CoCalc -- hecurve1x.c

Harvard Workshop: Distribution of modular symbols and L-values
code / alex / psage / psage / libs / smalljac / hecurve1x.c
²⁴¹⁸¹⁸ views
1
#include <stdlib.h>
2
#include <stdio.h>
3
#include <math.h>
4
#include <memory.h>
5
#include "gmp.h"
6
#include "mpzutil.h"
7
#include "hecurve.h"
8
#include "ffwrapper.h"
9
#include "ffpoly.h"
10
#include "cstd.h"
11

12
/*
13
    Copyright 2008 Andrew V. Sutherland
14

15
    This file is part of smalljac.
16

17
    smalljac is free software: you can redistribute it and/or modify
18
    it under the terms of the GNU General Public License as published by
19
    the Free Software Foundation, either version 2 of the License, or
20
    (at your option) any later version.
21

22
    smalljac is distributed in the hope that it will be useful,
23
    but WITHOUT ANY WARRANTY; without even the implied warranty of
24
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25
    GNU General Public License for more details.
26

27
    You should have received a copy of the GNU General Public License
28
    along with smalljac.  If not, see <http://www.gnu.org/licenses/>.
29
*/
30

31
/*
32
	This module implements operations for genus 1 curves, both low level
33
	point operations and functions for exponentiation computing the
34
	group order, group structure, etc...
35
	
36
	These functions are heavily optimized and responsible for a more than a 2x
37
	improvement in genus 1 performance from  smalljac version 3 over version 2.
38
	
39
	We begin with basic point arithmetic functions (mostly inlined), the higher
40
	level functions appear below.  	At the end of the module are a collection of functions
41
	that were implement and test but are not used because they were found to be
42
	sub-optimal in the current application.
43
*/
44

45

46
long hecurve_g1_JC_pp_order (hec_jc_t *a, long p, long q, ff_t f1);
47
int hecurve_g1_dlog (hec_jc_t *a, hec_jc_t *b, long o, ff_t f1);
48
int hecurve_g1_bsgs_search (long *ord, hec_jc_t *b, long low, long high, int m, int a, ff_t f1);
49
long hecurve_g1_fastorder (ff_t x, ff_t y, long k, ff_t f1);
50
int hecurve_g1_test_exponent (long e, ff_t f[4]);
51
void hecurve_g1_p_reduce (hec_jc_t *b1, long *q1, hec_jc_t *b2, long *q2, long p, ff_t f1);
52

53
unsigned long hecurve_expbits;
54
unsigned long hecurve_steps;
55
unsigned long hecurve_retries;
56

57

58
/*
59
	We use a reduced form of the Chudnovsky Jacobian representation (JC) which uses (x,y,z^2,z^3) to represent the affine point (x/z^2,y/z^3), but does not maintain z.
60
	Not computing z saves 1 field multiplication when compsing an affine point with JC point (usually called mixed addition).  Squaring (doubling) costs an extra multiplication
61
	but two fewer additions and the performance is comparable (in fact, when tested on an AMD Athlon 64, JC doubling was slighly faster, not clear why).
62

63
	In terms of the Mumford representation used elsewhere, the point point (x,y,z^2,z^3) is u[1]=z, u[0]=-x/z^2, v[0]=y/x^3. (notice the sign on x)
64
        When z is 1 this corresponds to u(t)=(t-x) and v(t)=y, so that x is a root of u and y=v(x).
65

66
	Negating y inverts an element, as in the affine rep, and 2-torsion elements have y=0.
67
	Any element with z=0 (or z2 for reduced Chudnovsky) is considered the identity, regardless of the value of x and y.
68

69
	As we use multiplicative notation for group operations elsewhere, we generally speak of the elliptic curve group operation multiplicatively,
70
        so we multiply, square, and exponentiate points, rather than adding, doubling, or multiplying by a scalar.
71

72
	In our operation counts we don't distinguish field multiplication and squaring.  For prime fields where p fits in a machine word, they are effectively the same.
73
        We do count additions, as these are not negligible--roughly speaking, 1M is about 2.5A.  Field inversions are horribly expensive relative to Montgomery multiplication,
74
        costing 40M or more (for p~2^30, say).  Whenever possible, we do n field inversions in parallel for a cost of I+3(n-1)M, effectively 3M per inverted element, for n large.
75
*/
76

77
// We trust the compiler to be smart about register allocation and don't hassle with macros (the speed difference appears to be negligible in testing)
78
// It would be nice to have C++ reference data types here...
79

80

81
static inline int hecurve_g1_JC_id (hec_jc_t *p) { return _ff_zero(p->z2); }
82
static inline int hecurve_g1_JC_2tor (hec_jc_t *p) { return _ff_zero(p->y); }
83
static inline void hecurve_g1_JC_invert (hec_jc_t *p) { ff_negate(p->y); }
84

85
static inline int hecurve_g1_JC_cmp (hec_jc_t *p1, hec_jc_t *p2)
86
{
87
	register ff_t t0,t1;
88
	
89
	_ff_mult(t0,p1->x,p2->z2); _ff_mult(t1,p2->x,p1->z2);
90
	if ( ! _ff_equal(t0,t1) ) return 0;
91
	_ff_mult(t0,p1->y,p2->z3); _ff_mult(t1,p2->y,p1->z3);
92
	return ( _ff_equal(t0,t1) ? 1 : -1 );
93
}
94

95
// converts reduced Chudnovsky Jacobian to affine coords, given the inverse of z3, overlap ok
96
static inline void hecurve_g1_JC_to_A (ff_t *px, ff_t *py, hec_jc_t *p, ff_t z3inv)
97
{
98
	register ff_t t0,t1;
99

100
	ff_mult(*py,p->y,z3inv);
101
	_ff_mult(t0,p->z2,z3inv);
102
	_ff_square(t1,t0);
103
	ff_mult(*px,p->x,t1);
104
	// 4M
105
}
106

107
// squares (doubles) an affine point into reduced Chudnovsky Jacobian coords
108
static inline void hecurve_g1_2AJC (hec_jc_t *p, ff_t x, ff_t y, ff_t f1)
109
{
110
	register ff_t t0,t1,t2,a,b;
111

112
	_ff_add(t0,y,y);  _ff_mult(t1,t0,y); _ff_add(p->z2,t1,t1); _ff_mult(a,x,p->z2);		// a=4xy^2, t1=2y^2, z2=4y^2
113
	_ff_mult(p->z3,t0,p->z2);												// z3=8y^3
114
	_ff_square(t1,x); _ff_add(b,t1,t1); _ff_addto(b,t1); _ff_addto(b,f1);				// b = 3x^2+f1*1^4
115
	_ff_square(t0,b); _ff_add(t2,a,a); _ff_sub(p->x,t0,t2);						// x = b^2-2a
116
	_ff_sub(t1,a,p->x); _ff_mult(t2,t1,b); _ff_mult(t0, y,p->z3); _ff_sub(p->y,t2,t0);	// y = b(a-x)-8y^4 (note we use the new x here and new z3=8y^3)
117
	// 7M+9A
118
}
119

120
// squares (doubles) an affine point into reduced Chudnovsky Jacobian coords and also sets az4 to f1*z3^4=16y^4 (cached value for squaring used below)
121
static inline void hecurve_g1_2AJC_cache (hec_jc_t *p, ff_t x, ff_t y, ff_t f1, ff_t *az4)
122
{
123
	register ff_t t0,t1,t2,a,b;
124

125
	_ff_add(t0,y,y);  _ff_mult(t1,t0,y); _ff_add(p->z2,t1,t1); _ff_mult(a,x,p->z2);		// a=4xy^2, t1=2y^2, z2=4y^2
126
	_ff_mult(p->z3,t0,p->z2);												// z3=8y^3
127
	_ff_square(t1,x); _ff_add(b,t1,t1); _ff_addto(b,t1); _ff_addto(b,f1);				// b = 3x^2+f1*1^4
128
	_ff_square(t0,b); _ff_add(t2,a,a); _ff_sub(p->x,t0,t2);						// x = b^2-2a
129
	_ff_sub(t1,a,p->x); _ff_mult(t2,t1,b); _ff_mult(t0, y,p->z3); _ff_sub(p->y,t2,t0);	// y = b(a-x)-8y^4 (note we use the new x here and new z3=8y^3)
130
	_ff_add(*az4,t0,t0); 
131
	// 7M+10A
132
}
133

134
// squares a point p1 in reduced Chudnovsky Jacobian coords (p3 is the output, may be equal to p1)
135
// This code requires one more multiplication than doubling in standard Jacobian coordinates (but two fewer additions, which makes it a close call)
136
// Surprisingly, it is actually slightly faster than doubling in Jacobian coords when tested on an AMD Athlon 64 (YMMV).
137
static inline void hecurve_g1_2JC (hec_jc_t *p3, hec_jc_t *p1, ff_t f1)
138
{
139
	register ff_t a, b, c, t0, t1, t2;
140
	
141
	_ff_square(t0,p1->x); _ff_add(t1,t0,t0); _ff_addto(t1,t0); 					// t1 = 3x^2
142
	_ff_square(t2,p1->z2);  _ff_mult(t0,f1,t2); _ff_add(b,t1,t0);					// b = 3x^2+f1*z^4	(note that f1=a4 in 13.2.1.c  of HECHECC p.282)
143
	_ff_add(c,p1->y,p1->y); _ff_square(t2,c); _ff_mult(a,t2,p1->x);				// a=4xy^2, c=2y, t2=4y^2
144
	_ff_add(t0,a,a); _ff_square(t1,b); _ff_sub(p3->x,t1,t0);						// x = b^2-2a
145
	_ff_mult(p3->z2,p1->z2,t2); _ff_mult(t1,c,t2); _ff_mult(p3->z3,p1->z3,t1);		// z2=4y^2z2, z3=8y^3z3, t1=8y^3
146
	_ff_mult(c,t1,p1->y);												// c = 8y^4
147
	_ff_sub(t0,a,p3->x); _ff_mult(t2,t0,b); _ff_sub(p3->y,t2,c);					// y = b(a-x)-c   -- note we use the new x value here
148
	// 11M+8A
149
}
150

151
// same as above except the parameter az4 contains a_4*z1^4 = f1*z1^4 and is updated to hold a_4*z3^4 (cost 1M less, 1A more)
152
static inline void hecurve_g1_2JC_cache (hec_jc_t *p3, hec_jc_t *p1, ff_t *az4)
153
{
154
	register ff_t a, b, c, t0, t1, t2;
155
	
156
	_ff_square(t0,p1->x); _ff_add(t1,t0,t0); _ff_addto(t1,t0);  _ff_add(b,t1,*az4);	// b = 3x^2+f1*z^4
157
	_ff_add(c,p1->y,p1->y); _ff_square(t2,c); _ff_mult(a,t2,p1->x);				// a=4xy^2, c=2y, t2=4y^2
158
	_ff_add(t0,a,a); _ff_square(t1,b); _ff_sub(p3->x,t1,t0);						// x = b^2-2a
159
	_ff_mult(p3->z2,p1->z2,t2); _ff_mult(t1,c,t2); _ff_mult(p3->z3,p1->z3,t1);		// z2=4y^2z2, z3=8y^3z3, t1=8y^3
160
	_ff_mult(c,t1,p1->y); _ff_add(t0,c,c); ff_mult(*az4,*az4,t0);					// c = 8y^4, az4 = az4*16y^4
161
	_ff_sub(t0,a,p3->x); _ff_mult(t2,t0,b); _ff_sub(p3->y,t2,c);					// y = b(a-x)-c   -- note we use the new x value here
162
	// 10M+9A
163
}
164

165
// multiplies a non-identity point p1 in reduced Chudnovsky Jacobians coords by a (non-identity) affine point and puts result in p
166
// using the reduced Chudnosky form (no z, only z^2 and z^3) saves one field mult (this is faster than any of the alternatives in table 13.3 of HECHECC on p.284)
167
static inline void hecurve_g1_AJC (hec_jc_t *p, hec_jc_t *p1, ff_t x0, ff_t y0, ff_t f1)
168
{
169
	register ff_t a, c, e, f, t0, t1, t2;
170
	
171
	if ( _ff_zero(p1->z2) ) { _ff_set(p->x,x0); _ff_set(p->y,y0); _ff_set_one(p->z2); _ff_set_one(p->z3); return; }
172
	_ff_mult(a,x0,p1->z2);											// a = x0z^2, b = x*1^2=x (z0=1)
173
	_ff_sub(e,p1->x,a);												// e = a-b
174
	_ff_mult(c,y0,p1->z3);											// c = y0z^3, d=y*1^3=y (z0=1)
175
	_ff_sub(f,p1->y,c);												// f = d-c
176
	if ( _ff_zero(e) && _ff_zero(f) ) { hecurve_g1_2AJC (p, x0, y0, f1); return; }	// must use doubling code here, but at least it's an affine double (7M+9A) (inverses are ok)
177
	_ff_square(t0,e); _ff_mult(t1,t0,e); _ff_mult(t2,t0,a);					// e2=e^2, t1=e^3, t2=ae^2
178
	ff_mult(p->z2,p1->z2,t0); ff_mult(p->z3,p1->z3,t1);					// z2 = 1*z2*e^2, z3=1*z3*e^3
179
	_ff_square(t0,f); _ff_sub(p->x,t0,t1); _ff_add(t0,t2,t2); _ff_subfrom(p->x,t0);	// x = f^2-e^3-2ae^2
180
	_ff_sub(t0,t2,p->x); _ff_mult(t2,f,t0); _ff_mult(t0,t1,c); _ff_sub(p->y,t2,t0);	// y = f(ae^2-x) - ce^3
181
	// 10M+6A
182
}
183

184
// multiplies p1 in reduced Chudnovsky Jacobian coords by p2 in reduced Chudnovsky Jacobian coords  and puts the result in p1.
185
// Handles identity and will square (double) if needed.  Cost is 13M+7A, one less than with standard Chudnovsky Jacobian coords.
186
static inline void hecurve_g1_JCJC (hec_jc_t *p1, hec_jc_t *p2, ff_t f1)
187
{
188
	register ff_t a, b, c, d, e, f, t0, t1, t2;
189

190
	// we need to handle identity cases in general (although in some cases we might be able to rule this out)
191
	if ( _ff_zero(p2->z2) ) return;
192
	if ( _ff_zero(p1->z2) ) { *p1=*p2; return; }
193
	_ff_mult(a,p1->x,p2->z2);  _ff_mult(b,p2->x,p1->z2);  _ff_sub(e,b,a);			// a = x1z2^2, b = x2z1^2, e = a-b
194
	_ff_mult(c,p1->y,p2->z3);  _ff_mult(d,p2->y,p1->z3);  _ff_sub(f,d,c);			// c = y1z2^2, d = y2z1^3, f = d-c
195
	if ( _ff_zero(e) && _ff_zero(f) ) { hecurve_g1_2JC (p1,p1,f1); return; }			// must double if pts are equal (11M+8A), inverses will end up with z2=0, so we let them through
196
	_ff_square(t1,e); ff_mult(t0,p1->z2,p2->z2);  _ff_mult(p1->z2,t0,t1);			// z^2 = z1^2z2^2e^2, t1=e^2
197
	_ff_mult(t2,t1,e); ff_mult(t0,p1->z3,p2->z3);  _ff_mult(p1->z3,t0,t2);			// z^2 = z1^2z2^2e^2, t2=e^3
198
	ff_mult(t1,t1,a);													// t1=ae^2
199
	_ff_square(t0,f); _ff_sub(p1->x,t0,t2); _ff_add(t0,t1,t1); _ff_subfrom(p1->x,t0);	// x = f^2-e^3-2ae^2
200
	_ff_sub(t0,t1,p1->x); _ff_mult(t1,f,t0); _ff_mult(t0,t2,c); _ff_sub(p1->y,t1,t0);	// y = f(ae^2-x) - ce^3 -- note we use the new x here
201
	// 13M+7A
202
}
203

204
/*
205
	Computes p=(x,y)^n where (x,y) !=1 is in affine coordinates and p is in Jacobian coordinates
206
	It takes advantage of the fact that all additions are of the form J+A, requiring only 11M, doubling is done in J+J form, using 10M
207
*/
208
void hecurve_g1_AJC_exp_ui (hec_jc_t *p, ff_t x0, ff_t y0, unsigned long n, ff_t f1)
209
{
210
	register unsigned long m;
211
	unsigned long pbits, nbits;
212
	register ff_t negy0;
213
	int i;
214

215
	if ( n == 0 ) { _ff_set_zero(p->z2); return; }
216
	if ( n == 1 ) { _ff_set(p->x,x0); _ff_set(p->y,y0); _ff_set_one(p->z2); _ff_set_one(p->z3);  return; }
217
	hecurve_g1_2AJC(p,x0,y0,f1);
218
	if ( n == 2 ) return;
219
	ui_NAF(&pbits,&nbits,n);
220
	i = _asm_highbit(pbits)-2;						// we know the top two bits of the NAF are 10
221
	_ff_neg(negy0,y0);
222
	m = (1UL<<i);
223
	for ( ; m ; m >>= 1 ) {
224
		hecurve_g1_2JC(p,p,f1);	 					// 11M+8A
225
		if ( m&pbits ) hecurve_g1_AJC(p,p,x0,y0,f1);		// 10M+6A
226
		if ( m&nbits ) hecurve_g1_AJC(p,p,x0,negy0,f1);	// 10M+6A
227
	}
228
}
229

230
/*
231
	Computes p=a^e where a!=1 is in reduced Jacobian Chudnovsky coords, and so is p. overlap is ok.
232
*/
233
void hecurve_g1_JC_exp_ui (hec_jc_t *p, hec_jc_t *a, unsigned long n, ff_t f1)
234
{
235
	register unsigned long m;
236
	unsigned long pbits, nbits;
237
	hec_jc_t t, ai;
238
	int i;
239
	
240
	if ( n == 0 ) { _ff_set_zero(p->z2); return; }
241
	if ( n == 1 ) { *p=*a; return; }
242
	hecurve_g1_2JC(&t,a,f1);							// 11M+8A
243

244
	if ( n == 2 ) { *p = t; return; }
245
	ui_NAF(&pbits,&nbits,n);
246
	i = _asm_highbit(pbits)-2;
247
	ai=*a;  hecurve_g1_JC_invert(&ai);
248
	m = (1UL<<i);
249
	for ( ; m ; m >>= 1 ) {
250
		hecurve_g1_2JC(&t,&t,f1);					// 11M+8A
251
		if ( m&pbits ) hecurve_g1_JCJC(&t,a,f1);			// 13M+7A
252
		if ( m&nbits ) hecurve_g1_JCJC(&t,&ai,f1);		// 13M+7A
253
	}
254
	*p = t;
255
}
256

257
// Computes b=a^e where a and b are both in affine coordinates
258
void hecurve_g1_exp_ui (ff_t u[HECURVE_GENUS+1], ff_t v[HECURVE_GENUS], ff_t u1[HECURVE_GENUS+1], ff_t v1[HECURVE_GENUS], unsigned long n, ff_t f[HECURVE_DEGREE+1])
259
{
260
	ff_t zinv,x0;
261
	hec_jc_t o[1];
262
	
263
	if ( _hecurve_is_identity(u1,v1) ) { _hecurve_set_identity(u,v);  return; }
264
	if ( ! _ff_one(u1[1]) ) { printf ("p=%ld, input to hecurve_g1_exp_ui most be in affine coords!\n",_ff_p); hecurve_print(u,v); exit(0); }
265
	_ff_neg(x0,u1[0]);
266
	hecurve_g1_AJC_exp_ui (o, x0, v1[0], n, f[1]);
267
	if ( hecurve_g1_JC_id(o) ) { _hecurve_set_identity(u,v);  return; }
268
	ff_invert(zinv,o->z3);
269
	hecurve_g1_JC_to_A(&x0,v,o,zinv);
270
	_ff_neg(u[0],x0);
271
	_ff_set_one(u[1]);
272
}
273

274
// Combines precomputed values p[i]=p[0]^(2^i) to compute p[0]^n.  Assumes all required powers are present: NAF potentially requires one more bit!
275
// CAUTION: if this is false, the resulting behavior is very strange and unpredictable.  You have been warned.
276
void hecurve_g1_JC_exp_powers(hec_jc_t *o, hec_jc_t *p, unsigned long n, ff_t f1)
277
{
278
	register unsigned long m;
279
	unsigned long pbits, nbits;
280
	hec_jc_t t;
281
	int i;
282

283
	// handle small n quickly
284
	switch (n) {
285
	case 0:	_ff_set_zero(o->z2); return; 
286
	case 1:	*o=p[0]; return;
287
	case 2:	*o=p[1]; return;
288
	case 3:	*o=p[0]; hecurve_g1_JCJC(o,p+1,f1); return;
289
	case 4:	*o=p[2]; return;
290
	case 5:	*o=p[2]; hecurve_g1_JCJC(o,p,f1); return;
291
	case 6:	*o=p[2]; hecurve_g1_JCJC(o,p+1,f1); return;
292
	case 7:	*o=p[0]; hecurve_g1_JC_invert(o); hecurve_g1_JCJC(o,p+3,f1); return;
293
	case 8:	*o=p[3]; return;
294
	}
295
	ui_NAF(&pbits,&nbits,n);
296
	i = _asm_highbit(pbits);
297
	*o = p[i];
298
	i-=2;
299
	m = (1UL<<i);
300
	for ( ; m ; m >>= 1, i-- ) {
301
		if ( (m&pbits) ) hecurve_g1_JCJC(o,p+i,f1);								// 13M+8A
302
		if ( (m&nbits) ) {													// 13M+10A (slightly more expensive)
303
			hecurve_g1_JC_invert(o); hecurve_g1_JCJC(o,p+i,f1); hecurve_g1_JC_invert(o);
304
		}
305
	}
306
}
307

308
// Sets p[i] = p[0]^(2^i) for i in [0,k]. Input is an affine pt not equal to the identity, output is in reduced Jacobian Chudnovsky coords.
309
void hecurve_g1_AJC_powers (hec_jc_t p[], ff_t x0, ff_t y0, int k, ff_t f1)
310
{
311
	register hec_jc_t *e;
312
	ff_t az4;
313
	
314
	_ff_set(p->x,x0);  _ff_set(p->y,y0);  _ff_set_one(p->z2); _ff_set_one(p->z3);
315
	hecurve_g1_2AJC_cache (p+1, x0, y0, f1, &az4);						// 7M + 10A
316
	for ( e=p+k,p+=2 ; p <= e ; p++ ) hecurve_g1_2JC_cache(p,p-1,&az4);	// 10M + 9A
317
}
318

319
// Sets p[i] = p[0]^(2^i) for i in [1,k]. Input is an pt not equal to the identity, input and output is in reduced Jacobian Chudnovsky coords.
320
void hecurve_g1_JC_powers (hec_jc_t p[], int k, ff_t f1)
321
{
322
	register hec_jc_t *e;
323

324
	// don't bother caching
325
	for ( e=p+k,p++ ; p <= e ; p++ ) hecurve_g1_2JC(p,p-1,f1);		 	// 11M + 8A
326
}
327

328
// Sets s[i] = s[0]*(x0,y0)^i for i < n.  Returns k<n if s[k] is the identity (in which case no further elements are computed), otherwise returns n.
329
int hecurve_g1_AJC_steps_1 (hec_jc_t s[], ff_t x0, ff_t y0, int n, ff_t f1)
330
{
331
	register hec_jc_t *end;
332
	
333
	if ( hecurve_g1_JC_id(s) ) return 0;
334
	end = s+n;
335
	for ( s++ ; s < end ; s++ ) {
336
		hecurve_g1_AJC (s,s-1,x0,y0,f1);								// 10M + 6A
337
		if ( hecurve_g1_JC_id(s) ) break;
338
	}
339
	hecurve_steps += n-(end-s);
340
	return n-(end-s);
341
}
342

343
// Sets s[-i] = s[0]*(x0,-y0)^i for i < n.  Returns k<n if s[k] is the identity (in which case no further elements are computed), otherwise returns n.
344
// Used for downward stepping - NOTE THAT s SHOULD POINT TO THE LAST ENTRY IN THE ARRAY
345
int hecurve_g1_AJC_dsteps_1 (hec_jc_t *s, ff_t x0, ff_t y0, int n, ff_t f1)
346
{
347
	register hec_jc_t *end;
348
	register ff_t negy0;
349
	
350
	if ( hecurve_g1_JC_id(s) ) return 0;
351
	end = s-n;
352
	_ff_neg(negy0,y0);
353
	for ( s-- ; s > end ; s-- ) {
354
		hecurve_g1_AJC (s,s+1,x0,negy0,f1);							// 10M + 6A
355
		if ( hecurve_g1_JC_id(s) ) break;
356
	}
357
	hecurve_steps += n-(s-end);
358
	return n-(s-end);
359
}
360

361
// sets s[2*i+j] = s[0]*(x0,y0)^(i+j)*(x1,y1)^i for 2*i+j < n with j=0 or 1.  Returns k<n if s[k] is the identity (in which case no further elements are computed), otherwise returns n.
362
int hecurve_g1_AJC_steps_2 (hec_jc_t s[], ff_t x0, ff_t y0, ff_t x1, ff_t y1, int n, ff_t f1)
363
{
364
	register hec_jc_t *end;
365
	register int i;
366
	
367
	if ( hecurve_g1_JC_id(s) ) return 0;
368
	end = s+n;
369
	for ( i=1,s++ ; s < end ; i++,s++ ) {
370
		if ( i&1) {
371
			hecurve_g1_AJC (s,s-1,x0,y0,f1);		// 10M + 6A
372
		} else {
373
			hecurve_g1_AJC (s,s-1,x1,y1,f1);		// 10M + 6A
374
		}
375
		if ( hecurve_g1_JC_id(s) ) break;
376
	}
377
	hecurve_steps += n-(end-s);
378
	return n-(end-s);
379
}
380

381
static inline int inv_mod3 (int e) { return ((e%3)==1?1:2); }
382

383
/*
384
	Fast group order computation for elliptic curves over F_p, designed for p<2^40, optimized for p~2^30.
385
	Returns 1 if successful and sets *o to the group order and if pd is non-null, sets *pd to gcd(m,6), where the group structure is Z/mZ x Z/nZ with m dividing n (possibly m=1).
386
	This information can be used to speed up group structure computations.
387

388
	Failure can only happen for p < 229, in which case 0 is returned and *o is a divisor of the group order highly likely equal to the group exponent (*pd is as also set, as above).
389
*/
390
long hecurve_g1_order (long *pd, ff_t f[4])
391
{
392
	long r, d, e, E, min, max, low, high, exp, M;
393
	hec_jc_t t[1];
394
	ff_t g[4],x,y,*h;
395
	int a,i,j,m,twist,s2known,tor3,tor4,flag8;
396

397
	/*
398
		We compute the 3-torsion subgroup first, because this information may lead us to work in the twist.
399
		The function ff_poly_g1_3tor() computes the 3-torsion subgroup of y^2=f(x), but if it is trivial, it will try to
400
		determine the 3-torsion subgroup of the twist, using the factorization pattern of the 3-division polynomial (a quartic).
401
	*/
402
	h = f;  twist = 0;
403
	tor3 = ff_poly_g1_3tor(f);	
404
	if ( tor3<0 ) { ff_poly_twist(g,f,3); h = g; twist = 1; tor3=-tor3; }		// negative return value indicates 3-torsion in the twist (but not in the primary)
405
	d = ( tor3==9 ? 3 : 1);										// d is a known divisor of |G|/lambda(|G|), i.e. it divides the group exponent (lambda(G)) and its square divides |G|.
406

407
	if ( _ff_p < HECURVE_G1_4TOR_MINP ) {							// when p is small, only compute 2-torsion but not 4-torsion (the marginal benefit does not justify the cost)
408
		 i = ff_poly_roots_d3(0,h);
409
		if ( i == 0 ) {											// no roots means there is no 2-torsion and the group order must be odd
410
			s2known = 1;  tor4 = 1;								// the flag s2known indicates we know the entire 2-Sylow subgroup
411
		} else {												// this means that once we factor it out, the remaining exponent is known to be odd
412
			s2known = 0;  tor4 = 2;								// if there is one root, we have 2-torsion Z/2Z, otherwise there are 3 roots and 2-torsion Z/2Z x Z/2Z
413
			if ( i > 1 ) { tor4 = 4;  d *= 2; }			
414
		}		
415
	} else {
416
		s2known = 0;
417
		flag8 = ( _ff_p < HECURVE_G1_8TOR_MINP ? 0 : 1);				// flag8 set if we also want to check whether the 2-Sylow subgroup is a cyclic group containing Z/8Z
418
		d *= ff_poly_g1_4tor(&tor4,h,flag8);						// compute the 4-torsion subgroup (and, optionally, check Z/8Z as well)
419
		if ( flag8 ) {
420
			if ( tor4 <= 4 ) s2known = 1;							// In these cases we know the entire 2-Sylow subgoup
421
		} else {
422
			if ( tor4 < 4 || (tor4==4 && d==2) ) s2known = 1;			// if the 4-torsion subgroup contains no elements order 4, it must be equal to the 2-Sylow subgroup.
423
		}
424
	}
425
	e = tor4*tor3/d;											// e is our known divisor of lambda(|G|)
426
	E = d*e;													// E is our known divisor of |G|
427
	m = (s2known==1?2:1)*(tor3==1?3:1);							// we know that gcd(|G|/tor4,m) = 1, which we can use to speed up the BSGS search
428
	
429
	a = ( tor3==1 && _ff_p1mod3 ? -1 : 0 );							// if neither the curve or its twist has 3-torsion and p=1mod3, we must have a_p=0 and group order 2 mod 3
430
															// For any divisor x of |G| this means |G|/x must be equal to -1/x mod 3 (and also mod 6 if m=6)
431
	r = (long)(2.0*sqrt(_ff_p));
432
	exp = _ff_p+1;
433
	min = exp-r;
434
	max = exp+r;
435
	low = _ui_ceil_ratio(min,E);
436
	high = max/E;
437
	if ( low > high ) { printf ("Error, no multiple of torsion derived e=%ld exists in interval [%ld,%ld] for p=%ld\n", e, min, max, _ff_p); exit(0); }
438
	if ( pd ) { *pd = ( twist && tor3==9 ? d/3 : d );  if ( !((*pd)&3) ) *pd/=2; }	// set *pd to reflect 2-torsion and 3-torsion information in y^2=f(x) (but not the twist)
439

440
//printf("%ld: tor3=%d, tor4=%d, e=%d, d=%d ", _ff_p, tor3, tor4, e, d); ff_poly_print(h,3);
441
	
442
	// Note that for a non-CM curve the probability that hecurve_g1_bsgs succeeds on the first try is close to 1 and grows with p (say 99.9% for p~2^20)
443
	for ( i = 0 ; i < HECURVE_G1_ORDER_RETRIES ; i++ ) {
444
		if ( low==high ) {exp=low; break; }
445
		if ( ! hecurve_random_point(&x,&y,h) ) continue;				// this can fail, e.g.  y^2=x^3+2x+2 over F_3 has no finite rational points
446
//printf ("%ld: e=%d, Random pt (%ld,%ld) on curve ", _ff_p, e, _ff_get_ui(x), _ff_get_ui(y));  ff_poly_print(h,3);
447
		hecurve_g1_AJC_exp_ui (t, x, y, e, h[1]);  if ( hecurve_g1_JC_id(t) ) continue;
448
		// handle order 2 elements here so that bsgs search can assume |t| > 2
449
		if ( hecurve_g1_JC_2tor(t) ) { 
450
			exp = 2;
451
 		//On paper, the code below should speed things up (it uses no inversions and fewer operations for small intervals), but in testing it actually slows things down slightly
452
		//} else if ( high-low < HECURVE_G1_SHORT_INTERVAL ) {
453
		//	if ( hecurve_g1_bsgs_short (&exp, &t, low, high, h[1]) ) break;
454
		} else {
455
			if ( a ) { a = -inv_mod3(E); if ( m==6 && a==-2 ) a = -5; }
456
			if ( hecurve_g1_bsgs_search (&exp, t, low, high, m, a, h[1]) ) break;
457
		}
458
		e *= exp;  E *= exp;
459
		low = _ui_ceil_ratio(min,E);
460
		high = max/E;
461
		// there are lot's of optimizations we could insert here, but none of them apply often enough to be worth doing, so keep it simple.
462
		hecurve_retries++;
463
	}
464
	if ( i < HECURVE_G1_ORDER_RETRIES ) {
465
		E *= exp;
466
		if ( twist ) E = 2*(_ff_p+1)-E;
467
		if ( E < min || E > max ) { printf ("Error, computed order %ld is not in Hasse-Weil interval [%ld,%ld] for p=%ld, h= ", E, min, max, _ff_p); ff_poly_print(h,3); exit(0); }
468
		return E;
469
	}
470
	// For non-CM curves we will almost never reach this point.
471
	
472
	if ( e > 2*r ) { printf("Error, exponent %ld for p=%ld has unique multiple in [%ld,%ld] that was not detected.\n", e, _ff_p, min, max);  exit(0); }
473
	if ( e < sqrt(min) ) { printf ("Error, exponent %ld is impossibly small for p=%ld\n", e, _ff_p); exit(0); }
474
	
475
	/*
476
		With very high probability (exponential in HECURVE_G1_ORDER_RETRIES), e is now the group exponent.   For all but 21 primes (the largest of which is 547)
477
		this uniquely determines the group order (this result is due to Cremona and Harley, or see Washington, "Elliptic Curves", Prop. 4.19).
478
		In fact, taking our knowledge of 2-torsion and 3-torsion into account, the group order is uniquely determined in every case (AVS: to be written up).
479
		Unfortunately, we can't prove that e is definitely the group exponent without computing the group structure.
480
	
481
		Instead, we apply results of Schoof and Mestre (see Washington, Prop. 4.18) which tells us that either the curve or its twist contains an element whose
482
		order has a unique multiple in the Hasse interval provided p > 229.  If one examines the exceptional cases for p<=229, one finds that if we also
483
		consider the information provided by 2-torsion, in every case but two (y^2=x^3+2 over F_13 and F_19) we obtain a known divisor of the group order
484
		with a unique multiple in the Hasse interval, and the two exceptional cases are addressed if we consider 3-torsion. (AVS: to be written up).
485

486
		In the two 3-torsion cases, we will have chosen h to be the twist with 3-torsion and should have succeeded above.  If not we mustn't have computed
487
		the full group exponent and we should try again (by recursively calling ourselves below)
488
	
489
		For the 2-torsion cases, the twist will also have 2-torsion, we just need to use this information when we try to find a unique multiple of the exponent
490
		in the twist below.  If we fail, it must mean we got unlucky and should try again.
491
	
492
		In summary, this yields a Las Vegas algorithm with provably correct output, for all odd p (for p=2 the curve y^2=f(x) is singular).
493
	*/
494
	
495
	m = (d%2?1:2);		// m is a known divisor of the twist group order based on the rank of the 2-torsion subgroup, note that we don't keep track of d anymore and reuse it below
496

497
	for ( M = e*_ui_ceil_ratio(min,e) ; M <= max ; M += e ) { d = M/e;  if ( !(e%d) && !((_ff_p-1)%d) ) break; }
498
	if ( M > max ) { printf ("Error: no multiple M of ambiguous e=%ld satisfies M/e | gcd(e,p-1) (p=%ld).\n", e, _ff_p); exit (0); }
499
	if ( ! twist ) { ff_poly_twist(g,f,3); h = g; } else { h = f; }	
500

501
	do {
502
		r = 2*(_ff_p+1)-M;
503
		// Attempt to prove that r is the order of the twist.  We don't bother trying to be efficient here, we expect to succeed on the first try.
504
		for ( i = 0 ; i < HECURVE_G1_ORDER_RETRIES ; i++ ) {
505
			if ( ! hecurve_random_point(&x,&y,h) ) { printf("hecurve_random_point failed!"); exit (0); }
506
			hecurve_g1_AJC_exp_ui (t, x, y, r, h[1]);
507
			if ( ! hecurve_g1_JC_id(t) ) break;											// can't be the right M
508
			exp = m*hecurve_g1_fastorder(x,y,r,h[1]);									// compute the order of our random point, times our 2-torsion derived info
509
			low = exp*_ui_ceil_ratio(min,exp);											// low is the first multiple of exp >= min
510
			if ( low <= max && low+exp > max ) { return ( twist ? 2*(_ff_p+1)-M : M ); }		// success
511
		}
512
		// try another candidate, if there is one (this can happen for small p)
513
		for ( M+= e; M <= max ; M += e ) { d = M/e;  if ( !(e%d) && !((_ff_p-1)%d) ) break; }			
514
	} while ( M <= max );
515
	// Otherwise, try again -- we must have not gotten the full group exponent (this essentially never happens unless HECURVE_G1_ORDER_RETRIES is set quite low).
516
	return hecurve_g1_order(pd,f);
517
}
518

519
/*
520
	Given the group order N, compute the isomorphism type of the group structure of an elliptic curve, of the form Z/n1Z x Z/n2Z with n1 dividing n2 (and also p-1), possibly n1=1.
521
	The parameter d specifies gcd(6,n1), which can be derived from 2-torsion and 3-torsion info by hecurve_g1_order above (specify 0 if not known).
522
	The rank of the group (1 or 2) is returned, and n[0]=n2 if n1=1, o.w. n[0]=n1 and n[1]=n2.  Note that n2 is the group exponent.
523

524
	It is possible to compute the group invariants (n1 and n2) in probabilistic (Las Vegas) polynomial time using the Weil pairing via Miller's algorithm
525
	(although this doesn't actually determine a basis, i.e. independent elements of order n1 and n2).
526

527
	We take a simpler generic approach which is very fast in the typical case.  We may in the future want to invoke (refinements of) Miller's algorithm for
528
	hard cases (large prime dividing p-1 whose square divides N).  As it stands, the algorithm below has complexity O(q^(1/2)) where q is the largest prime dividing p-1
529
	whose square divides N.  This is O(N^(1/4)) in the worst case, but this rarely happens and in practice computing the group structure takes about 10% the
530
	time it takes to compute the group order (for a non-CM curve).  (It might be interesting to do a performance comparison wth Miller's algorithm at some point)
531
	
532
	We just return the group invariants here, and optimize for computing these as quickly as possible, but the algorithm below can easily be modified to return a basis
533
	without changing the complexity (but the constant factors will be slightly worse).  This is a Las Vegas algorithm, i.e. provably correct output and bounded expected running time.
534
*/
535
int hecurve_g1_group_structure (long n[2], long N, long d, ff_t f[4])
536
{
537
	long m,n1,n2,e;
538
	hec_jc_t b1, b2;
539
	unsigned long p[MAX_UI_PP_FACTORS], q[MAX_UI_PP_FACTORS], h[MAX_UI_PP_FACTORS];
540
	ff_t x,y;
541
	long q0, q1, q2, r;
542
	int i, j, k;
543
	
544
	/*
545
		For each prime q|N, if q does not divide p-1 we know the q-Sylow subgroup is cyclic.  Even when q does divide p-1, if q^2 does not divide N, we
546
		again know the q-Sylow is cyclic (of prime order in this case).  Applying just these 2 facts will often suffice to determine the group structure without
547
		using any group operations, particularly if information in d is also applied (e.g. 2-torsion and 3-torsion info).
548
	*/
549
	k = ui_factor(p,h,ui_gcd(_ff_p-1,N));
550
	for ( i = j = 0 ; i < k ; i++ ) {												// make a list of primes p[i] dividing p-1 whose square divides N
551
		if ( p[i]==2 && d && d%2 ) continue;									// d=gcd(6,n1) not divisible by 2 implies 2-Sylow must be cyclic
552
		if ( p[i]==3 && d && d%3 ) continue;									// d=gcd(6,n1) not divisible by 3 implies 3-Sylow must be cyclic
553
		if ( !(N%(p[i]*p[i])) ) p[j++] = p[i];
554
	}
555
	n1 = 1;  n2 = N;
556
	for ( i = 0 ; i < j ; i++ ) {
557
		for ( q[i] = p[i]*p[i] ; !(N%(q[i]*p[i])) ; q[i] *= p[i] );						// determine power of p[i] dividing N
558
		n2 /= q[i];														// remove q[i] from n2
559
	}
560
//printf("%ld: N=%d ", _ff_p, N); ff_poly_print(f,3);
561
	// Now determine the p[i]-Sylow subgroups for p[i] dividing p-1 and p[i]^2 dividing N (if there are none, we have no work to do).
562
	for ( i = 0 ; i < j ; i++ ) {
563
		if ( d ) for ( q0 = 1 ; ! (d%(p[i]*q0)) ; q0 *= p[i] );	else q0 = 1;				// if d was specified, use d to compute q0, a known divisor of q1 (and q2), possibly 1
564
		e = N/(q[i]*n1);													// the p[i]-Sylow subgroup must lie in the image of the e-power map (we can take out the n1 we know)
565
		for ( q1 = q2 = q0, k=0 ; q1*q2 < q[i] && k < 1000 ; k++ ) {				// bound the number of retries as a safety valve, just in case the group order was invalid
566
			if ( ! hecurve_random_point(&x,&y,f) ) continue;						// get a random finite point
567
			hecurve_g1_AJC_exp_ui (&b1, x, y, e, f[1]);							// get an element of the p[i]-Sylow via the e-power map
568
			r = hecurve_g1_JC_pp_order (&b1,p[i],q[i],f[1]);						// compute its order, which will be a power of p[i]
569
			if ( r <= q1 ) continue;											// if the order of b1 is not strictly larger than q1, it isn't going to help us
570
			if ( q2==q0 ) { b2=b1; q2 = r; continue; }							// first time through we will just set q2
571
			hecurve_g1_p_reduce (&b1, &r, &b2, &q2, p[i], f[1]);					// reduce to a basis for <b1,b2>
572
			q1 = ( r > q0 ? r : q0);											// note that if r<q0, then q1 is not the order of b1, but we don't care
573
		}
574
		if ( k == 1000 ) { printf ("Group structure computation failed on %d-Sylow of size %d with group of order %d on curve over F_%d: ", p[i], q[i], N, _ff_p); ff_poly_print(f,3); exit (0); }
575
		n1 *= q1;  n2 *= q2;
576
	}
577
	if ( n1*n2 != N ) { printf ("bug in hecurve_g1_group_order, %d*%d != %d,  F_%ld curve ", n1, n2, N, _ff_p);  ff_poly_print(f,3); exit(0); }		// sanity check
578
	if ( n1 == 1 ) { n[0] = n2; return 1; }
579
	n[0] = n1;  n[1] = n2;
580
	return 2;
581
}
582

583
/*
584
	Slow Monte Carlo algorithm to test the group exponent, use for debugging/testing only.
585
*/
586
int hecurve_g1_test_exponent (long e, ff_t f[4])
587
{
588
	hec_jc_t t[1];
589
	ff_t x,y;
590
	long m, n;
591
	int i;
592
	
593
	n = 1;
594
	for ( i = 0 ; i < 100 ; i++ ) {
595
		if ( ! hecurve_random_point(&x,&y,f) ) continue;
596
		hecurve_g1_AJC_exp_ui(t,x,y,e,f[1]);
597
		if ( ! hecurve_g1_JC_id(t) ) return 0;
598
		m = hecurve_g1_fastorder (x,y,e,f[1]);
599
		if ( n%m ) n = m*(n/ui_gcd(m,n));
600
	}
601
	return (n==e);
602
}
603

604
/*
605
	Given non-trivial elements b1 and b2 of the p-Sylow subgroup with |b1|=q1 and |b2|=q2  powers of p (p here is a divisor of the group order, not the characteristic of the field),
606
	The function below computes a basis for <b1,b2> and updates b1,b2,q1, and q2 appropriately, with q1 < q2 (possibly q1=1 if <b1,b2> is cyclic).
607

608
	The complexity is O(log_p(q1)*(sqrt(p)+log(q2)).
609
	AVS: the code below is a special case of a new algorithm for group structure computation (a simpler and faster version of Algorithm 9.2 in my thesis which also incorporates
610
	a few ideas from Teske's Pohlig-Hellman paper).  I am in the process of writing a paper which I will eventually reference here.
611
*/
612
void hecurve_g1_p_reduce (hec_jc_t *b1, long *q1, hec_jc_t *b2, long *q2, long p, ff_t f1)
613
{
614
	hec_jc_t t[1],a1[1],a2[1];
615
	long k;
616
	int i;
617
	
618
	if ( *q1 > *q2 ) { t[0] = *b1;  *b1 = *b2; *b2 = t[0];  k = *q1; *q1 = *q2; *q2 = k; }	// swap inputs if needed so that q1 <= q2
619

620
	hecurve_g1_JC_exp_ui(a2,b2,*q2/p,f1);										// a2=b2^(q2/p), <a2> is the subgroup of <b2> of order p
621
	while (*q1>1) {
622
		hecurve_g1_JC_exp_ui(a1,b1,(*q1)/p,f1);									// a1=b1^(q1/p), <a1> is the subgroup of <b1> of order p
623
		k = hecurve_g1_dlog (a2,a1,p,f1);										// compute least k>0 s.t. a1=a2^k, if possible
624
		if ( ! k ) break;													// if we can't, then a1 and a2 are independent, hence so are b1 and b2, and we are done
625
		hecurve_g1_JC_exp_ui(t,b2,(*q2)/(*q1)*k,f1);  hecurve_g1_JC_invert(t);			// compute t=b2^-(q2/q1*k)
626
		hecurve_g1_JCJC (b1,t,f1);											// replace b1 with b1*b2^-(q2/q1*k), this reduces the order of b1 to at most q1/p since
627
																		// (b1*b2^-(q2/q1*k))^(q1/p) = a1*a2^{-k} = 1, and we do not change <b1,b2> by doing this.
628
		k = hecurve_g1_JC_pp_order(b1,p,*q1,f1);								// recompute q1 (it could be less than q1/p)
629
		if ( k >= *q1 ) { printf ("%ld: Error in hecurve_g1_p_reduce(%d,%d) element order %d not reduced\n", _ff_p, *q1, *q2, k); exit (0); }		// sanity check
630
		*q1 = k;
631
	}
632
}
633

634

635
long hecurve_g1_JC_pp_order (hec_jc_t *a, long p, long q, ff_t f1)
636
{
637
	register long r;
638
	hec_jc_t t[1];
639
	
640
	if ( hecurve_g1_JC_id(a) ) return 1;
641
	if ( p==q ) return p;
642
	hecurve_g1_JC_exp_ui (t,a,p,f1);
643
	if ( hecurve_g1_JC_id(t) ) return p;
644
	for ( r = p*p ; r < q ; r *= p ) {
645
		hecurve_g1_JC_exp_ui (t,t,p,f1);
646
		if ( hecurve_g1_JC_id(t) ) return r;
647
	}
648
	return q;
649
}
650

651
/*
652
	Fast and simple small hash table lookup used by hecurve_g1_bsgs_search and also by hecurve_g1_dlog.
653
*/
654

655
#if FF_HALF_WORDS == 1
656
#define BSGS_MAX_STEPS		256					// guaranteed to handle  p<2^31 (single word ff_t), assuming 2 torsion is known, enlarge if needed but it's nice to fit in L1 cache
657
#else
658
#define BSGS_MAX_STEPS		1024				// will BSGS up to 2^40, assuming 2 torsion is known, and nearly as high for dlogs
659
#endif
660
#define BSGS_TABSIZE			BSGS_MAX_STEPS		// don't make this too big, it takes time to initialize it.  A few collisions won't kill us.
661
#define BSGS_TABMASK			((unsigned long)(BSGS_TABSIZE-1))
662

663
static hec_jc_t babys[BSGS_MAX_STEPS];
664
static hec_jc_t giants[BSGS_MAX_STEPS];
665

666
static ff_t stepzs[2*BSGS_MAX_STEPS];
667
#if FF_HALF_WORDS == 1
668
static unsigned char hashtab[BSGS_TABSIZE];
669
#else
670
static unsigned short hashtab[BSGS_TABSIZE];
671
#endif
672
static struct tab_entry {
673
	ff_t x;
674
	short i;
675
	short next;
676
} entries[BSGS_MAX_STEPS+1];
677
short nexttabentry;
678

679
static inline void tab_clear() { memset(hashtab,0,sizeof(hashtab)); nexttabentry = 1; } // don't use entry 0
680

681
// we require inserts to have unique x values, return -1 if unique, otherwise return index value for existing entry
682
static inline int tab_insert(ff_t x, short i)
683
{
684
	register struct tab_entry *p;
685
	register int n,h;
686

687
	h = x&BSGS_TABMASK;
688
	n = hashtab[h];
689
	if ( n ) {
690
		p=entries+n;
691
		for(;;) {
692
			if ( _ff_equal(p->x,x) ) return p->i;
693
			if ( ! p->next ) break;
694
			p=entries+p->next;
695
		}
696
	}
697
	p = entries+nexttabentry;
698
	_ff_set(p->x,x);
699
	p->i = i;
700
	p->next = n;
701
	hashtab[h] = nexttabentry++;
702
	return -1;
703
}
704

705
static inline int tab_lookup(ff_t x)
706
{
707
	register struct tab_entry *p;
708
	register int n;
709

710
	n = hashtab[x&BSGS_TABMASK];
711
	if ( ! n ) return -1;
712
	p=entries+n;
713
	for(;;) {
714
		if ( _ff_equal(p->x,x) ) return p->i;
715
		if ( ! p->next ) return -1;
716
		p=entries+p->next;
717
	}
718
	return -1;
719
}
720

721
/*
722
	Computes the discrete log of b wrt a given the order of a using a BSGS search.  Optimized for small searches (shares table lookup code with hecurve_g1_bsgs_search).
723
	Returns 0 if b is not an element of <a> (this is not assumed).
724

725
	Note that we are only called for prime values whose square divides the group order, so we only use O(N^(1/4)) steps in the worst case, and even this is very rare
726
	since the prime must also divide p-1.
727
*/
728
int hecurve_g1_dlog (hec_jc_t *a, hec_jc_t *b, long o, ff_t f1)
729
{
730
	hec_jc_t c[1];
731
	long bsteps, giant, gstep, gsteps;
732
	ff_t zinv[2], baby_x[1], baby_y[1], giant_x[1], giant_y[1];
733
	ff_t t0, t1;
734
	register int i, j;
735
		
736
	// hard-wire small o cases
737
	if ( hecurve_g1_JC_id(a) ) return o;
738
	switch ( o ) {
739
	case 1:	return 0;
740
	case 2:	return ( hecurve_g1_JC_cmp (a,b) ? 1 : 0 );
741
	case 3: 	i = hecurve_g1_JC_cmp (a,b);  return ( i ? (i>0?1:2) : 0 );
742
	}
743

744
//printf ("dlog(%d) a=(%d,%d,%d,%d) b=(%d,%d,%d,%d)\n", o, _ff_get_ui(a->x), _ff_get_ui(a->y), _ff_get_ui(a->z2), _ff_get_ui(a->z3), _ff_get_ui(b->x), _ff_get_ui(b->y), _ff_get_ui(b->z2), _ff_get_ui(b->z3));
745
	
746
	// For small searches we will just search the whole range so we only match once (using just one inversion).   The only optimization we use is the usual one for inverses.
747
	bsteps = (long)sqrt(o/2);
748
	gstep = 2*bsteps+1;
749
	gsteps = _ui_ceil_ratio(o,gstep);
750
	if ( bsteps > BSGS_MAX_STEPS || gsteps > BSGS_MAX_STEPS ) { printf ("%ld: order %ld caused BSGS_MAX_STEPS to be exceeded in hecurve_g1_dlog\n", _ff_p, o); exit (0); }
751
//printf("bsteps=%d, gsteps=%d, giant step=%d\n", bsteps, gsteps, gstep);
752
	
753
	// Convert baby and giant step to affine coords for stepping
754
	hecurve_g1_JC_exp_ui (c,a,gstep,f1);
755
	hecurve_g1_JC_invert(c);
756
	_ff_set(zinv[0],a->z3);
757
	_ff_set(zinv[1],c->z3);
758
	ff_parallel_invert (zinv, zinv, 2);
759
	hecurve_g1_JC_to_A (baby_x, baby_y, a, zinv[0]);
760
	hecurve_g1_JC_to_A (giant_x, giant_y, c, zinv[1]);
761
//printf ("baby step(a) = (%d,%d)\n", _ff_get_ui(baby_x[0]), _ff_get_ui(baby_y[0]));
762
//printf ("giant step(a^-%d) = (%d,%d)\n", gstep, _ff_get_ui(giant_x[0]), _ff_get_ui(giant_y[0]));
763

764
	// Step
765
	babys[0] = *a;  giants[0] = *b;
766
	j = hecurve_g1_AJC_steps_1 (babys, baby_x[0], baby_y[0], bsteps, f1);							// baby steps
767
	if ( j < bsteps ) { printf ("%ld: baby step hit identity in dlog with o=%ld\n", _ff_p, o); exit(0); }
768
	j = hecurve_g1_AJC_steps_1 (giants, giant_x[0], giant_y[0], gsteps, f1);						// giant steps
769
	if ( j < gsteps ) return j*gstep;
770

771
	// Convert to affine to get unique x coords
772
	for ( i = j = 0 ; i < bsteps ; i++, j++ ) _ff_set(stepzs[j], babys[i].z2);
773
	for ( i = 0 ; i < gsteps ; i++, j++ ) _ff_set(stepzs[j], giants[i].z2);
774
	ff_parallel_invert(stepzs, stepzs, j);																	// we assume j < FF_MAX_PARALLEL_INVERTS here
775
	for ( i = j = 0 ; i < bsteps ; i++,j++ ) ff_mult(babys[i].x, babys[i].x, stepzs[j]);								// only invert x-coord here, we never need to invert y
776
	for ( i = 0 ; i < gsteps ; i++,j++ ) ff_mult(giants[i].x, giants[i].x, stepzs[j]);
777

778
/*
779
printf ("affine baby x's:\n", j);
780
for ( i = 0 ; i < bsteps ; i++ ) printf ("   %d: %ld\n", i+1, _ff_get_ui(babys[i].x));
781
printf ("affine giant x's:\n", j);
782
for ( i = 0 ; i < gsteps ; i++ ) printf ("   %ld: %ld\n", i*gstep, _ff_get_ui(giants[i].x));
783
*/
784
	// Populate the table with babys.  Inserts should never fail (baby steps can't be inverses provided bsteps < o/2)
785
	tab_clear();
786
	for ( i = 0 ; i < bsteps ; i++ ) if ( (j=tab_insert(babys[i].x,i)) >= 0 ) break;
787
	if ( i < bsteps ) { printf("%ld: baby step insert failed in dlog\n", _ff_p); exit (0); }
788

789
	// Now match giant steps
790
	for ( i = 0 ; i < gsteps ; i++ ) {
791
		if ( (j=tab_lookup(giants[i].x)) >= 0 ) {
792
			_ff_mult(t0,babys[j].y,giants[i].z3);	_ff_mult(t1,giants[i].y,babys[j].z3);								// normalize y values for comparison
793
			if ( _ff_equal(t0,t1) ) return i*gstep+j+1;
794
			return (i?i*gstep-j-1:o-j-1);
795
		}
796
	}
797
	return 0;
798
}
799

800

801
/*
802
	Support functions for hecurve_g1_bsgs_search...
803
*/
804

805
double baby_stretch[8] = {0.0,1.0,2.0,1.5,3.0,0.0,3.0,6.0};
806

807
// returns the (j+1)-th integer satisfying property determined by modcase.
808
static inline int baby_index(int j, int modcase)
809
{
810
	register int k;
811
	
812
	k=j&1;
813
	switch (modcase) {
814
	case 1: return j+1;				// all values
815
	case 2: return 2*j+1;				// odd values	(bspan,gsteps and giants even)
816
	case 3: return 3*(j-k)/2+1+k;		// prime to 3    (bspan, gstep, and giants multiples of 3)
817
	case 4: return 3*(j+1);				// 0 mod 3        (bspan and gstep multiples of 3, giants a mod 3)
818
	case 6: return 6*(j-k)/2+1+4*k;		// prime to 6    (bspan, gstep, and giants multiples of 6)
819
	case 7: return 6*(j+1);				// 0 mod 6        (bspan and gstep multiples of 6, giants a mod 6)
820
	}
821
}
822

823
/*
824
	sets exp to the unique multiple of k in [low,high] if there is one (and returns 1) or sets exp=k and returns 0
825
*/
826
static inline int set_exp (long *exp, long k, long low, long high)
827
{
828
	register int i;
829
	i = ui_multiples_in_range(k,low,high);
830
	if ( ! i ) { printf ("no multiple of element order %ld in interval [%ld,%ld]\n", k, low, high); exit(0); }
831
	if ( i == 1 ) { *exp = (high/k)*k;  return 1; } else { *exp = k; return 0; }
832
}
833

834

835
/*
836
	Uses BSGS (and fastorder) to compute the order k of the element b, given that some multiple of k lies in [low,high] (this is assumed, not verified).
837
	The values m and a specify modular constraints on k as follows:
838

839
		m=1		no constraint
840
		m=2		k is odd						(use odd baby steps, even giant steps)
841
		m=3		k is prime to 3					(use baby steps 1,2 mod 3, giant steps 0 mod 3)
842
		m=3 (a!=0)	k = a mod 3					(use baby steps 0 mod 3, giant steps congruent to a mod 3)
843
		m=6		k is prime to 6					(use baby steps 1,5 mod 6, giant steps 0 mod 6)
844
		m=6 (a!=0)	k = a mod 6					(use baby steps 0 mod 6, giant steps congruent to a mod 6)
845

846
	The value modcase = m+(a?1:0) is used internally to uniquely identify these 6 situations.
847
	Note that the giant steps are congruent to a mod m in every case.
848

849
	If the return value is 1, then exp is the unique multiuple of k in [low,high].
850
        If the return value is 0, exp=k is the order of b.  This almost always means there is more than one multiple of k in [low,high], but not necessarily.
851
*/
852
int hecurve_g1_bsgs_search (long *exp, hec_jc_t *b, long low, long high, int m,int a, ff_t f1)
853
{
854
	hec_jc_t p[64];
855
	hec_jc_t baby_steps[3], giant_steps[1];
856
	ff_t zinv[4], baby_x[3], baby_y[3], giant_x[1], giant_y[1];
857
	register ff_t t0, t1, t2;
858
	register int i,j,k,bsteps,gsteps,dsteps,usteps;
859
	int bspan,gstep,modcase;
860
	long o, o1, o2, gbase, range;
861
	
862
//printf ("%d: hecurve_g1_bsgs pt (%ld,%ld,%ld,%ld), low=%d, high=%d, m=%d, a=%d\n", _ff_p, _ff_get_ui(b->x), _ff_get_ui(b->y),_ff_get_ui(b->z2),_ff_get_ui(b->z3), low, high, m, a); 
863
	
864
	range = high-low+1;												// range is inclusive, we assume high>low, so range > 1
865
	bsteps = (long)sqrt((double)(range) / (2.0*baby_stretch[m]));
866
	if ( ! bsteps ) bsteps = 1;
867
	if ( m>2 && !a && (bsteps&1) ) bsteps++;								// make sure bsteps is even if we are covering steps prime to 3 or 6
868
	modcase = m + (a?1:0);
869
	bspan = bsteps * baby_stretch[modcase];								// effective baby coverage depends on modular contraints m and a
870
	if ( m > 1 ) bspan = m * _ui_ceil_ratio(bspan,m);							// make sure bspan is a multiple of modulus m (we can safely round up)
871
	gstep= 2*bspan;													// giant span is effectively 2*bspan+1 due to inverses, but we use 2*bspan so bspan divides gspan...
872
	if ( m==1 ) gstep++;												// ...except when m=1, where we can use 2*bspan+1.
873

874
	/*
875
		Pick first giant step to be a multiple of as large a value of 2 as possible (and still lie in the interval [low,high]).
876
		We then need to adjust to make it congruent to a mod m, but this changes at most 2 bits.
877
		(An optimal approach might search for the value in [low,high] congruent to a mod m with minimal hamming weight, but this would likely take more time than it is worth).
878
		
879
		The overall savings is only a few percent (it cuts the cost of computing the first giant step by 1/6, but computing the first giant step is less than 1/4 the total cost).
880
		However, it's an easy optimization, so there is no reason not to do it.
881
		
882
		This idea was suggested by Dan Bernstein.
883
	*/
884
	k = ui_lg_floor(range);
885
	for (;;) {
886
		o1 = (1L<<(k+1));  o = o1*_ui_ceil_ratio(low,o1);
887
		if ( o > high ) break;
888
		k++;
889
	}
890
	o1 = 1L<<k;
891
	gbase = o1*_ui_ceil_ratio(low,o1);										// gbase is now a multiple of a largish power of 2 in [low,high]
892
	if ( m > 1 ) {
893
		i = (gbase-a)%m;
894
		gbase -= i;													// gbase is now congruent to a mod m
895
		if ( gbase < low ) gbase += m;									// make sure we stay in [low,high]
896
		if ( gbase > high ) gbase -= m;
897
	}
898
	for ( dsteps = (gbase-low)/gstep ; gbase - (dsteps-1)*gstep - bspan > low ; dsteps++ );
899
	while ( gbase-(dsteps-1)*gstep <= 0 ) gbase += m;						// make sure we don't step on zero or negative values
900
	for ( usteps = (high-gbase)/gstep ; gbase + (usteps-1)*gstep + bspan < high ; usteps++ );
901
	if ( dsteps+usteps+1 > BSGS_MAX_STEPS )  { printf ("Exceeded BSGS_MAX_STEPS=%d! p=%ld, low=%ld, high=%ld, m=%d, a=%d\n", BSGS_MAX_STEPS, _ff_p, low, high, m, a);  exit (0); }
902
	gsteps = dsteps+usteps-1;											// dsteps and usteps both include starting step, so total is one less then the sum
903
	
904
//printf ("%d: bsteps=%d, gsteps=%d(%d,%d), bspan=%d, gstep=%d, first giant = %ld\n", _ff_p, bsteps, gsteps, dsteps, usteps, bspan, gstep, gbase);
905
	
906
	// compute 2^k powers of b sufficient to compute b^gstep, and b^gbase
907
	o = _ui_max(gstep,gbase);
908
	k = ui_lg_floor(o);
909
	p[0] = *b;
910
	hecurve_g1_JC_powers (p, k+1, f1);									// add 1 extra power for NAF
911
	baby_steps[0] = *b; k = 1;
912
	
913
	// compute baby step sizes depending on the value of modcase
914
	switch (modcase) {																				// if m=1, fist step=step size=1
915
	case 1: break;
916
	case 2:
917
	case 3: hecurve_g1_2JC (baby_steps+1, baby_steps, f1); k = 2; break;										// first step is 1, steps size is 1 or 1,2
918
	case 4: hecurve_g1_2JC (baby_steps+1,baby_steps,f1);  hecurve_g1_JCJC(baby_steps+1,baby_steps,f1); k = 2;  break;	// first step = step size is 3
919
	case 6: hecurve_g1_2JC (baby_steps+1, baby_steps, f1);											
920
		     hecurve_g1_2JC (baby_steps+2, baby_steps+1, f1); k = 3; break;										// first step is 1, step size is 2 or 4
921
	case 7: hecurve_g1_2JC (baby_steps+1,baby_steps,f1);  hecurve_g1_JCJC(baby_steps+1,baby_steps,f1);
922
		     hecurve_g1_2JC(baby_steps+1,baby_steps+1,f1); k = 2;  break;										// first step = step size is 6
923
	default:
924
		printf("Invalid modcase value %d in hecure_g1_bsgs\n", m); exit (0);
925
	}
926

927
	hecurve_g1_JC_exp_powers(giant_steps, p, gstep, f1);
928
	hecurve_g1_JC_exp_powers(giants+dsteps-1, p, gbase, f1);
929
	
930
	// If any of our baby steps is the identity, we won't get very far, so we need to check this situation (it can happen).   The giant steps are handled below.
931
	if ( k > 1 && hecurve_g1_JC_id(baby_steps+1) ) return set_exp(exp,(a?m:2),low,high);
932
	if ( k > 2 && hecurve_g1_JC_id(baby_steps+2) ) return set_exp(exp,4,low,high);
933
	
934
	// We need to convert the steps to affine coords before using them--do this with one field inversion.
935
	for ( i = 0 ; i < k ; i++ ) _ff_set(zinv[i], baby_steps[i].z3);
936
	j = k;
937
	if ( ! hecurve_g1_JC_id(giant_steps) ) { _ff_set(zinv[j], giant_steps[0].z3); j++; }						 			// giant step could be the identity
938
	// invert all the z's
939
	ff_parallel_invert (zinv, zinv, j);
940
	for ( i = 0 ; i < k ; i++ ) hecurve_g1_JC_to_A (baby_x+i, baby_y+i, baby_steps+i, zinv[i]);
941
	j=k;
942
	if ( ! hecurve_g1_JC_id(giant_steps) ) { hecurve_g1_JC_to_A (giant_x, giant_y, giant_steps, zinv[j]); j++; }
943

944
	// Now handle giant step = identity (we needed to get the baby in affine coords for the fastorder computation first)
945
	if ( hecurve_g1_JC_id(giant_steps) ) { o = hecurve_g1_fastorder (baby_x[0], baby_y[0], gstep, f1); return set_exp(exp,o,low,high); }
946
	if ( hecurve_g1_JC_id(giants+dsteps-1) ) { o = hecurve_g1_fastorder (baby_x[0], baby_y[0], gbase, f1); return set_exp(exp,o,low,high); }
947

948
/*
949
printf("affine baby step 0 (%ld,%ld)\n", _ff_get_ui(baby_x[0]),_ff_get_ui(baby_y[0]));
950
if (m>1) printf("affine baby step 1 (%ld,%ld)\n", _ff_get_ui(baby_x[1]),_ff_get_ui(baby_y[1]));
951
if (m==6) printf("affine baby step 2 (%ld,%ld)\n", _ff_get_ui(baby_x[2]),_ff_get_ui(baby_y[2]));
952
printf("affine giant step %d: (%ld,%ld)\n", gstep, _ff_get_ui(giant_x[0]),_ff_get_ui(giant_y[0])); 
953
*/
954
		
955
	// Baby steps
956
	babys[0] = ( a ? baby_steps[1] : baby_steps[0] );														// take first baby step
957
	switch (modcase) {
958
	case 1:  j=hecurve_g1_AJC_steps_1 (babys, baby_x[0], baby_y[0], bsteps, f1);  break;						// step on everything (use step size 1)
959
	case 2:  j=hecurve_g1_AJC_steps_1 (babys, baby_x[1], baby_y[1], bsteps, f1);  break;						// step on odd powers (use step size 2)
960
	case 3:  j=hecurve_g1_AJC_steps_2 (babys, baby_x[0], baby_y[0], baby_x[1], baby_y[1], bsteps, f1);  break;		// step on powers prime to 3 (alternate step sizes 1 and 2: 1,2,4,5,7,8,...)
961
	case 4:  j=hecurve_g1_AJC_steps_1 (babys, baby_x[1], baby_y[1], bsteps, f1);  break;						// step on multiples of 3
962
	case 6:  j=hecurve_g1_AJC_steps_2 (babys, baby_x[2], baby_y[2], baby_x[1], baby_y[1], bsteps, f1);  break;		// step on powers prime to 6 (alternate step sizes 4 and 2: 1,5,7,11,13,17,...)
963
	case 7:  j=hecurve_g1_AJC_steps_1 (babys, baby_x[1], baby_y[1], bsteps, f1);  break;						// step on multiples of 6
964
	}
965
	if ( j < bsteps ) { 																				// if a baby step is the identity, we know the order of the element
966
		k = baby_index(j,modcase);																	// note that we rely on the order being relatively prime to 2,3,6 (per m)
967
		if ( a ) k/=m;  																				// need to adjust for common divisor of baby steps when a!=0
968
		return set_exp(exp,k,low,high);
969
	}	
970
	
971
	// Giant steps
972
	j = hecurve_g1_AJC_dsteps_1 (giants+dsteps-1, giant_x[0], giant_y[0], dsteps, f1);							// downward steps first
973
	if ( j < dsteps ) {																				// if a giant step hit the identity, it could be a multiple of the element order
974
		// It might be faster to just continue in this situation, but it's rare in any event
975
		o = hecurve_g1_fastorder (baby_x[0], baby_y[0], gbase-j*gstep, f1);
976
		return set_exp(exp,o,low,high);
977
	}
978
	j = hecurve_g1_AJC_steps_1 (giants+dsteps-1, giant_x[0], giant_y[0], usteps, f1);							// now upward steps
979
	if ( j < usteps ) {																				// if a giant step hit the identity, it could be a multiple of the element order
980
		// It might be faster to just continue in this situation, but it's rare in any event
981
		o = hecurve_g1_fastorder (baby_x[0], baby_y[0], gbase+j*gstep, f1);
982
		return set_exp(exp,o,low,high);
983
	}
984
	gbase -= (dsteps-1)*gstep;																		// reset gbase to match giants[0]
985
	
986
	// Convert to affine to get unique x coords
987
	for ( i = j = 0 ; i < bsteps ; i++, j++ ) _ff_set(stepzs[j], babys[i].z2);
988
	for ( i = 0 ; i < gsteps ; i++, j++ ) _ff_set(stepzs[j], giants[i].z2);
989
	ff_parallel_invert(stepzs, stepzs, j);																	// we assume j < FF_MAX_PARALLEL_INVERTS here
990
	for ( i = j = 0 ; i < bsteps ; i++,j++ ) ff_mult(babys[i].x, babys[i].x, stepzs[j]);								// only invert x-coord here, we never need to invert y
991
	for ( i = 0 ; i < gsteps ; i++,j++ ) ff_mult(giants[i].x, giants[i].x, stepzs[j]);
992
/*
993
printf ("affine baby x's:\n", j);
994
for ( i = 0 ; i < bsteps ; i++ ) printf ("   %d(%d): %ld\n", i, baby_index(i,modcase), _ff_get_ui(babys[i].x));
995
printf ("affine giant x's:\n", j);
996
for ( i = 0 ; i < gsteps ; i++ ) printf ("   %ld: %ld\n", gbase+i*gstep, _ff_get_ui(giants[i].x));
997
*/
998
	// Populate the table with babys.  Insert will fail if we try to insert the same x value twice - this can only happen if two babys are inverses (they can't be equal because we didn't hit the identity).
999
	tab_clear();
1000
	for ( i = 0 ; i < bsteps ; i++ ) if ( (j=tab_insert(babys[i].x,i)) >= 0 ) break;
1001
	/*
1002
	   If we encountered two baby steps with the same x value, they must be inverses, since otherwise we would have hit the identity in between.
1003
	   If m=1,2,3,6 we have stepped on every possible element order between the two, and if m=4,5 implies we have stepped on every multiple of a common divisor of the two,
1004
	   and hence on every possible value of the difference of their indices (which would be the identity if the steps were equal).
1005
	*/
1006
	if ( i < bsteps ) {
1007
		k = baby_index(i,modcase)+baby_index(j,modcase); 
1008
		if ( a ) k/=m; 																				// need to adjust for common divisor of baby steps when a != 0
1009
		return set_exp(exp,k,low,high);
1010
	}
1011

1012
	// Now match giant steps
1013
	o1 = o2 = 0;
1014
	for ( i = 0 ; i < gsteps ; i++ ) {
1015
		if ( (j=tab_lookup(giants[i].x)) >= 0 ) {
1016
			_ff_mult(t0,babys[j].y,giants[i].z3);	_ff_mult(t1,giants[i].y,babys[j].z3);								// normalize y values for comparison
1017
			if ( _ff_zero(t0) ) { k = 2*baby_index(j,modcase);  return set_exp(exp,k,low,high); }					// handle 2-torsion case
1018
			k = baby_index(j, modcase);
1019
			if ( _ff_equal(t0,t1) ) k = -k;
1020
			o = gbase+i*gstep+k;
1021
			if ( o==o1 ) continue;																	// ignore duplicate matches, this can happen for a != 0
1022
			if ( o1 )  { o2 = o; break; }
1023
			o1 = o;
1024
		}
1025
	}
1026
	if ( ! o1 ) { printf ("%ld: Error, BSGS search of entire Hasse-Weil interval failed, low=%ld, high=%ld, m=%d\n", _ff_p, low, high, m);  exit (0); }
1027
	if ( ! o2 ) { *exp = o1; return 1; }									// we know the o1 is the unique multiple of the element order in [low,high]
1028
	k=i_abs(o2-o1);												// otherwise, we know |o2-o1| is a multiple of the element order and there are at least two multiples in [low,high]
1029
	*exp = (m==1 ? k : hecurve_g1_fastorder (baby_x[0], baby_y[0], k, f1) );	// if m is not 1 we don't know that o1 and o2 are adjacent multiples, but a fastorder computation will figure it out
1030
	return 0;
1031
}
1032

1033
/*
1034
	Compute the order of affine point (x,y) given that (x,y)^e=1 using the classical algorithm.
1035

1036
	This algorithm is not particularly fast, but it doesn't need to be, as it is not invoked for most p.
1037
	The average number of bits exponentiated per prime for a non-CM curve (fixed curve, varying p) is less than 1.
1038
*/
1039
long hecurve_g1_fastorder (ff_t x, ff_t y, long e, ff_t f1)
1040
{
1041
	hec_jc_t b[MAX_UI_PP_FACTORS];
1042
	unsigned long q, p[MAX_UI_PP_FACTORS], h[MAX_UI_PP_FACTORS];
1043
	register long o;
1044
	register int i, j, k, w;
1045

1046
	w = ui_factor(p,h,e);				// takes about 2 microseconds for e~2^32 (2.5MHz AMD-64), but e is generally much smaller
1047
	if ( w == 1 && h[0] == 1 ) return e;	// not hard when the exponent is prime
1048
		
1049
	for ( i = 0 ; i < w ; i++ ) {
1050
		for ( q=p[i],j=1 ; j < h[i] ; j++ ) q *= p[i]; 
1051
		o = e/q;
1052
		hecurve_g1_AJC_exp_ui (b+i, x, y, o, f1);
1053
	}
1054
	o = 1;
1055
	for ( i = 0 ; i < w ; i++ ) {
1056
		for ( j = 0 ; j < h[i]-1 ; j++ ) {
1057
			if ( hecurve_g1_JC_id(b+i) ) break;
1058
			hecurve_g1_JC_exp_ui (b+i, b+i, p[i], f1);
1059
		}
1060
		if ( ! hecurve_g1_JC_id (b+i) ) j++;
1061
		for ( k = 0 ; k < j ; k++ ) o *= p[i];
1062
	}
1063
 //printf ("fastorder computed order of (%ld,%ld) is %ld from exponent %ld\n", _ff_get_ui(x), _ff_get_ui(y), o, e);
1064
	return o;
1065
}
1066

1067
/*
1068
	BSGS search hardwired for very short intervals (typically less than 50), which uses only one or two baby steps.
1069
	These intervals can arise when the first order computation does not uniquely determine the exponent of the group (or when p is small).
1070

1071
	We assume b does not have 2 torsion (hence its order is at least 3) and high > low.
1072
	We rely on the existence of an exponent of b in [low,high] and do not always verify this (e.g., if high=low+1 and b^low != id then we conclude b^high == id)
1073
*/
1074
int hecurve_g1_bsgs_short (long *exp, hec_jc_t *b, long low, long high, ff_t f1)
1075
{
1076
	long range;
1077
	hec_jc_t b2[1],b5[1],g[1];
1078
	register ff_t t0, t1;
1079
	register long e;
1080
	long o1,o2;
1081
	int i;
1082

1083
	range = high-low+1;
1084
	switch (range) {
1085
	case 2:		// E no computation other than the g exponentiation, cost E
1086
		hecurve_g1_JC_exp_ui(g, b, low, f1);
1087
		*exp = ( hecurve_g1_JC_id(g) ? low : high );
1088
		return 1;
1089
	case 3:		// E+4/3M (on average)
1090
		hecurve_g1_JC_exp_ui(g, b, low+1, f1);
1091
		if ( hecurve_g1_JC_id(g) ) { *exp = low+1;  return 1; }
1092
		// it must be the case that g = +/- b, we don't verify this but simply compare y values to distinguish
1093
		_ff_mult(t0,b->y,g->z3);  _ff_mult(t1,g->y,b->z3);
1094
		*exp =  ( _ff_equal (t0, t1) ? low : high );
1095
		return 1;
1096
	case 4:		// E+1/4D+2M = E+5M+2A (roughly) (D indicates doubling(squaring) a point on the curve, which costs 11M+8A field operations)
1097
		hecurve_g1_JC_exp_ui(g, b, low+1, f1);
1098
		if ( hecurve_g1_JC_id(g) ) { *exp = low+1;  return 1; }
1099
		_ff_mult(t0,b->x,g->z2);  _ff_mult(t1,g->x,b->z2);
1100
		if ( _ff_equal (t0, t1) ) {
1101
			_ff_mult(t0,b->y,g->z3);  _ff_mult(t1,g->y,b->z3);
1102
			if ( ! _ff_equal (t0, t1) ) { *exp = low+2;  return 1; }
1103
			// we know b^low = id, but it could also be that b^high = id if b has order 3, so we need to check this
1104
			hecurve_g1_2JC (b2,b, f1);
1105
			_ff_mult(t0,b->x,b2->z2);  _ff_mult(t1,b2->x,b->z2);					// given b != id, b^3==id iff b and b^2 have the same x coord
1106
			if ( _ff_equal(t0,t1) ) { *exp = 3; return 0; }
1107
			*exp = low;
1108
			return 1;
1109
		}
1110
		*exp = high;
1111
		return 1;
1112
	case 5:		// < E+4/5D+5M (on average)
1113
		hecurve_g1_JC_exp_ui(g, b, low+2, f1);
1114
		if ( hecurve_g1_JC_id(g) ) { *exp = low+2;  return 1; }
1115
		hecurve_g1_2JC (b2,b, f1);
1116
		if ( hecurve_g1_JC_id(b2) ) { *exp = 2; return 0; }
1117
		if ( hecurve_g1_JC_2tor(b2) ) { *exp = 4; return 0; }
1118
		_ff_mult(t0,b->x,b2->z2);  _ff_mult(t1,b2->x,b->z2);
1119
		if ( _ff_equal(t0,t1) ) { *exp = 3; return 0; }
1120
		// we now know |b|>4, so there is a unique multiple in the interval
1121
		_ff_mult(t0,b->x,g->z2);  _ff_mult(t1,g->x,b->z2);
1122
		if ( _ff_equal (t0, t1) ) {
1123
			_ff_mult(t0,b->y,g->z3);  _ff_mult(t1,g->y,b->z3);
1124
			*exp = ( _ff_equal(t0,t1) ? low+1 : low+3 );
1125
			return 1;
1126
		}
1127
		_ff_mult(t0,b2->y,g->z3);  _ff_mult(t1,g->y,b2->z3);
1128
		*exp =  ( _ff_equal (t0, t1) ? low : low+4 );
1129
		return 1;
1130
	}
1131
	hecurve_g1_2JC (b2,b, f1);
1132
	if ( hecurve_g1_JC_id(b2) ) { *exp = 2; return 0; }
1133
	if ( hecurve_g1_JC_2tor (b2) ) { *exp = 4; return 0; }	
1134
	_ff_mult(t0,b->x,b2->z2);  _ff_mult(t1,b2->x,b->z2);
1135
	if ( _ff_equal(t0,t1) ) { *exp = 3; return 0; }
1136
	hecurve_g1_2JC (b5,b2, f1);
1137
	hecurve_g1_JCJC (b5,b, f1);
1138
	if ( hecurve_g1_JC_id(b5) ) { *exp = 5; return 0; }
1139
	// we know |b|>5
1140
	e = low+2;
1141
	hecurve_g1_JC_exp_ui(g, b, e, f1);
1142
	o1 = o2 = 0;
1143
	while ( e < high+3 ) {
1144
		if ( hecurve_g1_JC_id(g) ) { o2 = o1; o1 = e; goto next; }
1145
		_ff_mult(t0,b->x,g->z2);  _ff_mult(t1,g->x,b->z2);
1146
		if ( _ff_equal (t0, t1) ) {
1147
			_ff_mult(t0,b->y,g->z3);  _ff_mult(t1,g->y,b->z3);
1148
			i = ( _ff_equal(t0,t1) ? -1 : 1 );
1149
			o2 = o1;  o1 = e+i; goto next;
1150
		}
1151
		_ff_mult(t0,b2->x,g->z2);  _ff_mult(t1,g->x,b2->z2);
1152
		if ( _ff_equal (t0, t1) ) {
1153
			_ff_mult(t0,b2->y,g->z3);  _ff_mult(t1,g->y,b2->z3);
1154
			i = ( _ff_equal(t0,t1) ? -2 : 2 );
1155
			o2 = o1;  o1 = e+i; goto next;
1156
		}
1157
next:	if ( o2 ) break;
1158
		hecurve_g1_JCJC (g,b5,f1);
1159
		e += 5;
1160
	}
1161
	if ( ! o1 ) { printf ("%ld: No match found in short interval [%ld,%ld]\n", _ff_p, low, high);  exit (0); }
1162
	if ( o2 ) { *exp = o1-o2; return 0; }
1163
	*exp = o1;
1164
	// total cost is roughly E + 2D + (1+(range/5-1))*A + (range/5)*4M (D indicates point doubling, A indicates point addition)
1165
	return 1;
1166
}
1167

1168
/*
1169
	The rest of this file contains code that is not currently used because it was found to to be suboptimal in testing (on an AMD Athlon) but
1170
	may be useful on other platforms and/or other applications.
1171
*/
1172
#if 0
1173

1174
/*
1175
	Below are elliptic curve arithmetic functions for Jacobian coordinates.  The reduced Chudnovsky coordinates
1176
        are faster at everything except 2JC versus 2J is (11M+8A) vs (10M+10A), but this difference is negligible (1M~2.5A),
1177
        and in testing it was found to be faster to use JC everywhere.
1178
*/
1179

1180
// converts Jacobian to affine coords, given the inverse of z
1181
static inline void hecurve_g1_J_to_A (ff_t *px, ff_t *py, hec_j_t *p, ff_t zinv)
1182
{
1183
	register ff_t t0,t1;
1184
	
1185
	_ff_square(t0,zinv);
1186
	_ff_mult(*px,p->x,t0);
1187
	_ff_mult(t1,t0,zinv);
1188
	_ff_mult(*py,p->y,t1);
1189
	// 4M
1190
}
1191

1192
// converts Jacobian to reduced Chudnovsky Jacobian coords
1193
static inline void hecurve_g1_J_to_JC (hec_jc_t *o, hec_j_t *p)
1194
{
1195
	_ff_set(o->x,p->x);
1196
	_ff_set(o->y,p->y);
1197
	_ff_square(o->z2,p->z);
1198
	_ff_mult(o->z3,o->z2,p->z);
1199
	// 2M
1200
}
1201

1202
// squares an affine point into Jacobian coords
1203
static inline void hecurve_g1_2AJ (hec_j_t *p, ff_t x, ff_t y, ff_t f1)
1204
{
1205
	register ff_t t0,t1,t2,t3,a,b;
1206

1207
	_ff_add(p->z,y,y);  _ff_mult(t1,p->z,y); _ff_add(t2,t1,t1); _ff_mult(a,x,t2);		// z=2y, a=4xy^2, t1=2y^2, t2=4y^2
1208
	_ff_mult(t3,t1,t2);													// t3=8y^4
1209
	_ff_square(t1,x); _ff_add(b,t1,t1); _ff_addto(b,t1); _ff_addto(b,f1);				// b = 3x^2+f1*1^4
1210
	_ff_square(t0,b); _ff_add(t2,a,a); _ff_sub(p->x,t0,t2);						// x = b^2-2a
1211
	_ff_sub(t1,a,p->x); _ff_mult(t2,t1,b); _ff_sub(p->y,t2,t3);					// y = b(a-x)-8y^4 (note we use the new x here)
1212
	// 6M+9A
1213
}
1214

1215

1216
// squares a point p1 in Jacobian coords (p3 is the output, may be equal to p1)
1217
static inline void hecurve_g1_2J (hec_j_t *p3, hec_j_t *p1, ff_t f1)
1218
{
1219
	register ff_t a, b, c, t0, t1, t2;
1220
	
1221
	_ff_square(t0,p1->x); _ff_add(t1,t0,t0); _ff_addto(t1,t0); 					// t1 = 3x^2
1222
	_ff_square(t0,p1->z); _ff_square(t2,t0);  _ff_mult(t0,f1,t2); _ff_add(b,t1,t0);		// b = 3x^2+f1*z^4	(note that f1=a4 in 13.2.1.c  of HECHECC p.282)
1223
	_ff_add(c,p1->y,p1->y); ff_mult(p3->z,p1->z,c);							// c=2y, z = 2yz
1224
	_ff_mult(t2,c,p1->y); _ff_add(t0,t2,t2);  _ff_mult(a,t0,p1->x);					// a=4xy^2,t2=2y^2
1225
	_ff_add(t0,a,a); _ff_square(t1,b); _ff_sub(p3->x,t1,t0);						// x = b^2-2a
1226
	_ff_square(t0,t2); _ff_add(c,t0,t0);										// c = 8y^4
1227
	_ff_sub(t0,a,p3->x); _ff_mult(t2,t0,b); _ff_sub(p3->y,t2,c);					// y = b(a-x)-c   -- note we use the new x value here
1228
	// 10M+10A
1229
}
1230

1231

1232
// multiplies a point p in Jacobian coords by a (non-identity) affine point (p is an input and an output)
1233
static inline void hecurve_g1_AJ (hec_j_t *p, ff_t x0, ff_t y0, ff_t f1)
1234
{
1235
	register ff_t a, c, e, f, t0, t1, t2;
1236
	
1237
	if ( _ff_zero(p->z) ) { _ff_set(p->x,x0); _ff_set(p->y,y0); _ff_set_one(p->z); return; }
1238
	_ff_square(t0,p->z);  _ff_mult(a,x0,t0);									// a = x0z^2, b = x*1^2=x (since z0=1), and t0=z^2
1239
	_ff_sub(e,p->x,a);													// e = a-b
1240
	_ff_mult(t1,t0,p->z); _ff_mult(c,y0,t1);									// c = y0z^3, d=y*1^3=y
1241
	_ff_sub(f,p->y,c);													// f = d-c
1242
	if ( _ff_zero(e) && _ff_zero(f) ) { hecurve_g1_2AJ (p, x0,y0,f1); return; }		// must use doubling code here, but at least its an affine double (6M+9A)
1243
	_ff_square(t0,e); _ff_mult(t1,t0,e); _ff_mult(t2,t0,a);						// t1=e^3, t2=ae^2
1244
	_ff_square(t0,f); _ff_sub(p->x,t0,t1); _ff_add(t0,t2,t2); _ff_subfrom(p->x,t0);		// x = f^2-e^3-2ae^2
1245
	_ff_sub(t0,t2,p->x); _ff_mult(t2,f,t0); _ff_mult(t0,t1,c); _ff_sub(p->y,t2,t0);		// y = f(ae^2-x) - ce^3
1246
	ff_mult(p->z,p->z,e);												// z = 1*z*e
1247
	// 11M+6A
1248
}
1249

1250

1251
// multiplies p1 in Jacobian coords by p2 Jacobian coords  and puts the result in p1.  Handles identity and will square (double) if needed
1252
// this is the slowest case and should be avoided when possible, AJ, AJC, and JCJC are all better
1253
static inline void hecurve_g1_JJ (hec_j_t *p1, hec_j_t *p2, ff_t f1)
1254
{
1255
	register ff_t a, b, c, d, e, f, t0, t1, t2;
1256
	
1257
	// we need to handle identity cases in general (although in some cases we might be able to rule this out)
1258
	if ( _ff_zero(p2->z) ) return;
1259
	if ( _ff_zero(p1->z) ) { *p1=*p2; return; }
1260
	_ff_square(t0,p2->z);  _ff_mult(a,p1->x,t0);								// a = x1z2^2, and t0=z2^2
1261
	_ff_square(t1,p1->z);  _ff_mult(b,p2->x,t1);								// b = x2z1^2, and t1=z1^2
1262
	_ff_sub(e,b,a);														// e = a-b
1263
	_ff_mult(t2,t0,p2->z); _ff_mult(c,p1->y,t2);								// c = y1z2^3
1264
	_ff_mult(t2,t1,p1->z); _ff_mult(d,p2->y,t2);								// d = y2z1^3
1265
	_ff_sub(f,d,c);														// f = d-c
1266
	if ( _ff_zero(e) && _ff_zero(f) ) { hecurve_g1_2J (p1,p1,f1); return; }			// must double if pts are equal (10M+10A), inverses will end up with z=0, so we let them through
1267
	_ff_square(t0,e); _ff_mult(t1,t0,e); _ff_mult(t2,t0,a);						// t1=e^3, t2=ae^2
1268
	_ff_square(t0,f); _ff_sub(p1->x,t0,t1); _ff_add(t0,t2,t2); _ff_subfrom(p1->x,t0);	// x = f^2-e^3-2ae^2
1269
	_ff_sub(t0,t2,p1->x); _ff_mult(t2,f,t0); _ff_mult(t0,t1,c); _ff_sub(p1->y,t2,t0);	// y = f(ae^2-x) - ce^3	-- note we use the new x here
1270
	ff_mult(t0,p1->z,p2->z);	_ff_mult(p1->z,t0,e);							// z = z1z2e
1271
	// 16M+7A (ouch)
1272
}
1273

1274

1275
// Computes p=(x,y)^n where (x,y) !=1 is in affine coordinates and p is in Jacobian coordinates
1276
// It takes advantage of the fact that all additions are of the form J+A, requiring only 11M, doubling is done in J+J form, using 10M
1277
void hecurve_g1_AJ_exp_ui (hec_j_t *p, ff_t x0, ff_t y0, unsigned long n, ff_t f1)
1278
{
1279
	register unsigned long m;
1280
	unsigned long pbits, nbits;
1281
	register ff_t negy0;
1282
	int i;
1283

1284
	if ( n == 0 ) { _ff_set_zero(p->z); return; }
1285
	if ( n == 1 ) { _ff_set(p->x,x0); _ff_set(p->y,y0); _ff_set_one(p->z); return; }
1286
	hecurve_g1_2AJ(p,x0,y0,f1);
1287
	if ( n == 2 ) return;
1288
	ui_NAF(&pbits,&nbits,n);
1289
	i = _asm_highbit(pbits)-2;						// we know the top two bits of the NAF are 10
1290
hecurve_expbits+=i+1;
1291
	_ff_neg(negy0,y0);
1292
	m = (1UL<<i);
1293
	for ( ; m ; m >>= 1 ) {
1294
		hecurve_g1_2J(p,p,f1);	 					// 10M+10A
1295
		if ( m&pbits ) hecurve_g1_AJ(p,x0,y0,f1);		// 11M+6A
1296
		if ( m&nbits ) hecurve_g1_AJ(p,x0,negy0,f1);		// 11M+6A
1297
	}
1298
}
1299

1300

1301
// Computes p=a^e where a!=1 is in Jacobian coords, and so is p.
1302
// This is slow and should be avoided.  Overlap is ok
1303
void hecurve_g1_J_exp_ui (hec_j_t *p, hec_j_t *a, unsigned long n, ff_t f1)
1304
{
1305
	register unsigned long m;
1306
	unsigned long pbits, nbits;
1307
	hec_j_t t, ai;
1308
	int i;
1309
	
1310
	if ( n == 0 ) { _ff_set_zero(p->z); return; }
1311
	if ( n == 1 ) { *p=*a; return; }
1312
	hecurve_g1_2J(&t,a,f1);							// 10M+10A
1313
	if ( n == 2 ) { *p = t; return; }
1314
	ui_NAF(&pbits,&nbits,n);
1315
	i = _asm_highbit(pbits)-2;
1316
hecurve_expbits+=i+1;
1317
	ai=*a;  ff_negate(ai.y);
1318
	m = (1UL<<i);
1319
	for ( ; m ; m >>= 1 ) {
1320
		hecurve_g1_2J(&t,&t,f1);						// 10M+10A
1321
		if ( m&pbits ) hecurve_g1_JJ(&t,a,f1);			// 16M+7A
1322
		if ( m&nbits ) hecurve_g1_JJ(&t,&ai,f1);			// 16M+7A
1323
	}
1324
	*p = t;
1325
}
1326

1327

1328
// Combines precomputed values p[i]=p[0]^(2^i) to compute p[0]^n.  Assumes all required powers are present: NAF potentially requires one more bit!
1329
// CAUTION: if this is false, the resulting behavior is very strange and unpredictable.  You have been warned.
1330
void hecurve_g1_J_exp_powers(hec_j_t *o, hec_j_t *p, unsigned long n, ff_t f1)
1331
{
1332
	register unsigned long m;
1333
	unsigned long pbits, nbits;
1334
	hec_j_t t;
1335
	int i;
1336

1337
	// handle small n quickly
1338
	switch (n) {
1339
	case 0:	_ff_set_zero(o->z); return; 
1340
	case 1:	*o=p[0]; return;
1341
	case 2:	*o=p[1]; return;
1342
	case 3:	*o=p[0]; hecurve_g1_JJ(o,p+1,f1); return;
1343
	case 4:	*o=p[2]; return;
1344
	case 5:	*o=p[2]; hecurve_g1_JJ(o,p,f1); return;
1345
	case 6:	*o=p[2]; hecurve_g1_JJ(o,p+1,f1); return;
1346
	case 7:	*o=p[0]; ff_negate(o->y); hecurve_g1_JJ(o,p+3,f1); return;
1347
	case 8:	*o=p[3]; return;
1348
	}
1349
	ui_NAF(&pbits,&nbits,n);
1350
	i = _asm_highbit(pbits);
1351
	*o = p[i];
1352
	i-=2;
1353
hecurve_expbits+=i+1;
1354
	m = (1UL<<i);
1355
	for ( ; m ; m >>= 1, i-- ) {
1356
		if ( (m&pbits) ) hecurve_g1_JJ(o,p+i,f1);						 	// 16M+7A (these are expensive)
1357
		if ( (m&nbits) )
1358
			{ ff_negate(o->y); hecurve_g1_JJ(o,p+i,f1); ff_negate(o->y); }	// 16M+9A (slightly more expensive)
1359
	}
1360
}
1361

1362
// Sets p[i] = p[0]^(2^i) for i in [0,k]. Input is an affine pt not equal to the identity, output is in Jacobian coords.
1363
void hecurve_g1_AJ_powers (hec_j_t p[], ff_t x0, ff_t y0, int k, ff_t f1)
1364
{
1365
	ff_t a, b, c, x, y, z, t0, t1, t2;
1366
	register hec_j_t *e;
1367
	
1368
	_ff_set(p->x,x0);  _ff_set(p->y,y0);  _ff_set_one(p->z);
1369
	hecurve_g1_2AJ (p+1, x0, y0, f1);
1370
	for ( e=p+k,p+=2 ; p <= e ; p++ ) hecurve_g1_2J(p,p-1,f1); 			// 10M + 10A
1371
}
1372
#endif
1373

1374
Product

Resources

Company