CoCalc -- ffpolyext.c

Harvard Workshop: Distribution of modular symbols and L-values
code / alex / psage / psage / libs / smalljac / ffpolyext.c
²⁴¹⁸¹⁸ views
1
#include <stdlib.h>
2
#include <stdio.h>
3
#include "ffext.h"
4
#include "ffpoly.h"
5
#include "cstd.h"
6

7
/*
8
    Copyright 2007-2008 Andrew V. Sutherland
9

10
    This file is part of smalljac.
11

12
    smalljac is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 2 of the License, or
15
    (at your option) any later version.
16

17
    smalljac is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21

22
    You should have received a copy of the GNU General Public License
23
    along with smalljac.  If not, see <http://www.gnu.org/licenses/>.
24
*/
25

26
/*
27
	This module contains performance-oriented code for operations on polynomials of low degree,
28
	and support functions for computations on genus 1 and genus 2 curves.
29
	
30
	It is assumed here that the ff_t type does not require initialization (no support for GMP's mpz type),
31
	and in a number of cases it is further assumed that the ff_t type fits in an unsigned long (this assumption is verified when made).
32
*/
33

34

35
void ff_poly_xpan_mod_d2 (ff_t g[2], ff_t a, unsigned long n, ff_t f[1]);
36
void ff_poly_xn_mod_d3 (ff_t g[3], unsigned long n, ff_t f[4]);
37
void ff_poly_xpan_mod_d3 (ff_t g[2], ff_t a, unsigned long n, ff_t f[4]);
38
void ff_poly_mult_mod_d3 (ff_t w[3], ff_t u[3], ff_t v[3], ff_t f[4]);
39
void ff_poly_square_mod_d3 (ff_t w[3], ff_t u[3], ff_t f[4]);
40
void ff_poly_exp4_mod_d3 (ff_t g[3], ff_t p[12], unsigned long n, ff_t f[4]);
41
void ff_poly_exp8_mod_d3 (ff_t g[3], ff_t p[24], unsigned long n, ff_t f[4]);
42

43
void ff_poly_xn_mod_d4 (ff_t g[4], unsigned long n, ff_t f[5]);
44
void ff_poly_xpan_mod_d4 (ff_t g[3], ff_t a, unsigned long n, ff_t f[5]);
45
void ff_poly_mult_mod_d4 (ff_t w[4], ff_t u[4], ff_t v[4], ff_t f[5]);
46
void ff_poly_square_mod_d4 (ff_t w[4], ff_t u[4], ff_t f[5]);
47
void ff_poly_exp4_mod_d4 (ff_t g[4], ff_t p[16], unsigned long n, ff_t f[5]);
48
void ff_poly_exp8_mod_d4 (ff_t g[4], ff_t p[32], unsigned long n, ff_t f[5]);
49

50

51
// assumes f monic, char > 3, sets t to translation value: depressed cubic is f(x-t)
52
void ff_depress_cubic (ff_t t[1], ff_t f[4])
53
{
54
	_ff_t_declare_reg t1, t2, f1;
55
	
56
	_ff_mult(t1,_ff_third,f[2]);														// t1 = f2/3
57
	_ff_set(t[0],t1);
58
	_ff_mult(f1,t1,f[2]);															// f1 = f2^2/3
59
	_ff_square(t2,t1); ff_mult(t2,t2,t1); _ff_x2(t2); ff_mult(t1,t1,f[1]);						// t2 = f2^3/27, t1 = f1f2/3
60
	_ff_addto(f[0],t2);  _ff_subfrom(f[0],t1);
61
	_ff_subfrom(f[1],f1);
62
	_ff_set_zero(f[2]);
63
	// 5M+3A
64
}
65

66
// computes cubic resolvent of f=x^4+f2x^2+f1x+f0 and depresses it
67
void ff_depressed_cubic_resolvent (ff_t t[1], ff_t g[4], ff_t f[5])
68
{
69
	_ff_t_declare_reg t0,t1,t2;
70

71
	_ff_set_one(g[3]);
72
	_ff_add(t1,f[2],f[2]);
73
	_ff_neg(g[2],t1);															// g2 = -2f2
74
	_ff_add(t1,f[0],f[0]); _ff_x2(t1);
75
	_ff_square(t2,f[2]);
76
	_ff_sub(g[1],t2,t1);															// g1 = f2^2-4f0
77
	_ff_square(g[0],f[1]);														// g0 = f1^2
78
	ff_depress_cubic (t,g);
79
	// total 7M+5A
80
}
81

82

83
// assumes f monic, char > 3, sets t to translation value: depressed cubic is f(x-t)
84
void ff_depress_quartic (ff_t t[1], ff_t f[5])
85
{
86
	_ff_t_declare_reg t0,t1, t2, t3, t4, w1, w2;
87
	
88
	_ff_mult(t1,f[3],_ff_half);														// t1 = f3/2
89
	ff_mult(t0,t1,_ff_half);														// t0 = f3/4
90
	_ff_set(t[0],t0);			
91
	_ff_square(t2,t0);  _ff_square(t4,t3);											// t2 = f3^2/16, t4=f3^4/256
92
	_ff_add(t3,t4,t4);  _ff_addto(t3,t4);												// t3 = 3f3^4/256
93
	_ff_mult(w1,f[2],t2);															// w1 = f2f3^2/16
94
	_ff_mult(w2,f[1],t0);															// w2 = f1f3/4
95
	_ff_subfrom(w1,w2);
96
	_ff_subfrom(w1,t3);
97
	_ff_addto(f[0],w1);															// new f0 = f0 + f2f3^2/16 - f1f3/4 - 3f^4/256
98
	_ff_mult(t3,t0,t2);															// t3 = f3^2/8
99
	_ff_mult(w1,t3,f[3]);
100
	_ff_mult(w2,f[2],t1);
101
	_ff_subfrom(w1,w2);
102
	_ff_addto(f[1],w1);															// new f1 = f1 + f3^3/8 - f2f3/2
103
	_ff_add(w1,t3,t3);  _ff_addto(w1,t3);											// w1 = 3f3^2/8
104
	_ff_subfrom(f[2],w1);														// new f2 = f2-3f3^2/8
105
	_ff_set_zero(f[3]);
106
	// 9M+10A
107
}
108

109
// computes the gcd of f and g where f=x^3+ax+b 
110
void ff_poly_gcd_g1 (ff_t h[4], int *d_h, ff_t f[4], ff_t g[4], int d_g)
111
{
112
	_ff_t_declare_reg t1,t2;
113
	ff_t r[4], s[4];
114
	int d_r, d_s;
115
	
116
	if ( d_g == 3 ) {
117
		_ff_neg(r[2],g[2]);
118
		_ff_mult(t1,f[1],g[3]);
119
		_ff_sub(r[1],t1,g[1]);
120
		_ff_mult(t1,f[0],g[3]);
121
		_ff_sub(f[0],t1,g[0]);
122
		for ( d_r = 2 ; d_r >= 0 && _ff_zero(r[d_r]) ; d_r-- );
123
	} else {
124
		ff_poly_copy (r,&d_r,g,d_g);
125
	}
126
	if ( d_r < 0 ) { ff_poly_copy (h, d_h, f, 3);  return; }
127
	if ( ! d_r ) { _ff_set_one(h[0]); *d_h = 0;  return; }
128
	_ff_neg(s[2],r[d_r-1]);
129
	_ff_mult(t1,f[1],r[d_r]);
130
	if ( d_r > 1 ) { _ff_sub(s[1],t1,r[d_r-2]); } else { _ff_set(s[1],t1); }
131
	_ff_mult(s[0],f[0],r[d_r]);
132
	for ( d_s = 2 ; d_s >= 0 && _ff_zero(s[d_s]) ; d_s-- );
133
	if ( d_s == 2 && d_r == 2) {
134
		_ff_mult(t1,s[1],r[2]);
135
		_ff_mult(t2,r[1],s[2]);
136
		_ff_sub(s[1],t1,t2);
137
		_ff_mult(t1,s[0],r[2]);
138
		_ff_mult(t2,r[0],s[2]);
139
		_ff_sub(s[0],t1,t2);
140
		for ( d_s = 1 ; d_s >= 0 && _ff_zero(s[d_s]) ; d_s-- );
141
	}
142
	if ( d_s < 0 ) { ff_poly_copy(h,d_h,r,d_r);  return; }
143
	if ( ! d_s ) { _ff_set_one(h[0]); *d_h = 0;  return; }
144
	if ( d_r == 2 && d_s == 1 ) {
145
		_ff_mult(t1,r[1],s[1]);
146
		_ff_mult(t2,s[0],r[2]);
147
		_ff_sub(r[1],t1,t2);
148
		ff_mult(r[0],r[0],s[1]);
149
		for ( d_r = 1 ; d_r >= 0 && _ff_zero(r[d_r]) ; d_r-- );
150
		if ( d_r < 0 ) { ff_poly_copy (h, d_h, s, d_s);  return; }
151
		if ( ! d_r ) { _ff_set_one(h[0]); *d_h = 0;  return; }
152
	}
153
	if ( d_s == 2 && d_r == 1 ) {
154
		_ff_mult(t1,s[1],r[1]);
155
		_ff_mult(t2,r[0],s[2]);
156
		_ff_sub(s[1],t1,t2);
157
		ff_mult(s[0],s[0],r[1]);
158
		for ( d_s = 1 ; d_s >= 0 && _ff_zero(s[d_s]) ; d_s-- );
159
		if ( d_s < 0 ) { ff_poly_copy (h, d_h, r, d_r);  return; }
160
		if ( ! d_s ) { _ff_set_one(h[0]); *d_h = 0;  return; }
161
	}
162
	// we must have d_r==d_s==1 at this point
163
	_ff_mult(t1,r[0],s[1]);
164
	_ff_mult(t2,s[0],r[1]);
165
	_ff_sub(s[0],t2,t1);
166
	if ( _ff_zero(s[0]) ) { ff_poly_copy(h,d_h,r,d_r);  return; }
167
	_ff_set_one(h[0]); *d_h = 0;
168
	// worst case 13M+7A and some copies
169
	return;	
170
}
171

172
/*
173
	Standard root finding algorithm specialized to degree 3, note that roots may be null if only the count is required
174
	Returns the number of roots found.  This function is superseded by ff_poly_roots_d3 below which solves the cubic with radicals.
175
*/
176
int ff_old_poly_roots_d3 (ff_t roots[3], ff_t f[4])
177
{
178
	_ff_t_declare g[3], h[3], hh[3], a;
179
	_ff_t_declare_reg t0, t1, t2;
180
	int i, j, d_g, d_h, d_hh;
181

182
#if FF_WORDS > 1
183
	err_printf ("ff_poly_factors_g1 only implemented for single word fields\n");  exit (0);
184
#endif
185

186
	// this is wasteful if the caller already knows the discriminant!
187
	_ff_square(t0,f[0]);  _ff_set_ui(t1,27);  _ff_mult(t2,t0,t1);			// t2 = 27f1^2
188
	_ff_square(t0,f[1]);  _ff_mult(t1,t0,f[1]);  _ff_x2(t1); _ff_x2(t1);		// t1 = 4f0^3
189
	_ff_addto(t1,t2);
190
	if ( _ff_zero(t1) ) {											// D=0 => curve has a repeated root
191
		if ( roots ) {											
192
			if ( _ff_zero(f[1]) ) {									// if f[1]=0 then f[0]=0 and we have a triple root at 0
193
				_ff_set_zero(roots[0]);  _ff_set_zero(roots[1]); _ff_set_zero(roots[2]);
194
			} else {											// otherwise return the distinct root is 3b/a and the double root is (-3b/(2a)
195
				_ff_invert(t0,f[1]);								// 1/a
196
				_ff_add(t1,f[0],f[0]);  _ff_addto(t1,f[0]);				// 3b
197
				_ff_mult(roots[0],t0,t1);							// 3b/a
198
				_ff_neg(t0,roots[0]);
199
				_ff_mult(roots[1],t0,_ff_half);						// -3b/(2a)
200
				_ff_set(roots[2],roots[1]);
201
			}
202
		}
203
		return 3;											
204
	}
205
	
206
	// compute x^p mod f
207
	ff3_zn_mod (g, _ff_p, f);	// g = x^p mod f
208
	_ff_dec(g[1]);			// g = x^p-x mod f
209
	for ( d_g = 2 ; d_g >= 0 && _ff_zero(g[d_g]) ; d_g-- );
210
	ff_poly_gcd_red (h, &d_h, f, 3, g, d_g);
211
	switch (d_h) {
212
	case 0:
213
		return 0;
214
	case 1:
215
		if ( roots ) {
216
			_ff_invert(t1,h[1]);
217
			_ff_mult(t2,t1,h[0]);
218
			_ff_neg(roots[0], t2);
219
		}
220
		return 1;
221
	case 2:
222
		err_printf ("Impossible, non-singular cubic with 2 roots in ff_old_poly_roots_d3: ");  ff_poly_print (f,3);  ff_poly_print(h,d_h);  exit (0);
223
	case 3:
224
		if ( ! roots ) return 3;
225
		// We know there are three roots, and separate the roots using the standard probabilistic algorithm
226
		// Pick a random a and compute g=(x+a)^((p-1)/2).  Half of the elements in F_p* will be roots of g, 
227
		// and with probability 3/4 gcd(f,g) will have degree 1 or 2 allowing us to pick out a root
228
		// As a minor optimization, we pick a=0 as our first "random" choice, since we can compute g faster in this case.
229
		for ( i = 0 ; ; i++ ) {
230
			if ( ! i ) {
231
				ff3_zn_mod (g, (_ff_p-1)/2, f);
232
			} else {
233
				ff_random(&a);
234
				ff_poly_xpan_mod_d3 (g, a, (_ff_p-1)/2, f);
235
			}
236
			_ff_dec (g[0]);		// g = (x+a)^((p-1)/2) mod f
237
			for ( d_g = 2 ; d_g >= 0 && _ff_zero(g[d_g]) ; d_g-- );
238
			ff_poly_gcd_red (h, &d_h, f, 3, g, d_g);
239
			if ( d_h > 0 && d_h < 3 ) break;
240
		}
241
		i = ff_poly_roots_d2(roots,h,d_h);
242
		if ( i != d_h ) { err_printf ("poly_roots_d2 failed in ff_old_poly_roots_d3!");  exit (0); }
243
		ff_poly_div (h, &d_h, g, &d_g, f, 3, h, d_h);
244
		if ( d_g != -1 ) { err_printf ("inexact poly division in ff_old_poly_factors_g1!\n");  exit(0); }
245
		j = ff_poly_roots_d2(roots+i,h,d_h);
246
		if ( j != d_h ) { err_printf ("poly_roots_d2 failed in ff_old_poly_roots_d3!");  exit (0); }
247
		if ( i+j != 3 ) { err_printf ("missing root in ff_old_poly_roots_d3!");  exit (0); }
248
		return 3;
249
	}
250
	err_printf ("Unreachable code in ff_old_poly_factors_g1!\n");
251
	exit (0);
252
}
253

254

255

256

257
/*
258
	Finds the roots of a monic depreseed cubic x^3+ax+b over F_p.
259
	If roots is NULL only the # of roots is determined, which can be significantly quicker (e.g. when D is not a QR)
260
	The value d is optional.  If non-null it must contain sqrt(-D/3) (this is used when computing 3-torsion)
261

262
	Assumes _ff_p fits in an unsigned long
263
*/
264
int _ff_poly_roots_d3 (ff_t roots[3], ff_t f[4], ff_t *pd)
265
{
266
	_ff_t_declare g[3], h[2], w[2], v[2], vk[2], d, t, r, s;
267
	_ff_t_declare_reg a, t0, t1, t2;
268
	int i, k, sts;
269

270
#if FF_WORDS > 1
271
	err_printf ("ff_poly_factors_g1 only implemented for single word fields\n");  exit (0);
272
#endif
273
	
274
	// handle p=3 separately
275
	if ( _ff_p == 3 ) {
276
		if ( _ff_zero(f[0]) ) { if ( roots ) _ff_set_zero(roots[0]); if ( _ff_one(f[1]) ) return 1; if ( roots ) {  _ff_set_one(roots[1]); _ff_set(roots[2],_ff_negone); } return 3; }
277
		if ( _ff_zero(f[1]) ) { if ( roots ) { _ff_neg(roots[0],f[0]); _ff_neg(roots[0],f[0]); _ff_neg(roots[0],f[0]); } return 3; }
278
		if ( _ff_one(f[1]) ) { if ( roots ) _ff_set(roots[0],f[0]);  return 1; } else return 0;
279
	}
280
	
281
	// f=x^3+ax
282
	if ( _ff_zero(f[0]) ) {
283
		_ff_neg(t,f[1]);  
284
		if ( ! roots ) { return (ff_residue(t)?3:1); }
285
		_ff_set_zero(roots[0]);
286
		if ( ! ff_sqrt(roots+1,&t) ) return 1;
287
		_ff_neg(roots[2],roots[1]);
288
		return 3;
289
	}
290
	
291
	// f = x^3+b
292
	if ( _ff_zero(f[1]) ) {
293
		_ff_neg(t,f[0]);
294
		if ( _ff_p1mod3 ) {
295
			if ( ! ff_cbrt(&d,&t) ) return 0;
296
			if ( roots ) { _ff_set(roots[0],d);  _ff_mult(roots[1], roots[0], _ff_cbrt_unity);  _ff_mult(roots[2], roots[1], _ff_cbrt_unity); }
297
			return 3;
298
		} else {
299
			if ( roots ) if ( ! ff_cbrt(roots,&t) ) { printf ("Impossible, ff_cbrt failed on input %ld when p=%ld is 2 mod 3\n", _ff_get_ui(t), _ff_p); exit(0); }
300
			return 1;
301
		}
302
	}
303

304
	if ( ! pd ) {
305
		_ff_square(t0,f[0]);  _ff_set_ui(t1,27);  _ff_mult(t2,t0,t1);			// t2 = 27f1^2
306
		_ff_square(t0,f[1]);  _ff_mult(t1,t0,f[1]);  _ff_x2(t1); _ff_x2(t1);		// t1 = 4f0^3
307
		_ff_add(t0,t1,t2);  _ff_mult(t,t0,_ff_third);						// t = -D/3 = (27f1^2+4f0^3)/3
308
		if ( _ff_zero(t) ) {											// t=0 => D=0 => curve is singular
309
			if ( roots ) {											// distinct root is 3b/a (we know a!=0), and (-3b/(2a) is a double root).
310
				_ff_invert(t0,f[1]);									// 1/a
311
				_ff_add(t1,f[0],f[0]);  _ff_addto(t1,f[0]);					// 3b
312
				_ff_mult(roots[0],t0,t1);								// 3b/a
313
				_ff_neg(t0,roots[0]);
314
				_ff_mult(roots[1],t0,_ff_half);							// -3b/(2a)
315
				_ff_set(roots[2],roots[1]);
316
			}
317
			return 3;											
318
		}
319
		sts = ff_sqrt_ext(&d,&t);										// use extended sqrt so we get a solution in F_p^2 if necessary
320
	} else {
321
		_ff_set(d,*pd);
322
	}
323
	_ff_mult(t2,_ff_half,_ff_third);									// t2=1/6 (used later)
324
	_ff_mult(t0, _ff_half, f[0]);
325
	_ff_neg(a,t0);												// a = -f[0]/2 (used later)
326
	if ( _ff_p1mod3 ) {
327
		if ( ! sts ) {											// p=1mod3 => -1/3 is a QR => (t not a QR => D not a QR => f has an even # of factors (by Stickleberger), so 1 root
328
			if ( ! roots ) return 1;								// if all the caller wants is the number of roots, we are done
329
			_ff_square(t0,t2);									// 1/36
330
			_ff_mult(s,t,t0);									// s = -D/108 is a not a QR, we will now work in the field F_p^2 = F_p[z]/(z^2-s) to compute (z+a)^1/3
331
			// compute (z+t1)^((p+1)*m) in F_p^2, this will be in the 3-Sylow subgroup of F_p^2, necessarily in F_p (here p=3^e*m+1, m not divisible by 3)
332
			// in the process, we compute g=(z+1)^((m-i)/3) where i is congruent to m mod 3 and h=(z+a)^m
333
			if ( _ff_p3_m1mod3 ) {
334
				ff_poly_xpan_mod_d2(g,a,(_ff_p3_m-1)/3,&s);			// g = (z+a)^((m-1)/3)
335
				ff2_square_s(h,g,s);  ff2_mult_s (h,h,g,s);			// h = g^3 = (z+a)^(m-1)
336
				ff2_mult_zpa_s(h,h,a,s);							// h = g^3*(z+a) = (z+a)^m
337
			} else {
338
				ff_poly_xpan_mod_d2(g,a,(_ff_p3_m-2)/3,&s);			// g = (z+a)^((m-2)/3)
339
				ff2_square_s (h, g, s);  ff2_mult_s (h, h, g, s);		// h = g^3 = (z+a)^(m-2)
340
				_ff_square(t0,a); _ff_add(w[1],a,a);  _ff_add(w[0],t0,s);	// w = (z+a)^2
341
				ff2_mult_s (h,h,w,s);							// h = g^3*(z+a)^2 = (z+a)^m
342
			}
343
			ff2_norm_s (&t,h,s);								// norm(h)=((z+a)^(m*(p+1)) is in the 3-Sylow of F_p
344
			if ( ! ff_3Sylow_invcbrt(&r,&t) ) {						// (z+a) is not a cubic residue - this should be impossible
345
				printf ("(z+%ld)^%ld = ", _ff_get_ui(a), _ff_p3_m); ff_poly_print(h,1);
346
				printf ("norm(h)=%ld is not a cube in F_%ld\n", _ff_get_ui(t), _ff_p);
347
				printf ("z+%ld is not a CR in F_p[z]/(z^2-%ld)\n", _ff_get_ui(a), _ff_get_ui(s));
348
				ff_poly_print(f,3);
349
				exit (0);
350
			}
351
			// we now know (z+a)^-((p+1)m)/3
352
			ff2_norm_s(&t,g,s);									// t = norm(g) = g^(p+1)
353
			ff2_exp_ui_s(h,h,(_ff_p+1)/(3*_ff_p3_m),s);				// compute h^(3^(e-1))
354
			if ( !_ff_p3_m1mod3 ) {
355
				// We need to construct (z+a)^n where n = (2(p+1)m+1)/3 = 2(p+1)(m-2)/3 + 2(2p+1)/3 + 1 = 2(p+1)(m-2)/3 + 2(2*3^(e-1)*m+1) + 1
356
				// We then have (z+a)^n = norm(g)^2*((h^(3^(e-1))^2*(z+a))^2*(z+a)   (note that we also need to square r to get (z+a)^(-2(p+1)m/3) here)
357
				ff_square(t,t);  ff_square(r,r);
358
				ff2_square_s(h,h,s);  ff2_mult_zpa_s (h,h,a,s);  ff2_square_s(h,h,s);
359
			}// when m is 1 mod 3:
360
				// We need to construct (z+a)^n where n = ((p+1)m+1)/3 = (p+1)(m-1)/3 + (p-1)/3 + 1 = (p+1)(m-1)/3 + 3^(e-1)*m + 1
361
				// We then have (z+a)^n = norm(g)*h^(3^(e-1))*(z+a) 
362
			ff2_scalar_mult(g,t,h);								
363
			ff2_mult_zpa_s(g,g,a,s);								// g = (z+a)^n
364
			ff2_scalar_mult(g,r,g);								// g = (z+a)^(1/3)
365
			// We know that (g-f1/(3g)) is a root of f.  To get others we multiply by cube roots of unity (which are conveniently in F_p).
366
			// We use this to find the root in F_p (there must be exactly one, since we know f has 2 factors by Stickleberger)
367
			// The multiple h of g yielding a root in F_p must have the property that -f1/3=norm(h), which we can test without inverting, and we then have tr(h) as our root
368
			_ff_mult(t0,f[1],_ff_third);
369
			ff_negate(t0);
370
			for ( i = 0 ; i < 3 ; i++ ) {  ff2_norm_s(&t,g,s);  if ( _ff_equal(t,t0) ) break;  ff2_scalar_mult(g,_ff_cbrt_unity,g); }
371
			if ( i == 3 ) {
372
				printf ("g=%ldz+%ld is a cbrt of (z+%ld) in F_p[z]/(z^2-%ld), but couldn't find an F_%ld root of 2-factor f = ", _ff_get_ui(g[1]), _ff_get_ui(g[0]),_ff_get_ui(a), _ff_get_ui(s), _ff_p);
373
				ff_poly_print(f,3);
374
				printf ("t0=%ld, cube root of unity is %ld\n", _ff_get_ui(t0), _ff_get_ui(_ff_cbrt_unity));
375
				exit (0);
376
			}
377
			ff2_trace(roots,g);
378
			return 1;
379
		} else {												// in this case have a sqrt of t and we know that f has 1 or 3 factors
380
			ff_mult(d,d,t2);									// d = sqrt(t)/6 = sqrt(f1^3/27+f0^2/4) (this is the square root in Cardano's formula for depressed cubics)
381
			_ff_add(t,a,d);										// t = -f0/2 + d (first cube to check)
382
			if ( ! ff_cbrt(&r,&t) ) return 0;
383
			_ff_sub(t,a,d);										// t = -f0/2 - d (second cube to check - we really ought to be able to compute this from the first one)
384
			if ( ! ff_cbrt(&s,&t) ) { printf("Impossible cube root failure in ff_poly_roots_d3 for p=%ld, f = ", _ff_p); ff_poly_print(f,3);  exit(0); }
385
			if ( ! roots ) return 3;								// we know the roots exist but we don't need to find them
386
			// We have to cycle through the choices of one of our cube roots to verify that we have the correct s and t, unforutnately this means we may need to evaluate f twice
387
			_ff_add(t,r,s);
388
			_ff_square(t0,t);  _ff_addto(t0,f[1]);  _ff_multadd(t1,t0,t,f[0]);
389
			if ( ! _ff_zero(t1) ) {
390
				ff_mult(s,s,_ff_cbrt_unity);						// try second s
391
				_ff_add(t,r,s);
392
				_ff_square(t0,t);  _ff_addto(t0,f[1]);  _ff_multadd(t1,t0,t,f[0]);
393
				if ( ! _ff_zero(t1) ) ff_mult(s,s,_ff_cbrt_unity);			// must be the third s
394
			}
395
			_ff_add(roots[0],r,s);								// third choice must work since first two didn't
396
			_ff_square(t0,_ff_cbrt_unity);							// could cache this in ff.c
397
			_ff_mult(t1,r,_ff_cbrt_unity);
398
			_ff_mult(t2,s,t0);
399
			_ff_add(roots[1],t1,t2);								// second root is wr+w^2s where w is a primitive cbrt of unity
400
			_ff_mult(t1,r,t0);
401
			_ff_mult(t2,s,_ff_cbrt_unity);
402
			_ff_add(roots[2],t1,t2);								// third root is wr^2+ws
403
			return 3;
404
		}
405
	} else {													// p=2mod3
406
		if ( sts ) {											// p=2mod3 => -1/3 not a QR => (-D/3 a QR => D not a QR => f has an even # of factors (by Stickleberger))
407
			if ( ! roots ) return 1;								// if we only need the # of factors, we are done.
408
			ff_mult(d,d,t2);									// d = sqrt(t)/6 = sqrt(f1^3/27+f0^2/4) (this is the square root in Cardano's formula for depressed cubics)
409
			_ff_add(t,a,d);										// t = -f0/2 + d (first cube)
410
			ff_cbrt(&r,&t);										// p is 2 mod 3 so we know there is exactly 1 cube root
411
			_ff_sub(t,a,d);										// t = -f0/2 - d (second cube)
412
			ff_cbrt(&s,&t);
413
			_ff_add(t,r,s);										// there was no choice of cuberoots (and changing sign of d doesn't change r+s=t), so this has got to be the root
414
			_ff_set(roots[0],t);
415
			return 1;
416
		} else {												// in this case t = -D/3 is not a QR, D=d^2 is a QR, and we know that f has 1 or 3 factors
417
			// for p=2mod3, handle roots=NULL separately, since we can be much more efficient
418
			if ( ! roots ) {
419
				_ff_square(t0,t2);								// 1/36
420
				_ff_mult(s,t,t0);								// s = -D/108 is a not a QR, we will now work in the field F_p^2 = F_p[z]/(z^2-s)
421
				ff_poly_xpan_mod_d2(g,a,(_ff_p+1)/3,&s);			// (z+a)^((p+1)/3) is in F_p iff (z+a)^((p+1)(p-1)/3)=1 iff (z+a) is a cubic residue
422
				return ( _ff_zero(g[1]) ? 3 : 0 );
423
			}
424

425
			// in this case we will be slightly less efficient and work in the standard basis for F_p^2=F_p[z](z^2-s) because we
426
			// want don't want to have to compute a new generator of the Sylow 3-subgroup in F_p^2 every time 
427
			_ff_set(v[0],a);  _ff_mult(v[1],d,t2);						// v is the element of F_p^2 (in the standard basis) whose cuberoot we need
428
			// this code is slightly inefficient, but we are trying to compute h=v^((p+1)/3) as quickly as possible (if h is not in F_p, f is irreducible and we are done)
429
			// while also saving values we will need to compute a cube root of v when f splits
430
			if ( _ff_p3_m1mod3 ){ k=1; ff2_set(vk,v); }
431
			else { k=2; ff2_square(vk,v); }						// vk = v^k
432
			ff2_exp_ui(g,v,(_ff_p3_m-k)/3);							// g = v^((m-k)/3)
433
			ff2_square(h,g);  ff2_mult(h,h,g);						// h = v^(m-k)
434
			ff2_mult(h,h,vk);									// h = v^m
435
			if ( _ff_p3_e > 1 ) {
436
				ff2_exp_ui(w,h,(_ff_p+1)/(3*_ff_p3_m)-1);				// w = v^((3^(e-1)-1)m) = v^(((p+1)-3m)/3)
437
				_ff_mult(t1,w[1],h[0]);  _ff_mult(t2,w[0],h[1]);			// compute z coefficient of w*h
438
				_ff_add(t0,t1,t2);
439
				if ( ! _ff_zero(t0) ) return 0;						// if w*h=v^((p+1)/3) is not in F_p, then v^((p^2-1)/3)=h^(p-1) is not 1 and v is not a cube, and o.w. it is
440
			} else {
441
				if ( ! _ff_zero(h[1]) ) return 0;						// here h=v^m=v^((p+1)/3) since e=1
442
			}
443
			// We now know v is a cube and there are three roots
444
			if ( ! roots ) return 3;
445
			ff2_norm(&t,g);									// t = N(g)=g^(p+1)=v^((p+1)(m-k)/3) where k=m mod 3
446
			ff2_scalar_mult(g,t,g);								// g = v^((p+2)(m-k)/3)
447
			// When e==1, life is pretty simple: we know v is a cube, and there is only one cube in the Sylow 3-subgroup, name 1, and its inverse cube root is 1
448
			// It follows that we can assume v^((p-1)m/3)=v^((p-2)m/3)=1.
449
			if ( _ff_p3_e == 1 ) {
450
				if ( k==2 ) {
451
					ff2_mult(g,g,h);							// g = v^((p+2)(m-2)/3+m)=v^((pm+2m-2p-4+3m)/3)=v^((pm+2m-2(3m-1)-4+3m)/3)=v^(((p-1)m-2)/3)
452
				} else {										// note that e=1 and k=1 implies g=v^((p+1)(m-1)/3) = v^(((p-1)m-1)/3)
453
					ff2_square(g,g);							// g=v^((2(p-1)m-2)/3)
454
				}
455
				ff2_mult(g,g,v);								// g = v^(((3-k)(p-1)m+1)/3) = v^(1/3}
456
				ff2_trace(roots,g);								// Every choice of cube root of v corresponds to a root of f, so we can just take the trace
457
				ff2_setup_cbrt();								// we still need to get the cube root of unity (this is annoying)
458
				ff2_mult(g,g,_ff2_cbrt_unity);
459
				ff2_trace(roots+1,g);
460
				ff2_mult(g,g,_ff2_cbrt_unity);
461
				ff2_trace(roots+2,g);
462
				return 3;
463
			}
464
			// now we have e>1
465
			if ( k==2 ) { ff2_square(w,w);  ff2_mult(w,w,h); }			// w = v^((k*3^(e-1)-1)m)
466
			ff2_mult(g,g,w);									// g = v^(((p-1)m-k)/3)  (note that (p+2)(m-k)/3 + (k*3^(e-1)-1)m = (pm+2m-kp-2k+k3^em-3m)/3
467
															//                                                                                                     = (pm-m-kp-2k+k(p+1))/3=((p-1)m-k)/3
468
			ff2_square(h,g);  ff2_mult(h,h,g);						// h = v^((p-1)m-k)
469
			ff2_mult(h,h,vk);									// h = v^((p-1)m) is in the 3-Sylow subgroup, and it must be a cubic residue
470
			if ( ! ff2_3Sylow_invcbrt(h,h) ) {						// h = v^(-(p-1)m)/3)  if e=1 
471
				printf ("Impossible situation in ff_poly_factors_g1 with p=%ld,m=%ld:  v^((p-1)m)=(%ldz+%ld) is not a cube in 3-Sylow of F_p^2=F_p[z]/(z^2-%ld) ",
472
					   _ff_p, _ff_p3_m, _ff_get_ui(a), _ff_get_ui(h[1]), _ff_get_ui(h[0]), _ff_get_ui(s));
473
				ff_poly_print(f,3);
474
				exit (0);
475
			}
476
			if ( k==1 ) {
477
				ff2_square(h,h);								// h = v^(-2(p-1)m/3)
478
				ff2_square(g,g);								// g = v^(2((p-1)m-1)/3)
479
				ff2_mult(g,g,v);								// g = v^(2((p-1)m-1)/3+1) = v^((2(p-1)m+1)/3
480
			} else {
481
				ff2_mult(g,g,v);								// g = v^((p-1)m-2)/3+1 = v^((p-1)m+1/3
482
			}
483
			ff2_mult(g,g,h);									// g = v^(1/3)
484
			ff2_trace(roots,g);									// Every choice of cube root of v corresponds to a root of f, so we can just take the trace of each
485
			ff2_mult(g,g,_ff2_cbrt_unity);
486
			ff2_trace(roots+1,g);
487
			ff2_mult(g,g,_ff2_cbrt_unity);
488
			ff2_trace(roots+2,g);
489
			return 3;
490
		}
491
	}
492
	puts ("Unreachable code in ff_poly_factors_g1!");
493
	exit (0);
494
}
495

496
// Compute roots of poly of degree 2, or less
497
int ff_poly_roots_d2 (ff_t r[2], ff_t f[3], int d_f)
498
{
499
	ff_t t1,t2,D;
500
	
501
	switch (d_f) {
502
	case 0:  return 0;
503
	case 1: 
504
		if ( _ff_one(f[1]) ) { _ff_set_one(t1); } else { _ff_invert(t1,f[1]); }
505
		ff_negate(t1); _ff_mult(r[0],t1,f[0]);  return 1;
506
	case 2:
507
		_ff_square(t1,f[1]);
508
		_ff_mult(t2,f[2],f[0]);
509
		_ff_x2(t2);  _ff_x2(t2);
510
		_ff_subfrom(t1,t2);
511
		if ( ! ff_sqrt(&D,&t1) ) return 0;
512
		if ( _ff_one(f[2]) ) {					
513
			_ff_set(t1,_ff_half);
514
		} else {
515
			_ff_add(t2,f[2],f[2]);
516
			_ff_invert(t1,t2);
517
		}
518
		_ff_sub(t2,D,f[1]);
519
		_ff_mult(r[0],t1,t2);
520
		ff_negate(D);
521
		_ff_sub(t2,D,f[1]);
522
		_ff_mult(r[1],t1,t2);
523
		return 2;
524
	}
525
}
526

527

528
/*
529
	ff_poly_g1_3tor computes the size of the 3-torsion subgroup of y^2=f(x) = x^3+ax+b.
530
	If the 3-torsion is trivial, it will instead attempt to determine the 3-torsion subgroup of the twist, and if it is non-trivial
531
	will return a negative value whose absolute value is the size of the 3-torsion subgroup in the twist.
532

533
        This means that a return value of 1 indicates that neither the curve nor its twist have non-trivial 3-torsion.
534
	In this situation, if p=1mod3, then a_p=0mod3 and the group order must be congruent to 2 mod 3.
535

536
	If the curve is singular (discriminant zero), 0 is returned.
537
*/
538
int ff_poly_g1_3tor (ff_t f[4])
539
{
540
	ff_t div[5],r[4],g[4],t0,t1, t2, d;
541
	int j, k, tor;
542
	
543
#if FF_WORDS > 1
544
	err_printf ("ff_poly_g1_3tor only supports single word finite fields\n"); exit (0);
545
#endif
546

547
	if ( _ff_p == 3 ) { if ( _ff_zero(f[1]) ) return 0; else return 1; }					// no curve y^2=x^3+ax+b over F_3 has 3 torsion
548
	
549
	_ff_square(t1,f[0]); _ff_set_ui(t2,27); _ff_mult(t0,t1,t2);							// t0 = 27f0^2
550
	_ff_square(t2,f[1]); _ff_mult(t1,t2,f[1]); _ff_x2(t1); _ff_x2(t1);						// t1 = 4f1^3, t2 = f1^2
551
	_ff_add(d, t0, t1);														// d = 4f1^3+27f0^2 = -discriminant of f
552
	if ( _ff_zero(d) ) return 0;
553
	_ff_add(t0,_ff_third, _ff_third); _ff_x2(t0); _ff_square(t1,t0);						// t0 = (4/3)^2 = 16/9
554
	ff_mult(d,d,t0);														// d^2 = 256/81*D_f^2 = -D/3 where D is discriminant of the 3-div poly
555

556
	ff_negate(t2);
557
	ff_mult(div[0],_ff_third,t2);	
558
	_ff_add(div[1],f[0],f[0]); _ff_x2(div[1]);
559
	_ff_add(div[2],f[1],f[1]);
560
	_ff_set_zero(div[3]);
561
	_ff_set_one(div[4]);
562
	// 3-division poly is x^4 + 2f1x^2 +4f0x - f1^2/3
563

564
	// handle p=2mod3 ourselves since we can easily save some time.  note that in this case, either both the curve and its twist have non-trivial 3-torsion, or neither do.
565
	// we also know that the 3-rank is 1 and there can be at most one root of the 3-division poly (2 pts of order 3).
566
	if ( ! _ff_p1mod3 ) {
567
		if ( _ff_zero(div[1]) ) {												// biquadratic case is handled quickly by ff_poly_roots
568
			k = _ff_poly_roots_d4(r,div,&d);
569
			return ( k ? 3 : 1);
570
		} else {
571
			ff_depressed_cubic_resolvent(&t0,g,div);
572
			k = _ff_poly_roots_d3(r,g,&d);
573
			if ( k != 1 ) { printf ("p=%ld is 2mod3 but cubic resolvent of 3div poly has k=%d!=1 roots!\n  f=", _ff_p, k); ff_poly_print(f,3); exit(0); }
574
		}
575
		_ff_sub(t1,t0,r[0]);
576
		return ( ff_residue(t1) ? 3 : 1);
577
	}
578
	
579
	// Now handle p=1mod3 by computing the roots of the 3-division polynomial (we actually only need to know the number of roots and the value of one of them)
580
	k = _ff_poly_roots_d4(r,div,&d);
581
	if ( ! k ) return 1;
582
	_ff_square(t1,r[0]);  _ff_addto(t1,f[1]); _ff_mult(t2,t1,r[0]); _ff_addto(t2,f[0]);		// t2 = f(r[0])
583
	if ( _ff_zero(t2) ) { printf ("impossible - non-id point (%lu,0) with 2-tor and 3-tor!\n", _ff_get_ui(r[j]));  ff_poly_print(f,3);  ff_poly_print(div,4); exit(0); }
584
	
585
	// We now rely on the fact that the 3-division poly of the curve and its twist are intimately related.  They have the same factorization pattern,
586
	// and either all the roots of the 3-division poly correspond to points on the curve, or none of them do, so we only need to check one.
587
	// In the latter case, it must be that all the roots of the 3-division poly of the twist correspond to points on the twist.
588
	if ( ff_residue(t2) ) return ( k==1?3:9); else return (k==1?-3:-9);
589
}
590

591
#if SMALLJAC_GENUS==2
592
int ff_poly_g2_3tor(ff_t f[6])
593
{
594
	ff_t g[POLY_G2TOR3_DEGREE+1], h[POLY_G2TOR3_DEGREE], r[POLY_G2TOR3_DEGREE];
595
	int k, d_h, d_r;
596
	
597
	ff_poly_g2tor3_modpoly(g,f);
598
	
599
	// first make sure modular poly is square free
600
	ff_poly_derivative (h,&d_h,g,POLY_G2TOR3_DEGREE);
601
	ff_poly_gcd (r,&d_r,g,POLY_G2TOR3_DEGREE,h,d_h);
602
	if ( d_r > 0 ) return 0;
603
	
604
	k = ff_poly_roots(g,40);
605
	if ( ! k ) return 1;
606
	if ( k ==2 ||k==5 ||k==8 ) return 3;
607
	return 0;
608
}
609
#endif
610

611
/*
612
	Given a point (x,y) on y^2=f(x)=x^3+f1x+f0, replaces (x,y) with a point (u,v) s.t. (u,v) composed with itself yields (x,y) or its inverse (we don't distinguish these cases!)
613
	Returns 1 for success, 0 if no such point exists.
614
*/
615
int ff_poly_g1_halve (ff_t *x, ff_t *y, ff_t f[4])
616
{
617
	ff_t g[5], r[4];
618
	register ff_t t0, t1, t2;
619
	int i, k;
620
	
621
	// construct g(z) = z^4 - 2(f1+3x^2)z^2 - 8y^2z + f1^2 - 3x(y^2+f1x + 3f0).  If (u,v)*(u,v)=(x,y) then g(u-x)=0 (note translation by x to kill degree 3 coeff of g).
622
	_ff_set_one(g[4]);  _ff_set_zero(g[3]);
623
	_ff_square(t0,*x);  _ff_add(t1,t0,t0); _ff_addto(t1,t0);  _ff_addto(t1,f[1]); _ff_x2(t1); _ff_neg(g[2],t1);			// g[2] = -2(f1+3x^2)
624
	_ff_add(t0,*y,*y); _ff_square(t1,t0); _ff_x2(t1); _ff_neg(g[1],t1);										// g[1] = -8y^2
625
	_ff_square(t0,*y); _ff_mult(t1,f[1],*x); _ff_addto(t0,t1); _ff_add(t1,f[0],f[0]); _ff_addto(t1,f[0]); _ff_addto(t0,t1);	// t0 = y^2 + f1x + 3f0
626
	_ff_add(t1,*x,*x); _ff_addto(t1,*x); _ff_mult(t2,t0,t1); _ff_square(t1,f[1]);  _ff_sub(g[0],t1,t2);				// g[0] = f1^2 - 3x(y^2+f1x+3f0)
627
//printf ("(%d,%d) havling poly: ", _ff_get_ui(*x), _ff_get_ui(*y)); ff_poly_print(g,4);
628
	
629
	k  = ff_poly_roots_d4 (r, g);
630
//printf ("%d roots\n", k);
631
	for ( i = 0 ; i < k ; i++ ) {
632
		_ff_add(t2,r[i],*x);
633
		_ff_square(t0,t2);  _ff_addto(t0,f[1]);  _ff_mult(t1,t0,t2); _ff_add(g[1],t1,f[0]);						// g[1] = f(r[i]+x)
634
		if ( ff_sqrt(g,g+1) ) break;
635
	}
636
	if ( i == k ) return 0;
637
//printf ("found (%d,%d)^2 = (%d,%d)\n", _ff_get_ui(t2), _ff_get_ui(g[0]), _ff_get_ui(*x), _ff_get_ui(*y));
638
	_ff_set(*x,t2); _ff_set(*y,*g);
639
	return 1;
640
}
641

642

643
/*
644
	Returns the size of the 2-Sylow subgroup of the elliptic curve y^2=f(x)=x^3+f1x+f0, provided it is cyclic.
645
	Otherwise the return value is -1, which indicates that Z/2Z x Z/2Z is a subgroup (and the group order is divisible by 4)
646
        (we could compute the entire 2-Sylow in this case, but it would be much more time consuming)
647
*/
648
int ff_poly_g1_2Sylow (ff_t f[4])
649
{
650
	ff_t r[3];
651
	ff_t x, y;
652
	int n;
653
	
654
	n = ff_poly_roots_d3(r,f);
655
	if ( ! n ) return 1;
656
	if ( n > 1) return 0;
657
	
658
	_ff_set(x,r[0]);  _ff_set_zero(y);
659
	for ( n = 2 ; ff_poly_g1_halve (&x, &y, f) ; n<<= 1 );
660
	return n;
661
}
662

663

664
/*
665
	Computes the order and rank of the 4-torsion subgroup of the elliptic curve y^2=f(x)=x^3+f1x+f0. 
666
	Set o to the order and returns d to indicate the group Z/dZ x Z/eZ where d*e = o, d divides e (and may be 1)
667

668
	If flag8 is set, also check for Z/8Z (for a random curve with full Galois image in GL(2,Z/8Z), these occurs with probability 1/8).
669
	We could also check for Z/2ZxZ/8Z, or even Z/4ZxZ/8Z or Z/8ZxZ/8Z (these occur with prob. 3/64, 3/512 and 1/1536 resp);
670
	
671
	The latter two are rare enough not to be worth the trouble, but Z/2xZ/8Z might be worth doing.  Note, however, that Z/2xZ/4Z
672
	contains 4 points of order 4 (two pairs of inverses) and we would need to compute two of these.
673
*/
674
int ff_poly_g1_4tor (int *o, ff_t f[4], int flag8)
675
{
676
	ff_t r[3];
677
	ff_t u, v, v2;
678
	register ff_t t0,t1,t2;
679
	register int i, n;
680
	
681
	n = ff_poly_roots_d3(r,f);
682
	for ( i = 0 ; i < n ; i++ ) {
683
		_ff_square(t0,r[i]); 	_ff_add(t1,t0,t0); _ff_addto(t1,t0); _ff_add(u,t1,f[1]);			// u = 3x^2+a where (x,0) is a 2-torsion point (x=r[i] was a root of f)
684
		if ( ff_sqrt(&v,&u) ) {													// v is a root of the translated halving poly
685
			_ff_add(t2,r[i],v);													// t2=x+v is a root of the halving poly
686
			_ff_square(t0,t2);  _ff_addto(t0,f[1]);  _ff_mult(t1,t0,t2); _ff_add(u,t1,f[0]);		// u = f(t2)
687
			if ( ff_sqrt(&v,&u) ) break;											// if successful, (t2,v) is a point of order 4
688
			_ff_sub(t2,r[i],v);													// t2=x-u is also a root of the halving poly
689
			_ff_square(t0,t2);  _ff_addto(t0,f[1]);  _ff_mult(t1,t0,t2); _ff_add(u,t1,f[0]);		// u = f(t2)
690
			if ( ff_sqrt(&v,&u) ) break;											// if successful, (t2,v) is a point of order 4
691
		}
692
	}
693
	if ( i == n ) { *o = n+1;  return (n==3?2:1); }									// no pts of order 4, so 4-torsion subgroup = 2-torsion subgroup
694
//printf ("%ld: found point(%ld,%ld) with order 4 (i=%d,n=%d) on curve y^2 = ", _ff_p, _ff_get_ui(t2), _ff_get_ui(v), i, n); ff_poly_print(f,3);
695
	if ( n==1 ) {																// rank 1 case, with a point of order 4
696
		if ( ! flag8 ) { *o = 4;  return 1; }
697
//printf("%ld: flag8 set (Z/4Z), attempting to halve order 4 point (%ld,%ld) on curve y^2 = ", _ff_p, _ff_get_ui(t2), _ff_get_ui(v)); ff_poly_print(f,3);
698
		_ff_set(u,t2);
699
		*o = ( ff_poly_g1_halve(&u,&v,f) ? 8 : 4 );									// check for a point of order 8 (this takes us outside the 4-torsion subgroup)
700
//if ( *o == 8 ) puts ("Succeeded"); else puts ("Failed");
701
		return 1;
702
	}
703
	if ( i > 0 ) { *o = 8; return 2; }	
704
		
705
	// We are here if the 2-rank is 2 and we succesfully halved the first point of order 2 that we tried.  We need to check one more to distinguish Z/2Z x Z/4Z from Z/4Z x Z/4Z.
706
	// Don't change v or t2 from above, we may need them below
707
	i = 1;
708
	_ff_square(t0,r[i]); 	_ff_add(t1,t0,t0); _ff_addto(t1,t0); _ff_add(u,t1,f[1]);			// u = 3x^2+a where (x,0) is a 2-torsion point (x=r[0] was a root of f)
709
	if ( ff_sqrt(&v2,&u) ) {													// v2 is a root of the translated halving poly
710
		_ff_add(t1,r[i],v2);													// t1is a root of the halving poly
711
		_ff_square(t0,t1);  _ff_addto(t0,f[1]);  ff_mult(t1,t1,t0); _ff_addto(t1,f[0]);		// t1 = f(t1)
712
		if ( ff_residue(t1) ) { *o = 16; return 4; }
713
	}
714
	*o = 8;
715
	return 2;
716
}
717

718
/*
719
	Computes the roots of f(x)=x^4+ax^2+bx+c, solving by radicals.
720
	Handles singular f, will return repeated roots correctly.
721

722
	Currently r is required.  We could easily extend to make it optional and only return a count.
723
	The parameter pd is optional.  If specified it points to sqrt(-D/3) (used for 3-torsion).
724
*/
725
int _ff_poly_roots_d4 (ff_t r[4], ff_t f[5], ff_t *pd)
726
{
727
	ff_t g[4], h[3], s[3], x, y, t;
728
	register ff_t t0,t1,t2,w1,w2;
729
	register int i,k,res;
730

731
	if ( _ff_zero(f[1]) ) {
732
		// f = x^4+f2x^2+f0 = g(x^2) where g is monic quadratic
733
		_ff_set_one(g[2]);  _ff_set(g[1],f[2]);  _ff_set(g[0],f[0]);
734
		k = ff_poly_roots_d2(s,g,2);
735
		if ( ! k ) return 0;
736
		if ( ff_sqrt(r,s) ) {
737
			_ff_neg(r[1],r[0]);
738
			// could optimize for double root here, but why bother
739
			if ( ff_sqrt(r+2,s+1) ) { _ff_neg(r[3],r[2]); return 4; }
740
			return 2;
741
		}
742
		if ( ff_sqrt(r,s+1) ) { _ff_neg(r[1],r[0]); return 2; }
743
		return 0;
744
	}
745
	ff_depressed_cubic_resolvent(&t,g,f);
746
	k = _ff_poly_roots_d3(r,g,pd);				// put the roots of g into r for now, they will get replaced by roots of f below.
747
	if ( k ) {
748
		res = 0;
749
		for ( i = 0 ; i < k ; i++ ) {
750
//			ff_poly_eval (&y, g, 3, r+i);
751
//			if ( ! _ff_zero(y) ) { printf ("(%d roots) %ld is not a root of ", k, _ff_get_ui(r[i]));  ff_poly_print(g,3); exit(0); }
752
			_ff_sub(x,r[i],t);
753
			ff_negate(x);
754
			if ( ! ff_sqrt(s+i,&x) ) break;
755
			if ( ++res == 2 ) break;
756
		}
757
		if ( res==2 ) {
758
			_ff_mult(t1,s[0],s[1]);
759
			ff_negate(t1);
760
			_ff_invert(t0,t1);
761
			_ff_mult(s[2],t0,f[1]);
762
			_ff_square(t1,s[2]);
763
			_ff_sub(x,r[2],t);
764
			ff_negate(x);
765
			if ( ! _ff_equal(t1,x) ) { printf("%ld: division failed s[0]=%d, s[1]=%d, d=%d\n", _ff_p, _ff_get_ui(s[0]), _ff_get_ui(s[1]), _ff_get_ui(t0));  ff_poly_print(f,4); ff_poly_print(g,3); exit(0); }
766
			_ff_add(t1,s[0],s[1]);  _ff_addto(t1,s[2]); ff_mult(t1,t1,_ff_half);
767
			_ff_set(r[0],t1);
768
			ff_negate(s[0]); ff_negate(s[2]);
769
			_ff_add(t1,s[0],s[1]);  _ff_addto(t1,s[2]); ff_mult(t1,t1,_ff_half);
770
			_ff_set(r[1],t1);
771
			ff_negate(s[0]); ff_negate(s[1]);
772
			_ff_add(t1,s[0],s[1]);  _ff_addto(t1,s[2]); ff_mult(t1,t1,_ff_half);
773
			_ff_set(r[2],t1);
774
			ff_negate(s[0]); ff_negate(s[2]);
775
			_ff_add(t1,s[0],s[1]);  _ff_addto(t1,s[2]); ff_mult(t1,t1,_ff_half);
776
			_ff_set(r[3],t1);
777
			return 4;
778
		} else if ( k==1 && res==1 ) {
779
			_ff_set_one(h[2]);
780
			_ff_set(h[1],s[0]);
781
			_ff_square(t1,s[0]);						// t1 = s^2
782
			_ff_add(w1,t1,f[2]);
783
			ff_mult(w1,w1,_ff_half);					// w1 = (s^2+f2)/2
784
			_ff_square(t0,f[2]);
785
			_ff_add(t2,f[0], f[0]); _ff_x2(t2);			// t2 = 4f0
786
			_ff_subfrom(t0,t2);						// t0 = f2^2-4f0
787
			_ff_add(t2,f[2],f[2]);					// t2 = 2f2
788
			_ff_add(w2,t1,t2);						// w2 = s^2+2f2
789
			ff_mult(w2,w2,t1);						// w2 = s^4+2f2s^2
790
			_ff_addto(w2,t0);
791
			ff_mult(w2,w2,s[0]);						// w2 = s(s^4+2f2s^2+f2^2-4f0)				
792
			_ff_invert(t0,f[1]);
793
			ff_mult(t1,_ff_half,t0);
794
			ff_mult(w2,w2,t1);
795
			_ff_sub(h[0],w1,w2);
796
			if ( ff_poly_roots_d2(r,h,2) != 2 ) {
797
				ff_negate(h[1]);
798
				_ff_add(h[0],w1,w2);
799
				if ( ff_poly_roots_d2(r,h,2) != 2 ) { printf("%ld: fail 2-1-1 split with s=%ld\n", _ff_p, _ff_get_ui(s[0]));  ff_poly_print(f,4);  ff_poly_print(g,3);  ff_poly_print(h,2); exit(0); }
800
			}
801
			return 2;
802
		} else {
803
			return 0;
804
		}
805
	} else {
806
		// this is the annoying case, we know we have a 1,3 split, but getting the value of the root is expensive
807
		// we need sqrt(-z) in F_p[z]/h(z) where h(z) is the undepressed resolvent
808
		// so we want sqrt(t-z) in F_p[z]/g(z) which is sqrt(z+t) in F_p[z]/k(z) where k(z)=-g(-z)=z^3+g1z-g0z.  In terms of z^3-rz-s, r=-g1 and s=g0
809
		_ff_neg(t1,g[1]);
810
		if ( ff3_trsqrt_zpa_mod_rs(&x,t,t1,g[0]) ) res++; else { printf ("%ld: impossible failure of ff3_trsqrt_zpa_mod_rs in ff_poly_roots_d4!", _ff_p); ff_poly_print(f,4); ff_poly_print(g,3); exit(0); }
811
		ff_mult(x,x,_ff_half);
812
		ff_poly_eval(&y,f,4,&x);
813
		if ( ! _ff_zero(y) ) ff_negate(x);
814
		_ff_set(r[0],x);
815
		return 1;
816
	}
817
	puts ("Unreachable code in ff_poly_factors_g1!");
818
	exit (0);
819
}
820

821
/*
822
	Hard-wired code for computing the roots of f(x)=x^4+ax^2+bx+c.  This is used
823
	primarily for testing for 3-torsion by computing roots of the 3-division poly (made monic).
824
	If f is singular, it only returns distinct roots.
825

826
	It is generally very fast, although when f splits completely more work is required.
827
*/
828
int ff_old_poly_roots_d4 (ff_t r[4], ff_t f[5])
829
{
830
	_ff_t_declare g[5], h[5], A[5], t, a;
831
	int i,j,k,d_g, d_h, d_A;
832

833
#if FF_WORDS > 1
834
	err_printf ("ff_poly_roots_d4 only implemented for single word fields\n");  exit (0);
835
#endif
836
	ff_poly_xn_mod_d4 (g, _ff_p, f);		// compute x^p mod f
837
	_ff_set_one(t);
838
	_ff_subfrom (g[1],t);				// x^p-x mod f
839
	for ( d_g = 3 ; d_g >= 0 && _ff_zero(g[d_g]) ; d_g-- );
840
	ff_poly_gcd (h, &d_h, g, d_g, f, 4);
841
	switch (d_h) {
842
	case 0: return 0;
843
	case 1:
844
	case 2: return ff_poly_roots_d2 (r, h, d_h);
845
	case 3:
846
	case 4:
847
		ff_poly_monic (A, &d_A, h, d_h);
848
		i = j = k = 0;
849
		_ff_set_zero(t);
850
		do {
851
			// if A was just made cubic, it may need to be translated to make x^2 coeff zero
852
			// if this is done, we remember the translation so we can shift roots back at the end
853
			if ( d_A == 3 && ! _ff_zero(A[2]) ) { j = i; ff_depress_cubic(&t, A); }
854
			if ( k ) ff_random(&a);		// use a=0 first time through
855
			if ( d_A == 4 ) {
856
				if ( !k ) {
857
					ff_poly_xn_mod_d4 (g, (_ff_p-1)/2, A);
858
				} else {
859
					ff_poly_xpan_mod_d4 (g, a, (_ff_p-1)/2, A);
860
				}
861
				_ff_dec (g[0]);
862
				for ( d_g = 3 ; d_g >= 0 && _ff_zero(g[d_g]) ; d_g-- );
863
			} else {
864
				if ( !k ) {
865
					ff3_zn_mod (g, (_ff_p-1)/2, A);
866
				} else {
867
					ff_poly_xpan_mod_d3 (g, a, (_ff_p-1)/2, A);
868
				}
869
				_ff_dec (g[0]);
870
				for ( d_g = 2 ; d_g >= 0 && _ff_zero(g[d_g]) ; d_g-- );
871
			}
872
			ff_poly_gcd (h, &d_h, g, d_g, A, d_A);
873
			if ( d_h > 0 && d_h < d_A ) {
874
				ff_poly_div (A, &d_A, g, &d_g, A, d_A, h, d_h);
875
				if ( d_g != -1 ) { err_printf ("inexact poly division in ff_poly_roots_d4!\n");  exit(0); }
876
				if ( d_h < 3 ) {
877
					i += ff_poly_roots_d2 (r+i,h,d_h);
878
					ff_poly_copy (h,&d_h,A,d_A);
879
					ff_poly_monic (A,&d_A,h,d_h);
880
				} else {
881
					i += ff_poly_roots_d2 (r+i,A,d_A);
882
					ff_poly_monic (A,&d_A,h,d_h);
883
				}
884
			}
885
			k++;
886
		} while ( d_A > 2 );
887
		i += ff_poly_roots_d2 (r+i,A,d_A);
888
		// shift any roots found after translation occurred (if it did)
889
		if ( ! _ff_zero(t) ) while ( j < i ) { _ff_subfrom(r[j],t); j++; }
890
		return i;
891
	}
892
	return d_h;
893
}
894

895

896
void ff_poly_xn_mod_d4 (ff_t g[4], unsigned long n, ff_t f[5])
897
{
898
	_ff_t_declare_reg t1,t2,t3,nf0,f02,f12,f22,w1,w2;
899
	register unsigned long j, m;
900
	register int i;
901

902
	for ( i = 0 ; i < 4 ; i++ ) _ff_set_zero(g[i]);
903
	if ( !n ) { _ff_set_one(g[0]);  return; }
904
	i = _asm_highbit(n);
905
	if ( i&1 ) i--;
906
	m = 3UL<<i;
907
	j = (n&m)>>i;
908
	_ff_set_one(g[j]);
909
	m >>= 2;  i -= 2;
910
	if ( m > 1 ) {
911
		_ff_neg(nf0,f[0]);
912
		// these won't get used if no digits of n are 3, we could check for this...
913
		_ff_mult(f02,f[0],f[2]);
914
		_ff_mult(f12,f[1],f[2]);
915
		_ff_square(f22,f[2]);
916
	}
917
	while (m) {
918
		ff_poly_square_mod_d4 (g,g,f);
919
		ff_poly_square_mod_d4 (g,g,f);
920
		j = ((n&m)>>i);
921
		switch (j) {
922
		case 1:	// 3M+2A
923
			_ff_set(t3,g[3]);
924
			_ff_set(g[3],g[2]);
925
			_ff_mult(w2,f[2],t3);
926
			_ff_sub(g[2],g[1],w2);
927
			_ff_mult(w2,f[1],t3);
928
			_ff_sub(g[1],g[0],w2);
929
			_ff_mult(g[0],t3,nf0);
930
			break;
931
		case 2: // 6M+3A
932
			_ff_set(t3,g[3]);
933
			_ff_set(t2,g[2]);
934
			_ff_mult(w1,f[2],t3);
935
			_ff_sub(g[3],g[1],w1);
936
			_ff_mult(w1,f[2],t2);
937
			_ff_mult(w2,f[1],t3);
938
			_ff_addto(w2,w1);
939
			_ff_sub(g[2],g[0],w2);
940
			_ff_mult(w1,t3,nf0);
941
			_ff_mult(w2,f[1],t2);
942
			_ff_sub(g[1],w1,w2);
943
			_ff_mult(g[0],t2,nf0);
944
			break;
945
		case 3:	// 9M+8A
946
			_ff_set(t3,g[3]);
947
			_ff_set(t2,g[2]);
948
			_ff_set(t1,g[1]);
949
			_ff_mult(w1,t2,f[2]);
950
			_ff_mult(w2,t3,f[1]);
951
			_ff_addto(w1,w2);
952
			_ff_sub(g[3],g[0],w1);
953
			_ff_sub(w2,f22,f[0]);
954
			_ff_mult(w1,t3,w2);
955
			_ff_mult(w2,t1,f[2]);
956
			_ff_subfrom(w1,w2);
957
			_ff_mult(w2,t2,f[1]);
958
			_ff_sub(g[2],w1,w2);
959
			_ff_mult(w1,t1,f[1]);
960
			_ff_mult(w2,t2,f[0]);
961
			_ff_addto(w1,w2);
962
			_ff_mult(w2,t3,f12);
963
			_ff_sub(g[1],w2,w1);
964
			_ff_mult(w1,t3,f02);
965
			_ff_mult(w2,t1,f[0]);
966
			_ff_sub(g[0],w1,w2);			
967
		}
968
		m >>= 2;  i -= 2;
969
	}
970
}
971

972
// Computes (x+a)^n modulo x^2-f0 (note the sign).  Assumes n < 2^63
973
void ff_poly_xpan_mod_d2 (ff_t g[2], ff_t a, unsigned long n, ff_t f[1])
974
{
975
	_ff_t_declare_reg t0,t1,t2,t3,s0,s1;
976
	register unsigned long m;
977
	int i, j;
978

979
	if ( ! n ) { _ff_set_zero(g[1]); _ff_set_one(g[0]); return;}
980
	_ff_set_one(g[1]);  _ff_set(g[0],a);
981
	if ( n==1 ) return;
982
	i = _asm_highbit(n)-1;
983
	m = 1UL<<i;
984
	_ff_add(s0,a,a);
985
	_ff_add(s1,f[0],f[0]);
986
	while (m) {
987
		j = ((n&m)>>i);
988
		if ( j ) {
989
			// square g and multiply by (x+a) to get (g0^2+g1^2f0+a*2g0g1)*x + a(g0^2+g1^2f0) + 2g0g1f0
990
			_ff_square(t0,g[0]);				// t0=g0^2
991
			_ff_square(t1,g[1]);				// t1=g1^2
992
			_ff_mult(t2,f[0],t1);				// t2 = g1^2f0
993
			_ff_add(t1,t0,t2);				// t1 = g0^2+g1^2f0
994
			_ff_mult(t0,g[0],g[1]);			// t0 = g0g1
995
			_ff_mult(t2,a,t1);				// t2 = a(g0^2+g1^2f0)
996
			_ff_mult(t3,s1,t0);				// t3 = 2f0g0g1
997
			_ff_add(g[0],t2,t3);				// g0 = a(g0^2+g1^2f0) + 2g0g1f0
998
			_ff_mult(t2,t0,s0);				// t2 = 2ag0g1
999
			_ff_add(g[1],t1,t2);				// g1 = g0^2+g1^2f0+a*2g0g1			
1000
			// 7M + 3A
1001
		} else {
1002
			// square g mod f to get  2g0g1*x + (g0^2+g1^2f0)
1003
			_ff_square(t0, g[0]);  _ff_square(t1, g[1]);
1004
			_ff_mult(t2,g[0],g[1]);
1005
			_ff_add(g[1],t2,t2);
1006
			_ff_mult(t2,t1,f[0]);
1007
			_ff_add(g[0],t0,t2);
1008
			// 4M + 2A
1009
		}
1010
		m >>= 1;  i -= 1;
1011
	}
1012
}
1013

1014
// Computes (x+a)^n modulo a cubic of the form x^3+f1x+f0.  Assumes n < 2^63
1015
void ff_poly_xpan_mod_d3 (ff_t g[3], ff_t a, unsigned long n, ff_t f[4])
1016
{
1017
	_ff_t_declare_reg w1,w2,t2;
1018
	register unsigned long m;
1019
	register int i, j;
1020
	
1021
	if ( ! n ) { _ff_set_zero(g[1]); _ff_set_one(g[0]); return;}
1022
	_ff_set_zero(g[2]); _ff_set_one(g[1]);  _ff_set(g[0],a);
1023
	if ( n==1 ) return;
1024
	i = _asm_highbit(n)-1;
1025
	m = 1UL<<i;
1026
	while (m) {
1027
		ff_poly_square_mod_d3 (g,g,f);
1028
		j = ((n&m)>>i);
1029
		if ( j ) {	// 5M+4A
1030
			_ff_set(t2,g[2]);
1031
			_ff_mult(w1,t2,a);
1032
			_ff_add(g[2],g[1],w1);
1033
			_ff_mult(w1,g[1],a);
1034
			_ff_mult(w2,t2,f[1]);
1035
			_ff_subfrom(w1,w2);
1036
			_ff_add(g[1],g[0],w1);
1037
			_ff_mult(w1,g[0],a);
1038
			_ff_mult(w2,t2,f[0]);
1039
			_ff_sub(g[0],w1,w2);
1040
		}
1041
		m >>= 1;  i -= 1;
1042
	}
1043
}
1044

1045
// squares a quadratic (arbitrary coeff) modulo a cubic of the form x^3+f1x+f0
1046
// no initialization or validation is done, overlap is ok. 
1047
void ff_poly_square_mod_d3 (ff_t w[3], ff_t u[3], ff_t f[4])
1048
{
1049
	_ff_t_declare_reg t0,t1,s1,s2, w1, w2;
1050

1051
	_ff_mult(t0,u[2],f[0]);
1052
	_ff_mult(t1,u[2],f[1]);
1053
	_ff_mult(s1,t0,u[2]);
1054
	_ff_add(s2,u[1],u[1]);
1055
	
1056
	_ff_add(w1,u[0],u[0]);
1057
	_ff_subfrom(w1,t1);
1058
	_ff_mult(w2,w1,u[2]);
1059
	_ff_square(w1,u[1]);
1060
	_ff_add(w[2],w1,w2);			// w2 = u1^2+(2u0-f1u2)u2 = 2u0u2+u1^2 - f1u2^2
1061
	
1062
	_ff_sub(w1,u[0],t1);
1063
	_ff_mult(w2,w1,s2);
1064
	_ff_sub(u[1],w2,s1);			// w1 = 2u1(u0-f1u2)-(f0u2)u2) = 2u0u2+u1^2 - f1u2^2
1065
	
1066
	_ff_square(w1,u[0]);
1067
	_ff_mult(w2,s2,t0);
1068
	_ff_sub(u[0],w1,w2);			// w0 = u0^2-(2u1)(f0u2)
1069
	 // 8M+7A
1070
}
1071

1072
// Computes (x+a)^n modulo a quartic of the form x^4+f2x^2+f1x+f0.  Assumes n < 2^63
1073
void ff_poly_xpan_mod_d4 (ff_t g[4], ff_t a, unsigned long n, ff_t f[5])
1074
{
1075
	_ff_t_declare_reg w1,w2,t3;
1076
	ff_t h[4];
1077
	register unsigned long m;
1078
	int i, j;
1079
	
1080
	if ( ! n ) { _ff_set_zero(g[1]); _ff_set_one(g[0]); return; }
1081
	_ff_set_zero(g[3]);  _ff_set_zero(g[2]); _ff_set_one(g[1]);  _ff_set(g[0],a);
1082
	if ( n==1 ) return;
1083
	i = _asm_highbit(n)-1;
1084
	m = 1UL<<i;
1085
	while (m) {
1086
		ff_poly_square_mod_d4 (g,g,f);
1087
		j = ((n&m)>>i);
1088
		if ( j ) {
1089
			_ff_set(t3,g[3]);
1090
			_ff_mult(w1,t3,a);
1091
			_ff_add(g[3],g[2],w1);
1092
			_ff_mult(w1,g[2],a);
1093
			_ff_mult(w2,t3,f[2]);
1094
			_ff_subfrom(w1,w2);
1095
			_ff_add(g[2],g[1],w1);
1096
			_ff_mult(w1,g[1],a);
1097
			_ff_mult(w2,t3,f[1]);
1098
			_ff_subfrom(w1,w2);
1099
			_ff_add(g[1],g[0],w1);
1100
			_ff_mult(w1,g[0],a);
1101
			_ff_mult(w2,t3,f[0]);
1102
			_ff_sub(g[0],w1,w2);
1103
			// 7M+6A
1104
		}
1105
		m >>= 1;  i -= 1;
1106
	}
1107
}
1108

1109
// squares a cubic (arbitrary coeff) modulo a quartic of the form x^4+ax^2+bx+c
1110
// no initialization or validation is done, overlap of u,w is OK.  Uses 18M+21A
1111
void ff_poly_square_mod_d4 (ff_t w[4], ff_t u[4], ff_t f[5])
1112
{
1113
	_ff_t_declare_reg t0,t1,t2,s1,s2,s3,s4,s5,w1,w2,w3;
1114
	
1115
	// compute t_i = u_3*f_i
1116
	_ff_mult(t0,u[3],f[0]);  _ff_mult(t1,u[3],f[1]);  _ff_mult(t2,u[3],f[2]);
1117
	_ff_mult(s1,u[2],f[2]);			// s1=u2f2
1118
	_ff_square(s2,u[2]);				// s2=u2^2
1119
	_ff_mult(s3,u[3],t0);			// s3=u3^2f0	(not reused, but need to save since u[3] may get clobbered)
1120
	_ff_mult(s4,u[2],t0);
1121
	_ff_x2(s4);					// s4 = 2u2u3f0 (not reused, but need to save since u[2] may get clobbered)
1122
	_ff_mult(s5,u[1],u[3]);
1123
	_ff_x2(s5);					// s5 = 2u1u3 (not reused, but need to save since u[1],u[3] may get clobbered)
1124

1125
	_ff_sub(w1,u[0],s1);
1126
	_ff_x2(w1);
1127
	_ff_subfrom(w1,t1);
1128
	_ff_mult(w2,w1,u[3]);
1129
	_ff_mult(w1,u[1],u[2]);
1130
	_ff_x2(w1);
1131
	_ff_add(w[3],w1,w2);			// w3 = (2(u0-u2f2)-u3f2)u3+2u1u2 = 2*u0*u3 + 2*u1*u2 - 2*u2*u3*f2 - u3^2*f1
1132
	// note u[3] is now (potentially) clobbered
1133

1134
	_ff_sub(w2,u[1],t2);
1135
	_ff_square(w1,w2);
1136
	_ff_sub(w2,u[0],t1);
1137
	_ff_x2(w2);
1138
	_ff_subfrom(w2,s1);
1139
	_ff_mult(w3,u[2],w2);
1140
	_ff_addto(w1,w3);
1141
	_ff_sub(w[2],w1,s3);			// w2 = (u1-u3f2)^2+u2(2(u0-u3f1)-u2f2)-u3(u3f0) = 2*u0*u2 + u1^2 - 2*u1*u3*f2 - u2^2*f2 - 2*u2*u3*f1 - u3^2*f0 + u3^2*f2^2
1142
	// note u[2] is now (potentially) clobbered
1143
	
1144
	_ff_sub(w2,u[0],t1);
1145
	_ff_mult(w1,u[1],w2);
1146
	_ff_x2(w1);
1147
	_ff_mult(w2,s2,f[1]);
1148
	_ff_subfrom(w1,w2);
1149
	_ff_subfrom(w1,s4);
1150
	_ff_mult(w2,t1,t2);
1151
	_ff_add(w[1],w1,w2);			// w1 = 2u1(u0-u3f1)-u2^2f1-2u2(u3f0)+(u3f1)(u3f2) = 2*u0*u1 - 2*u1*u3*f1 - u2^2*f1 - 2*u2*u3*f0 + u3^2*f1*f2
1152
	// note u[1] is now (potentially) clobbered
1153

1154
	_ff_square(w1,u[0]);
1155
	_ff_add(w2,s5,s2);
1156
	_ff_mult(w3,w2,f[0]);
1157
	_ff_subfrom(w1,w3);
1158
	_ff_mult(w2,t0,t2);
1159
	_ff_add(w[0],w1,w2);			// w0 = u0^2-f0(2u1u3+u2^2)+(u3f0)(u3f2) = u0^2 - 2*u1*u3*f0 - u2^2*f0 + u3^2*f0*f2
1160
	 // 18M+21A
1161
}
1162

1163

1164
/*
1165

1166
The code below has all been superseded by more efficient routines above or in ffext.c
1167
Keep it around in case we need it for debugging.
1168

1169
// squares a quadratic (arbitrary coeff) modulo a cubic of the form x^3+f1x+f0
1170
// no initialization or validation is done, overlap is ok.  Uses 10M+9A
1171
void old_ff_poly_square_mod_d3 (ff_t w[3], ff_t u[3], ff_t f[4])
1172
{
1173
	_ff_t_declare_reg t1,t3, t4, w0, w1, w2;
1174

1175
	_ff_mult(t1,u[0],u[2]);
1176
	_ff_add(w2,t1,t1);
1177
	_ff_square(t1,u[1]);
1178
	_ff_addto(w2,t1);
1179
	_ff_square(t4,u[2]);				// t4 = u2^2
1180
	_ff_mult(t1,f[1],t4);
1181
	_ff_subfrom(w2,t1);				// w2 = 2u0u2+u1^2 - f1u2^2
1182
	_ff_mult(t1,u[0],u[1]);
1183
	_ff_add(w1,t1,t1);				
1184
	_ff_mult(t1,f[0],t4);
1185
	_ff_subfrom(w1,t1);
1186
	_ff_mult(t1,u[1],u[2]);
1187
	_ff_add(t3,t1,t1);				// t3 = 2u1u2
1188
	_ff_mult(t1,f[1],t3);
1189
	_ff_subfrom(w1,t1);				// w1 = 2u0u1 - f0u2^2 - 2f1u1u2
1190
	_ff_square(w0,u[0]);
1191
	_ff_mult(t1,f[0],t3);
1192
	_ff_sub(w[0],w0,t1);				// w0 = u0^2 - 2u1u2f0
1193
	_ff_set(w[1],w1);
1194
	_ff_set(w[2],w2);
1195
}
1196

1197
// Computes (x+a)^n modulo a cubic of the form x^3+f1x+f0.  Assumes n < 2^63
1198
void ff_poly_xpan_mod_d3 (ff_t g[3], ff_t a, unsigned long n, ff_t f[5])
1199
{
1200
	_ff_t_declare p[12], *q;
1201
	_ff_t_declare_reg t1,a2, ca;
1202
	register unsigned long m;
1203
	int i, d;
1204
	
1205
	memset (p, 0, sizeof(p));
1206
	q = p;
1207
	_ff_set_one(q[0]);		// (x+a)^0 = 1
1208
	q += 3;
1209
	_ff_set_one(q[1]);
1210
	_ff_set(q[0],a);		// (x+a)^1 = x + a
1211
	q += 3;
1212
	_ff_set_one(q[2]);
1213
	_ff_add(ca,a,a);
1214
	_ff_set(q[1],ca);
1215
	_ff_square(a2,a);
1216
	_ff_set(q[0],a2);		// (x+a)^2 = x^2 + 2ax + a^2
1217
	q += 3;
1218
	_ff_addto(ca,a);
1219
	_ff_set(q[2],ca);
1220
	_ff_mult(t1,ca,a);
1221
	_ff_sub(q[1],t1,f[1]);
1222
	_ff_mult(t1,a2,a);
1223
	_ff_sub(q[0],t1,f[0]);	// (x+a)^3 = 3ax^2+(3a^2-f1)x + (a^3-f0)
1224

1225
	// could move up to exp8 but we'll settle for a window size of 2 bits for now
1226
	ff_poly_exp4_mod_d3 (g, p, n, f);
1227
}
1228

1229

1230
// Computes x^n modulo a cubic of the form x^3+ax+b.  Assumes n < 2^63
1231
void ff_poly_xn_mod_d3 (ff_t g[3], unsigned long n, ff_t f[4])
1232
{
1233
	_ff_t_declare p[24], *q;
1234
	_ff_t_declare_reg t0, t1, t3;
1235
	register unsigned long m;
1236
	int i, d;
1237
	
1238
	memset (p, 0, sizeof(p));
1239
	q = p;
1240
	_ff_set_one(q[0]);		// x^0 = 1
1241
	q += 3;
1242
	_ff_set_one(q[1]);		// x^1 = x
1243
	q += 3;
1244
	_ff_set_one(q[2]);		// x^2 = x^2
1245
	q += 3;
1246
	_ff_neg(t1,f[1]);
1247
	_ff_neg(t0,f[0]);
1248
	_ff_set(q[1],t1);
1249
	_ff_set(q[0],t0);		// x^3 = -f1x-f0
1250
	q += 3;
1251
	_ff_set(q[2],t1);
1252
	_ff_set(q[1],t0);		// x^4 = -f1x^2-f0x
1253
	q += 3;
1254
	_ff_set(q[2],t0);
1255
	_ff_square(t3,f[1]);
1256
	_ff_set(q[1],t3);		// t3 = f1^2
1257
	_ff_mult(t1,f[0],f[1]);
1258
	_ff_set(q[0],t1);		// x^5 = -f0x^2+f1x^2+f0f1
1259
	q += 3;
1260
	_ff_set(q[2],t3);
1261
	_ff_add(t0,t1,t1);		// t0 = 2f0f1
1262
	_ff_set(q[1],t0);
1263
	_ff_square(t1,f[0]);		// t1 = f0^2
1264
	_ff_set(q[0],t1);		// x^6 = f1^2x^2+2f0f1x+f0^2
1265
	q += 3;
1266
	_ff_set(q[2],t0);
1267
	_ff_mult(t0,f[1],t3);
1268
	_ff_sub(q[1],t1,t0);
1269
	_ff_mult(t0,f[0],t3);
1270
	_ff_neg(q[0],t0);		// x^7 = 2f0f1x^2 + (f0^2-f1^3)x - f0f1^2
1271
	
1272
	ff_poly_exp8_mod_d3 (g, p, n, f);
1273
}
1274

1275

1276
void ff_poly_exp4_mod_d3 (ff_t g[3], ff_t p[12], unsigned long n, ff_t f[4])
1277
{
1278
	register ff_t *q;
1279
	register unsigned long m;
1280
	int i, d;
1281
	
1282
	for ( m = (0x3UL<<60), i = 60 ; ! (n&m) ; m >>= 2, i-=2 );
1283
	q = p + 3*((n&m)>>i);
1284
	_ff_set (g[2],q[2]);  _ff_set(g[1],q[1]);  _ff_set(g[0],q[0]);
1285
	m >>= 2;  i -= 2;
1286
	while (m) {
1287
		ff_poly_square_mod_d3 (g,g,f);
1288
		ff_poly_square_mod_d3 (g,g,f);
1289
		q = p + 3*((n&m)>>i);
1290
		ff_poly_mult_mod_d3 (g, g, q, f);
1291
		m >>= 2;  i -= 2;
1292
	}
1293
}
1294

1295
void ff_poly_exp8_mod_d3 (ff_t g[3], ff_t p[24], unsigned long n, ff_t f[4])
1296
{
1297
	register ff_t *q;
1298
	register unsigned long m;
1299
	int i, d;
1300
	
1301
	for ( m = (0x7UL<<60), i = 60 ; ! (n&m) ; m >>= 3, i-=3 );
1302
	q = p + 3*((n&m)>>i);
1303
	_ff_set (g[2],q[2]);  _ff_set(g[1],q[1]);  _ff_set(g[0],q[0]);
1304

1305
	m >>= 3;  i -= 3;
1306
	while (m) {
1307
		ff_poly_square_mod_d3 (g,g,f);
1308
		ff_poly_square_mod_d3 (g,g,f);
1309
		ff_poly_square_mod_d3 (g,g,f);
1310
		q = p + 3*((n&m)>>i);
1311
		ff_poly_mult_mod_d3 (g, g, q, f);
1312
		m >>= 3;  i -= 3;
1313
	}
1314
}
1315

1316

1317
// multiplies two quadratics (arbitrary coeff) modulo a cubic of the form x^3+ax+b
1318
// no initialization or validation is done, overlap of u,v,w is ok.  Uses 10M+11A
1319
void ff_poly_mult_mod_d3 (ff_t w[3], ff_t u[3], ff_t v[3], ff_t f[4])
1320
{
1321
//	_ff_t_declare_reg t1, t3, t4, w0, w1, w2;
1322
	_ff_t_declare_reg t0, t1, t2, s01, s02, s12, w0, w1;
1323
	
1324
	_ff_mult(t0,u[0],v[0]);  _ff_mult(t1,u[1],v[1]);  _ff_mult(t2,u[2],v[2]);  
1325
	_ff_add(w1,u[0],u[1]); _ff_add(w0,v[0],v[1]); _ff_mult(s01,w1,w0); _ff_subfrom(s01,t0);  _ff_subfrom(s01,t1);
1326
	_ff_add(w1,u[0],u[2]); _ff_add(w0,v[0],v[2]); _ff_mult(s02,w1,w0); _ff_subfrom(s02,t0);  _ff_subfrom(s02,t2);
1327
	_ff_add(w1,u[1],u[2]); _ff_add(w0,v[1],v[2]); _ff_mult(s12,w1,w0); _ff_subfrom(s12,t1);  _ff_subfrom(s12,t2);
1328
	
1329
	_ff_add(w1,s02,t1);
1330
	_ff_mult(w0,f[1],t2);
1331
	_ff_sub(w[2],w1,w0);			// w2 = s02 + t1 - t2f1 = u2v0+u1v1+ u0v2 - f1u2v2
1332
	
1333
	_ff_mult(w1,f[0],t2);
1334
	_ff_mult(w0,f[1],s12);
1335
	_ff_addto(w0,w1);
1336
	_ff_sub(w[1],s01,w0);			// w1= s01 - t2f0 - s12f1= u1v0+u0v1-f0u2v2-f1(u1v2+u2v1)
1337
	
1338
	_ff_mult (w0, f[0],s12);
1339
	_ff_sub(w[0],t0,w0);				// w0 = t0 - s12f0 = u0v0 - (u1v2+u2v1)f0
1340
	return;
1341
}
1342

1343

1344
*/
1345

1346
/*
1347
// multiplies two cubics (arbitrary coeff) modulo a quartic of the form x^4+ax^2+bx+c
1348
// no initialization or validation is done, overlap of u,v,w is ok.  Uses 23M+38A
1349
void ff_poly_mult_mod_d4 (ff_t w[4], ff_t u[4], ff_t v[4], ff_t f[5])
1350
{
1351
	_ff_t_declare_reg t0, t1, t2, t3, s01,s02,s03,s12,s13,s23,w0, w1;		// don't need so many variables
1352

1353
	// compute t_i = u_i*v_i
1354
	_ff_mult(t0,u[0],v[0]); _ff_mult(t1,u[1],v[1]); _ff_mult(t2,u[2],v[2]); _ff_mult(t3,u[3],v[3]);
1355
	
1356
	// compute s_ij = u_i*v_j + u_j*v_i
1357
	_ff_add(w0,u[0],u[3]); _ff_add(w1,v[0],v[3]); _ff_mult(s03,w0,w1); _ff_subfrom(s03,t0);  _ff_subfrom(s03,t3);
1358
	_ff_add(w0,u[0],u[2]); _ff_add(w1,v[0],v[2]); _ff_mult(s02,w0,w1); _ff_subfrom(s02,t0);  _ff_subfrom(s02,t2);
1359
	_ff_add(w0,u[0],u[1]); _ff_add(w1,v[0],v[1]); _ff_mult(s01,w0,w1); _ff_subfrom(s01,t0);  _ff_subfrom(s01,t1);
1360
	_ff_add(w0,u[1],u[3]); _ff_add(w1,v[1],v[3]); _ff_mult(s13,w0,w1); _ff_subfrom(s13,t1);  _ff_subfrom(s13,t3);
1361
	_ff_add(w0,u[1],u[2]); _ff_add(w1,v[1],v[2]); _ff_mult(s12,w0,w1); _ff_subfrom(s12,t1);  _ff_subfrom(s12,t2);
1362
	_ff_add(w0,u[2],u[3]); _ff_add(w1,v[2],v[3]); _ff_mult(s23,w0,w1); _ff_subfrom(s23,t2);  _ff_subfrom(s23,t3);
1363
	
1364
	// s13 is special, add t2 to it
1365
	_ff_addto(s13,t2);
1366
	
1367
	_ff_mult(w0,s23,f[2]);
1368
	_ff_mult(w1,t3,f[1]);
1369
	_ff_addto(w0,w1);
1370
	_ff_add(w1,s03,s12);
1371
	_ff_sub(w[3],w1,w0);			// w3 = s03 + s12 - s23f2 - t3f1 = u0v3 + u1v2 + u2v1 - u2v3f2 + u3v0 - u3v2f2 - u3v3f1
1372

1373
	_ff_mult(w1,s13,f[2]);
1374
	_ff_mult(w0,s23,f[1]);
1375
	_ff_addto(w0,w1);
1376
	_ff_square(w1,f[2]);
1377
	_ff_subfrom(w1,f[0]);
1378
	ff_mult(w1,w1,t3);
1379
	_ff_addto(w1,s02);
1380
	_ff_addto(w1,t1);
1381
	_ff_sub(w[2],w1,w0);			// w2 = s02 + t1 - s13f2 - s23f1 + t3(f2^2-f0) = u0v2 + u1v1 - u1v3f2 + u2v0 - u2v2f2 - u2v3f1 - u3v1f2 - u3v2f1 - u3v3f0 + u3v3f2^2
1382
	
1383
	_ff_mult(w1,s13,f[1]);
1384
	_ff_mult(w0,s23,f[0]);
1385
	_ff_addto(w0,w1);
1386
	_ff_mult(w1,f[1],f[2]);
1387
	ff_mult(w1,w1,t3);
1388
	_ff_addto(w1,s01);
1389
	_ff_sub(w[1],w1,w0);			// w1 = s01 - s13f1 - s23f0 + t3f1f2  = u0v1 + u1v0 - u1v3f1 - u2v2f1 - u2v3f0 - u3v1f1 - u3v2f0 + u3v3f1f2
1390

1391
	_ff_mult(w0,s13,f[0]);
1392
	_ff_mult(w1,f[0],f[2]);
1393
	ff_mult(w1,w1,t3);
1394
	_ff_addto(w1,t0);
1395
	_ff_sub(w[0],w1,w0);			// w0 = t0 - s13f0 + t3f0f2 = u0v0 - u1v3f0 - u2v2f0 - u3v1f0 + u3v3f0f2
1396
}
1397

1398

1399
// squares a cubic (arbitrary coeff) modulo a quartic of the form x^4+ax^2+bx+c
1400
// no initialization or validation is done, overlap of u,w is OK.  Uses 23M+21A
1401
void old_ff_poly_square_mod_d4 (ff_t w[4], ff_t u[4], ff_t f[5])
1402
{
1403
	_ff_t_declare_reg t0,t1,t2,t3,s03,s02,s01,s13,s12,s23,w0,w1;
1404

1405
	// compute t_i = u_i^2
1406
	_ff_square(t0,u[0]);	_ff_square(t1,u[1]);	_ff_square(t2,u[2]);	_ff_square(t3,u[3]);
1407
	
1408
	// compute s_ij = u_i*u_j
1409
	_ff_mult(s13,u[1],u[3]);  _ff_mult(s23,u[2],u[3]);
1410
	
1411
	_ff_mult(w1,u[0],u[3]);
1412
	_ff_mult(w0,u[1],u[2]);
1413
	_ff_addto(w1,w0);
1414
	_ff_mult(w0,s23,f[2]);
1415
	_ff_subfrom(w1,w0);
1416
	_ff_x2(w1);
1417
	_ff_mult(w0,t3,f[1]);
1418
	_ff_sub(w[3],w1,w0);			// w3 = 2(s03 + s12 - s23f2) - t3f1 = 2*u0*u3 + 2*u1*u2 - 2*u2*u3*f2 - u3^2*f1
1419
	
1420
	_ff_mult(w0,s13,f[2]);
1421
	_ff_mult(w1,s23,f[1]);
1422
	_ff_addto(w0,w1);
1423
	_ff_mult(w1,u[0],u[2]);
1424
	_ff_subfrom(w1,w0);
1425
	_ff_x2(w1);
1426
	_ff_addto(w1,t1);
1427
	_ff_square(w0,f[2]);
1428
	_ff_subfrom(w0,f[0]);
1429
	ff_mult(w0,w0,t3);
1430
	_ff_addto(w1,w0);
1431
	_ff_mult(w0,t2,f[2]);
1432
	_ff_sub(w[2],w1,w0);			// w2 = 2(s02 - s12f2 - s23f1) + t1 - t2f2 + t3(f2^2-f0) = 2*u0*u2 + u1^2 - 2*u1*u3*f2 - u2^2*f2 - 2*u2*u3*f1 - u3^2*f0 + u3^2*f2^2
1433
	
1434
	_ff_mult(w0,s23,f[0]);
1435
	_ff_mult(w1,u[0],u[1]);
1436
	_ff_subfrom(w1,w0);
1437
	_ff_x2(w1);
1438
	_ff_mult(w0,f[1],f[2]);
1439
	ff_mult(w0,w0,t3);
1440
	_ff_addto(w1,w0);
1441
	_ff_add(w0,s13,s13);
1442
	_ff_addto(w0,t2);
1443
	ff_mult(w0,w0,f[1]);
1444
	_ff_sub(w[1],w1,w0);			// w1 = 2(s01 - s23f0) - (t2 + 2s13)f1 + t3f1f2 = 2*u0*u1 - 2*u1*u3*f1 - u2^2*f1 - 2*u2*u3*f0 + u3^2*f1*f2
1445
	
1446
	_ff_mult(w0,t3,f[2]);
1447
	_ff_add(w1,s13,s13);
1448
	_ff_addto(w1,t2);
1449
	_ff_subfrom(w0,w1);
1450
	_ff_mult(w1,w0,f[0]);
1451
	_ff_add(w[0],w1,t0);			// w0 = t0 + (t3f2 - 2s13 - t2)f0 = u0^2 - 2*u1*u3*f0 - u2^2*f0 + u3^2*f0*f2
1452
}
1453
*/
1454
/*
1455
void old_ff_poly_xpan_mod_d4 (ff_t g[4], ff_t a, unsigned long n, ff_t f[5])
1456
{
1457
	_ff_t_declare p[16], *q;
1458
	_ff_t_declare_reg a2, ca;
1459
	register unsigned long m;
1460
	int i, d;
1461
	
1462
	memset (p, 0, sizeof(p));
1463
	q = p;
1464
	_ff_set_one(q[0]);		// (x+a)^0 = 1
1465
	q += 4;
1466
	_ff_set_one(q[1]);
1467
	_ff_set(q[0],a);		// (x+a)^1 = x + a
1468
	q += 4;
1469
	_ff_set_one(q[2]);
1470
	_ff_add(ca,a,a);
1471
	_ff_set(q[1],ca);
1472
	_ff_square(a2,a);
1473
	_ff_set(q[0],a2);		// (x+a)^2 = x^2 + 2ax + a^2
1474
	q += 4;
1475
	_ff_set_one(q[3]);		
1476
	_ff_addto(ca,a);
1477
	_ff_set(q[2],ca);
1478
	_ff_mult(q[1],ca,a);
1479
	_ff_mult(q[0],a2,a);		// (x+a)^3 = x^3 + 3ax^2+3a^2x+a^3
1480

1481
	// could move up to exp8 but we'll settle for a window size of 2 bits for now
1482
	ff_poly_exp4_mod_d4 (g, p, n, f);
1483
}
1484

1485

1486
void ff_poly_exp4_mod_d4 (ff_t g[4], ff_t p[16], unsigned long n, ff_t f[5])
1487
{
1488
	register ff_t *q;
1489
	register unsigned long m;
1490
	int i, d;
1491
	
1492
	for ( m = (0x3UL<<60), i = 60 ; ! (n&m) ; m >>= 2, i-=2 );
1493
	q = p + 4*((n&m)>>i);
1494
	_ff_set(g[3],q[3]);  _ff_set (g[2],q[2]);  _ff_set(g[1],q[1]);  _ff_set(g[0],q[0]);
1495
	m >>= 2;  i -= 2;
1496
	while (m) {
1497
		ff_poly_square_mod_d4 (g,g,f);
1498
		ff_poly_square_mod_d4 (g,g,f);
1499
		q = p + 4*((n&m)>>i);
1500
		ff_poly_mult_mod_d4 (g, g, q, f);
1501
		m >>= 2;  i -= 2;
1502
	}
1503
}
1504

1505
void ff_poly_exp8_mod_d4 (ff_t g[4], ff_t p[32], unsigned long n, ff_t f[5])
1506
{
1507
	register ff_t *q;
1508
	register unsigned long m;
1509
	int i, d;
1510
	
1511
	for ( m = (0x7UL<<60), i = 60 ; ! (n&m) ; m >>= 3, i-=3 );
1512
	q = p + 4*((n&m)>>i);
1513
	_ff_set(g[3],q[3]);  _ff_set (g[2],q[2]);  _ff_set(g[1],q[1]);  _ff_set(g[0],q[0]);
1514
	m >>= 3;  i -= 3;
1515
	while (m) {
1516
		ff_poly_square_mod_d4 (g,g,f);
1517
		ff_poly_square_mod_d4 (g,g,f);
1518
		ff_poly_square_mod_d4 (g,g,f);
1519
		q = p + 4*((n&m)>>i);
1520
		ff_poly_mult_mod_d4 (g, g, q, f);
1521
		m >>= 3;  i -= 3;
1522
	}
1523
}
1524
*/
1525

1526
/*
1527
// Computes x^n modulo a quartic of the form x^4+f2x^2+f1x+f0.  Assumes n < 2^63
1528
void old_ff_poly_xn_mod_d4 (ff_t g[4], unsigned long n, ff_t f[5])
1529
{
1530
	_ff_t_declare p[32], *q;
1531
	_ff_t_declare_reg t0, t1, t2, t3;
1532
	register unsigned long m;
1533
	int i, d;
1534
	
1535
	memset (p, 0, sizeof(p));
1536
	q = p;
1537
	_ff_set_one(q[0]);		// x^0 = 1
1538
	q += 4;
1539
	_ff_set_one(q[1]);		// x^1 = x
1540
	q += 4;
1541
	_ff_set_one(q[2]);		// x^2 = x^2
1542
	q += 4;
1543
	_ff_set_one(q[3]);		// x^3 = x^3
1544
	q += 4;
1545
	_ff_neg(t2,f[2]);
1546
	_ff_neg(t1,f[1]);
1547
	_ff_neg(t0,f[0]);
1548
	_ff_set(q[2],t2);
1549
	_ff_set(q[1],t1);
1550
	_ff_set(q[0],t0);		// x^4 = -f2x^2 - f1x-f0
1551
	q += 4;
1552
	_ff_set(q[3],t2);
1553
	_ff_set(q[2],t1);
1554
	_ff_set(q[1],t0);		// x^5 = -f2x^3 - f1x^2 - f0x
1555
	q += 4;
1556
	_ff_set(q[3],t1);
1557
	_ff_square(t3,f[2]);
1558
	_ff_subfrom(t3,f[0]);
1559
	_ff_set(q[2],t3);
1560
	_ff_mult(t1,f[1],f[2]);
1561
	_ff_set(q[1],t1);
1562
	_ff_mult(t0,f[0],f[2]);
1563
	_ff_set(q[0],t0);		// x^6 = -f1x^3 + (f2^2-f0)x^2 + f1f2x^2 + f0f2
1564
	q += 4;
1565
	_ff_set(q[3],t3);
1566
	_ff_add(q[2],t1,t1);
1567
	_ff_square(t2,f[1]);
1568
	_ff_add(q[1],t0,t2);
1569
	_ff_mult(q[0],f[0],f[1]);	// x^7 = (f2^2-f0)x^3 + 2f1f2x^2 + (f0f2+f1^2)x + f0f1
1570
	
1571
	ff_poly_exp8_mod_d4 (g, p, n, f);
1572
}
1573
*/
1574

1575
Product

Resources

Company