CoCalc -- ffext.c

Harvard Workshop: Distribution of modular symbols and L-values
code / alex / psage / psage / libs / smalljac / ffext.c
²⁴¹⁸¹⁸ views
1
#include <stdlib.h>
2
#include <stdio.h>
3
#include "ff.h"
4
#include "ffext.h"
5
#include "ffpoly.h"		// used to find irreducible cubic
6

7
/*
8
    Copyright 2008 Andrew V. Sutherland
9

10
    This file is part of smalljac.
11

12
    smalljac is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 2 of the License, or
15
    (at your option) any later version.
16

17
    smalljac is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21

22
    You should have received a copy of the GNU General Public License
23
    along with smalljac.  If not, see <http://www.gnu.org/licenses/>.
24
*/
25

26
static ff_t _ff3_f[4];									// irred minimal poly of z - polynomial basis {1,z,z^2}  of the form x^3-s or x^3-x-s
27
static ff_t _ff3_nr_exp;								// z^(m(p^2+p+1)), a primitive 2^e-th root of unity in F_p^3 (which necessarily lies in F_p)
28

29
ff_t _ff3_zp[3];										// if p=1mod3 this is just a multiple of z
30
ff_t _ff3_z2p[3];									// z^{2p} cached because it used by ff3_exp_p
31
int _ff3_trace_z2;									// trace of z^2 is 0 for p=1mod and 2 o.w., note that trace of z is always 0, store this as in int rather than an ff_t
32

33
static ff_t _ff2_3Sylow_tab[42][2][2];					// These values depend on s (the quadratic non-residue passed in) and are not reused, having a static table is simple a convenience
34
ff_t _ff2_cbrt_unity[2];
35
int _ff2_cbrt_setup;
36

37
void ff_ext_setup(void) { _ff3_f[3] = 0; _ff2_cbrt_setup = 0;  }	// don't zero everything, just enough to detect uninitialized cases
38

39
// standard 4-ary exponentiation (fixed 2-bit window)
40
void ff2_exp_ui_s (ff_t o[2], ff_t a[2], unsigned long e, ff_t s)
41
{
42
	register int i;
43
	register unsigned long j, m;
44
	ff_t b[4][2], c[2];
45
	
46
//printf ("exp(%ldz+%ld, %ld)\n", _ff_get_ui(a[1]), _ff_get_ui(a[0]), e);
47
	switch (e) {
48
	case 0:  ff2_set_one (o);  return;
49
	case 1:  ff2_set(o,a); return;
50
	case 2:  ff2_square_s(o,a,s); return;
51
	}
52
	i = _asm_highbit(e);
53
	if ( i&1 ) i--;
54
	m = 3UL<<i;
55
	ff2_set (b[1], a);
56
	ff2_square_s (b[2],b[1],s);
57
	ff2_mult_s(b[3],b[2],b[1],s);
58
	ff2_set (c, b[(m&e)>>i]);
59
	for ( m>>=2,i-=2 ; m ; m>>=2, i-=2 ) {
60
		ff2_square_s(c,c,s);  ff2_square_s(c,c,s);
61
		j = (m&e)>>i;
62
		if ( j ) ff2_mult_s(c,c,b[j],s);
63
	}
64
	ff2_set (o, c);
65
//printf("expresult=%ldz+%ld\n", _ff_get_ui(o[1]), _ff_get_ui(o[0]));
66
}
67

68

69

70
/*
71
    sqrt algorithm over F_p^2, reduces problem to two sqrts in F_p
72

73
    We are given an element of the form a1z+a0 where z^2=s=_ff_2g (a non-residue in F_p)
74
    If a1 is zero, we just compute sqrt(a0) in F_p, and if it doesn't exist, sqrt(a0)=sqrt(a0/s)*sqrt(s).
75

76
    We now assume a1!=0.  If (b1z+b0)^2 = (a1z+a0) then
77

78
           b0^2+sb1^2 = a0  and  2b0b1=a1
79

80
    and we know that both b0 and b1 are non-zero.  We then obtain
81

82
           b0^4 - a0b0^2 + a1^2s/4 = 0 =>  b0^2 = (a0 +/- sqrt(N(a)))/2
83
 
84
    and similarly
85

86
          sb1^4 - a0b1^2 + a1^2/4 = 0 => b1^2 = (a0 -/+ sqrt(N(a)))/(2s)
87

88
    Note that b0^2*b1^2 = a1^2/4 is a QR, but 1/s is not a QR, so exactly one of (-a0+sqrt(N(a)))/2 and (-a0-sqrt(N(a)))/2 is a QR
89

90
*/
91
int ff2_sqrt (ff_t o[2], ff_t a[2])
92
{
93
	ff_t x, y, t;
94
	
95
	ff_setup_2g();
96
	if ( _ff_zero(a[1]) ) {
97
		if ( ff_invsqrt(&t, a, 1) ) {
98
			_ff_mult (o[0], t, a[0]);
99
			_ff_set_zero(o[1]);
100
		} else {
101
			_ff_mult (o[1],t,a[0]);
102
			_ff_set_zero(o[0]);
103
		}
104
		return 1;
105
	}
106
	ff2_norm(&t,a);
107
	if ( ! ff_sqrt (&x, &t) ) return 0;		// a is a QR in F_p^2 iff N(a) is a QR in F_p
108
	_ff_add (t,a[0],x);
109
	_ff_div2(y,t);
110
	if ( ! ff_invsqrt (&t, &y, 1) ) {	
111
		_ff_mult(x,y,t);					// if r=sqrt(y) is not in F_p then b1 = r/sqrt(s)=r/z, we have r=y*tz, so b1=y*t
112
		ff_mult(t,t,_ff_2g);				// b0=a1/(2b1)=a1/(2yt)=a1z/(2ytz)=a1z/(2r)=a1zr/(2y)=a1ytz^2/(2y)=a1ts/2
113
		ff_mult(t,t,a[1]);
114
		_ff_div2(y,t);					// got b0
115
	} else {
116
		ff_mult(y,y,t);					// got b0
117
		ff_mult(t,t,a[1]);				// b1=a1/(2b0)=a1/(2yt)=a1t/2
118
		_ff_div2(x,t);					// got b1
119
	}
120
check:
121
#if ! FF_FAST
122
{
123
	ff_t b[2];
124
	_ff_set(b[0],y); _ff_set(b[1],x);
125
	ff2_square(b,b);
126
	if ( ! ff2_equal(b,a) ) { printf ("ff2_sqrt failed, (%lu+%luz)^2 != %lu+%luz\n",_ff_get_ui(b[0]), _ff_get_ui(b[1]), _ff_get_ui(a[0]), _ff_get_ui(a[1])); exit (0); }
127
	}
128
#endif
129
	_ff_set(o[1],x);
130
	_ff_set(o[0],y);
131
	return 1;
132
}
133

134
void _ff2_setup_cbrt(void)
135
{
136
	register int i;
137
	register unsigned long n;
138
	ff_t t0,t1;
139
	ff_t r[2],s[2],t[2],w[2];
140
	
141
	if ( _ff2_cbrt_setup ) return;
142
	ff_setup_2g();
143
	if ( _ff_p1mod3 || _ff_p==3 ) { puts ("_ff2_setup_cbrt called when p!= 2mod 3, this should never happen.");  exit (0); }
144
	_ff2_cbrt_setup = 1;
145
	// Note that p+1 = 3^e*m where m is not divisible by 3, and we have p-1 not div by 3
146

147
	if ( _ff_p3_e == 1 ) {
148
		// life is slightly easer in this situation, since (-1+sqrt(-3))/2 generates the Sylow 3-subgroup of F_p^2
149
		// unfortunately we still need to compute sqrt(-3) in the standard basis (-3 is not necessarily _ff_2g)
150
		_ff_set_i(t0,-3);
151
		if ( ff_sqrt_ext (&t1,&t0) ) { printf ("Impossible, -3 is a QR mod %ld with p=2mod3\n", _ff_p);  exit (0); }
152
		_ff_div2(_ff2_3Sylow_tab[0][0][0], _ff_negone);
153
		_ff_div2(_ff2_3Sylow_tab[0][0][1], t1);
154
		ff2_square (_ff2_3Sylow_tab[0][1],_ff2_3Sylow_tab[0][0]);
155
		ff2_set (_ff2_cbrt_unity,_ff2_3Sylow_tab[0][0]);
156
//printf ("cube root of unity is %ldz+%ld, 2g=%d\n", _ff_get_ui(_ff2_cbrt_unity[1]), _ff_get_ui(_ff2_cbrt_unity[0]), _ff_get_ui(_ff_2g));
157
		return;
158
	}
159

160
	// To find a cubic non-residue, it suffices to find r^(m*3^(e-1)) not in F_p.   (when e is 1 this is just r^m)
161
	// To get into the 3-Sylow, we need to exponentiate by (p-1)m = (p+1)(m-1) + 2m(3^(e-1)-1), so we exponentiate by m-1 first.
162
	n = (_ff_p+1)/(3*_ff_p3_m)-1;					// n = 3^(e-1)-1
163
	_ff_set_one(t[1]); _ff_set_one(t[0]);
164
	for(;;) { 
165
		ff2_exp_ui (r, t, _ff_p3_m-1);				// r = t^(m-1)
166
		ff2_mult (s, r, t);						// s = t^m
167
		if ( ! _ff_zero(s[1]) ) {					// if s is in F_p, t is a cubic residue
168
			ff2_exp_ui (w,s,n);					// w = t^(m*(3^(e-1)-1))
169
			_ff_mult(t0,w[0],s[1]); _ff_mult(t1,w[1],s[0]);
170
			_ff_addto(t1,t0);					// t1 = z coeff of t^(m*3^(e-1)) = t^((p+1)/3)
171
			if ( ! _ff_zero(t1) ) break;			// t is  a cubic residue iff t^((p+1)/3) is in F_p iff t1 is zero
172
		}
173
next:	_ff_inc(t[0]);
174
		if ( _ff_zero(t[0]) ) { _ff_inc(t[1]); _ff_set_one(t[0]); }
175
	}
176
	// t is a non CR, r=t^(m-1), s=t^(m*(3^e-2))
177
	ff2_norm(&t0,r);									// t0 = t^((p+1)(m-1))
178
	ff2_mult(s,s,w);									// s = t^(3^(e-1)m)
179
	ff2_square(w,w);									// w = t^(2*(3^(e-1)-1)m)
180
	ff2_mult(s,s,w);									// s = t^((3^(e-1)+2(3^(e-1)-1))m) = t^((3^e-2)m)
181
	ff2_scalar_mult(r,t0,s);								// r = t^((p+1)(m-1)+(3^e-2)m) = t^((p-1)m) generates the 3-Sylow of F_p^2	
182
	ff2_set (_ff2_3Sylow_tab[0][0], r);
183
	ff2_square (_ff2_3Sylow_tab[0][1],_ff2_3Sylow_tab[0][0]);
184
	for ( i = 1 ; i < _ff_p3_e ; i++ ) {
185
		ff2_mult(_ff2_3Sylow_tab[i][0],_ff2_3Sylow_tab[i-1][0],_ff2_3Sylow_tab[i-1][1]);		// tab[i][0] = tab[i-1][0]^3
186
		ff2_square(_ff2_3Sylow_tab[i][1], _ff2_3Sylow_tab[i][0]);							// tab[i][1] = tab[i][0]^2
187
	}
188
	ff2_set (_ff2_cbrt_unity,_ff2_3Sylow_tab[_ff_p3_e-1][0]);
189
//printf ("cube root of unity is %ldz+%ld, 2g=%d\n", _ff_get_ui(_ff2_cbrt_unity[1]), _ff_get_ui(_ff2_cbrt_unity[0]), _ff_get_ui(_ff_2g));
190
}
191

192

193
// computes a^{-1/3} for a in the 3-Sylow subgroup, returns 0 if not a quadratic residue
194
// uses precomputed table of 2e powers of the 3-Sylow generator to reduce running time by a factor of 2 over standard Tonelli-Shanks (still O(e^2)).
195
int ff2_3Sylow_invcbrt (ff_t o[2], ff_t a[2])
196
{
197
	ff_t b[2], q[2], q1[2], t[2], w1[2], w2[2];
198
	register int i, j, k;
199
	
200
	ff2_setup_cbrt();
201

202
	// handle easy cases without setting up
203
	if ( _ff_one(a[0]) ) { _ff_set_one(o[0]);  return 1; }		// use 1 as the cube root of 1
204
	if ( _ff_p3_e == 1 ) return 0;
205
	
206
	// set w1 and w2 to the two elements of order 3 in the 3-Sylow (i.e. the two non-trivial cube roots of unity)
207
	ff2_set (w1, _ff2_3Sylow_tab[_ff_p3_e-1][0]);
208
	ff2_set (w2, _ff2_3Sylow_tab[_ff_p3_e-1][1]);
209
	ff2_set_one (t);
210
			
211
	ff2_set (b,a);
212
	do {
213
		ff2_set (q, b);
214
		for ( i = 1 ; i < _ff_p3_e+1 ; i++ ) {		// s<e+1 is just a safety check in case a isn't in the 3-Sylow, this could be removed
215
			j=1;
216
			if ( ff2_equal(q,w1) ) break;
217
			j=0;
218
			if ( ff2_equal(q,w2) ) break;
219
			ff2_set(q1,q);
220
			ff2_square (q,q);  ff2_mult(q,q,q1);
221
		}
222
		k = _ff_p3_e-i;
223
#if ! FF_FAST
224
		if ( k < 0 ) { printf ("Unexpected result: k<0 in ff2_3Sylow_invsqrt?!  a = %luz+%lu, p = %lu\n", _ff_get_ui(a[1]), _ff_get_ui(a[0]), _ff_p);  exit (1); }
225
#endif
226
		if ( k <= 0 ) return 0;
227
		ff2_mult (b, b, _ff2_3Sylow_tab[k][j]);		// the product of all the elements S[k] we multiply into b here is b^{-1}, since we terminate with b=1
228
		ff2_mult (t, t, _ff2_3Sylow_tab[k-1][j]);		// multiply t by S[k-1]=cbrt(S[k]), the product of all these will be b^{-1/3}
229
	} while ( ! ff2_one(b) );
230
#if ! FF_FAST
231
	ff2_square (b, t); ff2_mult(b,b,t);
232
	ff2_mult(b,b,a);
233
	if ( ! ff2_one(b) ) { printf ("ff2_3Sylow_invcbrt failed, %(lu+%lu)^3 *  (%luz+%lu) != 1\n", _ff_get_ui(t[1]), _ff_get_ui(t[0]), _ff_get_ui(a[1]), _ff_get_ui(a[0])); exit (0); }
234
#endif
235
	ff2_set(o,t);
236
	return 1;
237
}
238

239
void _ff3_setup (void)
240
{
241
	_ff_set_one(_ff3_f[3]);  _ff_set_zero(_ff3_f[2]);
242
	if ( _ff_p1mod3 ) {
243
		ff_setup_3g();
244
		ff3_set_zero(_ff3_zp);
245
		ff_exp_ui(_ff3_zp+1,&_ff_3g,(_ff_p-1)/3);
246
		_ff_set_zero(_ff3_f[1]);
247
		_ff_neg (_ff3_f[0], _ff_3g);
248
		_ff3_trace_z2 = 0;
249
	} else {
250
		_ff_set_one(_ff3_f[1]);
251
		ff_negate (_ff3_f[1]);
252
		_ff_set(_ff3_f[0],_ff3_f[1]);
253
		for(;;) {
254
			if ( ! ff_poly_roots_d3 (0, _ff3_f) ) break;
255
			_ff_dec(_ff3_f[0]);
256
		}
257
		_ff_neg (_ff_3g, _ff3_f[0]);				// note we use _ff_3g here even though the 3-Sylow is trivial
258
		ff3_zn_mod (_ff3_zp, _ff_p, _ff3_f);
259
		_ff3_trace_z2 = 2;
260
	}
261
	ff3_square (_ff3_z2p, _ff3_zp);
262
	// Note that the norm of z is the product of the roots of f, which is -f[0] = __ff_3g
263
	// the trace of z is -f[2] = 0, trace of z^2 is 0 for p=1mod3 and 2 o.w.
264
printf ("ff3 poly = z^3+%ldz+%ld\n", _ff_get_ui(_ff3_f[1]), _ff_get_ui(_ff3_f[0]));
265
}
266

267
static inline void ff3_setup(void) { if ( ! _ff3_f[3] ) _ff3_setup(); }
268

269
// overlap ok
270
void ff3_mult (ff_t o[3], ff_t a[3], ff_t b[3])
271
{
272
	register ff_t s1,s2,s3,t0,t1,t2,w1,w2,w3;
273
	
274
	_ff_add(t1,a[0],a[1]);   _ff_add(t2,b[0],b[1]);  _ff_mult(s1,t1,t2);
275
	_ff_add(t1,a[0],a[2]);   _ff_add(t2,b[0],b[2]);  _ff_mult(s2,t1,t2);
276
	_ff_add(t1,a[1],a[2]);   _ff_add(t2,b[1],b[2]);  _ff_mult(s3,t1,t2);
277
	_ff_mult(t0,a[0],b[0]);  _ff_mult(t1,a[1],b[1]); _ff_mult(t2,a[2],b[2]);
278
	_ff_sub(w1,s3,t1);  _ff_subfrom(w1,t2);		// w1 = (a1+a2)(b1+b2)-a1b1-a2b2 = a1b2+b2b1
279
	_ff_mult(w3,w1,_ff_3g);					// w3 = s(a1b2+b2b1)
280
	_ff_add(o[0],t0,w3);						// o[0] = a0b0 + (a1b2+b2b1)s
281
	_ff_mult(w2,t2,_ff_3g);					// w2 = a2b2s
282
	_ff_sub(w3,s1,t0);  _ff_subfrom(w3,t1);		// w3 = (a0+a1)(b0+b1)-a0b0-a1b1 = a0b1+b1a0
283
	_ff_add(o[1],w2,w3);					// o[1] = a0b1+b1a0+a2b2s
284
	_ff_sub(w3,s2,t0);  _ff_subfrom(w3,t2); 		// w3 = (a0+a2)(b0+b2)-a0b0-a2b2 = a0b2+b2a0
285
	_ff_add(o[2],w3,t1);						// o[2] = a0b2+b2a0+a1b1
286
	if ( ! _ff_p1mod3 ) {						// in this case z^3=z+s so we need to add z[(a1b2+b2b1)+a2b2z]
287
		_ff_addto(o[1],w1);					// o[1] = a0b1+b1a0+a2b2s + a1b2+b2b1
288
		_ff_addto(o[2],t2);					// o[2] = a0b2+b2a0+a1b1 + a2b2
289
	}
290
	// 8M + 15A (17A)
291
}
292

293
// multiplies modulo (z^3-rz-s), overlap ok (duplicates code in ff3_mult above, then adjusts for r)
294
void _ff3_mult_mod_rs (ff_t o[3], ff_t a[3], ff_t b[3], ff_t r, ff_t s)
295
{
296
	register ff_t s1,s2,s3,t0,t1,t2,w1,w2,w3;
297
	
298
	_ff_add(t1,a[0],a[1]);   _ff_add(t2,b[0],b[1]);  _ff_mult(s1,t1,t2);
299
	_ff_add(t1,a[0],a[2]);   _ff_add(t2,b[0],b[2]);  _ff_mult(s2,t1,t2);
300
	_ff_add(t1,a[1],a[2]);   _ff_add(t2,b[1],b[2]);  _ff_mult(s3,t1,t2);
301
	_ff_mult(t0,a[0],b[0]);  _ff_mult(t1,a[1],b[1]); _ff_mult(t2,a[2],b[2]);
302
	_ff_sub(w1,s3,t1);  _ff_subfrom(w1,t2);		// w1 = (a1+a2)(b1+b2)-a1b1-a2b2 = a1b2+b2b1
303
	_ff_mult(w3,w1,s);						// w3 = s(a1b2+b2b1)
304
	_ff_add(o[0],t0,w3);						// o[0] = a0b0 + (a1b2+b2b1)s
305
	_ff_mult(w2,t2,s);						// w2 = a2b2s
306
	_ff_sub(w3,s1,t0);  _ff_subfrom(w3,t1);		// w3 = (a0+a1)(b0+b1)-a0b0-a1b1 = a0b1+b1a0
307
	_ff_add(o[1],w2,w3);					// o[1] = a0b1+b1a0+a2b2s
308
	_ff_sub(w3,s2,t0);  _ff_subfrom(w3,t2); 		// w3 = (a0+a2)(b0+b2)-a0b0-a2b2 = a0b2+b2a0
309
	_ff_add(o[2],w3,t1);						// o[2] = a0b2+b2a0+a1b1
310
	// z^3=rz+s so we need to add rz[(a1b2+b2b1)+a2b2z] - don't optimize for r=0 or 1 here, we assume general case applies
311
	_ff_mult(t0,r,w1);
312
	_ff_addto(o[1],t0);						// o[1] = a0b1+b1a0+a2b2s + r(a1b2+b2b1)
313
	_ff_mult(t1,r,t2);
314
	_ff_addto(o[2],t1);						// o[2] = a0b2+b2a0+a1b1 + ra2b2
315
	// 10M + 17A
316
}
317

318
// squares mod x^3-s
319
static void inline _ff3_square_mod_0s (ff_t o[3], ff_t a[3], ff_t s)
320
{
321
	register ff_t s1,s2,t0,t1,t2,w1,w2,w3;
322

323
	_ff_add(s1,a[1],a[1]);					// 2a1
324
	_ff_mult(w1,s1,a[0]);					// 2a0a1
325
	_ff_mult(s2,_ff_3g,a[2]);					// a2s
326
	_ff_mult(w2,s1,s2);						// 2a1a2s
327
	_ff_mult(t2,s2,a[2]);					// a2^2s
328
	_ff_square(t0,a[0]);  _ff_square(t1,a[1]);	// a0^2, a1^2
329
	_ff_mult(w3,a[0],a[2]);					// a0a2
330
	_ff_x2(w3);							// 2a0a2
331
	_ff_add(o[0],t0,w2);						// a0^2 + 2a1a2s
332
	_ff_add(o[1],w1,t2);						// 2a0a1 + a2^2s
333
	_ff_add(o[2],w3,t1);						// 2a0a2 + a1^2
334
	// 7M + 5A
335
}
336

337
// squares mod x^3-x-s
338
static void inline _ff3_square_mod_1s (ff_t o[3], ff_t a[3], ff_t s)
339
{
340
	register ff_t s1,s2,t0,t1,t2,w1,w2,w3;
341

342
	_ff_add(s1,a[0],a[2]);					// a0+a2
343
	_ff_square(w1,s1);						// a0^2+2a0a2+a2^2
344
	_ff_add(s2,a[1],a[1]);					// 2a1
345
	_ff_mult(w3,s1,s2);						// 2a0a1+2a1a2
346
	_ff_mult(w2,a[2],s);						// a2s
347
	ff_mult(s2,s2,w2);						// 2a1a2s
348
	_ff_mult(t2,w2,a[2]);					// a2^2s
349
	_ff_square(t0,a[0]); _ff_square(t1,a[1]);		// a0^2, a1^2
350
	_ff_subfrom(w1,t0);						// 2a0a2 + a2^2
351
	_ff_add(o[0],t0,s2);						// a0^2+2a1a2s
352
	_ff_add(o[1],w3,t2);						// 2a0a1+2a1a2+a2^2s
353
	_ff_add(o[2],w1,t1);						// 2a0a2 + a2^2 + a1^2
354
	// 7M + 6A
355
}
356

357
// squares mod x^3-rx-s
358
static void inline _ff3_square_mod_rs (ff_t o[3], ff_t a[3], ff_t r, ff_t s)
359
{
360
	register ff_t s1,s2,t0,t1,t2,w1,w2,w3;
361

362
	_ff_mult(w1,a[2],r);						// a2r
363
	_ff_add(s1,a[0],w1);					// a0+a2r
364
	_ff_add(s2,a[1],a[1]);					// 2a1
365
	_ff_mult(w3,s1,s2);						// 2a0a1+2a1a2r
366
	_ff_mult(w2,a[2],s);						// a2s
367
	ff_mult(s2,s2,w2);						// 2a1a2s
368
	_ff_mult(t2,w2,a[2]);					// a2^2s
369
	_ff_square(t0,a[0]); _ff_square(t1,a[1]);		// a0^2, a1^2
370
	_ff_addto(s1,a[0]);						// 2a0+a2r
371
	_ff_mult(w1,s1,a[2]);					// 2a0a2+a2^2r
372
	_ff_add(o[0],t0,s2);						// a0^2+2a1a2s
373
	_ff_add(o[1],w3,t2);						// 2a0a1+2a1a2+a2^2s
374
	_ff_add(o[2],w1,t1);						// 2a0a2 + a2^2r + a1^2
375
	// 8M + 6A
376
}
377

378
// overlap ok
379
void ff3_square (ff_t o[3], ff_t a[3])
380
{
381
	if ( _ff_p1mod3 ) {
382
		_ff3_square_mod_0s (o, a, _ff_3g);
383
	} else {
384
		_ff3_square_mod_1s (o, a, _ff_3g);
385
	}
386
	// 7M + 5A (6A)
387
}
388

389
// change code below if and when we add degree 2 ext ops
390
void ff2_poly_eval (ff_t o[2], ff_t f[], int d, ff_t x[2])
391
{
392
	ff_t t[2], y[2];
393
	register int i;
394

395
	if ( d < 0 ) { ff2_set_zero(o); return; }
396
	ff2_set_zero(y);
397
	_ff_set (y[0], f[d]);
398
	ff2_set(t,x);
399
	for ( i = d-1 ; i >= 0 ; i-- ) { ff2_mult(y,y,t);  _ff_addto(y[0], f[i]); }
400
	ff2_set(o,y);
401
	return;
402
}
403

404

405
void ff3_poly_eval (ff_t o[3], ff_t f[], int d, ff_t x[3])
406
{
407
	ff_t t[3], y[3];
408
	register int i;
409

410
	ff_setup_3g();
411
	if ( d < 0 ) { ff3_set_zero (o); return; }
412
	ff3_set_zero (y);
413
	_ff_set (y[0], f[d]);
414
	ff3_set (t,x);
415
	for ( i = d-1 ; i >= 0 ; i-- ) { ff3_mult(y,y,t);  _ff_addto(y[0],f[i]); }
416
	ff3_set(o,y);
417
	return;
418
}
419

420
// computes a^p = a[0]+a[1]z^p+a[2]z^{2p} using 2M or 6M+4A, overlap ok
421
void ff3_exp_p (ff_t o[3], ff_t a[3])
422
{
423
	ff_t t1[3], t2[3];
424
	
425
	if ( _ff_p1mod3 ) {
426
		// if p=1mod3 we know z^p is a multiple of z and z^2p is a multiple of z^2
427
		_ff_set(o[0],a[0]);
428
		ff_mult(o[1],a[1],_ff3_zp[1]);
429
		ff_mult(o[2],a[2],_ff3_z2p[2]);
430
	} else {
431
		ff3_scalar_mult(t1,a[1],_ff3_zp);
432
		ff3_scalar_mult(t2,a[2],_ff3_z2p);
433
		ff3_add(t1,t1,t2);
434
		_ff_addto(t1[0],a[0]);
435
		ff3_set(o,t1);
436
	}
437
}
438

439

440
// computes z^p mod f=z^3-az-b, optimized for a=1 case
441
// standard 4-ary exponentiation (fixed 2-bit window)
442
void ff3_zn_mod (ff_t o[3], unsigned long n, ff_t f[4])			// only looks at f[0] and f[1], implicitly assumes f[2]=0 and f[3]=1
443
{
444
	register int i;
445
	register unsigned long j, m, e;
446
	register ff_t a, b, w1,w2, w3;
447
	ff_t t[3];
448

449
	e=n;
450
	i = _asm_highbit(e);
451
	if ( i&1 ) i--;
452
	m = 3UL<<i;
453
	j = (m&e)>>i;
454
	_ff_neg(a,f[1]);
455
	_ff_neg(b,f[0]);
456
//printf("a=%ld, f[1]=%ld, b=%ld, f[0]=%ld\n", _ff_get_ui(a), _ff_get_ui(f[1]), _ff_get_ui(b), _ff_get_ui(f[0]));
457
	switch(j){
458
	case 1:
459
		_ff_set_zero(t[0]);  _ff_set_one(t[1]);  _ff_set_zero(t[2]);		// t = z
460
		break;
461
	case 2:
462
		_ff_set_zero(t[0]);  _ff_set_zero(t[1]);  _ff_set_one(t[2]);		// t = z^2
463
		break;
464
	case 3:
465
		_ff_set(t[0],b);  _ff_set(t[1],a);  _ff_set_zero(t[2]);			// t = z^3=az+b
466
		break;
467
	}
468
//printf("j=%d, %ldz^2+%ldz+%ld!\n", j, _ff_get_ui(t[2]), _ff_get_ui(t[1]), _ff_get_ui(t[0]));	
469
	if ( _ff_one(a) ) {
470
		for ( m>>=2,i-=2 ; m ; m>>=2, i-=2 ) {
471
			_ff3_square_mod_1s(t,t,b);  _ff3_square_mod_1s(t,t,b);
472
//printf("i=%d, %ldz^2+%ldz+%ld!\n", i, _ff_get_ui(t[2]), _ff_get_ui(t[1]), _ff_get_ui(t[0]));	
473
			j = (m&e)>>i;
474
			switch(j) {
475
			case 1:
476
				_ff_mult(w1,t[2],b);
477
				_ff_set(w2,t[1]);
478
				_ff_add(t[1],t[0],t[2]);
479
				_ff_set(t[2],w2);
480
				_ff_set(t[0],w1);
481
				break;
482
			case 2:
483
				_ff_mult(w1,t[2],b);
484
				_ff_addto(t[2],t[0]);
485
				_ff_mult(w2,t[1],b);
486
				_ff_addto(t[1],w1);
487
				_ff_set(t[0],w2);
488
				break;
489
			case 3:
490
				_ff_mult(w1,t[2],b);
491
				_ff_mult(w2,t[1],b);
492
				_ff_addto(w2,t[0]);
493
				_ff_add(w3,t[0],t[2]);
494
				_ff_mult(t[0],w3,b);
495
				_ff_set(w3,t[2]);
496
				_ff_add(t[2],t[1],w1);
497
				_ff_add(t[1],w3,w2);
498
				break;
499
			}
500
			// 1.5M+1.75A on average, every 4 bits
501
		}
502
	} else {		
503
		for ( m>>=2,i-=2 ; m ; m>>=2, i-=2 ) {
504
			_ff3_square_mod_rs(t,t,a,b);  _ff3_square_mod_rs(t,t,a,b);
505
//printf("i=%d, %ldz^2+%ldz+%ld!\n", i, _ff_get_ui(t[2]), _ff_get_ui(t[1]), _ff_get_ui(t[0]));	
506
			j = (m&e)>>i;
507
			switch(j) {
508
			case 1:
509
				_ff_mult(w1,t[2],b);
510
				_ff_mult(w2,t[2],a);
511
				_ff_set(t[2],t[1]);
512
				_ff_add(t[1],w2,t[0]);
513
				_ff_set(t[0],w1);
514
				break;
515
			case 2:
516
				_ff_mult(w1,t[2],b);
517
				_ff_mult(w2,t[2],a);
518
				_ff_add(t[2],t[0],w2);
519
				_ff_mult(w2,t[1],a);
520
				_ff_mult(t[0],t[1],b);
521
				_ff_add(t[1],w1,w2);
522
				break;
523
			case 3:
524
				_ff_mult(w1,t[2],b);
525
				_ff_mult(w2,t[2],a);
526
				_ff_add(w3,t[0],w2);
527
				_ff_mult(t[0],w3,b);
528
				_ff_mult(w2,w3,a);
529
				_ff_mult(w3,t[1],b);
530
				_ff_mult(t[2],t[1],a);
531
				_ff_add(t[1],w2,w3);
532
				_ff_addto(t[2],w1);
533
				break;
534
			}
535
			// 3M+1.5A on average, every 4 bits
536
//printf("j=%d, %ldz^2+%ldz+%ld!\n", j, _ff_get_ui(t[2]), _ff_get_ui(t[1]), _ff_get_ui(t[0]));	
537
		}
538
	}
539
	ff3_set (o, t);
540
}
541

542

543
// standard 4-ary exponentiation (fixed 2-bit window)
544
void ff3_exp_ui (ff_t o[3], ff_t a[3], unsigned long e)
545
{
546
	register int i;
547
	register unsigned long j, m;
548
	ff_t b[4][3], c[3];
549
	
550
//printf ("exp(%ldz^2+%ldz+%ld, %ld)\n", _ff_get_ui(a[2]), _ff_get_ui(a[1]), _ff_get_ui(a[0]), e);
551
	ff_setup_3g();
552
	switch (e) {
553
	case 0:  ff3_set_one (o);  return;
554
	case 1:  ff3_set(o,a); return;
555
	case 2:  ff3_square(o,a); return;
556
	}
557
	i = _asm_highbit(e);
558
	if ( i&1 ) i--;
559
	m = 3UL<<i;
560
	ff3_set (b[1], a);
561
	ff3_square (b[2],b[1]);
562
	ff3_mult(b[3],b[2],b[1]);
563
	ff3_set (c, b[(m&e)>>i]);
564
	for ( m>>=2,i-=2 ; m ; m>>=2, i-=2 ) {
565
		ff3_square(c,c);  ff3_square(c,c);
566
		j = (m&e)>>i;
567
		if ( j ) ff3_mult(c,c,b[j]);
568
	}
569
	ff3_set (o, c);
570
//printf("expresult=%ldz^2+%ldz+%ld\n", _ff_get_ui(o[2]), _ff_get_ui(o[1]), _ff_get_ui(o[0]));
571
}
572

573
// standard 4-ary exponentiation (fixed 2-bit window)
574
void ff3_exp_ui_rs (ff_t o[3], ff_t a[3], unsigned long e, ff_t r, ff_t s)
575
{
576
	register int i;
577
	register unsigned long j, m;
578
	ff_t b[4][3], c[3];
579
	
580
//printf ("exp(%ldz^2+%ldz+%ld, %ld)\n", _ff_get_ui(a[2]), _ff_get_ui(a[1]), _ff_get_ui(a[0]), e);
581
	ff_setup_3g();
582
	switch (e) {
583
	case 0:  ff3_set_one (o);  return;
584
	case 1:  ff3_set(o,a); return;
585
	case 2:  _ff3_square_mod_rs (o,a,r,s); return;
586
	}
587
	i = _asm_highbit(e);
588
	if ( i&1 ) i--;
589
	m = 3UL<<i;
590
	ff3_set (b[1], a);
591
	_ff3_square_mod_rs (b[2],b[1],r,s);
592
	_ff3_mult_mod_rs(b[3],b[2],b[1],r,s);
593
	ff3_set (c, b[(m&e)>>i]);
594
	for ( m>>=2,i-=2 ; m ; m>>=2, i-=2 ) {
595
		_ff3_square_mod_rs(c,c,r,s);  _ff3_square_mod_rs(c,c,r,s);
596
		j = (m&e)>>i;
597
		if ( j ) _ff3_mult_mod_rs(c,c,b[j],r,s);
598
	}
599
	ff3_set (o, c);
600
//printf("expresult=%ldz^2+%ldz+%ld\n", _ff_get_ui(o[2]), _ff_get_ui(o[1]), _ff_get_ui(o[0]));
601
}
602

603
/*
604
   This algorithm is deterministic (given 2Sylow generator in F_p) - caller should flip coin to randomize roots
605
*/
606
int ff3_sqrt (ff_t o[3], ff_t a[3])
607
{
608
	ff_t u[3], v[3], w[3], x[3];
609
	ff_t b, c, d;
610

611
	if ( ff3_zero(a) ) { ff3_set_zero (o);  return 1; }
612
	if ( ff3_one(a) ) { ff3_set (o,a);  return 1; }
613

614
	ff3_setup();
615

616
//printf("sqrt(%ld+%ldz+%ldz^2)\n", _ff_get_ui(a[0]), _ff_get_ui(a[1]), _ff_get_ui(a[2]));
617

618
	/*
619
		We need to compute a^n and a^(n+1)/2, where n is the odd part of p^3-1, which is m(p^2+p+1), where m=2k+1 is the odd part of p-1
620
		The element a^n=N(a)^m is in the 2-sylow subgroup, and is also in F_p (F_p and F_p^3 have the same 2-Sylow subgroup).
621
		
622
	        We can compute N(a) very efficiently, and then exponentiate by k in F_p, followed by a square and a multiply to obtain N(a)^(2k+1)=a^n.
623
	        We use a standard F_p Tonelli-Shanks algorithm to compute the inverse of the square root of a^n, if it exists, and if it doesn't we detect this
624
		for less than the cost of a Legendre computation.
625
	
626
		Once we have a^(-n/2) in hand, we then compute a^((n+1)/2) to obtain a^(1/2).
627
	
628
		(n+1)/2 = ((2k+1)(p^2+p+1)+1)/2 = k(p^2+p+1) + p(p+1)/2 + 1
629
	
630
		We already have a^(k(p^2+p+1)) from above, so we need only compute a^((p+1)/2) followed by a Frobenius exponentiation by p, and two mults.
631
	*/
632
	ff3_norm(&b,a);						// b = N(a) = a^(p^2+p+1)
633
	ff_exp_ui(&c,&b,(_ff_p2_m-1)/2);			// c = N(a)^k (save for later)
634
	_ff_square(d,c);						// d = N(a)^(2k)
635
	ff_mult(b,b,d);							// b = N(a)^(2k+1) = N(a)^m is in the 2-Sylow
636
	if ( ! ff_2Sylow_invsqrt(&d,&b,0) ) return 0;	// d = 1/sqrt(b) = a^{-n/2)
637
	ff3_exp_ui(x,a,(_ff_p+1)/2);				// x = a^((p+1)/2)
638
	ff3_exp_p(x,x);							// x = a^(p(p+1)/2)
639
	ff3_mult (x, x, a);						// x = a^(p(p+1)/2+1)
640
	_ff_mult (b, d, c);						// b = a^(k(p^2+p+1)-n/2)		(do F_p mults before scalar_mult)
641
	ff3_scalar_mult (x, b, x);					// x = a^(k(p^2+p+1)+p(p+1)/2+1-n/2) = a^((n+1)/2-n/2) = a^(1/2)
642
//printf ("a^(1/2) = %ldz^2+%ldz+%ld\n", _ff_get_ui(x[2]),  _ff_get_ui(x[1]),  _ff_get_ui(x[0]));
643
#if ! FF_FAST
644
	ff3_square (w, x);
645
	if ( ! ff3_equal(w,a) ) { printf ("ff3_sqrt failed, (%luz^2+%luz+%lu)^2 = ", _ff_get_ui(x[2]), _ff_get_ui(x[1]), _ff_get_ui(x[0]));
646
					    printf ("%luz^^2+%luz+%luz != ",   _ff_get_ui(w[2]), _ff_get_ui(w[1]), _ff_get_ui(w[0]));
647
				            printf ("%luz^2+%luz+%luz\n",   _ff_get_ui(a[2]), _ff_get_ui(a[1]), _ff_get_ui(a[0])); exit (0); }
648
#endif
649
	ff3_set (o, x); 
650
	return 1;
651
}
652

653
// computes (z+a)*v in F_p[z]/(z^3-rz-s), overlap ok
654
static inline void _ff3_mult_zpa_mod_rs (ff_t o[3], ff_t v[3], ff_t a, ff_t r, ff_t s)
655
{
656
	register ff_t t1,t2,w1,w2;
657
	
658
	_ff_set(t2,v[2]);
659
	_ff_mult(w1,t2,a);
660
	_ff_add(o[2],v[1],w1);
661
	_ff_mult(w1,v[1],a);
662
	_ff_mult(w2,t2,r);
663
	_ff_addto(w1,w2);
664
	_ff_add(o[1],v[0],w1);
665
	_ff_mult(w1,v[0],a);
666
	_ff_mult(w2,t2,s);
667
	_ff_add(o[0],w1,w2);
668
	// 5M+4A
669
}
670

671

672
// Computes tr(sqrt(z)) in F_p^3=F_p[z]/(z^3-rz-s).  This is a support function for factoring quartics.
673
int ff3_trsqrt_zpa_mod_rs (ff_t o[1], ff_t a, ff_t r, ff_t s)
674
{
675
	ff_t u[3], v[3], w[3], x[3], f[4];
676
	ff_t b, c, d, t1, t2;
677

678
//printf("Computing tr(sqrt(z+a)) for F_p[z]/(z^3-rz-s) with p=%ld, a=%ld, r=%ld, s=%ld\n", _ff_p, _ff_get_ui(a), _ff_get_ui(r), _ff_get_ui(s));	
679
	/*
680
		We use effectively the same algorithm as ff3_sqrt to compute sqrt(z), except we work in the user supplied basis
681
	        and our input is (z+a) which simplifies a few things.
682
	*/
683
	// N(z+a) = (z+a)(z+a)^p(z+a)^(p^2) = (z+a)(z^p+a)(z^(p^2)+a) = N(z)+a*tr(z^p*z)+a^2tr(z)+a^3 = s-ar+a^3  (since N(z)=s, tr(z)=0 and tr(z*z^p)=-r)
684
	_ff_mult(t1,a,r);						// t1=ar
685
	_ff_sub(b,s,t1);						// b=s-ar
686
	_ff_square(t2,a); ff_mult(t1,t2,a);			// t1=a^3
687
	_ff_addto(b,t1);						// b=s-ar+a^3=N(z+a)
688
//printf("N(z+a)=%ld\n", _ff_get_ui(b));
689
	ff_exp_ui(&c,&b,(_ff_p2_m-1)/2);			// c = N(z+a)^k (save for later)
690
	_ff_square(d,c);						// d = N(z+a)^(2k)
691
	ff_mult(b,b,d);							// b = N(z+a)^(2k+1) = N(z)^m is in the 2-Sylow
692
//printf ("b=N(z+a)^m=%ld\n", _ff_get_ui(b));
693
	if ( ! ff_2Sylow_invsqrt(&d,&b,0) ) return 0;	// d = 1/sqrt(b) = z^{-m(p^2+p+1)/2)
694
//printf("d=sqrt(b)=%ld\n", _ff_get_ui(d));
695

696
	// set modulus for ff3 mults
697
	_ff_set_one(f[3]);  _ff_set_zero(f[2]);
698
	_ff_neg(f[1],r);  _ff_neg(f[0],s);
699
	// combine computation of (z+a)^((p+1)/2) and z^p
700
	ff_poly_xpan_mod_d3 (w,a,(_ff_p-1)/2,f);		// w = (z+a)^((p-1)/2)
701
	_ff3_mult_zpa_mod_rs (x,w,a,r,s);			// x = (z+a)^((p+1)/2)
702
//printf ("(z+%ld)^((p+1)/2) = %ldz^2+%ldz+%ld\n", _ff_get_ui(a), _ff_get_ui(x[2]), _ff_get_ui(x[1]), _ff_get_ui(x[0]));
703
	_ff3_mult_mod_rs(u,w,x,r,s);				// u = (z+a)^p mod f
704
	_ff_subfrom(u[0],a);						// u = (z+a)^p - a = z^p+a^p-a = z^p
705
//printf("z^p = %ldz^2+%ldz+%ld\n", _ff_get_ui(u[2]), _ff_get_ui(u[1]), _ff_get_ui(u[0]));
706

707
	// x^p = x0+x1*z^p+x2*z^(2p) = x0+x1*u+x2*u^2
708
	ff3_scalar_mult(w,x[1],u);				// w = x1u
709
	_ff3_square_mod_rs(v,u,r,s);				// v = u^2
710
	ff3_scalar_mult(v,x[2],v);					// v = x2u^2
711
	_ff_set(t1,x[0]);
712
	ff3_add(x,w,v);							// x = x1u+x2u^2
713
	_ff_addto(x[0],t1);						// x = x0+x1u+x2u^2 = (z^((p+1)/2))^p
714
	_ff3_mult_zpa_mod_rs (x,x,a,r,s);			// x = z^(p(p+1)/2+1)
715
	_ff_mult (b, d, c);						// b = z^(k(p^2+p+1)-n/2)		(do F_p mults before scalar_mult)  recall that n=m(p^2+p+1) where m is odd part of p-1
716
	ff3_scalar_mult (x, b, x);					// x = z^(k(p^2+p+1)+p(p+1)/2+1-n/2) = z^((n+1)/2-n/2) = z^(1/2)
717
//printf ("z^(1/2) = %ldz^2+%ldz+%ld\n", _ff_get_ui(x[2]),  _ff_get_ui(x[1]),  _ff_get_ui(x[0]));
718
#if ! FF_FAST
719
	_ff3_square_mod_rs (w, x,r,s);
720
	if ( !_ff_zero(w[0])||!_ff_one(w[1])||!_ff_zero(w[2]) ) { printf ("ff3_sqrt failed, (%luz^2+%luz+%lu)^2 = ", _ff_get_ui(x[2]), _ff_get_ui(x[1]), _ff_get_ui(x[0]));
721
					    printf ("%luz^^2+%luz+%luz != z",   _ff_get_ui(w[2]), _ff_get_ui(w[1]), _ff_get_ui(w[0])); }
722
#endif
723
	// now compute tr(x)=x0*tr(1)+x1*tr(z)+x2*(tr(z^2)) = 3x0 + 0 + 2rx2
724
	_ff_mult(t2,r,x[2]);
725
	_ff_add(t1,t2,t2);						// t1=2rx2
726
	_ff_add(t2,x[0],x[0]);
727
	_ff_addto(t2,x[0]);						// t2=3x0
728
	_ff_add(o[0],t1,t2);						// o=tr(x)
729
//printf("tr(z^(1/2)) = %ld\n", _ff_get_ui(o[0]));
730
	return 1;
731
}
732

733
Product

Resources

Company