Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
241818 views
1
#include <stdlib.h>
2
#include <stdio.h>
3
#include "ff.h"
4
#include "ffext.h"
5
#include "ffpoly.h" // used to find irreducible cubic
6
7
/*
8
Copyright 2008 Andrew V. Sutherland
9
10
This file is part of smalljac.
11
12
smalljac is free software: you can redistribute it and/or modify
13
it under the terms of the GNU General Public License as published by
14
the Free Software Foundation, either version 2 of the License, or
15
(at your option) any later version.
16
17
smalljac is distributed in the hope that it will be useful,
18
but WITHOUT ANY WARRANTY; without even the implied warranty of
19
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20
GNU General Public License for more details.
21
22
You should have received a copy of the GNU General Public License
23
along with smalljac. If not, see <http://www.gnu.org/licenses/>.
24
*/
25
26
static ff_t _ff3_f[4]; // irred minimal poly of z - polynomial basis {1,z,z^2} of the form x^3-s or x^3-x-s
27
static ff_t _ff3_nr_exp; // z^(m(p^2+p+1)), a primitive 2^e-th root of unity in F_p^3 (which necessarily lies in F_p)
28
29
ff_t _ff3_zp[3]; // if p=1mod3 this is just a multiple of z
30
ff_t _ff3_z2p[3]; // z^{2p} cached because it used by ff3_exp_p
31
int _ff3_trace_z2; // trace of z^2 is 0 for p=1mod and 2 o.w., note that trace of z is always 0, store this as in int rather than an ff_t
32
33
static ff_t _ff2_3Sylow_tab[42][2][2]; // These values depend on s (the quadratic non-residue passed in) and are not reused, having a static table is simple a convenience
34
ff_t _ff2_cbrt_unity[2];
35
int _ff2_cbrt_setup;
36
37
void ff_ext_setup(void) { _ff3_f[3] = 0; _ff2_cbrt_setup = 0; } // don't zero everything, just enough to detect uninitialized cases
38
39
// standard 4-ary exponentiation (fixed 2-bit window)
40
void ff2_exp_ui_s (ff_t o[2], ff_t a[2], unsigned long e, ff_t s)
41
{
42
register int i;
43
register unsigned long j, m;
44
ff_t b[4][2], c[2];
45
46
//printf ("exp(%ldz+%ld, %ld)\n", _ff_get_ui(a[1]), _ff_get_ui(a[0]), e);
47
switch (e) {
48
case 0: ff2_set_one (o); return;
49
case 1: ff2_set(o,a); return;
50
case 2: ff2_square_s(o,a,s); return;
51
}
52
i = _asm_highbit(e);
53
if ( i&1 ) i--;
54
m = 3UL<<i;
55
ff2_set (b[1], a);
56
ff2_square_s (b[2],b[1],s);
57
ff2_mult_s(b[3],b[2],b[1],s);
58
ff2_set (c, b[(m&e)>>i]);
59
for ( m>>=2,i-=2 ; m ; m>>=2, i-=2 ) {
60
ff2_square_s(c,c,s); ff2_square_s(c,c,s);
61
j = (m&e)>>i;
62
if ( j ) ff2_mult_s(c,c,b[j],s);
63
}
64
ff2_set (o, c);
65
//printf("expresult=%ldz+%ld\n", _ff_get_ui(o[1]), _ff_get_ui(o[0]));
66
}
67
68
69
70
/*
71
sqrt algorithm over F_p^2, reduces problem to two sqrts in F_p
72
73
We are given an element of the form a1z+a0 where z^2=s=_ff_2g (a non-residue in F_p)
74
If a1 is zero, we just compute sqrt(a0) in F_p, and if it doesn't exist, sqrt(a0)=sqrt(a0/s)*sqrt(s).
75
76
We now assume a1!=0. If (b1z+b0)^2 = (a1z+a0) then
77
78
b0^2+sb1^2 = a0 and 2b0b1=a1
79
80
and we know that both b0 and b1 are non-zero. We then obtain
81
82
b0^4 - a0b0^2 + a1^2s/4 = 0 => b0^2 = (a0 +/- sqrt(N(a)))/2
83
84
and similarly
85
86
sb1^4 - a0b1^2 + a1^2/4 = 0 => b1^2 = (a0 -/+ sqrt(N(a)))/(2s)
87
88
Note that b0^2*b1^2 = a1^2/4 is a QR, but 1/s is not a QR, so exactly one of (-a0+sqrt(N(a)))/2 and (-a0-sqrt(N(a)))/2 is a QR
89
90
*/
91
int ff2_sqrt (ff_t o[2], ff_t a[2])
92
{
93
ff_t x, y, t;
94
95
ff_setup_2g();
96
if ( _ff_zero(a[1]) ) {
97
if ( ff_invsqrt(&t, a, 1) ) {
98
_ff_mult (o[0], t, a[0]);
99
_ff_set_zero(o[1]);
100
} else {
101
_ff_mult (o[1],t,a[0]);
102
_ff_set_zero(o[0]);
103
}
104
return 1;
105
}
106
ff2_norm(&t,a);
107
if ( ! ff_sqrt (&x, &t) ) return 0; // a is a QR in F_p^2 iff N(a) is a QR in F_p
108
_ff_add (t,a[0],x);
109
_ff_div2(y,t);
110
if ( ! ff_invsqrt (&t, &y, 1) ) {
111
_ff_mult(x,y,t); // if r=sqrt(y) is not in F_p then b1 = r/sqrt(s)=r/z, we have r=y*tz, so b1=y*t
112
ff_mult(t,t,_ff_2g); // b0=a1/(2b1)=a1/(2yt)=a1z/(2ytz)=a1z/(2r)=a1zr/(2y)=a1ytz^2/(2y)=a1ts/2
113
ff_mult(t,t,a[1]);
114
_ff_div2(y,t); // got b0
115
} else {
116
ff_mult(y,y,t); // got b0
117
ff_mult(t,t,a[1]); // b1=a1/(2b0)=a1/(2yt)=a1t/2
118
_ff_div2(x,t); // got b1
119
}
120
check:
121
#if ! FF_FAST
122
{
123
ff_t b[2];
124
_ff_set(b[0],y); _ff_set(b[1],x);
125
ff2_square(b,b);
126
if ( ! ff2_equal(b,a) ) { printf ("ff2_sqrt failed, (%lu+%luz)^2 != %lu+%luz\n",_ff_get_ui(b[0]), _ff_get_ui(b[1]), _ff_get_ui(a[0]), _ff_get_ui(a[1])); exit (0); }
127
}
128
#endif
129
_ff_set(o[1],x);
130
_ff_set(o[0],y);
131
return 1;
132
}
133
134
void _ff2_setup_cbrt(void)
135
{
136
register int i;
137
register unsigned long n;
138
ff_t t0,t1;
139
ff_t r[2],s[2],t[2],w[2];
140
141
if ( _ff2_cbrt_setup ) return;
142
ff_setup_2g();
143
if ( _ff_p1mod3 || _ff_p==3 ) { puts ("_ff2_setup_cbrt called when p!= 2mod 3, this should never happen."); exit (0); }
144
_ff2_cbrt_setup = 1;
145
// Note that p+1 = 3^e*m where m is not divisible by 3, and we have p-1 not div by 3
146
147
if ( _ff_p3_e == 1 ) {
148
// life is slightly easer in this situation, since (-1+sqrt(-3))/2 generates the Sylow 3-subgroup of F_p^2
149
// unfortunately we still need to compute sqrt(-3) in the standard basis (-3 is not necessarily _ff_2g)
150
_ff_set_i(t0,-3);
151
if ( ff_sqrt_ext (&t1,&t0) ) { printf ("Impossible, -3 is a QR mod %ld with p=2mod3\n", _ff_p); exit (0); }
152
_ff_div2(_ff2_3Sylow_tab[0][0][0], _ff_negone);
153
_ff_div2(_ff2_3Sylow_tab[0][0][1], t1);
154
ff2_square (_ff2_3Sylow_tab[0][1],_ff2_3Sylow_tab[0][0]);
155
ff2_set (_ff2_cbrt_unity,_ff2_3Sylow_tab[0][0]);
156
//printf ("cube root of unity is %ldz+%ld, 2g=%d\n", _ff_get_ui(_ff2_cbrt_unity[1]), _ff_get_ui(_ff2_cbrt_unity[0]), _ff_get_ui(_ff_2g));
157
return;
158
}
159
160
// To find a cubic non-residue, it suffices to find r^(m*3^(e-1)) not in F_p. (when e is 1 this is just r^m)
161
// To get into the 3-Sylow, we need to exponentiate by (p-1)m = (p+1)(m-1) + 2m(3^(e-1)-1), so we exponentiate by m-1 first.
162
n = (_ff_p+1)/(3*_ff_p3_m)-1; // n = 3^(e-1)-1
163
_ff_set_one(t[1]); _ff_set_one(t[0]);
164
for(;;) {
165
ff2_exp_ui (r, t, _ff_p3_m-1); // r = t^(m-1)
166
ff2_mult (s, r, t); // s = t^m
167
if ( ! _ff_zero(s[1]) ) { // if s is in F_p, t is a cubic residue
168
ff2_exp_ui (w,s,n); // w = t^(m*(3^(e-1)-1))
169
_ff_mult(t0,w[0],s[1]); _ff_mult(t1,w[1],s[0]);
170
_ff_addto(t1,t0); // t1 = z coeff of t^(m*3^(e-1)) = t^((p+1)/3)
171
if ( ! _ff_zero(t1) ) break; // t is a cubic residue iff t^((p+1)/3) is in F_p iff t1 is zero
172
}
173
next: _ff_inc(t[0]);
174
if ( _ff_zero(t[0]) ) { _ff_inc(t[1]); _ff_set_one(t[0]); }
175
}
176
// t is a non CR, r=t^(m-1), s=t^(m*(3^e-2))
177
ff2_norm(&t0,r); // t0 = t^((p+1)(m-1))
178
ff2_mult(s,s,w); // s = t^(3^(e-1)m)
179
ff2_square(w,w); // w = t^(2*(3^(e-1)-1)m)
180
ff2_mult(s,s,w); // s = t^((3^(e-1)+2(3^(e-1)-1))m) = t^((3^e-2)m)
181
ff2_scalar_mult(r,t0,s); // r = t^((p+1)(m-1)+(3^e-2)m) = t^((p-1)m) generates the 3-Sylow of F_p^2
182
ff2_set (_ff2_3Sylow_tab[0][0], r);
183
ff2_square (_ff2_3Sylow_tab[0][1],_ff2_3Sylow_tab[0][0]);
184
for ( i = 1 ; i < _ff_p3_e ; i++ ) {
185
ff2_mult(_ff2_3Sylow_tab[i][0],_ff2_3Sylow_tab[i-1][0],_ff2_3Sylow_tab[i-1][1]); // tab[i][0] = tab[i-1][0]^3
186
ff2_square(_ff2_3Sylow_tab[i][1], _ff2_3Sylow_tab[i][0]); // tab[i][1] = tab[i][0]^2
187
}
188
ff2_set (_ff2_cbrt_unity,_ff2_3Sylow_tab[_ff_p3_e-1][0]);
189
//printf ("cube root of unity is %ldz+%ld, 2g=%d\n", _ff_get_ui(_ff2_cbrt_unity[1]), _ff_get_ui(_ff2_cbrt_unity[0]), _ff_get_ui(_ff_2g));
190
}
191
192
193
// computes a^{-1/3} for a in the 3-Sylow subgroup, returns 0 if not a quadratic residue
194
// uses precomputed table of 2e powers of the 3-Sylow generator to reduce running time by a factor of 2 over standard Tonelli-Shanks (still O(e^2)).
195
int ff2_3Sylow_invcbrt (ff_t o[2], ff_t a[2])
196
{
197
ff_t b[2], q[2], q1[2], t[2], w1[2], w2[2];
198
register int i, j, k;
199
200
ff2_setup_cbrt();
201
202
// handle easy cases without setting up
203
if ( _ff_one(a[0]) ) { _ff_set_one(o[0]); return 1; } // use 1 as the cube root of 1
204
if ( _ff_p3_e == 1 ) return 0;
205
206
// set w1 and w2 to the two elements of order 3 in the 3-Sylow (i.e. the two non-trivial cube roots of unity)
207
ff2_set (w1, _ff2_3Sylow_tab[_ff_p3_e-1][0]);
208
ff2_set (w2, _ff2_3Sylow_tab[_ff_p3_e-1][1]);
209
ff2_set_one (t);
210
211
ff2_set (b,a);
212
do {
213
ff2_set (q, b);
214
for ( i = 1 ; i < _ff_p3_e+1 ; i++ ) { // s<e+1 is just a safety check in case a isn't in the 3-Sylow, this could be removed
215
j=1;
216
if ( ff2_equal(q,w1) ) break;
217
j=0;
218
if ( ff2_equal(q,w2) ) break;
219
ff2_set(q1,q);
220
ff2_square (q,q); ff2_mult(q,q,q1);
221
}
222
k = _ff_p3_e-i;
223
#if ! FF_FAST
224
if ( k < 0 ) { printf ("Unexpected result: k<0 in ff2_3Sylow_invsqrt?! a = %luz+%lu, p = %lu\n", _ff_get_ui(a[1]), _ff_get_ui(a[0]), _ff_p); exit (1); }
225
#endif
226
if ( k <= 0 ) return 0;
227
ff2_mult (b, b, _ff2_3Sylow_tab[k][j]); // the product of all the elements S[k] we multiply into b here is b^{-1}, since we terminate with b=1
228
ff2_mult (t, t, _ff2_3Sylow_tab[k-1][j]); // multiply t by S[k-1]=cbrt(S[k]), the product of all these will be b^{-1/3}
229
} while ( ! ff2_one(b) );
230
#if ! FF_FAST
231
ff2_square (b, t); ff2_mult(b,b,t);
232
ff2_mult(b,b,a);
233
if ( ! ff2_one(b) ) { printf ("ff2_3Sylow_invcbrt failed, %(lu+%lu)^3 * (%luz+%lu) != 1\n", _ff_get_ui(t[1]), _ff_get_ui(t[0]), _ff_get_ui(a[1]), _ff_get_ui(a[0])); exit (0); }
234
#endif
235
ff2_set(o,t);
236
return 1;
237
}
238
239
void _ff3_setup (void)
240
{
241
_ff_set_one(_ff3_f[3]); _ff_set_zero(_ff3_f[2]);
242
if ( _ff_p1mod3 ) {
243
ff_setup_3g();
244
ff3_set_zero(_ff3_zp);
245
ff_exp_ui(_ff3_zp+1,&_ff_3g,(_ff_p-1)/3);
246
_ff_set_zero(_ff3_f[1]);
247
_ff_neg (_ff3_f[0], _ff_3g);
248
_ff3_trace_z2 = 0;
249
} else {
250
_ff_set_one(_ff3_f[1]);
251
ff_negate (_ff3_f[1]);
252
_ff_set(_ff3_f[0],_ff3_f[1]);
253
for(;;) {
254
if ( ! ff_poly_roots_d3 (0, _ff3_f) ) break;
255
_ff_dec(_ff3_f[0]);
256
}
257
_ff_neg (_ff_3g, _ff3_f[0]); // note we use _ff_3g here even though the 3-Sylow is trivial
258
ff3_zn_mod (_ff3_zp, _ff_p, _ff3_f);
259
_ff3_trace_z2 = 2;
260
}
261
ff3_square (_ff3_z2p, _ff3_zp);
262
// Note that the norm of z is the product of the roots of f, which is -f[0] = __ff_3g
263
// the trace of z is -f[2] = 0, trace of z^2 is 0 for p=1mod3 and 2 o.w.
264
printf ("ff3 poly = z^3+%ldz+%ld\n", _ff_get_ui(_ff3_f[1]), _ff_get_ui(_ff3_f[0]));
265
}
266
267
static inline void ff3_setup(void) { if ( ! _ff3_f[3] ) _ff3_setup(); }
268
269
// overlap ok
270
void ff3_mult (ff_t o[3], ff_t a[3], ff_t b[3])
271
{
272
register ff_t s1,s2,s3,t0,t1,t2,w1,w2,w3;
273
274
_ff_add(t1,a[0],a[1]); _ff_add(t2,b[0],b[1]); _ff_mult(s1,t1,t2);
275
_ff_add(t1,a[0],a[2]); _ff_add(t2,b[0],b[2]); _ff_mult(s2,t1,t2);
276
_ff_add(t1,a[1],a[2]); _ff_add(t2,b[1],b[2]); _ff_mult(s3,t1,t2);
277
_ff_mult(t0,a[0],b[0]); _ff_mult(t1,a[1],b[1]); _ff_mult(t2,a[2],b[2]);
278
_ff_sub(w1,s3,t1); _ff_subfrom(w1,t2); // w1 = (a1+a2)(b1+b2)-a1b1-a2b2 = a1b2+b2b1
279
_ff_mult(w3,w1,_ff_3g); // w3 = s(a1b2+b2b1)
280
_ff_add(o[0],t0,w3); // o[0] = a0b0 + (a1b2+b2b1)s
281
_ff_mult(w2,t2,_ff_3g); // w2 = a2b2s
282
_ff_sub(w3,s1,t0); _ff_subfrom(w3,t1); // w3 = (a0+a1)(b0+b1)-a0b0-a1b1 = a0b1+b1a0
283
_ff_add(o[1],w2,w3); // o[1] = a0b1+b1a0+a2b2s
284
_ff_sub(w3,s2,t0); _ff_subfrom(w3,t2); // w3 = (a0+a2)(b0+b2)-a0b0-a2b2 = a0b2+b2a0
285
_ff_add(o[2],w3,t1); // o[2] = a0b2+b2a0+a1b1
286
if ( ! _ff_p1mod3 ) { // in this case z^3=z+s so we need to add z[(a1b2+b2b1)+a2b2z]
287
_ff_addto(o[1],w1); // o[1] = a0b1+b1a0+a2b2s + a1b2+b2b1
288
_ff_addto(o[2],t2); // o[2] = a0b2+b2a0+a1b1 + a2b2
289
}
290
// 8M + 15A (17A)
291
}
292
293
// multiplies modulo (z^3-rz-s), overlap ok (duplicates code in ff3_mult above, then adjusts for r)
294
void _ff3_mult_mod_rs (ff_t o[3], ff_t a[3], ff_t b[3], ff_t r, ff_t s)
295
{
296
register ff_t s1,s2,s3,t0,t1,t2,w1,w2,w3;
297
298
_ff_add(t1,a[0],a[1]); _ff_add(t2,b[0],b[1]); _ff_mult(s1,t1,t2);
299
_ff_add(t1,a[0],a[2]); _ff_add(t2,b[0],b[2]); _ff_mult(s2,t1,t2);
300
_ff_add(t1,a[1],a[2]); _ff_add(t2,b[1],b[2]); _ff_mult(s3,t1,t2);
301
_ff_mult(t0,a[0],b[0]); _ff_mult(t1,a[1],b[1]); _ff_mult(t2,a[2],b[2]);
302
_ff_sub(w1,s3,t1); _ff_subfrom(w1,t2); // w1 = (a1+a2)(b1+b2)-a1b1-a2b2 = a1b2+b2b1
303
_ff_mult(w3,w1,s); // w3 = s(a1b2+b2b1)
304
_ff_add(o[0],t0,w3); // o[0] = a0b0 + (a1b2+b2b1)s
305
_ff_mult(w2,t2,s); // w2 = a2b2s
306
_ff_sub(w3,s1,t0); _ff_subfrom(w3,t1); // w3 = (a0+a1)(b0+b1)-a0b0-a1b1 = a0b1+b1a0
307
_ff_add(o[1],w2,w3); // o[1] = a0b1+b1a0+a2b2s
308
_ff_sub(w3,s2,t0); _ff_subfrom(w3,t2); // w3 = (a0+a2)(b0+b2)-a0b0-a2b2 = a0b2+b2a0
309
_ff_add(o[2],w3,t1); // o[2] = a0b2+b2a0+a1b1
310
// z^3=rz+s so we need to add rz[(a1b2+b2b1)+a2b2z] - don't optimize for r=0 or 1 here, we assume general case applies
311
_ff_mult(t0,r,w1);
312
_ff_addto(o[1],t0); // o[1] = a0b1+b1a0+a2b2s + r(a1b2+b2b1)
313
_ff_mult(t1,r,t2);
314
_ff_addto(o[2],t1); // o[2] = a0b2+b2a0+a1b1 + ra2b2
315
// 10M + 17A
316
}
317
318
// squares mod x^3-s
319
static void inline _ff3_square_mod_0s (ff_t o[3], ff_t a[3], ff_t s)
320
{
321
register ff_t s1,s2,t0,t1,t2,w1,w2,w3;
322
323
_ff_add(s1,a[1],a[1]); // 2a1
324
_ff_mult(w1,s1,a[0]); // 2a0a1
325
_ff_mult(s2,_ff_3g,a[2]); // a2s
326
_ff_mult(w2,s1,s2); // 2a1a2s
327
_ff_mult(t2,s2,a[2]); // a2^2s
328
_ff_square(t0,a[0]); _ff_square(t1,a[1]); // a0^2, a1^2
329
_ff_mult(w3,a[0],a[2]); // a0a2
330
_ff_x2(w3); // 2a0a2
331
_ff_add(o[0],t0,w2); // a0^2 + 2a1a2s
332
_ff_add(o[1],w1,t2); // 2a0a1 + a2^2s
333
_ff_add(o[2],w3,t1); // 2a0a2 + a1^2
334
// 7M + 5A
335
}
336
337
// squares mod x^3-x-s
338
static void inline _ff3_square_mod_1s (ff_t o[3], ff_t a[3], ff_t s)
339
{
340
register ff_t s1,s2,t0,t1,t2,w1,w2,w3;
341
342
_ff_add(s1,a[0],a[2]); // a0+a2
343
_ff_square(w1,s1); // a0^2+2a0a2+a2^2
344
_ff_add(s2,a[1],a[1]); // 2a1
345
_ff_mult(w3,s1,s2); // 2a0a1+2a1a2
346
_ff_mult(w2,a[2],s); // a2s
347
ff_mult(s2,s2,w2); // 2a1a2s
348
_ff_mult(t2,w2,a[2]); // a2^2s
349
_ff_square(t0,a[0]); _ff_square(t1,a[1]); // a0^2, a1^2
350
_ff_subfrom(w1,t0); // 2a0a2 + a2^2
351
_ff_add(o[0],t0,s2); // a0^2+2a1a2s
352
_ff_add(o[1],w3,t2); // 2a0a1+2a1a2+a2^2s
353
_ff_add(o[2],w1,t1); // 2a0a2 + a2^2 + a1^2
354
// 7M + 6A
355
}
356
357
// squares mod x^3-rx-s
358
static void inline _ff3_square_mod_rs (ff_t o[3], ff_t a[3], ff_t r, ff_t s)
359
{
360
register ff_t s1,s2,t0,t1,t2,w1,w2,w3;
361
362
_ff_mult(w1,a[2],r); // a2r
363
_ff_add(s1,a[0],w1); // a0+a2r
364
_ff_add(s2,a[1],a[1]); // 2a1
365
_ff_mult(w3,s1,s2); // 2a0a1+2a1a2r
366
_ff_mult(w2,a[2],s); // a2s
367
ff_mult(s2,s2,w2); // 2a1a2s
368
_ff_mult(t2,w2,a[2]); // a2^2s
369
_ff_square(t0,a[0]); _ff_square(t1,a[1]); // a0^2, a1^2
370
_ff_addto(s1,a[0]); // 2a0+a2r
371
_ff_mult(w1,s1,a[2]); // 2a0a2+a2^2r
372
_ff_add(o[0],t0,s2); // a0^2+2a1a2s
373
_ff_add(o[1],w3,t2); // 2a0a1+2a1a2+a2^2s
374
_ff_add(o[2],w1,t1); // 2a0a2 + a2^2r + a1^2
375
// 8M + 6A
376
}
377
378
// overlap ok
379
void ff3_square (ff_t o[3], ff_t a[3])
380
{
381
if ( _ff_p1mod3 ) {
382
_ff3_square_mod_0s (o, a, _ff_3g);
383
} else {
384
_ff3_square_mod_1s (o, a, _ff_3g);
385
}
386
// 7M + 5A (6A)
387
}
388
389
// change code below if and when we add degree 2 ext ops
390
void ff2_poly_eval (ff_t o[2], ff_t f[], int d, ff_t x[2])
391
{
392
ff_t t[2], y[2];
393
register int i;
394
395
if ( d < 0 ) { ff2_set_zero(o); return; }
396
ff2_set_zero(y);
397
_ff_set (y[0], f[d]);
398
ff2_set(t,x);
399
for ( i = d-1 ; i >= 0 ; i-- ) { ff2_mult(y,y,t); _ff_addto(y[0], f[i]); }
400
ff2_set(o,y);
401
return;
402
}
403
404
405
void ff3_poly_eval (ff_t o[3], ff_t f[], int d, ff_t x[3])
406
{
407
ff_t t[3], y[3];
408
register int i;
409
410
ff_setup_3g();
411
if ( d < 0 ) { ff3_set_zero (o); return; }
412
ff3_set_zero (y);
413
_ff_set (y[0], f[d]);
414
ff3_set (t,x);
415
for ( i = d-1 ; i >= 0 ; i-- ) { ff3_mult(y,y,t); _ff_addto(y[0],f[i]); }
416
ff3_set(o,y);
417
return;
418
}
419
420
// computes a^p = a[0]+a[1]z^p+a[2]z^{2p} using 2M or 6M+4A, overlap ok
421
void ff3_exp_p (ff_t o[3], ff_t a[3])
422
{
423
ff_t t1[3], t2[3];
424
425
if ( _ff_p1mod3 ) {
426
// if p=1mod3 we know z^p is a multiple of z and z^2p is a multiple of z^2
427
_ff_set(o[0],a[0]);
428
ff_mult(o[1],a[1],_ff3_zp[1]);
429
ff_mult(o[2],a[2],_ff3_z2p[2]);
430
} else {
431
ff3_scalar_mult(t1,a[1],_ff3_zp);
432
ff3_scalar_mult(t2,a[2],_ff3_z2p);
433
ff3_add(t1,t1,t2);
434
_ff_addto(t1[0],a[0]);
435
ff3_set(o,t1);
436
}
437
}
438
439
440
// computes z^p mod f=z^3-az-b, optimized for a=1 case
441
// standard 4-ary exponentiation (fixed 2-bit window)
442
void ff3_zn_mod (ff_t o[3], unsigned long n, ff_t f[4]) // only looks at f[0] and f[1], implicitly assumes f[2]=0 and f[3]=1
443
{
444
register int i;
445
register unsigned long j, m, e;
446
register ff_t a, b, w1,w2, w3;
447
ff_t t[3];
448
449
e=n;
450
i = _asm_highbit(e);
451
if ( i&1 ) i--;
452
m = 3UL<<i;
453
j = (m&e)>>i;
454
_ff_neg(a,f[1]);
455
_ff_neg(b,f[0]);
456
//printf("a=%ld, f[1]=%ld, b=%ld, f[0]=%ld\n", _ff_get_ui(a), _ff_get_ui(f[1]), _ff_get_ui(b), _ff_get_ui(f[0]));
457
switch(j){
458
case 1:
459
_ff_set_zero(t[0]); _ff_set_one(t[1]); _ff_set_zero(t[2]); // t = z
460
break;
461
case 2:
462
_ff_set_zero(t[0]); _ff_set_zero(t[1]); _ff_set_one(t[2]); // t = z^2
463
break;
464
case 3:
465
_ff_set(t[0],b); _ff_set(t[1],a); _ff_set_zero(t[2]); // t = z^3=az+b
466
break;
467
}
468
//printf("j=%d, %ldz^2+%ldz+%ld!\n", j, _ff_get_ui(t[2]), _ff_get_ui(t[1]), _ff_get_ui(t[0]));
469
if ( _ff_one(a) ) {
470
for ( m>>=2,i-=2 ; m ; m>>=2, i-=2 ) {
471
_ff3_square_mod_1s(t,t,b); _ff3_square_mod_1s(t,t,b);
472
//printf("i=%d, %ldz^2+%ldz+%ld!\n", i, _ff_get_ui(t[2]), _ff_get_ui(t[1]), _ff_get_ui(t[0]));
473
j = (m&e)>>i;
474
switch(j) {
475
case 1:
476
_ff_mult(w1,t[2],b);
477
_ff_set(w2,t[1]);
478
_ff_add(t[1],t[0],t[2]);
479
_ff_set(t[2],w2);
480
_ff_set(t[0],w1);
481
break;
482
case 2:
483
_ff_mult(w1,t[2],b);
484
_ff_addto(t[2],t[0]);
485
_ff_mult(w2,t[1],b);
486
_ff_addto(t[1],w1);
487
_ff_set(t[0],w2);
488
break;
489
case 3:
490
_ff_mult(w1,t[2],b);
491
_ff_mult(w2,t[1],b);
492
_ff_addto(w2,t[0]);
493
_ff_add(w3,t[0],t[2]);
494
_ff_mult(t[0],w3,b);
495
_ff_set(w3,t[2]);
496
_ff_add(t[2],t[1],w1);
497
_ff_add(t[1],w3,w2);
498
break;
499
}
500
// 1.5M+1.75A on average, every 4 bits
501
}
502
} else {
503
for ( m>>=2,i-=2 ; m ; m>>=2, i-=2 ) {
504
_ff3_square_mod_rs(t,t,a,b); _ff3_square_mod_rs(t,t,a,b);
505
//printf("i=%d, %ldz^2+%ldz+%ld!\n", i, _ff_get_ui(t[2]), _ff_get_ui(t[1]), _ff_get_ui(t[0]));
506
j = (m&e)>>i;
507
switch(j) {
508
case 1:
509
_ff_mult(w1,t[2],b);
510
_ff_mult(w2,t[2],a);
511
_ff_set(t[2],t[1]);
512
_ff_add(t[1],w2,t[0]);
513
_ff_set(t[0],w1);
514
break;
515
case 2:
516
_ff_mult(w1,t[2],b);
517
_ff_mult(w2,t[2],a);
518
_ff_add(t[2],t[0],w2);
519
_ff_mult(w2,t[1],a);
520
_ff_mult(t[0],t[1],b);
521
_ff_add(t[1],w1,w2);
522
break;
523
case 3:
524
_ff_mult(w1,t[2],b);
525
_ff_mult(w2,t[2],a);
526
_ff_add(w3,t[0],w2);
527
_ff_mult(t[0],w3,b);
528
_ff_mult(w2,w3,a);
529
_ff_mult(w3,t[1],b);
530
_ff_mult(t[2],t[1],a);
531
_ff_add(t[1],w2,w3);
532
_ff_addto(t[2],w1);
533
break;
534
}
535
// 3M+1.5A on average, every 4 bits
536
//printf("j=%d, %ldz^2+%ldz+%ld!\n", j, _ff_get_ui(t[2]), _ff_get_ui(t[1]), _ff_get_ui(t[0]));
537
}
538
}
539
ff3_set (o, t);
540
}
541
542
543
// standard 4-ary exponentiation (fixed 2-bit window)
544
void ff3_exp_ui (ff_t o[3], ff_t a[3], unsigned long e)
545
{
546
register int i;
547
register unsigned long j, m;
548
ff_t b[4][3], c[3];
549
550
//printf ("exp(%ldz^2+%ldz+%ld, %ld)\n", _ff_get_ui(a[2]), _ff_get_ui(a[1]), _ff_get_ui(a[0]), e);
551
ff_setup_3g();
552
switch (e) {
553
case 0: ff3_set_one (o); return;
554
case 1: ff3_set(o,a); return;
555
case 2: ff3_square(o,a); return;
556
}
557
i = _asm_highbit(e);
558
if ( i&1 ) i--;
559
m = 3UL<<i;
560
ff3_set (b[1], a);
561
ff3_square (b[2],b[1]);
562
ff3_mult(b[3],b[2],b[1]);
563
ff3_set (c, b[(m&e)>>i]);
564
for ( m>>=2,i-=2 ; m ; m>>=2, i-=2 ) {
565
ff3_square(c,c); ff3_square(c,c);
566
j = (m&e)>>i;
567
if ( j ) ff3_mult(c,c,b[j]);
568
}
569
ff3_set (o, c);
570
//printf("expresult=%ldz^2+%ldz+%ld\n", _ff_get_ui(o[2]), _ff_get_ui(o[1]), _ff_get_ui(o[0]));
571
}
572
573
// standard 4-ary exponentiation (fixed 2-bit window)
574
void ff3_exp_ui_rs (ff_t o[3], ff_t a[3], unsigned long e, ff_t r, ff_t s)
575
{
576
register int i;
577
register unsigned long j, m;
578
ff_t b[4][3], c[3];
579
580
//printf ("exp(%ldz^2+%ldz+%ld, %ld)\n", _ff_get_ui(a[2]), _ff_get_ui(a[1]), _ff_get_ui(a[0]), e);
581
ff_setup_3g();
582
switch (e) {
583
case 0: ff3_set_one (o); return;
584
case 1: ff3_set(o,a); return;
585
case 2: _ff3_square_mod_rs (o,a,r,s); return;
586
}
587
i = _asm_highbit(e);
588
if ( i&1 ) i--;
589
m = 3UL<<i;
590
ff3_set (b[1], a);
591
_ff3_square_mod_rs (b[2],b[1],r,s);
592
_ff3_mult_mod_rs(b[3],b[2],b[1],r,s);
593
ff3_set (c, b[(m&e)>>i]);
594
for ( m>>=2,i-=2 ; m ; m>>=2, i-=2 ) {
595
_ff3_square_mod_rs(c,c,r,s); _ff3_square_mod_rs(c,c,r,s);
596
j = (m&e)>>i;
597
if ( j ) _ff3_mult_mod_rs(c,c,b[j],r,s);
598
}
599
ff3_set (o, c);
600
//printf("expresult=%ldz^2+%ldz+%ld\n", _ff_get_ui(o[2]), _ff_get_ui(o[1]), _ff_get_ui(o[0]));
601
}
602
603
/*
604
This algorithm is deterministic (given 2Sylow generator in F_p) - caller should flip coin to randomize roots
605
*/
606
int ff3_sqrt (ff_t o[3], ff_t a[3])
607
{
608
ff_t u[3], v[3], w[3], x[3];
609
ff_t b, c, d;
610
611
if ( ff3_zero(a) ) { ff3_set_zero (o); return 1; }
612
if ( ff3_one(a) ) { ff3_set (o,a); return 1; }
613
614
ff3_setup();
615
616
//printf("sqrt(%ld+%ldz+%ldz^2)\n", _ff_get_ui(a[0]), _ff_get_ui(a[1]), _ff_get_ui(a[2]));
617
618
/*
619
We need to compute a^n and a^(n+1)/2, where n is the odd part of p^3-1, which is m(p^2+p+1), where m=2k+1 is the odd part of p-1
620
The element a^n=N(a)^m is in the 2-sylow subgroup, and is also in F_p (F_p and F_p^3 have the same 2-Sylow subgroup).
621
622
We can compute N(a) very efficiently, and then exponentiate by k in F_p, followed by a square and a multiply to obtain N(a)^(2k+1)=a^n.
623
We use a standard F_p Tonelli-Shanks algorithm to compute the inverse of the square root of a^n, if it exists, and if it doesn't we detect this
624
for less than the cost of a Legendre computation.
625
626
Once we have a^(-n/2) in hand, we then compute a^((n+1)/2) to obtain a^(1/2).
627
628
(n+1)/2 = ((2k+1)(p^2+p+1)+1)/2 = k(p^2+p+1) + p(p+1)/2 + 1
629
630
We already have a^(k(p^2+p+1)) from above, so we need only compute a^((p+1)/2) followed by a Frobenius exponentiation by p, and two mults.
631
*/
632
ff3_norm(&b,a); // b = N(a) = a^(p^2+p+1)
633
ff_exp_ui(&c,&b,(_ff_p2_m-1)/2); // c = N(a)^k (save for later)
634
_ff_square(d,c); // d = N(a)^(2k)
635
ff_mult(b,b,d); // b = N(a)^(2k+1) = N(a)^m is in the 2-Sylow
636
if ( ! ff_2Sylow_invsqrt(&d,&b,0) ) return 0; // d = 1/sqrt(b) = a^{-n/2)
637
ff3_exp_ui(x,a,(_ff_p+1)/2); // x = a^((p+1)/2)
638
ff3_exp_p(x,x); // x = a^(p(p+1)/2)
639
ff3_mult (x, x, a); // x = a^(p(p+1)/2+1)
640
_ff_mult (b, d, c); // b = a^(k(p^2+p+1)-n/2) (do F_p mults before scalar_mult)
641
ff3_scalar_mult (x, b, x); // x = a^(k(p^2+p+1)+p(p+1)/2+1-n/2) = a^((n+1)/2-n/2) = a^(1/2)
642
//printf ("a^(1/2) = %ldz^2+%ldz+%ld\n", _ff_get_ui(x[2]), _ff_get_ui(x[1]), _ff_get_ui(x[0]));
643
#if ! FF_FAST
644
ff3_square (w, x);
645
if ( ! ff3_equal(w,a) ) { printf ("ff3_sqrt failed, (%luz^2+%luz+%lu)^2 = ", _ff_get_ui(x[2]), _ff_get_ui(x[1]), _ff_get_ui(x[0]));
646
printf ("%luz^^2+%luz+%luz != ", _ff_get_ui(w[2]), _ff_get_ui(w[1]), _ff_get_ui(w[0]));
647
printf ("%luz^2+%luz+%luz\n", _ff_get_ui(a[2]), _ff_get_ui(a[1]), _ff_get_ui(a[0])); exit (0); }
648
#endif
649
ff3_set (o, x);
650
return 1;
651
}
652
653
// computes (z+a)*v in F_p[z]/(z^3-rz-s), overlap ok
654
static inline void _ff3_mult_zpa_mod_rs (ff_t o[3], ff_t v[3], ff_t a, ff_t r, ff_t s)
655
{
656
register ff_t t1,t2,w1,w2;
657
658
_ff_set(t2,v[2]);
659
_ff_mult(w1,t2,a);
660
_ff_add(o[2],v[1],w1);
661
_ff_mult(w1,v[1],a);
662
_ff_mult(w2,t2,r);
663
_ff_addto(w1,w2);
664
_ff_add(o[1],v[0],w1);
665
_ff_mult(w1,v[0],a);
666
_ff_mult(w2,t2,s);
667
_ff_add(o[0],w1,w2);
668
// 5M+4A
669
}
670
671
672
// Computes tr(sqrt(z)) in F_p^3=F_p[z]/(z^3-rz-s). This is a support function for factoring quartics.
673
int ff3_trsqrt_zpa_mod_rs (ff_t o[1], ff_t a, ff_t r, ff_t s)
674
{
675
ff_t u[3], v[3], w[3], x[3], f[4];
676
ff_t b, c, d, t1, t2;
677
678
//printf("Computing tr(sqrt(z+a)) for F_p[z]/(z^3-rz-s) with p=%ld, a=%ld, r=%ld, s=%ld\n", _ff_p, _ff_get_ui(a), _ff_get_ui(r), _ff_get_ui(s));
679
/*
680
We use effectively the same algorithm as ff3_sqrt to compute sqrt(z), except we work in the user supplied basis
681
and our input is (z+a) which simplifies a few things.
682
*/
683
// N(z+a) = (z+a)(z+a)^p(z+a)^(p^2) = (z+a)(z^p+a)(z^(p^2)+a) = N(z)+a*tr(z^p*z)+a^2tr(z)+a^3 = s-ar+a^3 (since N(z)=s, tr(z)=0 and tr(z*z^p)=-r)
684
_ff_mult(t1,a,r); // t1=ar
685
_ff_sub(b,s,t1); // b=s-ar
686
_ff_square(t2,a); ff_mult(t1,t2,a); // t1=a^3
687
_ff_addto(b,t1); // b=s-ar+a^3=N(z+a)
688
//printf("N(z+a)=%ld\n", _ff_get_ui(b));
689
ff_exp_ui(&c,&b,(_ff_p2_m-1)/2); // c = N(z+a)^k (save for later)
690
_ff_square(d,c); // d = N(z+a)^(2k)
691
ff_mult(b,b,d); // b = N(z+a)^(2k+1) = N(z)^m is in the 2-Sylow
692
//printf ("b=N(z+a)^m=%ld\n", _ff_get_ui(b));
693
if ( ! ff_2Sylow_invsqrt(&d,&b,0) ) return 0; // d = 1/sqrt(b) = z^{-m(p^2+p+1)/2)
694
//printf("d=sqrt(b)=%ld\n", _ff_get_ui(d));
695
696
// set modulus for ff3 mults
697
_ff_set_one(f[3]); _ff_set_zero(f[2]);
698
_ff_neg(f[1],r); _ff_neg(f[0],s);
699
// combine computation of (z+a)^((p+1)/2) and z^p
700
ff_poly_xpan_mod_d3 (w,a,(_ff_p-1)/2,f); // w = (z+a)^((p-1)/2)
701
_ff3_mult_zpa_mod_rs (x,w,a,r,s); // x = (z+a)^((p+1)/2)
702
//printf ("(z+%ld)^((p+1)/2) = %ldz^2+%ldz+%ld\n", _ff_get_ui(a), _ff_get_ui(x[2]), _ff_get_ui(x[1]), _ff_get_ui(x[0]));
703
_ff3_mult_mod_rs(u,w,x,r,s); // u = (z+a)^p mod f
704
_ff_subfrom(u[0],a); // u = (z+a)^p - a = z^p+a^p-a = z^p
705
//printf("z^p = %ldz^2+%ldz+%ld\n", _ff_get_ui(u[2]), _ff_get_ui(u[1]), _ff_get_ui(u[0]));
706
707
// x^p = x0+x1*z^p+x2*z^(2p) = x0+x1*u+x2*u^2
708
ff3_scalar_mult(w,x[1],u); // w = x1u
709
_ff3_square_mod_rs(v,u,r,s); // v = u^2
710
ff3_scalar_mult(v,x[2],v); // v = x2u^2
711
_ff_set(t1,x[0]);
712
ff3_add(x,w,v); // x = x1u+x2u^2
713
_ff_addto(x[0],t1); // x = x0+x1u+x2u^2 = (z^((p+1)/2))^p
714
_ff3_mult_zpa_mod_rs (x,x,a,r,s); // x = z^(p(p+1)/2+1)
715
_ff_mult (b, d, c); // b = z^(k(p^2+p+1)-n/2) (do F_p mults before scalar_mult) recall that n=m(p^2+p+1) where m is odd part of p-1
716
ff3_scalar_mult (x, b, x); // x = z^(k(p^2+p+1)+p(p+1)/2+1-n/2) = z^((n+1)/2-n/2) = z^(1/2)
717
//printf ("z^(1/2) = %ldz^2+%ldz+%ld\n", _ff_get_ui(x[2]), _ff_get_ui(x[1]), _ff_get_ui(x[0]));
718
#if ! FF_FAST
719
_ff3_square_mod_rs (w, x,r,s);
720
if ( !_ff_zero(w[0])||!_ff_one(w[1])||!_ff_zero(w[2]) ) { printf ("ff3_sqrt failed, (%luz^2+%luz+%lu)^2 = ", _ff_get_ui(x[2]), _ff_get_ui(x[1]), _ff_get_ui(x[0]));
721
printf ("%luz^^2+%luz+%luz != z", _ff_get_ui(w[2]), _ff_get_ui(w[1]), _ff_get_ui(w[0])); }
722
#endif
723
// now compute tr(x)=x0*tr(1)+x1*tr(z)+x2*(tr(z^2)) = 3x0 + 0 + 2rx2
724
_ff_mult(t2,r,x[2]);
725
_ff_add(t1,t2,t2); // t1=2rx2
726
_ff_add(t2,x[0],x[0]);
727
_ff_addto(t2,x[0]); // t2=3x0
728
_ff_add(o[0],t1,t2); // o=tr(x)
729
//printf("tr(z^(1/2)) = %ld\n", _ff_get_ui(o[0]));
730
return 1;
731
}
732
733