Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
241818 views
1
#include <stdlib.h>
2
#include <stdio.h>
3
#include "ff.h"
4
5
/*
6
Copyright 2007-2008 Andrew V. Sutherland
7
8
This file is part of smalljac.
9
10
smalljac is free software: you can redistribute it and/or modify
11
it under the terms of the GNU General Public License as published by
12
the Free Software Foundation, either version 2 of the License, or
13
(at your option) any later version.
14
15
smalljac is distributed in the hope that it will be useful,
16
but WITHOUT ANY WARRANTY; without even the implied warranty of
17
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
GNU General Public License for more details.
19
20
You should have received a copy of the GNU General Public License
21
along with smalljac. If not, see <http://www.gnu.org/licenses/>.
22
*/
23
24
25
ff_t _ff_t1;
26
ff_t _ff_p;
27
ff_t _ff_2g; // generator of Sylow 2-subgroup and necessarily a quadratic non-residue
28
ff_t _ff_2gi; // inverse of _ff_2g
29
ff_t _ff_3g; // generator of Sylow 3-
30
ff_t _ff_negone;
31
ff_t _ff_half;
32
ff_t _ff_third;
33
static unsigned long mod256itab[256];
34
static ff_t __ff_mont_itab[FF_ITAB_SIZE]; // private copy for single modulus environment
35
36
static ff_t _ff_2exp; // 2^(p+3)/4-1 set only for p=5mod8, used for fast sqrts
37
unsigned long _ff_p2_m; // odd part of p-1, p=2^p2_e*p2_m+1
38
int _ff_p2_e; // power of 2 dividing p-1
39
unsigned long _ff_p3_m; // p=3^p3_e*p3_m+1 when p=1mod3, p=3^p3_3*p3_m-1 when p=2mod3
40
int _ff_p3_m1mod3; // true if p3_m is 1 mod 3
41
int _ff_p3_e; // power of 3 dividing p-1 when p=1mod3, power of 3 dividing p+1 when p=2mod3
42
ff_t _ff_2Sylow_tab[64][2]; // 2Sylow_tab[i][0]=g^(2^i) for 0<=i<=e-1, where g generates the 2-Sylow subgroup.
43
// 2Sylow_tab[i][1] = 2Sylow_tab[i][0]*2Sylow_tab[i+1][0]
44
static ff_t _ff_3Sylow_tab[42][2]; // 3Sylow_tab[i][0]=g^(3^i) for 0<=i<=e-1, where g generates the 3-Sylow subgroup (when it is non-trivial)
45
// 3Sylow_tab[i][1] = 3Sylow_tab[i][0]^2
46
ff_t _ff_cbrt_unity;
47
int _ff_sqrt_setup;
48
int _ff_cbrt_setup;
49
50
int _ff_p1mod3;
51
52
ff_t *_ff_mont_itab = __ff_mont_itab; // my be repointed to mange multiple moduli
53
ff_t _ff_mont_R;
54
ff_t _ff_mont_R2;
55
unsigned long _ff_mont_pni;
56
57
58
void ff_montgomery_setup (int ring);
59
int ff_ui_legendre (unsigned long a, unsigned long b);
60
61
void ff_ext_setup(void);
62
63
// nr35_tab[p%35] is either 0 or a non-residue mod p for p=1mod4 (if p is 2 or 3 mod 5, then 5 is a non-residue mod p; if p is 3, 5, or 6 mod 7, then 7 is a non-residue mod p).
64
// nric35_tab[p%35] is, the coefficient k s.t. kp+1=0 mod ff_nr35_tab[p%35]
65
int _ff_nr35_tab[35] = { 0, 0, 5, 5, 0, 7, 7, 5, 5, 0, 7, 0, 5, 5, 0, 0, 0, 5, 5, 7, 7, 0, 5, 5, 7, 0, 7, 5, 5, 0, 0, 7, 5, 5, 7};
66
int _ff_nric35_tab[35] = { 0, 0, 2, 3, 0, 4, 1, 2, 3, 0, 2, 0, 2, 3, 0, 0, 0, 2, 3, 4, 1, 0, 2, 3, 2, 0, 4, 2, 3, 0, 0, 2, 2, 3, 1};
67
68
69
// Note that it is possible to use non-prime p, and we don't want to waste time checking primality anyway.
70
// We assume that either the caller has checked the primality of p, or is aware that it is working over a ring
71
void ff_setup_ui (unsigned long p)
72
{
73
static int init = 0;
74
unsigned long n;
75
ff_t t;
76
77
#if FF_WORDS != 1
78
err_printf ("ff_setup_ui only supports single-word Montgomery or native representation\n");
79
exit (0);
80
#endif
81
#if FF_MONTGOMERY
82
if ( ! init ) { for ( n = 1 ; n < 256 ; n+= 2 ) mod256itab[n] = ff_ui_inverse(n,256); init = 1; } // This is only done once
83
#endif
84
85
if ( p && _ff_p == p ) return;
86
if ( p > (1UL<<FF_BITS) ) { printf ("ff_setup: p=%lu is too large, FF_BITS = %d\n", p, FF_BITS); exit (0); }
87
88
// Don't deal with char 2
89
if ( p < 3 ) { printf ("ff_setup: p=%lu must be an odd prime\n", p); exit (0); }
90
91
_ff_p = p;
92
#if FF_MONTGOMERY
93
ff_montgomery_setup (0);
94
#endif
95
_ff_p2_m = (_ff_p-1)/2;
96
_ff_set_ui (_ff_half, _ff_p2_m+1);
97
_ff_set_zero(_ff_negone); _ff_dec(_ff_negone);
98
for ( _ff_p2_e = 1 ; !(_ff_p2_m&0x1) ; _ff_p2_m>>=1, _ff_p2_e++ );
99
_ff_sqrt_setup = _ff_cbrt_setup = 0;
100
101
_ff_p1mod3 = ((p%3)==1);
102
n = (_ff_p1mod3? (2*p+1UL)/3UL : (p==3?0:(p+1UL)/3UL));
103
_ff_set_ui(_ff_third,n); // note this is set to zero for p=3
104
if ( _ff_p1mod3 ) {
105
_ff_p3_m = (_ff_p-1)/3;
106
for ( _ff_p3_e = 1 ; !(_ff_p3_m%3) ; _ff_p3_m /= 3, _ff_p3_e++ );
107
_ff_p3_m1mod3 = ((_ff_p3_m%3)==1);
108
} else {
109
// this code is meaningless if p=3, but we won't bother checking this because these values won't be used in this case
110
_ff_p3_m = (_ff_p+1)/3;
111
for ( _ff_p3_e = 1 ; !(_ff_p3_m%3) ; _ff_p3_m /= 3, _ff_p3_e++ );
112
_ff_p3_m1mod3 = ((_ff_p3_m%3)==1);
113
}
114
115
_ff_2exp = _ff_2g = _ff_3g = 0; // will get set when needed
116
ff_ext_setup();
117
return;
118
}
119
120
// macros to handle setting ff_t types in raw mode -- needed to handle multiples of p and montgomery values
121
#if FF_WORDS == 1
122
#define _ff_raw_set_mpz(z,X) ((z) = mpz_get_ui(X))
123
#define _ff_raw_get_mpz(Z,x) (mpz_set_ui(Z,x),Z)
124
#endif
125
126
127
#if FF_WORDS == 1 && FF_MONTGOMERY
128
129
void ff_montgomery_setup (int ring)
130
{
131
register unsigned long p, t;
132
register int i;
133
134
// precompute -p^{-1} mod B (B is either 2^32 or 2^64), R = B mod p and R^2 mod p
135
// Use Jebelean's trick for computing p^{-1} mod B (see HECHECC p. 190 remark 10.4 (ii))
136
p = _ff_p;
137
t = mod256itab[p&0xFF];
138
t = (2*t + (-(p*t*t)))&0xFFFF;
139
t = (2*t + (-(p*t*t)))&0xFFFFFFFF;
140
#if FF_HALF_WORDS == 1
141
_ff_mont_pni = (-t)&0xFFFFFFFF;
142
t = (1UL<<FF_MONTGOMERY_RBITS)%p;
143
_ff_mont_R = (ff_t)t;
144
t = (t*t)%p;
145
_ff_mont_R2 = (ff_t)t;
146
#else
147
t = (2*t + (-(p*t*t)))&0xFFFFFFFFFFFFFFFF;
148
_ff_mont_pni = -t;
149
_ff_mont_R = (1UL<<(FF_MONTGOMERY_RBITS-1))%p;
150
_ff_mont_R += _ff_mont_R;
151
if ( _ff_mont_R > p ) _ff_mont_R -= p;
152
153
// to avoid depending on multi-precision arithmetic, we square R mod p by doubling it RBITS times (R = 2^RBITS)
154
// probably no slower than a modular reduction anyway
155
_ff_mont_R2 = _ff_mont_R;
156
for ( i = 0 ; i < FF_MONTGOMERY_RBITS ; i++ ) {
157
_ff_mont_R2 += _ff_mont_R2;
158
if ( _ff_mont_R2 > p ) _ff_mont_R2 -= p;
159
}
160
#endif
161
if ( ring ) return;
162
t = 1;
163
for ( i = 0 ; i <= 3*FF_MONTGOMERY_RBITS ; i++ ) {
164
_ff_mont_itab[i] = t;
165
t += t; // note that _ff_p < 2^(ULONG_BITS-1) so overflow can't occur
166
if ( t >= _ff_p ) t -= _ff_p;
167
}
168
// printf ("Montgomery setup p = %lu, -p^{-1} mod b = %lu, R mod p = %lu, R^2 mod p = %lu\n", _ff_p, _ff_mont_pni, _ff_mont_R, _ff_mont_R2);
169
}
170
171
// This is a streamlined version of Algorithm 11.12 in HECHECC p. 208
172
// The input is in Montgomery representation: x = [a] = aR mod p
173
// This functions computes v = [a^{-1}] = a^{-1}R mod p = x^{-1}R^2 mod p
174
unsigned long ff_montgomery1_invert (unsigned long x)
175
{
176
register unsigned long r, s, t, v;
177
register int k;
178
179
if ( _ff_zero(x) ) { printf ("ff_invert: attempt to invert 0,\n");exit(0); return 0; }
180
r = x; s = 1; t = _ff_p; v = 0; k = 0;
181
while ( r > 0 ) {
182
if ( !(t&0x1UL) ) {
183
t >>= 1; s <<= 1;
184
} else if ( !(r&0x1UL) ) {
185
r >>= 1; v <<= 1;
186
} else if ( t > r ) {
187
t = (t-r)>>1; v += s; s <<= 1;
188
} else {
189
r = (r-t)>>1; s += v; v <<= 1;
190
}
191
k++;
192
// if ( k > 2*_ff_pbits ) { printf ("ff_invert: k = %d > 2*pbits! raw input %lu\n", k, x); exit (0); }
193
}
194
if ( v >= _ff_p ) v -= _ff_p;
195
v = _ff_p - v; // v cannot be zero if x is invertible (which we assume to be the case)
196
// This accomplishes steps 10-12 of Alg 11.12 in one multiplication
197
v = ff_montgomery1_mult (v, _ff_mont_itab[3*FF_MONTGOMERY_RBITS-k]); // Montgomery lookup table holds powers 2^k to save time
198
#if ! FF_FAST
199
if ( ff_montgomery1_mult (v,x) != _ff_mont_R ) {printf ("ff_montgomery1_invert failed, %lu*%lu != 1, raw input: %lu raw output %lu\n", _ff_get_ui(x), _ff_get_ui(v), x, v); exit (0); }
200
#endif
201
return v;
202
}
203
204
#endif
205
206
// Algorithm 11.15 of [HECHECC], P. 209 - note that z and x can overlap
207
void ff_parallel_invert (ff_t z[], ff_t x[], unsigned n)
208
{
209
ff_t c[FF_MAX_PARALLEL_INVERTS];
210
register ff_t u, v;
211
register unsigned i;
212
213
if ( ! n ) return;
214
if ( n > FF_MAX_PARALLEL_INVERTS ) { printf ("Exceeded FF_MAX_INVERTS, %d > %d\n", n, FF_MAX_PARALLEL_INVERTS); exit (0); }
215
216
_ff_set (c[0], x[0]);
217
for ( i = 1 ; i < n ; i++ ) _ff_mult (c[i], c[i-1], x[i]);
218
_ff_invert (u, c[n-1]);
219
for ( i = n-1 ; i > 0 ; i-- ) {
220
_ff_mult (v, c[i-1], u);
221
_ff_mult (u, u, x[i]);
222
_ff_set (z[i], v);
223
}
224
_ff_set (z[0], u);
225
}
226
227
static inline void ff_setup_2exp(void) { ff_t t; if ( _ff_2exp ) return; _ff_set_one(t); _ff_x2(t); ff_exp_ui (&_ff_2exp,&t,(_ff_p+3)/4-1); }
228
229
230
void ff_setup_2g (void)
231
{
232
ff_t t;
233
register int i,n;
234
235
if ( _ff_2g ) return;
236
237
// When p=3 mod 4 life is very simple, since -1 then generates the 2-Sylow subgroup
238
if ( _ff_p2_e == 1 ) { _ff_set(_ff_2g,_ff_negone); _ff_set(_ff_2gi,_ff_negone); _ff_set(_ff_2Sylow_tab[0][0],_ff_negone); return; }
239
240
// hardwire a few quick tests for non-residues to speed things up. This should catch all but 1/32 of the primes, on average.
241
_ff_set_zero(t);
242
if ( ! _ff_p1mod3 ) { _ff_set_i (t,-3); goto SylowSetup; } // if p is 2 mod 3 then -3 is a non-residue
243
if ( (_ff_p&0x7UL) == 5 ) { _ff_set_ui (t, 2); goto SylowSetup; } // if p is 5 mod 8 then 2 is a non-residue
244
n = _ff_nr35_tab[_ff_p%35];
245
if ( n ) { _ff_set_ui (t, n); goto SylowSetup; }
246
// We only reach this prob 1/32, and the expected cost of the computation below is less than 32 modular reductions
247
// Note that legendre is faster here than computing square roots because n is small
248
for ( n = 11 ; ; n++ ) if ( ff_ui_legendre (n, _ff_p) < 0 ) break;
249
_ff_set_ui(t,n);
250
251
SylowSetup:
252
253
ff_exp_ui (&_ff_2g, &t, _ff_p2_m);
254
_ff_set(_ff_2Sylow_tab[0][0], _ff_2g);
255
_ff_set(_ff_2gi,_ff_2g);
256
// compute the inverse of _ff_2g as we square up, this is faster than calling _ff_inverse
257
for ( i = 1 ; i < _ff_p2_e-1 ; i++ ) { _ff_square(_ff_2Sylow_tab[i][0],_ff_2Sylow_tab[i-1][0]); ff_mult(_ff_2gi,_ff_2gi,_ff_2Sylow_tab[i][0]); }
258
_ff_set(_ff_2Sylow_tab[i][0],_ff_negone);
259
ff_negate(_ff_2gi);
260
for ( i = 0 ; i < _ff_p2_e-1 ; i++ ) _ff_mult(_ff_2Sylow_tab[i][1], _ff_2Sylow_tab[i][0], _ff_2Sylow_tab[i+1][0]);
261
_ff_set(_ff_2Sylow_tab[i][1],_ff_negone);
262
}
263
264
/*
265
Extended sqrt algorithm, based on Tonelli-Shanks, compute sthe square root in F_p^2 if necessary.
266
The return value is 0 if the root is in F_p^2, in which case o=a^{-1/2}/z where F_p^2=F_p[z]/(z^2-_ff_2g).
267
268
This algorithm is deterministic (given precomputed 2-Sylow generator) - caller should flip coin to randomize choice of sqrt.
269
*/
270
int ff_invsqrt (ff_t o[1], ff_t a[1], int ext)
271
{
272
ff_t b, x, y;
273
int sts;
274
275
if ( _ff_zero(a[0]) ) { ff_setup_2g(); _ff_set_zero (o[0]); return 1; } // need to handle zero separately (make sure setup_2g gets called though)
276
277
ff_exp_ui (&x, a, (_ff_p2_m-1)/2); // x = a^{(m-1)/2}
278
ff_mult (b, a[0], x);
279
ff_mult (b, b, x); // b = a^m is in the 2-Sylow subgroup
280
sts = ff_2Sylow_invsqrt(&y,&b, ext);
281
if ( ! ext && ! sts ) return 0;
282
ff_mult (x, x, y); // x = a^{(m-1)/2} * b^{-1/2} = a^{(m-1)/2} * a^{-m/2} = a^{-1/2}
283
#if ! FF_FAST
284
_ff_square (y, x);
285
if ( ! sts ) ff_mult(y,y,_ff_2g);
286
ff_mult(y,y,a[0]);
287
if ( ! _ff_one(y) ) {
288
printf ("p=%lu, a=%lu, b=%lu, 2g=%lu, 2gi=%lu\n", _ff_p, _ff_get_ui(a[0]), _ff_get_ui(b), _ff_get_ui(_ff_2g), _ff_get_ui(_ff_2gi));
289
if ( sts ) {
290
printf ("ff_sqrt failed, %lu^2 != 1/%lu\n", _ff_get_ui(x), _ff_get_ui(a[0])); exit (0);
291
} else {
292
printf ("ff_sqrt failed, %lu^2*%lu != 1/%lu\n", _ff_get_ui(x), _ff_get_ui(_ff_2g), _ff_get_ui(a[0])); exit (0);
293
}
294
}
295
#endif
296
_ff_set (o[0], x);
297
return sts;
298
}
299
300
/*
301
Computes a^{-1/2} for a in the 2-Sylow subgroup, computing in F_p^2 if necessary.
302
Returns 0 if the root is in F_p^2, in which case o=a^{-1/2}/z as above.
303
304
Uses precomputed table of 2e powers of the 2-Sylow generator to reduce running time by a factor of 4 over standard Tonelli-Shanks (still O(e^2)).
305
Assumes _ff_p2_e > 2 and a has order at least 4 (easy cases handle inline by ff.h)
306
*/
307
int _ff_2Sylow_invsqrt (ff_t o[1], ff_t a[1], int ext)
308
{
309
register ff_t b, q, q1, q2, t, w1, w2;
310
register int i, j, k, s;
311
int sts;
312
313
// set w1 and w2 to the two elements of order 4 in the 2-Sylow subgroup (note we know e > 2 and a has order at least 4 since it is not 1 or -1)
314
_ff_set (w1, _ff_2Sylow_tab[_ff_p2_e-2][0]);
315
_ff_set (w2, _ff_2Sylow_tab[_ff_p2_e-2][1]);
316
_ff_set_one (t);
317
_ff_set (b,a[0]);
318
sts = 1;
319
for(;;) {
320
_ff_set (q, b);
321
for ( s = 2 ; s<_ff_p2_e+1 ; s++ ) { // s<_ff_p2_e+1 is an unnecessary safety check (in case a isn't in the 2-Sylow)
322
j=1;
323
if ( _ff_equal(q,w1) ) break;
324
j=0;
325
if ( _ff_equal(q,w2) ) break;
326
ff_square (q,q);
327
}
328
k = _ff_p2_e-s;
329
#if ! FF_FAST
330
if ( k < 0 ) { printf ("Unexpected result: k<0 in ff_2Sylow_invsqrt?! a = %lu, p = %lu\n", _ff_get_ui(a[0]), _ff_p); exit (1); }
331
#endif
332
if ( !k ) { // this can only happen the first time through
333
if ( ! ext ) return 0;
334
sts=0;
335
if ( j ) {
336
ff_mult(b,b,_ff_2gi); // clear the low bit rather than adding
337
ff_mult(t,t,_ff_2gi); // setting sts=0 implicitly multiplied t by s^{1/2}, so multiplying by s^{-1} gives s^{-1/2}
338
} else {
339
ff_mult(b,b,_ff_2g); // sts=0 multiplied t by s^{1/2}, so we don't need to do anything to t
340
}
341
} else {
342
ff_mult (b, b, _ff_2Sylow_tab[k][j]); // the product of all the elements S[k] we multiply into b here is b^{-1}, since we terminate with b=1
343
ff_mult (t, t, _ff_2Sylow_tab[k-1][j]); // multiply t by S[k-1]=sqrt(S[k]), the product of all these will be b^{-1/2}
344
}
345
if ( _ff_equal(b,_ff_negone) ) { ff_mult(t, t, w1); break; }
346
if ( _ff_one(b) ) break;
347
ff_mult(q2,q2,_ff_2Sylow_tab[_ff_p2_e-4][j]);
348
if ( _ff_one(q2) ) continue;
349
if ( _ff_equal(q2,_ff_negone) ) {
350
_ff_mult(b,b,_ff_2Sylow_tab[k+3][0]);
351
_ff_mult(t,t,_ff_2Sylow_tab[k+2][0]);
352
} else {
353
if ( _ff_equal(q2,w1) ) j = 1; else j = 0;
354
_ff_mult(b,b,_ff_2Sylow_tab[k+2][j]);
355
_ff_mult(t,t,_ff_2Sylow_tab[k+1][j]);
356
}
357
if ( _ff_equal(b,_ff_negone) ) { ff_mult(t, t, w1); break; }
358
if ( _ff_one(b) ) break;
359
}
360
check:
361
#if ! FF_FAST
362
_ff_square (b, t);
363
if ( ! sts ) ff_mult(b,b,_ff_2g);
364
ff_mult(b,b,a[0]);
365
if ( ! _ff_one(b) ) {
366
printf ("a=%lu, p=%lu\n", _ff_get_ui(a[0]), _ff_p);
367
if ( sts ) {
368
printf ("ff_2Sylow_invsqrt failed, %lu^2 * %lu != 1\n", _ff_get_ui(t), _ff_get_ui(a[0]));
369
} else {
370
printf ("ff_2Sylow_invsqrt failed, %lu^2 * %lu * %lu != 1\n", _ff_get_ui(t), _ff_get_ui(_ff_2g), _ff_get_ui(a[0]));
371
}
372
exit (0);
373
}
374
#endif
375
_ff_set(o[0],t);
376
return sts;
377
}
378
379
// Recursive discrete log algorithm - not currently used as it is slower than ff_2Sylow_invsqrt for p < 2^64
380
// Given a and b in the 2 Sylow subgroup, returns a nonnegative integer k < 2^e s.t. _a^k = b, or -1 if no such k exists
381
int ff_2Sylow_dlog (ff_t a, ff_t b, int e)
382
{
383
register ff_t x0, x1,y;
384
register int d, i, k0, k1;
385
ff_t t;
386
//printf ("2Sylow_dlog(%ld,%ld,%d)\n", _ff_get_ui(a), _ff_get_ui(b), e);
387
if ( _ff_one(b) ) return 0;
388
if ( _ff_equal(b,_ff_negone) ) return (1<<(e-1)); // non-generic optimization specific to fields
389
if ( e < 2 ) return -1;
390
if ( e == 2 ) {
391
if ( _ff_equal (b,a) ) return 1;
392
_ff_mult(y,a,b);
393
if ( _ff_one(y) ) return 3;
394
return -1;
395
}
396
d = e/2;
397
// compute x0=a^(2^(e-d)), y=b^(2^(e-d)), x1 = a^(2^d)
398
_ff_set(x0,a); _ff_set(y,b);
399
for ( i = 0 ; i < e-d ; i++ ) {
400
if ( i == d ) _ff_set(x1,x0);
401
ff_square(x0,x0);
402
ff_square(y,y);
403
}
404
if ( i == d ) _ff_set(x1,x0);
405
k0 = ff_2Sylow_dlog (x0,y,d);
406
//printf ("k0=%d\n", k0);
407
if ( k0 < 0 ) return -1;
408
k1 = (1<<e)-k0;
409
ff_exp_ui(&t, &a, k1);
410
_ff_mult(y,b,t);
411
k1 = ff_2Sylow_dlog(x1,y,e-d);
412
//printf("k1=%d\n", k1);
413
if ( k1 < 0 ) return -1;
414
return (1<<d)*k1+k0;
415
}
416
417
void _ff_setup_3g (void)
418
{
419
register int i;
420
register unsigned long n;
421
ff_t r,s,t;
422
423
if ( _ff_3g || ! _ff_p1mod3 ) return;
424
_ff_cbrt_setup = 1;
425
n = (_ff_p-1)/(3*_ff_p3_m); // p-1 = 3*m*n
426
427
// we could use cubic reciprocity here
428
_ff_set_one(t);
429
_ff_x2(t);
430
for(;;) {
431
ff_exp_ui (&r,&t, _ff_p3_m); // exponentiation into 3-Sylow group first
432
if ( _ff_one(r) ) { _ff_inc(t); continue; }
433
if ( _ff_p3_e == 1 ) break;
434
ff_exp_ui (&s, &r, n); // s = t^((p-1)/3)
435
if ( ! _ff_one(s) ) break; // if s is not 1 then r is a generator of the 3-Sylow (and a cubic nonresidue)
436
_ff_inc(t);
437
}
438
_ff_set (_ff_3g, r);
439
_ff_set (_ff_3Sylow_tab[0][0], r);
440
_ff_square (_ff_3Sylow_tab[0][1],_ff_3Sylow_tab[0][0]);
441
for ( i = 1 ; i < _ff_p3_e ; i++ ) {
442
_ff_mult(_ff_3Sylow_tab[i][0],_ff_3Sylow_tab[i-1][0],_ff_3Sylow_tab[i-1][1]); // tab[i][0] = tab[i-1][0]^3
443
_ff_square(_ff_3Sylow_tab[i][1], _ff_3Sylow_tab[i][0]);
444
}
445
_ff_set(_ff_cbrt_unity,_ff_3Sylow_tab[_ff_p3_e-1][0]);
446
//printf("cbrt_unity = %d\n", _ff_get_ui(_ff_cbrt_unity));
447
}
448
449
/*
450
Tonelli-Shanks cuberoot algorithm
451
452
This algorithm is deterministic (given precomputed 3-Sylow generator) - caller should flip coin to randomize choice of sqrt.
453
*/
454
int ff_cbrt (ff_t o[1], ff_t a[1])
455
{
456
ff_t b, x, y;
457
458
ff_setup_3g(); // always do this first to make sure _ff_cbrt_unity is set for caller
459
if ( _ff_zero(a[0]) ) { _ff_set_zero (o[0]); return 1; }
460
461
if ( ! _ff_p1mod3 ) {
462
// every element of F_p is a cubic residue
463
if ( _ff_p == 3 ) {_ff_set(o[0],a[0]); return 1; }
464
ff_exp_ui (o, a, (2*_ff_p-1)/3); // o = a^((2p-1)/3) (note that (2p-1)/3 = 1/3 mod (p-1)
465
return 1;
466
}
467
468
if ( _ff_p3_m1mod3 ) {
469
ff_exp_ui (&x, a, (_ff_p3_m-1)/3); // x = a^((m-1))/3)
470
_ff_square (y, x); _ff_mult (b, x, y); // b = x^3 = a^(m-1)
471
ff_mult (b, b, a[0]); // b = a^m is in the 3-Sylow subgroup
472
} else {
473
ff_exp_ui (&x, a, (_ff_p3_m-2)/3); // x = a^(m-2)/3
474
_ff_square (y, x); _ff_mult (b, x, y); // b = x^3 = a^(m-2)
475
_ff_square (y,a[0]); ff_mult (b, b, y); // b = a^m is in the 3-Sylow subgroup
476
}
477
478
// Check if be is the 3-Sylow generator or its square--this happens with probability 2/3^e. For 2/3 of the primes this is 2/3, which makes it worth avoiding the call to ff_2Sylow_invcbrt
479
if ( _ff_equal(b,_ff_3Sylow_tab[0][0]) || _ff_equal(b,_ff_3Sylow_tab[0][1]) ) return 0;
480
if ( ! ff_3Sylow_invcbrt(&y,&b) ) return 0;
481
if ( _ff_p3_m1mod3 ) { ff_square(x,x); ff_square(y,y); } // when m is 1mod3 we want to x=a^((2m-2)/3) and y=a^(-2m/3)
482
ff_mult (x, x, a[0]); ff_mult(x,x,y); // either x = a^(2m-2)/3 *a* a^(-2m/3) = a^(1/3) or x=a^(m-2)/3*a*a^(-m/3) = a^(1/3)
483
#if ! FF_FAST
484
_ff_square (y, x); ff_mult(y,y,x);
485
if ( ! _ff_equal(y,a[0]) ) { printf ("ff_cbrt failed, %lu^3 = %lu != %lu\n", _ff_get_ui(x), _ff_get_ui(y), _ff_get_ui(a[0])); exit (0); }
486
#endif
487
_ff_set (o[0], x);
488
return 1;
489
}
490
491
// computes a^{-1/3} for a in the 3-Sylow subgroup, returns 0 if not a quadratic residue
492
// uses precomputed table of 2e powers of the 3-Sylow generator to reduce running time by a factor of 2 over standard Tonelli-Shanks (still O(e^2)).
493
int ff_3Sylow_invcbrt (ff_t o[1], ff_t a[1])
494
{
495
register ff_t b, q, q1, t, w1, w2;
496
register int i, j, k, s;
497
498
ff_setup_3g(); // always do this first to make sure _ff_cbrt_unity is set for caller
499
// handle easy cases first
500
if ( _ff_one(a[0]) ) { _ff_set_one(o[0]); return 1; } // use 1 as the cube root of 1
501
if ( _ff_p3_e == 1 ) return 0;
502
503
// set w1 and w2 to the two elements of order 3 in the 3-Sylow (i.e. the two non-trivial cube roots of unity)
504
_ff_set (w1, _ff_3Sylow_tab[_ff_p3_e-1][0]);
505
_ff_set (w2, _ff_3Sylow_tab[_ff_p3_e-1][1]);
506
_ff_set_one (t);
507
_ff_set (b,a[0]);
508
do {
509
_ff_set (q, b);
510
for ( s = 1 ; s < _ff_p3_e+1 ; s++ ) { // s<e+1 is just a safety check in case a isn't in the 3-Sylow, this could be removed
511
j=1;
512
if ( _ff_equal(q,w1) ) break;
513
j=0;
514
if ( _ff_equal(q,w2) ) break;
515
_ff_set(q1,q);
516
ff_square (q,q); ff_mult(q,q,q1);
517
}
518
k = _ff_p3_e-s;
519
#if ! FF_FAST
520
if ( k < 0 ) { printf ("Unexpected result: k<0 in ff_3Sylow_invsqrt?! a = %lu, p = %lu\n", _ff_get_ui(a[0]), _ff_p); exit (1); }
521
#endif
522
if ( k <= 0 ) return 0;
523
ff_mult (b, b, _ff_3Sylow_tab[k][j]); // the product of all the elements S[k] we multiply into b here is b^{-1}, since we terminate with b=1
524
ff_mult (t, t, _ff_3Sylow_tab[k-1][j]); // multiply t by S[k-1]=cbrt(S[k]), the product of all these will be b^{-1/3}
525
} while ( !_ff_one(b) );
526
#if ! FF_FAST
527
_ff_square (b, t); ff_mult(b,b,t);
528
ff_mult(b,b,a[0]);
529
if ( ! _ff_one(b) ) { printf ("ff_3Sylow_invcbrt failed, %lu^3 * %lu != 1\n", _ff_get_ui(t), _ff_get_ui(a[0])); exit (0); }
530
#endif
531
_ff_set(o[0],t);
532
return 1;
533
}
534
535
536
// standard 4-ary exponentiation (fixed 2-bit window)
537
void ff_exp_ui (ff_t o[1], ff_t a[1], unsigned long e)
538
{
539
register int i, j;
540
register ff_t c;
541
register unsigned long m;
542
ff_t b[4];
543
544
if ( ! e ) { _ff_set_one (o[0]); return; }
545
// if ( _ff_one(e) ) { _ff_set (o[0], a[0]); return; }
546
// avoid tests to optimize for e <3 or a==0,1,-1
547
i = _asm_highbit(e);
548
if ( i&1 ) i--;
549
m = 3UL<<i;
550
_ff_set (b[1], a[0]);
551
_ff_square (b[2],b[1]);
552
_ff_mult(b[3],b[2],b[1]);
553
_ff_set (c, b[(m&e)>>i]);
554
for ( m>>=2,i-=2 ; m ; m>>=2, i-=2 ) {
555
ff_square(c,c); ff_square(c,c);
556
j = (m&e)>>i;
557
if ( j ) ff_mult(c,c,b[j]);
558
}
559
//printf ("%ld^%ld=%ld mod %ld\n", _ff_get_ui(a[0]),e,_ff_get_ui(c),_ff_p);
560
_ff_set (o[0], c);
561
}
562
563
564
// This function is copied from mpzutil.c to make ff.c/ff.h/asm.h self-contained
565
unsigned long ff_ui_inverse (unsigned long a, unsigned long m)
566
{
567
register unsigned long q, r, r0, r1;
568
register long t, t0, t1;
569
570
if ( a >= m ) a = a%m;
571
if ( a == 0 ) return 0;
572
573
t0 = 0; t1 = 1;
574
r0 = m; r1 = a;
575
while ( r1 > 0 ) {
576
q = r0 / r1;
577
r = r0 - q*r1;
578
r0 = r1; r1 = r;
579
t = t0 - q*t1;
580
t0 = t1; t1 = t;
581
}
582
if ( r0 > 1 ) return 0;
583
if ( t0 < 0 ) return m - ((unsigned long)(-t0));
584
return (unsigned long)t0;
585
}
586
587
/*
588
Algorithm 1.4.10, p. 29 of [CANT] trimmed down to just handle a >= 0, b > 0 odd
589
590
This algorithm is used to find a non-residue for use by ff_sqrt.
591
592
NOTE: this is slower than both ff_fast_sqrt and ff_sqrt (after setup) due to the
593
high cost of modular reductions (as opposed to Montgomery arithmetic). It does have
594
asymptotically superior "bit" complexity, O(lg^2(n)), but this doesn't mean much when
595
the cost of multiplying two unsigned longs is effectively a constant.
596
*/
597
int ff_ui_legendre (unsigned long a, unsigned long b)
598
{
599
register unsigned long r;
600
register int k, v;
601
602
#if ! FF_FAST
603
if ( ! (b&0x1) ) { printf ("ff_ui_Legendre, b = %lu must be odd!\n", b); exit (0); }
604
#endif
605
k = 1;
606
while ( a ) {
607
for ( v = 0 ; ! (a&0x1) ; v++ ) a >>= 1;
608
if ( v&0x1 ) if ( (b&0x7) ==3 || (b&0x7) ==5 ) k = -k;
609
if ( (a&0x3) == 3 && (b&0x3) == 3 ) k = -k;
610
r = a; a = b % r; b = r;
611
}
612
return ( b == 1 ? k : 0 );
613
}
614
615
/*
616
Fast square-root algorithm using a single exponentiation, requires p = 3 mod 4 or p = 5 mod 8.
617
Relies on Thm 8.1 on p. 107 of Bressoud "Factorization and Primality Testing".
618
Returns 1 if successful, 0 if a is not a quadratic residue.
619
620
Not currently used, ff_sqrt is effectively just as fast.
621
622
Currently only supported in single word implementations. Overlap is OK.
623
*/
624
625
int ff_fast_sqrt (ff_t o[1], ff_t a[1])
626
{
627
register ff_t i;
628
ff_t temp;
629
register int k;
630
631
#if FF_WORDS > 1
632
return 0;
633
#endif
634
if ( _ff_zero(a[0]) ) { _ff_set_zero(o[0]); return 1; }
635
if ( (_ff_p&3)==3 ) {
636
ff_exp_ui (&temp, a, (_ff_p+1)>>2);
637
} else if ((_ff_p&7)==5 ) {
638
ff_exp_ui (&temp, a, (_ff_p+3)>>3);
639
_ff_square (i,temp);
640
if ( _ff_equal (i, a[0]) ) {
641
_ff_set(o[0], temp);
642
return 1;
643
}
644
ff_setup_2exp();
645
ff_mult(temp, temp, _ff_2exp);
646
} else {
647
return 0;
648
}
649
_ff_square(i,temp);
650
if ( ! _ff_equal(i,a[0]) ) return 0;
651
_ff_set(o[0],temp);
652
return 1;
653
}
654
655