#include <stdlib.h>1#include <stdio.h>2#include "ff.h"3#include "ffext.h"4#include "ffpoly.h" // used to find irreducible cubic56/*7Copyright 2008 Andrew V. Sutherland89This file is part of smalljac.1011smalljac is free software: you can redistribute it and/or modify12it under the terms of the GNU General Public License as published by13the Free Software Foundation, either version 2 of the License, or14(at your option) any later version.1516smalljac is distributed in the hope that it will be useful,17but WITHOUT ANY WARRANTY; without even the implied warranty of18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the19GNU General Public License for more details.2021You should have received a copy of the GNU General Public License22along with smalljac. If not, see <http://www.gnu.org/licenses/>.23*/2425static ff_t _ff3_f[4]; // irred minimal poly of z - polynomial basis {1,z,z^2} of the form x^3-s or x^3-x-s26static ff_t _ff3_nr_exp; // z^(m(p^2+p+1)), a primitive 2^e-th root of unity in F_p^3 (which necessarily lies in F_p)2728ff_t _ff3_zp[3]; // if p=1mod3 this is just a multiple of z29ff_t _ff3_z2p[3]; // z^{2p} cached because it used by ff3_exp_p30int _ff3_trace_z2; // trace of z^2 is 0 for p=1mod and 2 o.w., note that trace of z is always 0, store this as in int rather than an ff_t3132static ff_t _ff2_3Sylow_tab[42][2][2]; // These values depend on s (the quadratic non-residue passed in) and are not reused, having a static table is simple a convenience33ff_t _ff2_cbrt_unity[2];34int _ff2_cbrt_setup;3536void ff_ext_setup(void) { _ff3_f[3] = 0; _ff2_cbrt_setup = 0; } // don't zero everything, just enough to detect uninitialized cases3738// standard 4-ary exponentiation (fixed 2-bit window)39void ff2_exp_ui_s (ff_t o[2], ff_t a[2], unsigned long e, ff_t s)40{41register int i;42register unsigned long j, m;43ff_t b[4][2], c[2];4445//printf ("exp(%ldz+%ld, %ld)\n", _ff_get_ui(a[1]), _ff_get_ui(a[0]), e);46switch (e) {47case 0: ff2_set_one (o); return;48case 1: ff2_set(o,a); return;49case 2: ff2_square_s(o,a,s); return;50}51i = _asm_highbit(e);52if ( i&1 ) i--;53m = 3UL<<i;54ff2_set (b[1], a);55ff2_square_s (b[2],b[1],s);56ff2_mult_s(b[3],b[2],b[1],s);57ff2_set (c, b[(m&e)>>i]);58for ( m>>=2,i-=2 ; m ; m>>=2, i-=2 ) {59ff2_square_s(c,c,s); ff2_square_s(c,c,s);60j = (m&e)>>i;61if ( j ) ff2_mult_s(c,c,b[j],s);62}63ff2_set (o, c);64//printf("expresult=%ldz+%ld\n", _ff_get_ui(o[1]), _ff_get_ui(o[0]));65}66676869/*70sqrt algorithm over F_p^2, reduces problem to two sqrts in F_p7172We are given an element of the form a1z+a0 where z^2=s=_ff_2g (a non-residue in F_p)73If a1 is zero, we just compute sqrt(a0) in F_p, and if it doesn't exist, sqrt(a0)=sqrt(a0/s)*sqrt(s).7475We now assume a1!=0. If (b1z+b0)^2 = (a1z+a0) then7677b0^2+sb1^2 = a0 and 2b0b1=a17879and we know that both b0 and b1 are non-zero. We then obtain8081b0^4 - a0b0^2 + a1^2s/4 = 0 => b0^2 = (a0 +/- sqrt(N(a)))/28283and similarly8485sb1^4 - a0b1^2 + a1^2/4 = 0 => b1^2 = (a0 -/+ sqrt(N(a)))/(2s)8687Note that b0^2*b1^2 = a1^2/4 is a QR, but 1/s is not a QR, so exactly one of (-a0+sqrt(N(a)))/2 and (-a0-sqrt(N(a)))/2 is a QR8889*/90int ff2_sqrt (ff_t o[2], ff_t a[2])91{92ff_t x, y, t;9394ff_setup_2g();95if ( _ff_zero(a[1]) ) {96if ( ff_invsqrt(&t, a, 1) ) {97_ff_mult (o[0], t, a[0]);98_ff_set_zero(o[1]);99} else {100_ff_mult (o[1],t,a[0]);101_ff_set_zero(o[0]);102}103return 1;104}105ff2_norm(&t,a);106if ( ! ff_sqrt (&x, &t) ) return 0; // a is a QR in F_p^2 iff N(a) is a QR in F_p107_ff_add (t,a[0],x);108_ff_div2(y,t);109if ( ! ff_invsqrt (&t, &y, 1) ) {110_ff_mult(x,y,t); // if r=sqrt(y) is not in F_p then b1 = r/sqrt(s)=r/z, we have r=y*tz, so b1=y*t111ff_mult(t,t,_ff_2g); // b0=a1/(2b1)=a1/(2yt)=a1z/(2ytz)=a1z/(2r)=a1zr/(2y)=a1ytz^2/(2y)=a1ts/2112ff_mult(t,t,a[1]);113_ff_div2(y,t); // got b0114} else {115ff_mult(y,y,t); // got b0116ff_mult(t,t,a[1]); // b1=a1/(2b0)=a1/(2yt)=a1t/2117_ff_div2(x,t); // got b1118}119check:120#if ! FF_FAST121{122ff_t b[2];123_ff_set(b[0],y); _ff_set(b[1],x);124ff2_square(b,b);125if ( ! ff2_equal(b,a) ) { printf ("ff2_sqrt failed, (%lu+%luz)^2 != %lu+%luz\n",_ff_get_ui(b[0]), _ff_get_ui(b[1]), _ff_get_ui(a[0]), _ff_get_ui(a[1])); exit (0); }126}127#endif128_ff_set(o[1],x);129_ff_set(o[0],y);130return 1;131}132133void _ff2_setup_cbrt(void)134{135register int i;136register unsigned long n;137ff_t t0,t1;138ff_t r[2],s[2],t[2],w[2];139140if ( _ff2_cbrt_setup ) return;141ff_setup_2g();142if ( _ff_p1mod3 || _ff_p==3 ) { puts ("_ff2_setup_cbrt called when p!= 2mod 3, this should never happen."); exit (0); }143_ff2_cbrt_setup = 1;144// Note that p+1 = 3^e*m where m is not divisible by 3, and we have p-1 not div by 3145146if ( _ff_p3_e == 1 ) {147// life is slightly easer in this situation, since (-1+sqrt(-3))/2 generates the Sylow 3-subgroup of F_p^2148// unfortunately we still need to compute sqrt(-3) in the standard basis (-3 is not necessarily _ff_2g)149_ff_set_i(t0,-3);150if ( ff_sqrt_ext (&t1,&t0) ) { printf ("Impossible, -3 is a QR mod %ld with p=2mod3\n", _ff_p); exit (0); }151_ff_div2(_ff2_3Sylow_tab[0][0][0], _ff_negone);152_ff_div2(_ff2_3Sylow_tab[0][0][1], t1);153ff2_square (_ff2_3Sylow_tab[0][1],_ff2_3Sylow_tab[0][0]);154ff2_set (_ff2_cbrt_unity,_ff2_3Sylow_tab[0][0]);155//printf ("cube root of unity is %ldz+%ld, 2g=%d\n", _ff_get_ui(_ff2_cbrt_unity[1]), _ff_get_ui(_ff2_cbrt_unity[0]), _ff_get_ui(_ff_2g));156return;157}158159// To find a cubic non-residue, it suffices to find r^(m*3^(e-1)) not in F_p. (when e is 1 this is just r^m)160// To get into the 3-Sylow, we need to exponentiate by (p-1)m = (p+1)(m-1) + 2m(3^(e-1)-1), so we exponentiate by m-1 first.161n = (_ff_p+1)/(3*_ff_p3_m)-1; // n = 3^(e-1)-1162_ff_set_one(t[1]); _ff_set_one(t[0]);163for(;;) {164ff2_exp_ui (r, t, _ff_p3_m-1); // r = t^(m-1)165ff2_mult (s, r, t); // s = t^m166if ( ! _ff_zero(s[1]) ) { // if s is in F_p, t is a cubic residue167ff2_exp_ui (w,s,n); // w = t^(m*(3^(e-1)-1))168_ff_mult(t0,w[0],s[1]); _ff_mult(t1,w[1],s[0]);169_ff_addto(t1,t0); // t1 = z coeff of t^(m*3^(e-1)) = t^((p+1)/3)170if ( ! _ff_zero(t1) ) break; // t is a cubic residue iff t^((p+1)/3) is in F_p iff t1 is zero171}172next: _ff_inc(t[0]);173if ( _ff_zero(t[0]) ) { _ff_inc(t[1]); _ff_set_one(t[0]); }174}175// t is a non CR, r=t^(m-1), s=t^(m*(3^e-2))176ff2_norm(&t0,r); // t0 = t^((p+1)(m-1))177ff2_mult(s,s,w); // s = t^(3^(e-1)m)178ff2_square(w,w); // w = t^(2*(3^(e-1)-1)m)179ff2_mult(s,s,w); // s = t^((3^(e-1)+2(3^(e-1)-1))m) = t^((3^e-2)m)180ff2_scalar_mult(r,t0,s); // r = t^((p+1)(m-1)+(3^e-2)m) = t^((p-1)m) generates the 3-Sylow of F_p^2181ff2_set (_ff2_3Sylow_tab[0][0], r);182ff2_square (_ff2_3Sylow_tab[0][1],_ff2_3Sylow_tab[0][0]);183for ( i = 1 ; i < _ff_p3_e ; i++ ) {184ff2_mult(_ff2_3Sylow_tab[i][0],_ff2_3Sylow_tab[i-1][0],_ff2_3Sylow_tab[i-1][1]); // tab[i][0] = tab[i-1][0]^3185ff2_square(_ff2_3Sylow_tab[i][1], _ff2_3Sylow_tab[i][0]); // tab[i][1] = tab[i][0]^2186}187ff2_set (_ff2_cbrt_unity,_ff2_3Sylow_tab[_ff_p3_e-1][0]);188//printf ("cube root of unity is %ldz+%ld, 2g=%d\n", _ff_get_ui(_ff2_cbrt_unity[1]), _ff_get_ui(_ff2_cbrt_unity[0]), _ff_get_ui(_ff_2g));189}190191192// computes a^{-1/3} for a in the 3-Sylow subgroup, returns 0 if not a quadratic residue193// uses precomputed table of 2e powers of the 3-Sylow generator to reduce running time by a factor of 2 over standard Tonelli-Shanks (still O(e^2)).194int ff2_3Sylow_invcbrt (ff_t o[2], ff_t a[2])195{196ff_t b[2], q[2], q1[2], t[2], w1[2], w2[2];197register int i, j, k;198199ff2_setup_cbrt();200201// handle easy cases without setting up202if ( _ff_one(a[0]) ) { _ff_set_one(o[0]); return 1; } // use 1 as the cube root of 1203if ( _ff_p3_e == 1 ) return 0;204205// set w1 and w2 to the two elements of order 3 in the 3-Sylow (i.e. the two non-trivial cube roots of unity)206ff2_set (w1, _ff2_3Sylow_tab[_ff_p3_e-1][0]);207ff2_set (w2, _ff2_3Sylow_tab[_ff_p3_e-1][1]);208ff2_set_one (t);209210ff2_set (b,a);211do {212ff2_set (q, b);213for ( i = 1 ; i < _ff_p3_e+1 ; i++ ) { // s<e+1 is just a safety check in case a isn't in the 3-Sylow, this could be removed214j=1;215if ( ff2_equal(q,w1) ) break;216j=0;217if ( ff2_equal(q,w2) ) break;218ff2_set(q1,q);219ff2_square (q,q); ff2_mult(q,q,q1);220}221k = _ff_p3_e-i;222#if ! FF_FAST223if ( k < 0 ) { printf ("Unexpected result: k<0 in ff2_3Sylow_invsqrt?! a = %luz+%lu, p = %lu\n", _ff_get_ui(a[1]), _ff_get_ui(a[0]), _ff_p); exit (1); }224#endif225if ( k <= 0 ) return 0;226ff2_mult (b, b, _ff2_3Sylow_tab[k][j]); // the product of all the elements S[k] we multiply into b here is b^{-1}, since we terminate with b=1227ff2_mult (t, t, _ff2_3Sylow_tab[k-1][j]); // multiply t by S[k-1]=cbrt(S[k]), the product of all these will be b^{-1/3}228} while ( ! ff2_one(b) );229#if ! FF_FAST230ff2_square (b, t); ff2_mult(b,b,t);231ff2_mult(b,b,a);232if ( ! ff2_one(b) ) { printf ("ff2_3Sylow_invcbrt failed, %(lu+%lu)^3 * (%luz+%lu) != 1\n", _ff_get_ui(t[1]), _ff_get_ui(t[0]), _ff_get_ui(a[1]), _ff_get_ui(a[0])); exit (0); }233#endif234ff2_set(o,t);235return 1;236}237238void _ff3_setup (void)239{240_ff_set_one(_ff3_f[3]); _ff_set_zero(_ff3_f[2]);241if ( _ff_p1mod3 ) {242ff_setup_3g();243ff3_set_zero(_ff3_zp);244ff_exp_ui(_ff3_zp+1,&_ff_3g,(_ff_p-1)/3);245_ff_set_zero(_ff3_f[1]);246_ff_neg (_ff3_f[0], _ff_3g);247_ff3_trace_z2 = 0;248} else {249_ff_set_one(_ff3_f[1]);250ff_negate (_ff3_f[1]);251_ff_set(_ff3_f[0],_ff3_f[1]);252for(;;) {253if ( ! ff_poly_roots_d3 (0, _ff3_f) ) break;254_ff_dec(_ff3_f[0]);255}256_ff_neg (_ff_3g, _ff3_f[0]); // note we use _ff_3g here even though the 3-Sylow is trivial257ff3_zn_mod (_ff3_zp, _ff_p, _ff3_f);258_ff3_trace_z2 = 2;259}260ff3_square (_ff3_z2p, _ff3_zp);261// Note that the norm of z is the product of the roots of f, which is -f[0] = __ff_3g262// the trace of z is -f[2] = 0, trace of z^2 is 0 for p=1mod3 and 2 o.w.263printf ("ff3 poly = z^3+%ldz+%ld\n", _ff_get_ui(_ff3_f[1]), _ff_get_ui(_ff3_f[0]));264}265266static inline void ff3_setup(void) { if ( ! _ff3_f[3] ) _ff3_setup(); }267268// overlap ok269void ff3_mult (ff_t o[3], ff_t a[3], ff_t b[3])270{271register ff_t s1,s2,s3,t0,t1,t2,w1,w2,w3;272273_ff_add(t1,a[0],a[1]); _ff_add(t2,b[0],b[1]); _ff_mult(s1,t1,t2);274_ff_add(t1,a[0],a[2]); _ff_add(t2,b[0],b[2]); _ff_mult(s2,t1,t2);275_ff_add(t1,a[1],a[2]); _ff_add(t2,b[1],b[2]); _ff_mult(s3,t1,t2);276_ff_mult(t0,a[0],b[0]); _ff_mult(t1,a[1],b[1]); _ff_mult(t2,a[2],b[2]);277_ff_sub(w1,s3,t1); _ff_subfrom(w1,t2); // w1 = (a1+a2)(b1+b2)-a1b1-a2b2 = a1b2+b2b1278_ff_mult(w3,w1,_ff_3g); // w3 = s(a1b2+b2b1)279_ff_add(o[0],t0,w3); // o[0] = a0b0 + (a1b2+b2b1)s280_ff_mult(w2,t2,_ff_3g); // w2 = a2b2s281_ff_sub(w3,s1,t0); _ff_subfrom(w3,t1); // w3 = (a0+a1)(b0+b1)-a0b0-a1b1 = a0b1+b1a0282_ff_add(o[1],w2,w3); // o[1] = a0b1+b1a0+a2b2s283_ff_sub(w3,s2,t0); _ff_subfrom(w3,t2); // w3 = (a0+a2)(b0+b2)-a0b0-a2b2 = a0b2+b2a0284_ff_add(o[2],w3,t1); // o[2] = a0b2+b2a0+a1b1285if ( ! _ff_p1mod3 ) { // in this case z^3=z+s so we need to add z[(a1b2+b2b1)+a2b2z]286_ff_addto(o[1],w1); // o[1] = a0b1+b1a0+a2b2s + a1b2+b2b1287_ff_addto(o[2],t2); // o[2] = a0b2+b2a0+a1b1 + a2b2288}289// 8M + 15A (17A)290}291292// multiplies modulo (z^3-rz-s), overlap ok (duplicates code in ff3_mult above, then adjusts for r)293void _ff3_mult_mod_rs (ff_t o[3], ff_t a[3], ff_t b[3], ff_t r, ff_t s)294{295register ff_t s1,s2,s3,t0,t1,t2,w1,w2,w3;296297_ff_add(t1,a[0],a[1]); _ff_add(t2,b[0],b[1]); _ff_mult(s1,t1,t2);298_ff_add(t1,a[0],a[2]); _ff_add(t2,b[0],b[2]); _ff_mult(s2,t1,t2);299_ff_add(t1,a[1],a[2]); _ff_add(t2,b[1],b[2]); _ff_mult(s3,t1,t2);300_ff_mult(t0,a[0],b[0]); _ff_mult(t1,a[1],b[1]); _ff_mult(t2,a[2],b[2]);301_ff_sub(w1,s3,t1); _ff_subfrom(w1,t2); // w1 = (a1+a2)(b1+b2)-a1b1-a2b2 = a1b2+b2b1302_ff_mult(w3,w1,s); // w3 = s(a1b2+b2b1)303_ff_add(o[0],t0,w3); // o[0] = a0b0 + (a1b2+b2b1)s304_ff_mult(w2,t2,s); // w2 = a2b2s305_ff_sub(w3,s1,t0); _ff_subfrom(w3,t1); // w3 = (a0+a1)(b0+b1)-a0b0-a1b1 = a0b1+b1a0306_ff_add(o[1],w2,w3); // o[1] = a0b1+b1a0+a2b2s307_ff_sub(w3,s2,t0); _ff_subfrom(w3,t2); // w3 = (a0+a2)(b0+b2)-a0b0-a2b2 = a0b2+b2a0308_ff_add(o[2],w3,t1); // o[2] = a0b2+b2a0+a1b1309// z^3=rz+s so we need to add rz[(a1b2+b2b1)+a2b2z] - don't optimize for r=0 or 1 here, we assume general case applies310_ff_mult(t0,r,w1);311_ff_addto(o[1],t0); // o[1] = a0b1+b1a0+a2b2s + r(a1b2+b2b1)312_ff_mult(t1,r,t2);313_ff_addto(o[2],t1); // o[2] = a0b2+b2a0+a1b1 + ra2b2314// 10M + 17A315}316317// squares mod x^3-s318static void inline _ff3_square_mod_0s (ff_t o[3], ff_t a[3], ff_t s)319{320register ff_t s1,s2,t0,t1,t2,w1,w2,w3;321322_ff_add(s1,a[1],a[1]); // 2a1323_ff_mult(w1,s1,a[0]); // 2a0a1324_ff_mult(s2,_ff_3g,a[2]); // a2s325_ff_mult(w2,s1,s2); // 2a1a2s326_ff_mult(t2,s2,a[2]); // a2^2s327_ff_square(t0,a[0]); _ff_square(t1,a[1]); // a0^2, a1^2328_ff_mult(w3,a[0],a[2]); // a0a2329_ff_x2(w3); // 2a0a2330_ff_add(o[0],t0,w2); // a0^2 + 2a1a2s331_ff_add(o[1],w1,t2); // 2a0a1 + a2^2s332_ff_add(o[2],w3,t1); // 2a0a2 + a1^2333// 7M + 5A334}335336// squares mod x^3-x-s337static void inline _ff3_square_mod_1s (ff_t o[3], ff_t a[3], ff_t s)338{339register ff_t s1,s2,t0,t1,t2,w1,w2,w3;340341_ff_add(s1,a[0],a[2]); // a0+a2342_ff_square(w1,s1); // a0^2+2a0a2+a2^2343_ff_add(s2,a[1],a[1]); // 2a1344_ff_mult(w3,s1,s2); // 2a0a1+2a1a2345_ff_mult(w2,a[2],s); // a2s346ff_mult(s2,s2,w2); // 2a1a2s347_ff_mult(t2,w2,a[2]); // a2^2s348_ff_square(t0,a[0]); _ff_square(t1,a[1]); // a0^2, a1^2349_ff_subfrom(w1,t0); // 2a0a2 + a2^2350_ff_add(o[0],t0,s2); // a0^2+2a1a2s351_ff_add(o[1],w3,t2); // 2a0a1+2a1a2+a2^2s352_ff_add(o[2],w1,t1); // 2a0a2 + a2^2 + a1^2353// 7M + 6A354}355356// squares mod x^3-rx-s357static void inline _ff3_square_mod_rs (ff_t o[3], ff_t a[3], ff_t r, ff_t s)358{359register ff_t s1,s2,t0,t1,t2,w1,w2,w3;360361_ff_mult(w1,a[2],r); // a2r362_ff_add(s1,a[0],w1); // a0+a2r363_ff_add(s2,a[1],a[1]); // 2a1364_ff_mult(w3,s1,s2); // 2a0a1+2a1a2r365_ff_mult(w2,a[2],s); // a2s366ff_mult(s2,s2,w2); // 2a1a2s367_ff_mult(t2,w2,a[2]); // a2^2s368_ff_square(t0,a[0]); _ff_square(t1,a[1]); // a0^2, a1^2369_ff_addto(s1,a[0]); // 2a0+a2r370_ff_mult(w1,s1,a[2]); // 2a0a2+a2^2r371_ff_add(o[0],t0,s2); // a0^2+2a1a2s372_ff_add(o[1],w3,t2); // 2a0a1+2a1a2+a2^2s373_ff_add(o[2],w1,t1); // 2a0a2 + a2^2r + a1^2374// 8M + 6A375}376377// overlap ok378void ff3_square (ff_t o[3], ff_t a[3])379{380if ( _ff_p1mod3 ) {381_ff3_square_mod_0s (o, a, _ff_3g);382} else {383_ff3_square_mod_1s (o, a, _ff_3g);384}385// 7M + 5A (6A)386}387388// change code below if and when we add degree 2 ext ops389void ff2_poly_eval (ff_t o[2], ff_t f[], int d, ff_t x[2])390{391ff_t t[2], y[2];392register int i;393394if ( d < 0 ) { ff2_set_zero(o); return; }395ff2_set_zero(y);396_ff_set (y[0], f[d]);397ff2_set(t,x);398for ( i = d-1 ; i >= 0 ; i-- ) { ff2_mult(y,y,t); _ff_addto(y[0], f[i]); }399ff2_set(o,y);400return;401}402403404void ff3_poly_eval (ff_t o[3], ff_t f[], int d, ff_t x[3])405{406ff_t t[3], y[3];407register int i;408409ff_setup_3g();410if ( d < 0 ) { ff3_set_zero (o); return; }411ff3_set_zero (y);412_ff_set (y[0], f[d]);413ff3_set (t,x);414for ( i = d-1 ; i >= 0 ; i-- ) { ff3_mult(y,y,t); _ff_addto(y[0],f[i]); }415ff3_set(o,y);416return;417}418419// computes a^p = a[0]+a[1]z^p+a[2]z^{2p} using 2M or 6M+4A, overlap ok420void ff3_exp_p (ff_t o[3], ff_t a[3])421{422ff_t t1[3], t2[3];423424if ( _ff_p1mod3 ) {425// if p=1mod3 we know z^p is a multiple of z and z^2p is a multiple of z^2426_ff_set(o[0],a[0]);427ff_mult(o[1],a[1],_ff3_zp[1]);428ff_mult(o[2],a[2],_ff3_z2p[2]);429} else {430ff3_scalar_mult(t1,a[1],_ff3_zp);431ff3_scalar_mult(t2,a[2],_ff3_z2p);432ff3_add(t1,t1,t2);433_ff_addto(t1[0],a[0]);434ff3_set(o,t1);435}436}437438439// computes z^p mod f=z^3-az-b, optimized for a=1 case440// standard 4-ary exponentiation (fixed 2-bit window)441void ff3_zn_mod (ff_t o[3], unsigned long n, ff_t f[4]) // only looks at f[0] and f[1], implicitly assumes f[2]=0 and f[3]=1442{443register int i;444register unsigned long j, m, e;445register ff_t a, b, w1,w2, w3;446ff_t t[3];447448e=n;449i = _asm_highbit(e);450if ( i&1 ) i--;451m = 3UL<<i;452j = (m&e)>>i;453_ff_neg(a,f[1]);454_ff_neg(b,f[0]);455//printf("a=%ld, f[1]=%ld, b=%ld, f[0]=%ld\n", _ff_get_ui(a), _ff_get_ui(f[1]), _ff_get_ui(b), _ff_get_ui(f[0]));456switch(j){457case 1:458_ff_set_zero(t[0]); _ff_set_one(t[1]); _ff_set_zero(t[2]); // t = z459break;460case 2:461_ff_set_zero(t[0]); _ff_set_zero(t[1]); _ff_set_one(t[2]); // t = z^2462break;463case 3:464_ff_set(t[0],b); _ff_set(t[1],a); _ff_set_zero(t[2]); // t = z^3=az+b465break;466}467//printf("j=%d, %ldz^2+%ldz+%ld!\n", j, _ff_get_ui(t[2]), _ff_get_ui(t[1]), _ff_get_ui(t[0]));468if ( _ff_one(a) ) {469for ( m>>=2,i-=2 ; m ; m>>=2, i-=2 ) {470_ff3_square_mod_1s(t,t,b); _ff3_square_mod_1s(t,t,b);471//printf("i=%d, %ldz^2+%ldz+%ld!\n", i, _ff_get_ui(t[2]), _ff_get_ui(t[1]), _ff_get_ui(t[0]));472j = (m&e)>>i;473switch(j) {474case 1:475_ff_mult(w1,t[2],b);476_ff_set(w2,t[1]);477_ff_add(t[1],t[0],t[2]);478_ff_set(t[2],w2);479_ff_set(t[0],w1);480break;481case 2:482_ff_mult(w1,t[2],b);483_ff_addto(t[2],t[0]);484_ff_mult(w2,t[1],b);485_ff_addto(t[1],w1);486_ff_set(t[0],w2);487break;488case 3:489_ff_mult(w1,t[2],b);490_ff_mult(w2,t[1],b);491_ff_addto(w2,t[0]);492_ff_add(w3,t[0],t[2]);493_ff_mult(t[0],w3,b);494_ff_set(w3,t[2]);495_ff_add(t[2],t[1],w1);496_ff_add(t[1],w3,w2);497break;498}499// 1.5M+1.75A on average, every 4 bits500}501} else {502for ( m>>=2,i-=2 ; m ; m>>=2, i-=2 ) {503_ff3_square_mod_rs(t,t,a,b); _ff3_square_mod_rs(t,t,a,b);504//printf("i=%d, %ldz^2+%ldz+%ld!\n", i, _ff_get_ui(t[2]), _ff_get_ui(t[1]), _ff_get_ui(t[0]));505j = (m&e)>>i;506switch(j) {507case 1:508_ff_mult(w1,t[2],b);509_ff_mult(w2,t[2],a);510_ff_set(t[2],t[1]);511_ff_add(t[1],w2,t[0]);512_ff_set(t[0],w1);513break;514case 2:515_ff_mult(w1,t[2],b);516_ff_mult(w2,t[2],a);517_ff_add(t[2],t[0],w2);518_ff_mult(w2,t[1],a);519_ff_mult(t[0],t[1],b);520_ff_add(t[1],w1,w2);521break;522case 3:523_ff_mult(w1,t[2],b);524_ff_mult(w2,t[2],a);525_ff_add(w3,t[0],w2);526_ff_mult(t[0],w3,b);527_ff_mult(w2,w3,a);528_ff_mult(w3,t[1],b);529_ff_mult(t[2],t[1],a);530_ff_add(t[1],w2,w3);531_ff_addto(t[2],w1);532break;533}534// 3M+1.5A on average, every 4 bits535//printf("j=%d, %ldz^2+%ldz+%ld!\n", j, _ff_get_ui(t[2]), _ff_get_ui(t[1]), _ff_get_ui(t[0]));536}537}538ff3_set (o, t);539}540541542// standard 4-ary exponentiation (fixed 2-bit window)543void ff3_exp_ui (ff_t o[3], ff_t a[3], unsigned long e)544{545register int i;546register unsigned long j, m;547ff_t b[4][3], c[3];548549//printf ("exp(%ldz^2+%ldz+%ld, %ld)\n", _ff_get_ui(a[2]), _ff_get_ui(a[1]), _ff_get_ui(a[0]), e);550ff_setup_3g();551switch (e) {552case 0: ff3_set_one (o); return;553case 1: ff3_set(o,a); return;554case 2: ff3_square(o,a); return;555}556i = _asm_highbit(e);557if ( i&1 ) i--;558m = 3UL<<i;559ff3_set (b[1], a);560ff3_square (b[2],b[1]);561ff3_mult(b[3],b[2],b[1]);562ff3_set (c, b[(m&e)>>i]);563for ( m>>=2,i-=2 ; m ; m>>=2, i-=2 ) {564ff3_square(c,c); ff3_square(c,c);565j = (m&e)>>i;566if ( j ) ff3_mult(c,c,b[j]);567}568ff3_set (o, c);569//printf("expresult=%ldz^2+%ldz+%ld\n", _ff_get_ui(o[2]), _ff_get_ui(o[1]), _ff_get_ui(o[0]));570}571572// standard 4-ary exponentiation (fixed 2-bit window)573void ff3_exp_ui_rs (ff_t o[3], ff_t a[3], unsigned long e, ff_t r, ff_t s)574{575register int i;576register unsigned long j, m;577ff_t b[4][3], c[3];578579//printf ("exp(%ldz^2+%ldz+%ld, %ld)\n", _ff_get_ui(a[2]), _ff_get_ui(a[1]), _ff_get_ui(a[0]), e);580ff_setup_3g();581switch (e) {582case 0: ff3_set_one (o); return;583case 1: ff3_set(o,a); return;584case 2: _ff3_square_mod_rs (o,a,r,s); return;585}586i = _asm_highbit(e);587if ( i&1 ) i--;588m = 3UL<<i;589ff3_set (b[1], a);590_ff3_square_mod_rs (b[2],b[1],r,s);591_ff3_mult_mod_rs(b[3],b[2],b[1],r,s);592ff3_set (c, b[(m&e)>>i]);593for ( m>>=2,i-=2 ; m ; m>>=2, i-=2 ) {594_ff3_square_mod_rs(c,c,r,s); _ff3_square_mod_rs(c,c,r,s);595j = (m&e)>>i;596if ( j ) _ff3_mult_mod_rs(c,c,b[j],r,s);597}598ff3_set (o, c);599//printf("expresult=%ldz^2+%ldz+%ld\n", _ff_get_ui(o[2]), _ff_get_ui(o[1]), _ff_get_ui(o[0]));600}601602/*603This algorithm is deterministic (given 2Sylow generator in F_p) - caller should flip coin to randomize roots604*/605int ff3_sqrt (ff_t o[3], ff_t a[3])606{607ff_t u[3], v[3], w[3], x[3];608ff_t b, c, d;609610if ( ff3_zero(a) ) { ff3_set_zero (o); return 1; }611if ( ff3_one(a) ) { ff3_set (o,a); return 1; }612613ff3_setup();614615//printf("sqrt(%ld+%ldz+%ldz^2)\n", _ff_get_ui(a[0]), _ff_get_ui(a[1]), _ff_get_ui(a[2]));616617/*618We need to compute a^n and a^(n+1)/2, where n is the odd part of p^3-1, which is m(p^2+p+1), where m=2k+1 is the odd part of p-1619The element a^n=N(a)^m is in the 2-sylow subgroup, and is also in F_p (F_p and F_p^3 have the same 2-Sylow subgroup).620621We can compute N(a) very efficiently, and then exponentiate by k in F_p, followed by a square and a multiply to obtain N(a)^(2k+1)=a^n.622We use a standard F_p Tonelli-Shanks algorithm to compute the inverse of the square root of a^n, if it exists, and if it doesn't we detect this623for less than the cost of a Legendre computation.624625Once we have a^(-n/2) in hand, we then compute a^((n+1)/2) to obtain a^(1/2).626627(n+1)/2 = ((2k+1)(p^2+p+1)+1)/2 = k(p^2+p+1) + p(p+1)/2 + 1628629We already have a^(k(p^2+p+1)) from above, so we need only compute a^((p+1)/2) followed by a Frobenius exponentiation by p, and two mults.630*/631ff3_norm(&b,a); // b = N(a) = a^(p^2+p+1)632ff_exp_ui(&c,&b,(_ff_p2_m-1)/2); // c = N(a)^k (save for later)633_ff_square(d,c); // d = N(a)^(2k)634ff_mult(b,b,d); // b = N(a)^(2k+1) = N(a)^m is in the 2-Sylow635if ( ! ff_2Sylow_invsqrt(&d,&b,0) ) return 0; // d = 1/sqrt(b) = a^{-n/2)636ff3_exp_ui(x,a,(_ff_p+1)/2); // x = a^((p+1)/2)637ff3_exp_p(x,x); // x = a^(p(p+1)/2)638ff3_mult (x, x, a); // x = a^(p(p+1)/2+1)639_ff_mult (b, d, c); // b = a^(k(p^2+p+1)-n/2) (do F_p mults before scalar_mult)640ff3_scalar_mult (x, b, x); // x = a^(k(p^2+p+1)+p(p+1)/2+1-n/2) = a^((n+1)/2-n/2) = a^(1/2)641//printf ("a^(1/2) = %ldz^2+%ldz+%ld\n", _ff_get_ui(x[2]), _ff_get_ui(x[1]), _ff_get_ui(x[0]));642#if ! FF_FAST643ff3_square (w, x);644if ( ! ff3_equal(w,a) ) { printf ("ff3_sqrt failed, (%luz^2+%luz+%lu)^2 = ", _ff_get_ui(x[2]), _ff_get_ui(x[1]), _ff_get_ui(x[0]));645printf ("%luz^^2+%luz+%luz != ", _ff_get_ui(w[2]), _ff_get_ui(w[1]), _ff_get_ui(w[0]));646printf ("%luz^2+%luz+%luz\n", _ff_get_ui(a[2]), _ff_get_ui(a[1]), _ff_get_ui(a[0])); exit (0); }647#endif648ff3_set (o, x);649return 1;650}651652// computes (z+a)*v in F_p[z]/(z^3-rz-s), overlap ok653static inline void _ff3_mult_zpa_mod_rs (ff_t o[3], ff_t v[3], ff_t a, ff_t r, ff_t s)654{655register ff_t t1,t2,w1,w2;656657_ff_set(t2,v[2]);658_ff_mult(w1,t2,a);659_ff_add(o[2],v[1],w1);660_ff_mult(w1,v[1],a);661_ff_mult(w2,t2,r);662_ff_addto(w1,w2);663_ff_add(o[1],v[0],w1);664_ff_mult(w1,v[0],a);665_ff_mult(w2,t2,s);666_ff_add(o[0],w1,w2);667// 5M+4A668}669670671// Computes tr(sqrt(z)) in F_p^3=F_p[z]/(z^3-rz-s). This is a support function for factoring quartics.672int ff3_trsqrt_zpa_mod_rs (ff_t o[1], ff_t a, ff_t r, ff_t s)673{674ff_t u[3], v[3], w[3], x[3], f[4];675ff_t b, c, d, t1, t2;676677//printf("Computing tr(sqrt(z+a)) for F_p[z]/(z^3-rz-s) with p=%ld, a=%ld, r=%ld, s=%ld\n", _ff_p, _ff_get_ui(a), _ff_get_ui(r), _ff_get_ui(s));678/*679We use effectively the same algorithm as ff3_sqrt to compute sqrt(z), except we work in the user supplied basis680and our input is (z+a) which simplifies a few things.681*/682// N(z+a) = (z+a)(z+a)^p(z+a)^(p^2) = (z+a)(z^p+a)(z^(p^2)+a) = N(z)+a*tr(z^p*z)+a^2tr(z)+a^3 = s-ar+a^3 (since N(z)=s, tr(z)=0 and tr(z*z^p)=-r)683_ff_mult(t1,a,r); // t1=ar684_ff_sub(b,s,t1); // b=s-ar685_ff_square(t2,a); ff_mult(t1,t2,a); // t1=a^3686_ff_addto(b,t1); // b=s-ar+a^3=N(z+a)687//printf("N(z+a)=%ld\n", _ff_get_ui(b));688ff_exp_ui(&c,&b,(_ff_p2_m-1)/2); // c = N(z+a)^k (save for later)689_ff_square(d,c); // d = N(z+a)^(2k)690ff_mult(b,b,d); // b = N(z+a)^(2k+1) = N(z)^m is in the 2-Sylow691//printf ("b=N(z+a)^m=%ld\n", _ff_get_ui(b));692if ( ! ff_2Sylow_invsqrt(&d,&b,0) ) return 0; // d = 1/sqrt(b) = z^{-m(p^2+p+1)/2)693//printf("d=sqrt(b)=%ld\n", _ff_get_ui(d));694695// set modulus for ff3 mults696_ff_set_one(f[3]); _ff_set_zero(f[2]);697_ff_neg(f[1],r); _ff_neg(f[0],s);698// combine computation of (z+a)^((p+1)/2) and z^p699ff_poly_xpan_mod_d3 (w,a,(_ff_p-1)/2,f); // w = (z+a)^((p-1)/2)700_ff3_mult_zpa_mod_rs (x,w,a,r,s); // x = (z+a)^((p+1)/2)701//printf ("(z+%ld)^((p+1)/2) = %ldz^2+%ldz+%ld\n", _ff_get_ui(a), _ff_get_ui(x[2]), _ff_get_ui(x[1]), _ff_get_ui(x[0]));702_ff3_mult_mod_rs(u,w,x,r,s); // u = (z+a)^p mod f703_ff_subfrom(u[0],a); // u = (z+a)^p - a = z^p+a^p-a = z^p704//printf("z^p = %ldz^2+%ldz+%ld\n", _ff_get_ui(u[2]), _ff_get_ui(u[1]), _ff_get_ui(u[0]));705706// x^p = x0+x1*z^p+x2*z^(2p) = x0+x1*u+x2*u^2707ff3_scalar_mult(w,x[1],u); // w = x1u708_ff3_square_mod_rs(v,u,r,s); // v = u^2709ff3_scalar_mult(v,x[2],v); // v = x2u^2710_ff_set(t1,x[0]);711ff3_add(x,w,v); // x = x1u+x2u^2712_ff_addto(x[0],t1); // x = x0+x1u+x2u^2 = (z^((p+1)/2))^p713_ff3_mult_zpa_mod_rs (x,x,a,r,s); // x = z^(p(p+1)/2+1)714_ff_mult (b, d, c); // b = z^(k(p^2+p+1)-n/2) (do F_p mults before scalar_mult) recall that n=m(p^2+p+1) where m is odd part of p-1715ff3_scalar_mult (x, b, x); // x = z^(k(p^2+p+1)+p(p+1)/2+1-n/2) = z^((n+1)/2-n/2) = z^(1/2)716//printf ("z^(1/2) = %ldz^2+%ldz+%ld\n", _ff_get_ui(x[2]), _ff_get_ui(x[1]), _ff_get_ui(x[0]));717#if ! FF_FAST718_ff3_square_mod_rs (w, x,r,s);719if ( !_ff_zero(w[0])||!_ff_one(w[1])||!_ff_zero(w[2]) ) { printf ("ff3_sqrt failed, (%luz^2+%luz+%lu)^2 = ", _ff_get_ui(x[2]), _ff_get_ui(x[1]), _ff_get_ui(x[0]));720printf ("%luz^^2+%luz+%luz != z", _ff_get_ui(w[2]), _ff_get_ui(w[1]), _ff_get_ui(w[0])); }721#endif722// now compute tr(x)=x0*tr(1)+x1*tr(z)+x2*(tr(z^2)) = 3x0 + 0 + 2rx2723_ff_mult(t2,r,x[2]);724_ff_add(t1,t2,t2); // t1=2rx2725_ff_add(t2,x[0],x[0]);726_ff_addto(t2,x[0]); // t2=3x0727_ff_add(o[0],t1,t2); // o=tr(x)728//printf("tr(z^(1/2)) = %ld\n", _ff_get_ui(o[0]));729return 1;730}731732733