#include <stdlib.h>1#include <stdio.h>2#include "ffext.h"3#include "ffpoly.h"4#include "cstd.h"56/*7Copyright 2007-2008 Andrew V. Sutherland89This file is part of smalljac.1011smalljac is free software: you can redistribute it and/or modify12it under the terms of the GNU General Public License as published by13the Free Software Foundation, either version 2 of the License, or14(at your option) any later version.1516smalljac is distributed in the hope that it will be useful,17but WITHOUT ANY WARRANTY; without even the implied warranty of18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the19GNU General Public License for more details.2021You should have received a copy of the GNU General Public License22along with smalljac. If not, see <http://www.gnu.org/licenses/>.23*/2425/*26This module contains performance-oriented code for operations on polynomials of low degree,27and support functions for computations on genus 1 and genus 2 curves.2829It is assumed here that the ff_t type does not require initialization (no support for GMP's mpz type),30and in a number of cases it is further assumed that the ff_t type fits in an unsigned long (this assumption is verified when made).31*/323334void ff_poly_xpan_mod_d2 (ff_t g[2], ff_t a, unsigned long n, ff_t f[1]);35void ff_poly_xn_mod_d3 (ff_t g[3], unsigned long n, ff_t f[4]);36void ff_poly_xpan_mod_d3 (ff_t g[2], ff_t a, unsigned long n, ff_t f[4]);37void ff_poly_mult_mod_d3 (ff_t w[3], ff_t u[3], ff_t v[3], ff_t f[4]);38void ff_poly_square_mod_d3 (ff_t w[3], ff_t u[3], ff_t f[4]);39void ff_poly_exp4_mod_d3 (ff_t g[3], ff_t p[12], unsigned long n, ff_t f[4]);40void ff_poly_exp8_mod_d3 (ff_t g[3], ff_t p[24], unsigned long n, ff_t f[4]);4142void ff_poly_xn_mod_d4 (ff_t g[4], unsigned long n, ff_t f[5]);43void ff_poly_xpan_mod_d4 (ff_t g[3], ff_t a, unsigned long n, ff_t f[5]);44void ff_poly_mult_mod_d4 (ff_t w[4], ff_t u[4], ff_t v[4], ff_t f[5]);45void ff_poly_square_mod_d4 (ff_t w[4], ff_t u[4], ff_t f[5]);46void ff_poly_exp4_mod_d4 (ff_t g[4], ff_t p[16], unsigned long n, ff_t f[5]);47void ff_poly_exp8_mod_d4 (ff_t g[4], ff_t p[32], unsigned long n, ff_t f[5]);484950// assumes f monic, char > 3, sets t to translation value: depressed cubic is f(x-t)51void ff_depress_cubic (ff_t t[1], ff_t f[4])52{53_ff_t_declare_reg t1, t2, f1;5455_ff_mult(t1,_ff_third,f[2]); // t1 = f2/356_ff_set(t[0],t1);57_ff_mult(f1,t1,f[2]); // f1 = f2^2/358_ff_square(t2,t1); ff_mult(t2,t2,t1); _ff_x2(t2); ff_mult(t1,t1,f[1]); // t2 = f2^3/27, t1 = f1f2/359_ff_addto(f[0],t2); _ff_subfrom(f[0],t1);60_ff_subfrom(f[1],f1);61_ff_set_zero(f[2]);62// 5M+3A63}6465// computes cubic resolvent of f=x^4+f2x^2+f1x+f0 and depresses it66void ff_depressed_cubic_resolvent (ff_t t[1], ff_t g[4], ff_t f[5])67{68_ff_t_declare_reg t0,t1,t2;6970_ff_set_one(g[3]);71_ff_add(t1,f[2],f[2]);72_ff_neg(g[2],t1); // g2 = -2f273_ff_add(t1,f[0],f[0]); _ff_x2(t1);74_ff_square(t2,f[2]);75_ff_sub(g[1],t2,t1); // g1 = f2^2-4f076_ff_square(g[0],f[1]); // g0 = f1^277ff_depress_cubic (t,g);78// total 7M+5A79}808182// assumes f monic, char > 3, sets t to translation value: depressed cubic is f(x-t)83void ff_depress_quartic (ff_t t[1], ff_t f[5])84{85_ff_t_declare_reg t0,t1, t2, t3, t4, w1, w2;8687_ff_mult(t1,f[3],_ff_half); // t1 = f3/288ff_mult(t0,t1,_ff_half); // t0 = f3/489_ff_set(t[0],t0);90_ff_square(t2,t0); _ff_square(t4,t3); // t2 = f3^2/16, t4=f3^4/25691_ff_add(t3,t4,t4); _ff_addto(t3,t4); // t3 = 3f3^4/25692_ff_mult(w1,f[2],t2); // w1 = f2f3^2/1693_ff_mult(w2,f[1],t0); // w2 = f1f3/494_ff_subfrom(w1,w2);95_ff_subfrom(w1,t3);96_ff_addto(f[0],w1); // new f0 = f0 + f2f3^2/16 - f1f3/4 - 3f^4/25697_ff_mult(t3,t0,t2); // t3 = f3^2/898_ff_mult(w1,t3,f[3]);99_ff_mult(w2,f[2],t1);100_ff_subfrom(w1,w2);101_ff_addto(f[1],w1); // new f1 = f1 + f3^3/8 - f2f3/2102_ff_add(w1,t3,t3); _ff_addto(w1,t3); // w1 = 3f3^2/8103_ff_subfrom(f[2],w1); // new f2 = f2-3f3^2/8104_ff_set_zero(f[3]);105// 9M+10A106}107108// computes the gcd of f and g where f=x^3+ax+b109void ff_poly_gcd_g1 (ff_t h[4], int *d_h, ff_t f[4], ff_t g[4], int d_g)110{111_ff_t_declare_reg t1,t2;112ff_t r[4], s[4];113int d_r, d_s;114115if ( d_g == 3 ) {116_ff_neg(r[2],g[2]);117_ff_mult(t1,f[1],g[3]);118_ff_sub(r[1],t1,g[1]);119_ff_mult(t1,f[0],g[3]);120_ff_sub(f[0],t1,g[0]);121for ( d_r = 2 ; d_r >= 0 && _ff_zero(r[d_r]) ; d_r-- );122} else {123ff_poly_copy (r,&d_r,g,d_g);124}125if ( d_r < 0 ) { ff_poly_copy (h, d_h, f, 3); return; }126if ( ! d_r ) { _ff_set_one(h[0]); *d_h = 0; return; }127_ff_neg(s[2],r[d_r-1]);128_ff_mult(t1,f[1],r[d_r]);129if ( d_r > 1 ) { _ff_sub(s[1],t1,r[d_r-2]); } else { _ff_set(s[1],t1); }130_ff_mult(s[0],f[0],r[d_r]);131for ( d_s = 2 ; d_s >= 0 && _ff_zero(s[d_s]) ; d_s-- );132if ( d_s == 2 && d_r == 2) {133_ff_mult(t1,s[1],r[2]);134_ff_mult(t2,r[1],s[2]);135_ff_sub(s[1],t1,t2);136_ff_mult(t1,s[0],r[2]);137_ff_mult(t2,r[0],s[2]);138_ff_sub(s[0],t1,t2);139for ( d_s = 1 ; d_s >= 0 && _ff_zero(s[d_s]) ; d_s-- );140}141if ( d_s < 0 ) { ff_poly_copy(h,d_h,r,d_r); return; }142if ( ! d_s ) { _ff_set_one(h[0]); *d_h = 0; return; }143if ( d_r == 2 && d_s == 1 ) {144_ff_mult(t1,r[1],s[1]);145_ff_mult(t2,s[0],r[2]);146_ff_sub(r[1],t1,t2);147ff_mult(r[0],r[0],s[1]);148for ( d_r = 1 ; d_r >= 0 && _ff_zero(r[d_r]) ; d_r-- );149if ( d_r < 0 ) { ff_poly_copy (h, d_h, s, d_s); return; }150if ( ! d_r ) { _ff_set_one(h[0]); *d_h = 0; return; }151}152if ( d_s == 2 && d_r == 1 ) {153_ff_mult(t1,s[1],r[1]);154_ff_mult(t2,r[0],s[2]);155_ff_sub(s[1],t1,t2);156ff_mult(s[0],s[0],r[1]);157for ( d_s = 1 ; d_s >= 0 && _ff_zero(s[d_s]) ; d_s-- );158if ( d_s < 0 ) { ff_poly_copy (h, d_h, r, d_r); return; }159if ( ! d_s ) { _ff_set_one(h[0]); *d_h = 0; return; }160}161// we must have d_r==d_s==1 at this point162_ff_mult(t1,r[0],s[1]);163_ff_mult(t2,s[0],r[1]);164_ff_sub(s[0],t2,t1);165if ( _ff_zero(s[0]) ) { ff_poly_copy(h,d_h,r,d_r); return; }166_ff_set_one(h[0]); *d_h = 0;167// worst case 13M+7A and some copies168return;169}170171/*172Standard root finding algorithm specialized to degree 3, note that roots may be null if only the count is required173Returns the number of roots found. This function is superseded by ff_poly_roots_d3 below which solves the cubic with radicals.174*/175int ff_old_poly_roots_d3 (ff_t roots[3], ff_t f[4])176{177_ff_t_declare g[3], h[3], hh[3], a;178_ff_t_declare_reg t0, t1, t2;179int i, j, d_g, d_h, d_hh;180181#if FF_WORDS > 1182err_printf ("ff_poly_factors_g1 only implemented for single word fields\n"); exit (0);183#endif184185// this is wasteful if the caller already knows the discriminant!186_ff_square(t0,f[0]); _ff_set_ui(t1,27); _ff_mult(t2,t0,t1); // t2 = 27f1^2187_ff_square(t0,f[1]); _ff_mult(t1,t0,f[1]); _ff_x2(t1); _ff_x2(t1); // t1 = 4f0^3188_ff_addto(t1,t2);189if ( _ff_zero(t1) ) { // D=0 => curve has a repeated root190if ( roots ) {191if ( _ff_zero(f[1]) ) { // if f[1]=0 then f[0]=0 and we have a triple root at 0192_ff_set_zero(roots[0]); _ff_set_zero(roots[1]); _ff_set_zero(roots[2]);193} else { // otherwise return the distinct root is 3b/a and the double root is (-3b/(2a)194_ff_invert(t0,f[1]); // 1/a195_ff_add(t1,f[0],f[0]); _ff_addto(t1,f[0]); // 3b196_ff_mult(roots[0],t0,t1); // 3b/a197_ff_neg(t0,roots[0]);198_ff_mult(roots[1],t0,_ff_half); // -3b/(2a)199_ff_set(roots[2],roots[1]);200}201}202return 3;203}204205// compute x^p mod f206ff3_zn_mod (g, _ff_p, f); // g = x^p mod f207_ff_dec(g[1]); // g = x^p-x mod f208for ( d_g = 2 ; d_g >= 0 && _ff_zero(g[d_g]) ; d_g-- );209ff_poly_gcd_red (h, &d_h, f, 3, g, d_g);210switch (d_h) {211case 0:212return 0;213case 1:214if ( roots ) {215_ff_invert(t1,h[1]);216_ff_mult(t2,t1,h[0]);217_ff_neg(roots[0], t2);218}219return 1;220case 2:221err_printf ("Impossible, non-singular cubic with 2 roots in ff_old_poly_roots_d3: "); ff_poly_print (f,3); ff_poly_print(h,d_h); exit (0);222case 3:223if ( ! roots ) return 3;224// We know there are three roots, and separate the roots using the standard probabilistic algorithm225// Pick a random a and compute g=(x+a)^((p-1)/2). Half of the elements in F_p* will be roots of g,226// and with probability 3/4 gcd(f,g) will have degree 1 or 2 allowing us to pick out a root227// As a minor optimization, we pick a=0 as our first "random" choice, since we can compute g faster in this case.228for ( i = 0 ; ; i++ ) {229if ( ! i ) {230ff3_zn_mod (g, (_ff_p-1)/2, f);231} else {232ff_random(&a);233ff_poly_xpan_mod_d3 (g, a, (_ff_p-1)/2, f);234}235_ff_dec (g[0]); // g = (x+a)^((p-1)/2) mod f236for ( d_g = 2 ; d_g >= 0 && _ff_zero(g[d_g]) ; d_g-- );237ff_poly_gcd_red (h, &d_h, f, 3, g, d_g);238if ( d_h > 0 && d_h < 3 ) break;239}240i = ff_poly_roots_d2(roots,h,d_h);241if ( i != d_h ) { err_printf ("poly_roots_d2 failed in ff_old_poly_roots_d3!"); exit (0); }242ff_poly_div (h, &d_h, g, &d_g, f, 3, h, d_h);243if ( d_g != -1 ) { err_printf ("inexact poly division in ff_old_poly_factors_g1!\n"); exit(0); }244j = ff_poly_roots_d2(roots+i,h,d_h);245if ( j != d_h ) { err_printf ("poly_roots_d2 failed in ff_old_poly_roots_d3!"); exit (0); }246if ( i+j != 3 ) { err_printf ("missing root in ff_old_poly_roots_d3!"); exit (0); }247return 3;248}249err_printf ("Unreachable code in ff_old_poly_factors_g1!\n");250exit (0);251}252253254255256/*257Finds the roots of a monic depreseed cubic x^3+ax+b over F_p.258If roots is NULL only the # of roots is determined, which can be significantly quicker (e.g. when D is not a QR)259The value d is optional. If non-null it must contain sqrt(-D/3) (this is used when computing 3-torsion)260261Assumes _ff_p fits in an unsigned long262*/263int _ff_poly_roots_d3 (ff_t roots[3], ff_t f[4], ff_t *pd)264{265_ff_t_declare g[3], h[2], w[2], v[2], vk[2], d, t, r, s;266_ff_t_declare_reg a, t0, t1, t2;267int i, k, sts;268269#if FF_WORDS > 1270err_printf ("ff_poly_factors_g1 only implemented for single word fields\n"); exit (0);271#endif272273// handle p=3 separately274if ( _ff_p == 3 ) {275if ( _ff_zero(f[0]) ) { if ( roots ) _ff_set_zero(roots[0]); if ( _ff_one(f[1]) ) return 1; if ( roots ) { _ff_set_one(roots[1]); _ff_set(roots[2],_ff_negone); } return 3; }276if ( _ff_zero(f[1]) ) { if ( roots ) { _ff_neg(roots[0],f[0]); _ff_neg(roots[0],f[0]); _ff_neg(roots[0],f[0]); } return 3; }277if ( _ff_one(f[1]) ) { if ( roots ) _ff_set(roots[0],f[0]); return 1; } else return 0;278}279280// f=x^3+ax281if ( _ff_zero(f[0]) ) {282_ff_neg(t,f[1]);283if ( ! roots ) { return (ff_residue(t)?3:1); }284_ff_set_zero(roots[0]);285if ( ! ff_sqrt(roots+1,&t) ) return 1;286_ff_neg(roots[2],roots[1]);287return 3;288}289290// f = x^3+b291if ( _ff_zero(f[1]) ) {292_ff_neg(t,f[0]);293if ( _ff_p1mod3 ) {294if ( ! ff_cbrt(&d,&t) ) return 0;295if ( roots ) { _ff_set(roots[0],d); _ff_mult(roots[1], roots[0], _ff_cbrt_unity); _ff_mult(roots[2], roots[1], _ff_cbrt_unity); }296return 3;297} else {298if ( roots ) if ( ! ff_cbrt(roots,&t) ) { printf ("Impossible, ff_cbrt failed on input %ld when p=%ld is 2 mod 3\n", _ff_get_ui(t), _ff_p); exit(0); }299return 1;300}301}302303if ( ! pd ) {304_ff_square(t0,f[0]); _ff_set_ui(t1,27); _ff_mult(t2,t0,t1); // t2 = 27f1^2305_ff_square(t0,f[1]); _ff_mult(t1,t0,f[1]); _ff_x2(t1); _ff_x2(t1); // t1 = 4f0^3306_ff_add(t0,t1,t2); _ff_mult(t,t0,_ff_third); // t = -D/3 = (27f1^2+4f0^3)/3307if ( _ff_zero(t) ) { // t=0 => D=0 => curve is singular308if ( roots ) { // distinct root is 3b/a (we know a!=0), and (-3b/(2a) is a double root).309_ff_invert(t0,f[1]); // 1/a310_ff_add(t1,f[0],f[0]); _ff_addto(t1,f[0]); // 3b311_ff_mult(roots[0],t0,t1); // 3b/a312_ff_neg(t0,roots[0]);313_ff_mult(roots[1],t0,_ff_half); // -3b/(2a)314_ff_set(roots[2],roots[1]);315}316return 3;317}318sts = ff_sqrt_ext(&d,&t); // use extended sqrt so we get a solution in F_p^2 if necessary319} else {320_ff_set(d,*pd);321}322_ff_mult(t2,_ff_half,_ff_third); // t2=1/6 (used later)323_ff_mult(t0, _ff_half, f[0]);324_ff_neg(a,t0); // a = -f[0]/2 (used later)325if ( _ff_p1mod3 ) {326if ( ! sts ) { // p=1mod3 => -1/3 is a QR => (t not a QR => D not a QR => f has an even # of factors (by Stickleberger), so 1 root327if ( ! roots ) return 1; // if all the caller wants is the number of roots, we are done328_ff_square(t0,t2); // 1/36329_ff_mult(s,t,t0); // s = -D/108 is a not a QR, we will now work in the field F_p^2 = F_p[z]/(z^2-s) to compute (z+a)^1/3330// compute (z+t1)^((p+1)*m) in F_p^2, this will be in the 3-Sylow subgroup of F_p^2, necessarily in F_p (here p=3^e*m+1, m not divisible by 3)331// in the process, we compute g=(z+1)^((m-i)/3) where i is congruent to m mod 3 and h=(z+a)^m332if ( _ff_p3_m1mod3 ) {333ff_poly_xpan_mod_d2(g,a,(_ff_p3_m-1)/3,&s); // g = (z+a)^((m-1)/3)334ff2_square_s(h,g,s); ff2_mult_s (h,h,g,s); // h = g^3 = (z+a)^(m-1)335ff2_mult_zpa_s(h,h,a,s); // h = g^3*(z+a) = (z+a)^m336} else {337ff_poly_xpan_mod_d2(g,a,(_ff_p3_m-2)/3,&s); // g = (z+a)^((m-2)/3)338ff2_square_s (h, g, s); ff2_mult_s (h, h, g, s); // h = g^3 = (z+a)^(m-2)339_ff_square(t0,a); _ff_add(w[1],a,a); _ff_add(w[0],t0,s); // w = (z+a)^2340ff2_mult_s (h,h,w,s); // h = g^3*(z+a)^2 = (z+a)^m341}342ff2_norm_s (&t,h,s); // norm(h)=((z+a)^(m*(p+1)) is in the 3-Sylow of F_p343if ( ! ff_3Sylow_invcbrt(&r,&t) ) { // (z+a) is not a cubic residue - this should be impossible344printf ("(z+%ld)^%ld = ", _ff_get_ui(a), _ff_p3_m); ff_poly_print(h,1);345printf ("norm(h)=%ld is not a cube in F_%ld\n", _ff_get_ui(t), _ff_p);346printf ("z+%ld is not a CR in F_p[z]/(z^2-%ld)\n", _ff_get_ui(a), _ff_get_ui(s));347ff_poly_print(f,3);348exit (0);349}350// we now know (z+a)^-((p+1)m)/3351ff2_norm_s(&t,g,s); // t = norm(g) = g^(p+1)352ff2_exp_ui_s(h,h,(_ff_p+1)/(3*_ff_p3_m),s); // compute h^(3^(e-1))353if ( !_ff_p3_m1mod3 ) {354// We need to construct (z+a)^n where n = (2(p+1)m+1)/3 = 2(p+1)(m-2)/3 + 2(2p+1)/3 + 1 = 2(p+1)(m-2)/3 + 2(2*3^(e-1)*m+1) + 1355// We then have (z+a)^n = norm(g)^2*((h^(3^(e-1))^2*(z+a))^2*(z+a) (note that we also need to square r to get (z+a)^(-2(p+1)m/3) here)356ff_square(t,t); ff_square(r,r);357ff2_square_s(h,h,s); ff2_mult_zpa_s (h,h,a,s); ff2_square_s(h,h,s);358}// when m is 1 mod 3:359// We need to construct (z+a)^n where n = ((p+1)m+1)/3 = (p+1)(m-1)/3 + (p-1)/3 + 1 = (p+1)(m-1)/3 + 3^(e-1)*m + 1360// We then have (z+a)^n = norm(g)*h^(3^(e-1))*(z+a)361ff2_scalar_mult(g,t,h);362ff2_mult_zpa_s(g,g,a,s); // g = (z+a)^n363ff2_scalar_mult(g,r,g); // g = (z+a)^(1/3)364// We know that (g-f1/(3g)) is a root of f. To get others we multiply by cube roots of unity (which are conveniently in F_p).365// We use this to find the root in F_p (there must be exactly one, since we know f has 2 factors by Stickleberger)366// The multiple h of g yielding a root in F_p must have the property that -f1/3=norm(h), which we can test without inverting, and we then have tr(h) as our root367_ff_mult(t0,f[1],_ff_third);368ff_negate(t0);369for ( i = 0 ; i < 3 ; i++ ) { ff2_norm_s(&t,g,s); if ( _ff_equal(t,t0) ) break; ff2_scalar_mult(g,_ff_cbrt_unity,g); }370if ( i == 3 ) {371printf ("g=%ldz+%ld is a cbrt of (z+%ld) in F_p[z]/(z^2-%ld), but couldn't find an F_%ld root of 2-factor f = ", _ff_get_ui(g[1]), _ff_get_ui(g[0]),_ff_get_ui(a), _ff_get_ui(s), _ff_p);372ff_poly_print(f,3);373printf ("t0=%ld, cube root of unity is %ld\n", _ff_get_ui(t0), _ff_get_ui(_ff_cbrt_unity));374exit (0);375}376ff2_trace(roots,g);377return 1;378} else { // in this case have a sqrt of t and we know that f has 1 or 3 factors379ff_mult(d,d,t2); // d = sqrt(t)/6 = sqrt(f1^3/27+f0^2/4) (this is the square root in Cardano's formula for depressed cubics)380_ff_add(t,a,d); // t = -f0/2 + d (first cube to check)381if ( ! ff_cbrt(&r,&t) ) return 0;382_ff_sub(t,a,d); // t = -f0/2 - d (second cube to check - we really ought to be able to compute this from the first one)383if ( ! ff_cbrt(&s,&t) ) { printf("Impossible cube root failure in ff_poly_roots_d3 for p=%ld, f = ", _ff_p); ff_poly_print(f,3); exit(0); }384if ( ! roots ) return 3; // we know the roots exist but we don't need to find them385// We have to cycle through the choices of one of our cube roots to verify that we have the correct s and t, unforutnately this means we may need to evaluate f twice386_ff_add(t,r,s);387_ff_square(t0,t); _ff_addto(t0,f[1]); _ff_multadd(t1,t0,t,f[0]);388if ( ! _ff_zero(t1) ) {389ff_mult(s,s,_ff_cbrt_unity); // try second s390_ff_add(t,r,s);391_ff_square(t0,t); _ff_addto(t0,f[1]); _ff_multadd(t1,t0,t,f[0]);392if ( ! _ff_zero(t1) ) ff_mult(s,s,_ff_cbrt_unity); // must be the third s393}394_ff_add(roots[0],r,s); // third choice must work since first two didn't395_ff_square(t0,_ff_cbrt_unity); // could cache this in ff.c396_ff_mult(t1,r,_ff_cbrt_unity);397_ff_mult(t2,s,t0);398_ff_add(roots[1],t1,t2); // second root is wr+w^2s where w is a primitive cbrt of unity399_ff_mult(t1,r,t0);400_ff_mult(t2,s,_ff_cbrt_unity);401_ff_add(roots[2],t1,t2); // third root is wr^2+ws402return 3;403}404} else { // p=2mod3405if ( sts ) { // p=2mod3 => -1/3 not a QR => (-D/3 a QR => D not a QR => f has an even # of factors (by Stickleberger))406if ( ! roots ) return 1; // if we only need the # of factors, we are done.407ff_mult(d,d,t2); // d = sqrt(t)/6 = sqrt(f1^3/27+f0^2/4) (this is the square root in Cardano's formula for depressed cubics)408_ff_add(t,a,d); // t = -f0/2 + d (first cube)409ff_cbrt(&r,&t); // p is 2 mod 3 so we know there is exactly 1 cube root410_ff_sub(t,a,d); // t = -f0/2 - d (second cube)411ff_cbrt(&s,&t);412_ff_add(t,r,s); // there was no choice of cuberoots (and changing sign of d doesn't change r+s=t), so this has got to be the root413_ff_set(roots[0],t);414return 1;415} else { // in this case t = -D/3 is not a QR, D=d^2 is a QR, and we know that f has 1 or 3 factors416// for p=2mod3, handle roots=NULL separately, since we can be much more efficient417if ( ! roots ) {418_ff_square(t0,t2); // 1/36419_ff_mult(s,t,t0); // s = -D/108 is a not a QR, we will now work in the field F_p^2 = F_p[z]/(z^2-s)420ff_poly_xpan_mod_d2(g,a,(_ff_p+1)/3,&s); // (z+a)^((p+1)/3) is in F_p iff (z+a)^((p+1)(p-1)/3)=1 iff (z+a) is a cubic residue421return ( _ff_zero(g[1]) ? 3 : 0 );422}423424// in this case we will be slightly less efficient and work in the standard basis for F_p^2=F_p[z](z^2-s) because we425// want don't want to have to compute a new generator of the Sylow 3-subgroup in F_p^2 every time426_ff_set(v[0],a); _ff_mult(v[1],d,t2); // v is the element of F_p^2 (in the standard basis) whose cuberoot we need427// this code is slightly inefficient, but we are trying to compute h=v^((p+1)/3) as quickly as possible (if h is not in F_p, f is irreducible and we are done)428// while also saving values we will need to compute a cube root of v when f splits429if ( _ff_p3_m1mod3 ){ k=1; ff2_set(vk,v); }430else { k=2; ff2_square(vk,v); } // vk = v^k431ff2_exp_ui(g,v,(_ff_p3_m-k)/3); // g = v^((m-k)/3)432ff2_square(h,g); ff2_mult(h,h,g); // h = v^(m-k)433ff2_mult(h,h,vk); // h = v^m434if ( _ff_p3_e > 1 ) {435ff2_exp_ui(w,h,(_ff_p+1)/(3*_ff_p3_m)-1); // w = v^((3^(e-1)-1)m) = v^(((p+1)-3m)/3)436_ff_mult(t1,w[1],h[0]); _ff_mult(t2,w[0],h[1]); // compute z coefficient of w*h437_ff_add(t0,t1,t2);438if ( ! _ff_zero(t0) ) return 0; // if w*h=v^((p+1)/3) is not in F_p, then v^((p^2-1)/3)=h^(p-1) is not 1 and v is not a cube, and o.w. it is439} else {440if ( ! _ff_zero(h[1]) ) return 0; // here h=v^m=v^((p+1)/3) since e=1441}442// We now know v is a cube and there are three roots443if ( ! roots ) return 3;444ff2_norm(&t,g); // t = N(g)=g^(p+1)=v^((p+1)(m-k)/3) where k=m mod 3445ff2_scalar_mult(g,t,g); // g = v^((p+2)(m-k)/3)446// When e==1, life is pretty simple: we know v is a cube, and there is only one cube in the Sylow 3-subgroup, name 1, and its inverse cube root is 1447// It follows that we can assume v^((p-1)m/3)=v^((p-2)m/3)=1.448if ( _ff_p3_e == 1 ) {449if ( k==2 ) {450ff2_mult(g,g,h); // g = v^((p+2)(m-2)/3+m)=v^((pm+2m-2p-4+3m)/3)=v^((pm+2m-2(3m-1)-4+3m)/3)=v^(((p-1)m-2)/3)451} else { // note that e=1 and k=1 implies g=v^((p+1)(m-1)/3) = v^(((p-1)m-1)/3)452ff2_square(g,g); // g=v^((2(p-1)m-2)/3)453}454ff2_mult(g,g,v); // g = v^(((3-k)(p-1)m+1)/3) = v^(1/3}455ff2_trace(roots,g); // Every choice of cube root of v corresponds to a root of f, so we can just take the trace456ff2_setup_cbrt(); // we still need to get the cube root of unity (this is annoying)457ff2_mult(g,g,_ff2_cbrt_unity);458ff2_trace(roots+1,g);459ff2_mult(g,g,_ff2_cbrt_unity);460ff2_trace(roots+2,g);461return 3;462}463// now we have e>1464if ( k==2 ) { ff2_square(w,w); ff2_mult(w,w,h); } // w = v^((k*3^(e-1)-1)m)465ff2_mult(g,g,w); // g = v^(((p-1)m-k)/3) (note that (p+2)(m-k)/3 + (k*3^(e-1)-1)m = (pm+2m-kp-2k+k3^em-3m)/3466// = (pm-m-kp-2k+k(p+1))/3=((p-1)m-k)/3467ff2_square(h,g); ff2_mult(h,h,g); // h = v^((p-1)m-k)468ff2_mult(h,h,vk); // h = v^((p-1)m) is in the 3-Sylow subgroup, and it must be a cubic residue469if ( ! ff2_3Sylow_invcbrt(h,h) ) { // h = v^(-(p-1)m)/3) if e=1470printf ("Impossible situation in ff_poly_factors_g1 with p=%ld,m=%ld: v^((p-1)m)=(%ldz+%ld) is not a cube in 3-Sylow of F_p^2=F_p[z]/(z^2-%ld) ",471_ff_p, _ff_p3_m, _ff_get_ui(a), _ff_get_ui(h[1]), _ff_get_ui(h[0]), _ff_get_ui(s));472ff_poly_print(f,3);473exit (0);474}475if ( k==1 ) {476ff2_square(h,h); // h = v^(-2(p-1)m/3)477ff2_square(g,g); // g = v^(2((p-1)m-1)/3)478ff2_mult(g,g,v); // g = v^(2((p-1)m-1)/3+1) = v^((2(p-1)m+1)/3479} else {480ff2_mult(g,g,v); // g = v^((p-1)m-2)/3+1 = v^((p-1)m+1/3481}482ff2_mult(g,g,h); // g = v^(1/3)483ff2_trace(roots,g); // Every choice of cube root of v corresponds to a root of f, so we can just take the trace of each484ff2_mult(g,g,_ff2_cbrt_unity);485ff2_trace(roots+1,g);486ff2_mult(g,g,_ff2_cbrt_unity);487ff2_trace(roots+2,g);488return 3;489}490}491puts ("Unreachable code in ff_poly_factors_g1!");492exit (0);493}494495// Compute roots of poly of degree 2, or less496int ff_poly_roots_d2 (ff_t r[2], ff_t f[3], int d_f)497{498ff_t t1,t2,D;499500switch (d_f) {501case 0: return 0;502case 1:503if ( _ff_one(f[1]) ) { _ff_set_one(t1); } else { _ff_invert(t1,f[1]); }504ff_negate(t1); _ff_mult(r[0],t1,f[0]); return 1;505case 2:506_ff_square(t1,f[1]);507_ff_mult(t2,f[2],f[0]);508_ff_x2(t2); _ff_x2(t2);509_ff_subfrom(t1,t2);510if ( ! ff_sqrt(&D,&t1) ) return 0;511if ( _ff_one(f[2]) ) {512_ff_set(t1,_ff_half);513} else {514_ff_add(t2,f[2],f[2]);515_ff_invert(t1,t2);516}517_ff_sub(t2,D,f[1]);518_ff_mult(r[0],t1,t2);519ff_negate(D);520_ff_sub(t2,D,f[1]);521_ff_mult(r[1],t1,t2);522return 2;523}524}525526527/*528ff_poly_g1_3tor computes the size of the 3-torsion subgroup of y^2=f(x) = x^3+ax+b.529If the 3-torsion is trivial, it will instead attempt to determine the 3-torsion subgroup of the twist, and if it is non-trivial530will return a negative value whose absolute value is the size of the 3-torsion subgroup in the twist.531532This means that a return value of 1 indicates that neither the curve nor its twist have non-trivial 3-torsion.533In this situation, if p=1mod3, then a_p=0mod3 and the group order must be congruent to 2 mod 3.534535If the curve is singular (discriminant zero), 0 is returned.536*/537int ff_poly_g1_3tor (ff_t f[4])538{539ff_t div[5],r[4],g[4],t0,t1, t2, d;540int j, k, tor;541542#if FF_WORDS > 1543err_printf ("ff_poly_g1_3tor only supports single word finite fields\n"); exit (0);544#endif545546if ( _ff_p == 3 ) { if ( _ff_zero(f[1]) ) return 0; else return 1; } // no curve y^2=x^3+ax+b over F_3 has 3 torsion547548_ff_square(t1,f[0]); _ff_set_ui(t2,27); _ff_mult(t0,t1,t2); // t0 = 27f0^2549_ff_square(t2,f[1]); _ff_mult(t1,t2,f[1]); _ff_x2(t1); _ff_x2(t1); // t1 = 4f1^3, t2 = f1^2550_ff_add(d, t0, t1); // d = 4f1^3+27f0^2 = -discriminant of f551if ( _ff_zero(d) ) return 0;552_ff_add(t0,_ff_third, _ff_third); _ff_x2(t0); _ff_square(t1,t0); // t0 = (4/3)^2 = 16/9553ff_mult(d,d,t0); // d^2 = 256/81*D_f^2 = -D/3 where D is discriminant of the 3-div poly554555ff_negate(t2);556ff_mult(div[0],_ff_third,t2);557_ff_add(div[1],f[0],f[0]); _ff_x2(div[1]);558_ff_add(div[2],f[1],f[1]);559_ff_set_zero(div[3]);560_ff_set_one(div[4]);561// 3-division poly is x^4 + 2f1x^2 +4f0x - f1^2/3562563// handle p=2mod3 ourselves since we can easily save some time. note that in this case, either both the curve and its twist have non-trivial 3-torsion, or neither do.564// we also know that the 3-rank is 1 and there can be at most one root of the 3-division poly (2 pts of order 3).565if ( ! _ff_p1mod3 ) {566if ( _ff_zero(div[1]) ) { // biquadratic case is handled quickly by ff_poly_roots567k = _ff_poly_roots_d4(r,div,&d);568return ( k ? 3 : 1);569} else {570ff_depressed_cubic_resolvent(&t0,g,div);571k = _ff_poly_roots_d3(r,g,&d);572if ( k != 1 ) { printf ("p=%ld is 2mod3 but cubic resolvent of 3div poly has k=%d!=1 roots!\n f=", _ff_p, k); ff_poly_print(f,3); exit(0); }573}574_ff_sub(t1,t0,r[0]);575return ( ff_residue(t1) ? 3 : 1);576}577578// Now handle p=1mod3 by computing the roots of the 3-division polynomial (we actually only need to know the number of roots and the value of one of them)579k = _ff_poly_roots_d4(r,div,&d);580if ( ! k ) return 1;581_ff_square(t1,r[0]); _ff_addto(t1,f[1]); _ff_mult(t2,t1,r[0]); _ff_addto(t2,f[0]); // t2 = f(r[0])582if ( _ff_zero(t2) ) { printf ("impossible - non-id point (%lu,0) with 2-tor and 3-tor!\n", _ff_get_ui(r[j])); ff_poly_print(f,3); ff_poly_print(div,4); exit(0); }583584// We now rely on the fact that the 3-division poly of the curve and its twist are intimately related. They have the same factorization pattern,585// and either all the roots of the 3-division poly correspond to points on the curve, or none of them do, so we only need to check one.586// In the latter case, it must be that all the roots of the 3-division poly of the twist correspond to points on the twist.587if ( ff_residue(t2) ) return ( k==1?3:9); else return (k==1?-3:-9);588}589590#if SMALLJAC_GENUS==2591int ff_poly_g2_3tor(ff_t f[6])592{593ff_t g[POLY_G2TOR3_DEGREE+1], h[POLY_G2TOR3_DEGREE], r[POLY_G2TOR3_DEGREE];594int k, d_h, d_r;595596ff_poly_g2tor3_modpoly(g,f);597598// first make sure modular poly is square free599ff_poly_derivative (h,&d_h,g,POLY_G2TOR3_DEGREE);600ff_poly_gcd (r,&d_r,g,POLY_G2TOR3_DEGREE,h,d_h);601if ( d_r > 0 ) return 0;602603k = ff_poly_roots(g,40);604if ( ! k ) return 1;605if ( k ==2 ||k==5 ||k==8 ) return 3;606return 0;607}608#endif609610/*611Given a point (x,y) on y^2=f(x)=x^3+f1x+f0, replaces (x,y) with a point (u,v) s.t. (u,v) composed with itself yields (x,y) or its inverse (we don't distinguish these cases!)612Returns 1 for success, 0 if no such point exists.613*/614int ff_poly_g1_halve (ff_t *x, ff_t *y, ff_t f[4])615{616ff_t g[5], r[4];617register ff_t t0, t1, t2;618int i, k;619620// construct g(z) = z^4 - 2(f1+3x^2)z^2 - 8y^2z + f1^2 - 3x(y^2+f1x + 3f0). If (u,v)*(u,v)=(x,y) then g(u-x)=0 (note translation by x to kill degree 3 coeff of g).621_ff_set_one(g[4]); _ff_set_zero(g[3]);622_ff_square(t0,*x); _ff_add(t1,t0,t0); _ff_addto(t1,t0); _ff_addto(t1,f[1]); _ff_x2(t1); _ff_neg(g[2],t1); // g[2] = -2(f1+3x^2)623_ff_add(t0,*y,*y); _ff_square(t1,t0); _ff_x2(t1); _ff_neg(g[1],t1); // g[1] = -8y^2624_ff_square(t0,*y); _ff_mult(t1,f[1],*x); _ff_addto(t0,t1); _ff_add(t1,f[0],f[0]); _ff_addto(t1,f[0]); _ff_addto(t0,t1); // t0 = y^2 + f1x + 3f0625_ff_add(t1,*x,*x); _ff_addto(t1,*x); _ff_mult(t2,t0,t1); _ff_square(t1,f[1]); _ff_sub(g[0],t1,t2); // g[0] = f1^2 - 3x(y^2+f1x+3f0)626//printf ("(%d,%d) havling poly: ", _ff_get_ui(*x), _ff_get_ui(*y)); ff_poly_print(g,4);627628k = ff_poly_roots_d4 (r, g);629//printf ("%d roots\n", k);630for ( i = 0 ; i < k ; i++ ) {631_ff_add(t2,r[i],*x);632_ff_square(t0,t2); _ff_addto(t0,f[1]); _ff_mult(t1,t0,t2); _ff_add(g[1],t1,f[0]); // g[1] = f(r[i]+x)633if ( ff_sqrt(g,g+1) ) break;634}635if ( i == k ) return 0;636//printf ("found (%d,%d)^2 = (%d,%d)\n", _ff_get_ui(t2), _ff_get_ui(g[0]), _ff_get_ui(*x), _ff_get_ui(*y));637_ff_set(*x,t2); _ff_set(*y,*g);638return 1;639}640641642/*643Returns the size of the 2-Sylow subgroup of the elliptic curve y^2=f(x)=x^3+f1x+f0, provided it is cyclic.644Otherwise the return value is -1, which indicates that Z/2Z x Z/2Z is a subgroup (and the group order is divisible by 4)645(we could compute the entire 2-Sylow in this case, but it would be much more time consuming)646*/647int ff_poly_g1_2Sylow (ff_t f[4])648{649ff_t r[3];650ff_t x, y;651int n;652653n = ff_poly_roots_d3(r,f);654if ( ! n ) return 1;655if ( n > 1) return 0;656657_ff_set(x,r[0]); _ff_set_zero(y);658for ( n = 2 ; ff_poly_g1_halve (&x, &y, f) ; n<<= 1 );659return n;660}661662663/*664Computes the order and rank of the 4-torsion subgroup of the elliptic curve y^2=f(x)=x^3+f1x+f0.665Set o to the order and returns d to indicate the group Z/dZ x Z/eZ where d*e = o, d divides e (and may be 1)666667If flag8 is set, also check for Z/8Z (for a random curve with full Galois image in GL(2,Z/8Z), these occurs with probability 1/8).668We could also check for Z/2ZxZ/8Z, or even Z/4ZxZ/8Z or Z/8ZxZ/8Z (these occur with prob. 3/64, 3/512 and 1/1536 resp);669670The latter two are rare enough not to be worth the trouble, but Z/2xZ/8Z might be worth doing. Note, however, that Z/2xZ/4Z671contains 4 points of order 4 (two pairs of inverses) and we would need to compute two of these.672*/673int ff_poly_g1_4tor (int *o, ff_t f[4], int flag8)674{675ff_t r[3];676ff_t u, v, v2;677register ff_t t0,t1,t2;678register int i, n;679680n = ff_poly_roots_d3(r,f);681for ( i = 0 ; i < n ; i++ ) {682_ff_square(t0,r[i]); _ff_add(t1,t0,t0); _ff_addto(t1,t0); _ff_add(u,t1,f[1]); // u = 3x^2+a where (x,0) is a 2-torsion point (x=r[i] was a root of f)683if ( ff_sqrt(&v,&u) ) { // v is a root of the translated halving poly684_ff_add(t2,r[i],v); // t2=x+v is a root of the halving poly685_ff_square(t0,t2); _ff_addto(t0,f[1]); _ff_mult(t1,t0,t2); _ff_add(u,t1,f[0]); // u = f(t2)686if ( ff_sqrt(&v,&u) ) break; // if successful, (t2,v) is a point of order 4687_ff_sub(t2,r[i],v); // t2=x-u is also a root of the halving poly688_ff_square(t0,t2); _ff_addto(t0,f[1]); _ff_mult(t1,t0,t2); _ff_add(u,t1,f[0]); // u = f(t2)689if ( ff_sqrt(&v,&u) ) break; // if successful, (t2,v) is a point of order 4690}691}692if ( i == n ) { *o = n+1; return (n==3?2:1); } // no pts of order 4, so 4-torsion subgroup = 2-torsion subgroup693//printf ("%ld: found point(%ld,%ld) with order 4 (i=%d,n=%d) on curve y^2 = ", _ff_p, _ff_get_ui(t2), _ff_get_ui(v), i, n); ff_poly_print(f,3);694if ( n==1 ) { // rank 1 case, with a point of order 4695if ( ! flag8 ) { *o = 4; return 1; }696//printf("%ld: flag8 set (Z/4Z), attempting to halve order 4 point (%ld,%ld) on curve y^2 = ", _ff_p, _ff_get_ui(t2), _ff_get_ui(v)); ff_poly_print(f,3);697_ff_set(u,t2);698*o = ( ff_poly_g1_halve(&u,&v,f) ? 8 : 4 ); // check for a point of order 8 (this takes us outside the 4-torsion subgroup)699//if ( *o == 8 ) puts ("Succeeded"); else puts ("Failed");700return 1;701}702if ( i > 0 ) { *o = 8; return 2; }703704// We are here if the 2-rank is 2 and we succesfully halved the first point of order 2 that we tried. We need to check one more to distinguish Z/2Z x Z/4Z from Z/4Z x Z/4Z.705// Don't change v or t2 from above, we may need them below706i = 1;707_ff_square(t0,r[i]); _ff_add(t1,t0,t0); _ff_addto(t1,t0); _ff_add(u,t1,f[1]); // u = 3x^2+a where (x,0) is a 2-torsion point (x=r[0] was a root of f)708if ( ff_sqrt(&v2,&u) ) { // v2 is a root of the translated halving poly709_ff_add(t1,r[i],v2); // t1is a root of the halving poly710_ff_square(t0,t1); _ff_addto(t0,f[1]); ff_mult(t1,t1,t0); _ff_addto(t1,f[0]); // t1 = f(t1)711if ( ff_residue(t1) ) { *o = 16; return 4; }712}713*o = 8;714return 2;715}716717/*718Computes the roots of f(x)=x^4+ax^2+bx+c, solving by radicals.719Handles singular f, will return repeated roots correctly.720721Currently r is required. We could easily extend to make it optional and only return a count.722The parameter pd is optional. If specified it points to sqrt(-D/3) (used for 3-torsion).723*/724int _ff_poly_roots_d4 (ff_t r[4], ff_t f[5], ff_t *pd)725{726ff_t g[4], h[3], s[3], x, y, t;727register ff_t t0,t1,t2,w1,w2;728register int i,k,res;729730if ( _ff_zero(f[1]) ) {731// f = x^4+f2x^2+f0 = g(x^2) where g is monic quadratic732_ff_set_one(g[2]); _ff_set(g[1],f[2]); _ff_set(g[0],f[0]);733k = ff_poly_roots_d2(s,g,2);734if ( ! k ) return 0;735if ( ff_sqrt(r,s) ) {736_ff_neg(r[1],r[0]);737// could optimize for double root here, but why bother738if ( ff_sqrt(r+2,s+1) ) { _ff_neg(r[3],r[2]); return 4; }739return 2;740}741if ( ff_sqrt(r,s+1) ) { _ff_neg(r[1],r[0]); return 2; }742return 0;743}744ff_depressed_cubic_resolvent(&t,g,f);745k = _ff_poly_roots_d3(r,g,pd); // put the roots of g into r for now, they will get replaced by roots of f below.746if ( k ) {747res = 0;748for ( i = 0 ; i < k ; i++ ) {749// ff_poly_eval (&y, g, 3, r+i);750// if ( ! _ff_zero(y) ) { printf ("(%d roots) %ld is not a root of ", k, _ff_get_ui(r[i])); ff_poly_print(g,3); exit(0); }751_ff_sub(x,r[i],t);752ff_negate(x);753if ( ! ff_sqrt(s+i,&x) ) break;754if ( ++res == 2 ) break;755}756if ( res==2 ) {757_ff_mult(t1,s[0],s[1]);758ff_negate(t1);759_ff_invert(t0,t1);760_ff_mult(s[2],t0,f[1]);761_ff_square(t1,s[2]);762_ff_sub(x,r[2],t);763ff_negate(x);764if ( ! _ff_equal(t1,x) ) { printf("%ld: division failed s[0]=%d, s[1]=%d, d=%d\n", _ff_p, _ff_get_ui(s[0]), _ff_get_ui(s[1]), _ff_get_ui(t0)); ff_poly_print(f,4); ff_poly_print(g,3); exit(0); }765_ff_add(t1,s[0],s[1]); _ff_addto(t1,s[2]); ff_mult(t1,t1,_ff_half);766_ff_set(r[0],t1);767ff_negate(s[0]); ff_negate(s[2]);768_ff_add(t1,s[0],s[1]); _ff_addto(t1,s[2]); ff_mult(t1,t1,_ff_half);769_ff_set(r[1],t1);770ff_negate(s[0]); ff_negate(s[1]);771_ff_add(t1,s[0],s[1]); _ff_addto(t1,s[2]); ff_mult(t1,t1,_ff_half);772_ff_set(r[2],t1);773ff_negate(s[0]); ff_negate(s[2]);774_ff_add(t1,s[0],s[1]); _ff_addto(t1,s[2]); ff_mult(t1,t1,_ff_half);775_ff_set(r[3],t1);776return 4;777} else if ( k==1 && res==1 ) {778_ff_set_one(h[2]);779_ff_set(h[1],s[0]);780_ff_square(t1,s[0]); // t1 = s^2781_ff_add(w1,t1,f[2]);782ff_mult(w1,w1,_ff_half); // w1 = (s^2+f2)/2783_ff_square(t0,f[2]);784_ff_add(t2,f[0], f[0]); _ff_x2(t2); // t2 = 4f0785_ff_subfrom(t0,t2); // t0 = f2^2-4f0786_ff_add(t2,f[2],f[2]); // t2 = 2f2787_ff_add(w2,t1,t2); // w2 = s^2+2f2788ff_mult(w2,w2,t1); // w2 = s^4+2f2s^2789_ff_addto(w2,t0);790ff_mult(w2,w2,s[0]); // w2 = s(s^4+2f2s^2+f2^2-4f0)791_ff_invert(t0,f[1]);792ff_mult(t1,_ff_half,t0);793ff_mult(w2,w2,t1);794_ff_sub(h[0],w1,w2);795if ( ff_poly_roots_d2(r,h,2) != 2 ) {796ff_negate(h[1]);797_ff_add(h[0],w1,w2);798if ( ff_poly_roots_d2(r,h,2) != 2 ) { printf("%ld: fail 2-1-1 split with s=%ld\n", _ff_p, _ff_get_ui(s[0])); ff_poly_print(f,4); ff_poly_print(g,3); ff_poly_print(h,2); exit(0); }799}800return 2;801} else {802return 0;803}804} else {805// this is the annoying case, we know we have a 1,3 split, but getting the value of the root is expensive806// we need sqrt(-z) in F_p[z]/h(z) where h(z) is the undepressed resolvent807// so we want sqrt(t-z) in F_p[z]/g(z) which is sqrt(z+t) in F_p[z]/k(z) where k(z)=-g(-z)=z^3+g1z-g0z. In terms of z^3-rz-s, r=-g1 and s=g0808_ff_neg(t1,g[1]);809if ( ff3_trsqrt_zpa_mod_rs(&x,t,t1,g[0]) ) res++; else { printf ("%ld: impossible failure of ff3_trsqrt_zpa_mod_rs in ff_poly_roots_d4!", _ff_p); ff_poly_print(f,4); ff_poly_print(g,3); exit(0); }810ff_mult(x,x,_ff_half);811ff_poly_eval(&y,f,4,&x);812if ( ! _ff_zero(y) ) ff_negate(x);813_ff_set(r[0],x);814return 1;815}816puts ("Unreachable code in ff_poly_factors_g1!");817exit (0);818}819820/*821Hard-wired code for computing the roots of f(x)=x^4+ax^2+bx+c. This is used822primarily for testing for 3-torsion by computing roots of the 3-division poly (made monic).823If f is singular, it only returns distinct roots.824825It is generally very fast, although when f splits completely more work is required.826*/827int ff_old_poly_roots_d4 (ff_t r[4], ff_t f[5])828{829_ff_t_declare g[5], h[5], A[5], t, a;830int i,j,k,d_g, d_h, d_A;831832#if FF_WORDS > 1833err_printf ("ff_poly_roots_d4 only implemented for single word fields\n"); exit (0);834#endif835ff_poly_xn_mod_d4 (g, _ff_p, f); // compute x^p mod f836_ff_set_one(t);837_ff_subfrom (g[1],t); // x^p-x mod f838for ( d_g = 3 ; d_g >= 0 && _ff_zero(g[d_g]) ; d_g-- );839ff_poly_gcd (h, &d_h, g, d_g, f, 4);840switch (d_h) {841case 0: return 0;842case 1:843case 2: return ff_poly_roots_d2 (r, h, d_h);844case 3:845case 4:846ff_poly_monic (A, &d_A, h, d_h);847i = j = k = 0;848_ff_set_zero(t);849do {850// if A was just made cubic, it may need to be translated to make x^2 coeff zero851// if this is done, we remember the translation so we can shift roots back at the end852if ( d_A == 3 && ! _ff_zero(A[2]) ) { j = i; ff_depress_cubic(&t, A); }853if ( k ) ff_random(&a); // use a=0 first time through854if ( d_A == 4 ) {855if ( !k ) {856ff_poly_xn_mod_d4 (g, (_ff_p-1)/2, A);857} else {858ff_poly_xpan_mod_d4 (g, a, (_ff_p-1)/2, A);859}860_ff_dec (g[0]);861for ( d_g = 3 ; d_g >= 0 && _ff_zero(g[d_g]) ; d_g-- );862} else {863if ( !k ) {864ff3_zn_mod (g, (_ff_p-1)/2, A);865} else {866ff_poly_xpan_mod_d3 (g, a, (_ff_p-1)/2, A);867}868_ff_dec (g[0]);869for ( d_g = 2 ; d_g >= 0 && _ff_zero(g[d_g]) ; d_g-- );870}871ff_poly_gcd (h, &d_h, g, d_g, A, d_A);872if ( d_h > 0 && d_h < d_A ) {873ff_poly_div (A, &d_A, g, &d_g, A, d_A, h, d_h);874if ( d_g != -1 ) { err_printf ("inexact poly division in ff_poly_roots_d4!\n"); exit(0); }875if ( d_h < 3 ) {876i += ff_poly_roots_d2 (r+i,h,d_h);877ff_poly_copy (h,&d_h,A,d_A);878ff_poly_monic (A,&d_A,h,d_h);879} else {880i += ff_poly_roots_d2 (r+i,A,d_A);881ff_poly_monic (A,&d_A,h,d_h);882}883}884k++;885} while ( d_A > 2 );886i += ff_poly_roots_d2 (r+i,A,d_A);887// shift any roots found after translation occurred (if it did)888if ( ! _ff_zero(t) ) while ( j < i ) { _ff_subfrom(r[j],t); j++; }889return i;890}891return d_h;892}893894895void ff_poly_xn_mod_d4 (ff_t g[4], unsigned long n, ff_t f[5])896{897_ff_t_declare_reg t1,t2,t3,nf0,f02,f12,f22,w1,w2;898register unsigned long j, m;899register int i;900901for ( i = 0 ; i < 4 ; i++ ) _ff_set_zero(g[i]);902if ( !n ) { _ff_set_one(g[0]); return; }903i = _asm_highbit(n);904if ( i&1 ) i--;905m = 3UL<<i;906j = (n&m)>>i;907_ff_set_one(g[j]);908m >>= 2; i -= 2;909if ( m > 1 ) {910_ff_neg(nf0,f[0]);911// these won't get used if no digits of n are 3, we could check for this...912_ff_mult(f02,f[0],f[2]);913_ff_mult(f12,f[1],f[2]);914_ff_square(f22,f[2]);915}916while (m) {917ff_poly_square_mod_d4 (g,g,f);918ff_poly_square_mod_d4 (g,g,f);919j = ((n&m)>>i);920switch (j) {921case 1: // 3M+2A922_ff_set(t3,g[3]);923_ff_set(g[3],g[2]);924_ff_mult(w2,f[2],t3);925_ff_sub(g[2],g[1],w2);926_ff_mult(w2,f[1],t3);927_ff_sub(g[1],g[0],w2);928_ff_mult(g[0],t3,nf0);929break;930case 2: // 6M+3A931_ff_set(t3,g[3]);932_ff_set(t2,g[2]);933_ff_mult(w1,f[2],t3);934_ff_sub(g[3],g[1],w1);935_ff_mult(w1,f[2],t2);936_ff_mult(w2,f[1],t3);937_ff_addto(w2,w1);938_ff_sub(g[2],g[0],w2);939_ff_mult(w1,t3,nf0);940_ff_mult(w2,f[1],t2);941_ff_sub(g[1],w1,w2);942_ff_mult(g[0],t2,nf0);943break;944case 3: // 9M+8A945_ff_set(t3,g[3]);946_ff_set(t2,g[2]);947_ff_set(t1,g[1]);948_ff_mult(w1,t2,f[2]);949_ff_mult(w2,t3,f[1]);950_ff_addto(w1,w2);951_ff_sub(g[3],g[0],w1);952_ff_sub(w2,f22,f[0]);953_ff_mult(w1,t3,w2);954_ff_mult(w2,t1,f[2]);955_ff_subfrom(w1,w2);956_ff_mult(w2,t2,f[1]);957_ff_sub(g[2],w1,w2);958_ff_mult(w1,t1,f[1]);959_ff_mult(w2,t2,f[0]);960_ff_addto(w1,w2);961_ff_mult(w2,t3,f12);962_ff_sub(g[1],w2,w1);963_ff_mult(w1,t3,f02);964_ff_mult(w2,t1,f[0]);965_ff_sub(g[0],w1,w2);966}967m >>= 2; i -= 2;968}969}970971// Computes (x+a)^n modulo x^2-f0 (note the sign). Assumes n < 2^63972void ff_poly_xpan_mod_d2 (ff_t g[2], ff_t a, unsigned long n, ff_t f[1])973{974_ff_t_declare_reg t0,t1,t2,t3,s0,s1;975register unsigned long m;976int i, j;977978if ( ! n ) { _ff_set_zero(g[1]); _ff_set_one(g[0]); return;}979_ff_set_one(g[1]); _ff_set(g[0],a);980if ( n==1 ) return;981i = _asm_highbit(n)-1;982m = 1UL<<i;983_ff_add(s0,a,a);984_ff_add(s1,f[0],f[0]);985while (m) {986j = ((n&m)>>i);987if ( j ) {988// square g and multiply by (x+a) to get (g0^2+g1^2f0+a*2g0g1)*x + a(g0^2+g1^2f0) + 2g0g1f0989_ff_square(t0,g[0]); // t0=g0^2990_ff_square(t1,g[1]); // t1=g1^2991_ff_mult(t2,f[0],t1); // t2 = g1^2f0992_ff_add(t1,t0,t2); // t1 = g0^2+g1^2f0993_ff_mult(t0,g[0],g[1]); // t0 = g0g1994_ff_mult(t2,a,t1); // t2 = a(g0^2+g1^2f0)995_ff_mult(t3,s1,t0); // t3 = 2f0g0g1996_ff_add(g[0],t2,t3); // g0 = a(g0^2+g1^2f0) + 2g0g1f0997_ff_mult(t2,t0,s0); // t2 = 2ag0g1998_ff_add(g[1],t1,t2); // g1 = g0^2+g1^2f0+a*2g0g1999// 7M + 3A1000} else {1001// square g mod f to get 2g0g1*x + (g0^2+g1^2f0)1002_ff_square(t0, g[0]); _ff_square(t1, g[1]);1003_ff_mult(t2,g[0],g[1]);1004_ff_add(g[1],t2,t2);1005_ff_mult(t2,t1,f[0]);1006_ff_add(g[0],t0,t2);1007// 4M + 2A1008}1009m >>= 1; i -= 1;1010}1011}10121013// Computes (x+a)^n modulo a cubic of the form x^3+f1x+f0. Assumes n < 2^631014void ff_poly_xpan_mod_d3 (ff_t g[3], ff_t a, unsigned long n, ff_t f[4])1015{1016_ff_t_declare_reg w1,w2,t2;1017register unsigned long m;1018register int i, j;10191020if ( ! n ) { _ff_set_zero(g[1]); _ff_set_one(g[0]); return;}1021_ff_set_zero(g[2]); _ff_set_one(g[1]); _ff_set(g[0],a);1022if ( n==1 ) return;1023i = _asm_highbit(n)-1;1024m = 1UL<<i;1025while (m) {1026ff_poly_square_mod_d3 (g,g,f);1027j = ((n&m)>>i);1028if ( j ) { // 5M+4A1029_ff_set(t2,g[2]);1030_ff_mult(w1,t2,a);1031_ff_add(g[2],g[1],w1);1032_ff_mult(w1,g[1],a);1033_ff_mult(w2,t2,f[1]);1034_ff_subfrom(w1,w2);1035_ff_add(g[1],g[0],w1);1036_ff_mult(w1,g[0],a);1037_ff_mult(w2,t2,f[0]);1038_ff_sub(g[0],w1,w2);1039}1040m >>= 1; i -= 1;1041}1042}10431044// squares a quadratic (arbitrary coeff) modulo a cubic of the form x^3+f1x+f01045// no initialization or validation is done, overlap is ok.1046void ff_poly_square_mod_d3 (ff_t w[3], ff_t u[3], ff_t f[4])1047{1048_ff_t_declare_reg t0,t1,s1,s2, w1, w2;10491050_ff_mult(t0,u[2],f[0]);1051_ff_mult(t1,u[2],f[1]);1052_ff_mult(s1,t0,u[2]);1053_ff_add(s2,u[1],u[1]);10541055_ff_add(w1,u[0],u[0]);1056_ff_subfrom(w1,t1);1057_ff_mult(w2,w1,u[2]);1058_ff_square(w1,u[1]);1059_ff_add(w[2],w1,w2); // w2 = u1^2+(2u0-f1u2)u2 = 2u0u2+u1^2 - f1u2^210601061_ff_sub(w1,u[0],t1);1062_ff_mult(w2,w1,s2);1063_ff_sub(u[1],w2,s1); // w1 = 2u1(u0-f1u2)-(f0u2)u2) = 2u0u2+u1^2 - f1u2^210641065_ff_square(w1,u[0]);1066_ff_mult(w2,s2,t0);1067_ff_sub(u[0],w1,w2); // w0 = u0^2-(2u1)(f0u2)1068// 8M+7A1069}10701071// Computes (x+a)^n modulo a quartic of the form x^4+f2x^2+f1x+f0. Assumes n < 2^631072void ff_poly_xpan_mod_d4 (ff_t g[4], ff_t a, unsigned long n, ff_t f[5])1073{1074_ff_t_declare_reg w1,w2,t3;1075ff_t h[4];1076register unsigned long m;1077int i, j;10781079if ( ! n ) { _ff_set_zero(g[1]); _ff_set_one(g[0]); return; }1080_ff_set_zero(g[3]); _ff_set_zero(g[2]); _ff_set_one(g[1]); _ff_set(g[0],a);1081if ( n==1 ) return;1082i = _asm_highbit(n)-1;1083m = 1UL<<i;1084while (m) {1085ff_poly_square_mod_d4 (g,g,f);1086j = ((n&m)>>i);1087if ( j ) {1088_ff_set(t3,g[3]);1089_ff_mult(w1,t3,a);1090_ff_add(g[3],g[2],w1);1091_ff_mult(w1,g[2],a);1092_ff_mult(w2,t3,f[2]);1093_ff_subfrom(w1,w2);1094_ff_add(g[2],g[1],w1);1095_ff_mult(w1,g[1],a);1096_ff_mult(w2,t3,f[1]);1097_ff_subfrom(w1,w2);1098_ff_add(g[1],g[0],w1);1099_ff_mult(w1,g[0],a);1100_ff_mult(w2,t3,f[0]);1101_ff_sub(g[0],w1,w2);1102// 7M+6A1103}1104m >>= 1; i -= 1;1105}1106}11071108// squares a cubic (arbitrary coeff) modulo a quartic of the form x^4+ax^2+bx+c1109// no initialization or validation is done, overlap of u,w is OK. Uses 18M+21A1110void ff_poly_square_mod_d4 (ff_t w[4], ff_t u[4], ff_t f[5])1111{1112_ff_t_declare_reg t0,t1,t2,s1,s2,s3,s4,s5,w1,w2,w3;11131114// compute t_i = u_3*f_i1115_ff_mult(t0,u[3],f[0]); _ff_mult(t1,u[3],f[1]); _ff_mult(t2,u[3],f[2]);1116_ff_mult(s1,u[2],f[2]); // s1=u2f21117_ff_square(s2,u[2]); // s2=u2^21118_ff_mult(s3,u[3],t0); // s3=u3^2f0 (not reused, but need to save since u[3] may get clobbered)1119_ff_mult(s4,u[2],t0);1120_ff_x2(s4); // s4 = 2u2u3f0 (not reused, but need to save since u[2] may get clobbered)1121_ff_mult(s5,u[1],u[3]);1122_ff_x2(s5); // s5 = 2u1u3 (not reused, but need to save since u[1],u[3] may get clobbered)11231124_ff_sub(w1,u[0],s1);1125_ff_x2(w1);1126_ff_subfrom(w1,t1);1127_ff_mult(w2,w1,u[3]);1128_ff_mult(w1,u[1],u[2]);1129_ff_x2(w1);1130_ff_add(w[3],w1,w2); // w3 = (2(u0-u2f2)-u3f2)u3+2u1u2 = 2*u0*u3 + 2*u1*u2 - 2*u2*u3*f2 - u3^2*f11131// note u[3] is now (potentially) clobbered11321133_ff_sub(w2,u[1],t2);1134_ff_square(w1,w2);1135_ff_sub(w2,u[0],t1);1136_ff_x2(w2);1137_ff_subfrom(w2,s1);1138_ff_mult(w3,u[2],w2);1139_ff_addto(w1,w3);1140_ff_sub(w[2],w1,s3); // w2 = (u1-u3f2)^2+u2(2(u0-u3f1)-u2f2)-u3(u3f0) = 2*u0*u2 + u1^2 - 2*u1*u3*f2 - u2^2*f2 - 2*u2*u3*f1 - u3^2*f0 + u3^2*f2^21141// note u[2] is now (potentially) clobbered11421143_ff_sub(w2,u[0],t1);1144_ff_mult(w1,u[1],w2);1145_ff_x2(w1);1146_ff_mult(w2,s2,f[1]);1147_ff_subfrom(w1,w2);1148_ff_subfrom(w1,s4);1149_ff_mult(w2,t1,t2);1150_ff_add(w[1],w1,w2); // w1 = 2u1(u0-u3f1)-u2^2f1-2u2(u3f0)+(u3f1)(u3f2) = 2*u0*u1 - 2*u1*u3*f1 - u2^2*f1 - 2*u2*u3*f0 + u3^2*f1*f21151// note u[1] is now (potentially) clobbered11521153_ff_square(w1,u[0]);1154_ff_add(w2,s5,s2);1155_ff_mult(w3,w2,f[0]);1156_ff_subfrom(w1,w3);1157_ff_mult(w2,t0,t2);1158_ff_add(w[0],w1,w2); // w0 = u0^2-f0(2u1u3+u2^2)+(u3f0)(u3f2) = u0^2 - 2*u1*u3*f0 - u2^2*f0 + u3^2*f0*f21159// 18M+21A1160}116111621163/*11641165The code below has all been superseded by more efficient routines above or in ffext.c1166Keep it around in case we need it for debugging.11671168// squares a quadratic (arbitrary coeff) modulo a cubic of the form x^3+f1x+f01169// no initialization or validation is done, overlap is ok. Uses 10M+9A1170void old_ff_poly_square_mod_d3 (ff_t w[3], ff_t u[3], ff_t f[4])1171{1172_ff_t_declare_reg t1,t3, t4, w0, w1, w2;11731174_ff_mult(t1,u[0],u[2]);1175_ff_add(w2,t1,t1);1176_ff_square(t1,u[1]);1177_ff_addto(w2,t1);1178_ff_square(t4,u[2]); // t4 = u2^21179_ff_mult(t1,f[1],t4);1180_ff_subfrom(w2,t1); // w2 = 2u0u2+u1^2 - f1u2^21181_ff_mult(t1,u[0],u[1]);1182_ff_add(w1,t1,t1);1183_ff_mult(t1,f[0],t4);1184_ff_subfrom(w1,t1);1185_ff_mult(t1,u[1],u[2]);1186_ff_add(t3,t1,t1); // t3 = 2u1u21187_ff_mult(t1,f[1],t3);1188_ff_subfrom(w1,t1); // w1 = 2u0u1 - f0u2^2 - 2f1u1u21189_ff_square(w0,u[0]);1190_ff_mult(t1,f[0],t3);1191_ff_sub(w[0],w0,t1); // w0 = u0^2 - 2u1u2f01192_ff_set(w[1],w1);1193_ff_set(w[2],w2);1194}11951196// Computes (x+a)^n modulo a cubic of the form x^3+f1x+f0. Assumes n < 2^631197void ff_poly_xpan_mod_d3 (ff_t g[3], ff_t a, unsigned long n, ff_t f[5])1198{1199_ff_t_declare p[12], *q;1200_ff_t_declare_reg t1,a2, ca;1201register unsigned long m;1202int i, d;12031204memset (p, 0, sizeof(p));1205q = p;1206_ff_set_one(q[0]); // (x+a)^0 = 11207q += 3;1208_ff_set_one(q[1]);1209_ff_set(q[0],a); // (x+a)^1 = x + a1210q += 3;1211_ff_set_one(q[2]);1212_ff_add(ca,a,a);1213_ff_set(q[1],ca);1214_ff_square(a2,a);1215_ff_set(q[0],a2); // (x+a)^2 = x^2 + 2ax + a^21216q += 3;1217_ff_addto(ca,a);1218_ff_set(q[2],ca);1219_ff_mult(t1,ca,a);1220_ff_sub(q[1],t1,f[1]);1221_ff_mult(t1,a2,a);1222_ff_sub(q[0],t1,f[0]); // (x+a)^3 = 3ax^2+(3a^2-f1)x + (a^3-f0)12231224// could move up to exp8 but we'll settle for a window size of 2 bits for now1225ff_poly_exp4_mod_d3 (g, p, n, f);1226}122712281229// Computes x^n modulo a cubic of the form x^3+ax+b. Assumes n < 2^631230void ff_poly_xn_mod_d3 (ff_t g[3], unsigned long n, ff_t f[4])1231{1232_ff_t_declare p[24], *q;1233_ff_t_declare_reg t0, t1, t3;1234register unsigned long m;1235int i, d;12361237memset (p, 0, sizeof(p));1238q = p;1239_ff_set_one(q[0]); // x^0 = 11240q += 3;1241_ff_set_one(q[1]); // x^1 = x1242q += 3;1243_ff_set_one(q[2]); // x^2 = x^21244q += 3;1245_ff_neg(t1,f[1]);1246_ff_neg(t0,f[0]);1247_ff_set(q[1],t1);1248_ff_set(q[0],t0); // x^3 = -f1x-f01249q += 3;1250_ff_set(q[2],t1);1251_ff_set(q[1],t0); // x^4 = -f1x^2-f0x1252q += 3;1253_ff_set(q[2],t0);1254_ff_square(t3,f[1]);1255_ff_set(q[1],t3); // t3 = f1^21256_ff_mult(t1,f[0],f[1]);1257_ff_set(q[0],t1); // x^5 = -f0x^2+f1x^2+f0f11258q += 3;1259_ff_set(q[2],t3);1260_ff_add(t0,t1,t1); // t0 = 2f0f11261_ff_set(q[1],t0);1262_ff_square(t1,f[0]); // t1 = f0^21263_ff_set(q[0],t1); // x^6 = f1^2x^2+2f0f1x+f0^21264q += 3;1265_ff_set(q[2],t0);1266_ff_mult(t0,f[1],t3);1267_ff_sub(q[1],t1,t0);1268_ff_mult(t0,f[0],t3);1269_ff_neg(q[0],t0); // x^7 = 2f0f1x^2 + (f0^2-f1^3)x - f0f1^212701271ff_poly_exp8_mod_d3 (g, p, n, f);1272}127312741275void ff_poly_exp4_mod_d3 (ff_t g[3], ff_t p[12], unsigned long n, ff_t f[4])1276{1277register ff_t *q;1278register unsigned long m;1279int i, d;12801281for ( m = (0x3UL<<60), i = 60 ; ! (n&m) ; m >>= 2, i-=2 );1282q = p + 3*((n&m)>>i);1283_ff_set (g[2],q[2]); _ff_set(g[1],q[1]); _ff_set(g[0],q[0]);1284m >>= 2; i -= 2;1285while (m) {1286ff_poly_square_mod_d3 (g,g,f);1287ff_poly_square_mod_d3 (g,g,f);1288q = p + 3*((n&m)>>i);1289ff_poly_mult_mod_d3 (g, g, q, f);1290m >>= 2; i -= 2;1291}1292}12931294void ff_poly_exp8_mod_d3 (ff_t g[3], ff_t p[24], unsigned long n, ff_t f[4])1295{1296register ff_t *q;1297register unsigned long m;1298int i, d;12991300for ( m = (0x7UL<<60), i = 60 ; ! (n&m) ; m >>= 3, i-=3 );1301q = p + 3*((n&m)>>i);1302_ff_set (g[2],q[2]); _ff_set(g[1],q[1]); _ff_set(g[0],q[0]);13031304m >>= 3; i -= 3;1305while (m) {1306ff_poly_square_mod_d3 (g,g,f);1307ff_poly_square_mod_d3 (g,g,f);1308ff_poly_square_mod_d3 (g,g,f);1309q = p + 3*((n&m)>>i);1310ff_poly_mult_mod_d3 (g, g, q, f);1311m >>= 3; i -= 3;1312}1313}131413151316// multiplies two quadratics (arbitrary coeff) modulo a cubic of the form x^3+ax+b1317// no initialization or validation is done, overlap of u,v,w is ok. Uses 10M+11A1318void ff_poly_mult_mod_d3 (ff_t w[3], ff_t u[3], ff_t v[3], ff_t f[4])1319{1320// _ff_t_declare_reg t1, t3, t4, w0, w1, w2;1321_ff_t_declare_reg t0, t1, t2, s01, s02, s12, w0, w1;13221323_ff_mult(t0,u[0],v[0]); _ff_mult(t1,u[1],v[1]); _ff_mult(t2,u[2],v[2]);1324_ff_add(w1,u[0],u[1]); _ff_add(w0,v[0],v[1]); _ff_mult(s01,w1,w0); _ff_subfrom(s01,t0); _ff_subfrom(s01,t1);1325_ff_add(w1,u[0],u[2]); _ff_add(w0,v[0],v[2]); _ff_mult(s02,w1,w0); _ff_subfrom(s02,t0); _ff_subfrom(s02,t2);1326_ff_add(w1,u[1],u[2]); _ff_add(w0,v[1],v[2]); _ff_mult(s12,w1,w0); _ff_subfrom(s12,t1); _ff_subfrom(s12,t2);13271328_ff_add(w1,s02,t1);1329_ff_mult(w0,f[1],t2);1330_ff_sub(w[2],w1,w0); // w2 = s02 + t1 - t2f1 = u2v0+u1v1+ u0v2 - f1u2v213311332_ff_mult(w1,f[0],t2);1333_ff_mult(w0,f[1],s12);1334_ff_addto(w0,w1);1335_ff_sub(w[1],s01,w0); // w1= s01 - t2f0 - s12f1= u1v0+u0v1-f0u2v2-f1(u1v2+u2v1)13361337_ff_mult (w0, f[0],s12);1338_ff_sub(w[0],t0,w0); // w0 = t0 - s12f0 = u0v0 - (u1v2+u2v1)f01339return;1340}134113421343*/13441345/*1346// multiplies two cubics (arbitrary coeff) modulo a quartic of the form x^4+ax^2+bx+c1347// no initialization or validation is done, overlap of u,v,w is ok. Uses 23M+38A1348void ff_poly_mult_mod_d4 (ff_t w[4], ff_t u[4], ff_t v[4], ff_t f[5])1349{1350_ff_t_declare_reg t0, t1, t2, t3, s01,s02,s03,s12,s13,s23,w0, w1; // don't need so many variables13511352// compute t_i = u_i*v_i1353_ff_mult(t0,u[0],v[0]); _ff_mult(t1,u[1],v[1]); _ff_mult(t2,u[2],v[2]); _ff_mult(t3,u[3],v[3]);13541355// compute s_ij = u_i*v_j + u_j*v_i1356_ff_add(w0,u[0],u[3]); _ff_add(w1,v[0],v[3]); _ff_mult(s03,w0,w1); _ff_subfrom(s03,t0); _ff_subfrom(s03,t3);1357_ff_add(w0,u[0],u[2]); _ff_add(w1,v[0],v[2]); _ff_mult(s02,w0,w1); _ff_subfrom(s02,t0); _ff_subfrom(s02,t2);1358_ff_add(w0,u[0],u[1]); _ff_add(w1,v[0],v[1]); _ff_mult(s01,w0,w1); _ff_subfrom(s01,t0); _ff_subfrom(s01,t1);1359_ff_add(w0,u[1],u[3]); _ff_add(w1,v[1],v[3]); _ff_mult(s13,w0,w1); _ff_subfrom(s13,t1); _ff_subfrom(s13,t3);1360_ff_add(w0,u[1],u[2]); _ff_add(w1,v[1],v[2]); _ff_mult(s12,w0,w1); _ff_subfrom(s12,t1); _ff_subfrom(s12,t2);1361_ff_add(w0,u[2],u[3]); _ff_add(w1,v[2],v[3]); _ff_mult(s23,w0,w1); _ff_subfrom(s23,t2); _ff_subfrom(s23,t3);13621363// s13 is special, add t2 to it1364_ff_addto(s13,t2);13651366_ff_mult(w0,s23,f[2]);1367_ff_mult(w1,t3,f[1]);1368_ff_addto(w0,w1);1369_ff_add(w1,s03,s12);1370_ff_sub(w[3],w1,w0); // w3 = s03 + s12 - s23f2 - t3f1 = u0v3 + u1v2 + u2v1 - u2v3f2 + u3v0 - u3v2f2 - u3v3f113711372_ff_mult(w1,s13,f[2]);1373_ff_mult(w0,s23,f[1]);1374_ff_addto(w0,w1);1375_ff_square(w1,f[2]);1376_ff_subfrom(w1,f[0]);1377ff_mult(w1,w1,t3);1378_ff_addto(w1,s02);1379_ff_addto(w1,t1);1380_ff_sub(w[2],w1,w0); // w2 = s02 + t1 - s13f2 - s23f1 + t3(f2^2-f0) = u0v2 + u1v1 - u1v3f2 + u2v0 - u2v2f2 - u2v3f1 - u3v1f2 - u3v2f1 - u3v3f0 + u3v3f2^213811382_ff_mult(w1,s13,f[1]);1383_ff_mult(w0,s23,f[0]);1384_ff_addto(w0,w1);1385_ff_mult(w1,f[1],f[2]);1386ff_mult(w1,w1,t3);1387_ff_addto(w1,s01);1388_ff_sub(w[1],w1,w0); // w1 = s01 - s13f1 - s23f0 + t3f1f2 = u0v1 + u1v0 - u1v3f1 - u2v2f1 - u2v3f0 - u3v1f1 - u3v2f0 + u3v3f1f213891390_ff_mult(w0,s13,f[0]);1391_ff_mult(w1,f[0],f[2]);1392ff_mult(w1,w1,t3);1393_ff_addto(w1,t0);1394_ff_sub(w[0],w1,w0); // w0 = t0 - s13f0 + t3f0f2 = u0v0 - u1v3f0 - u2v2f0 - u3v1f0 + u3v3f0f21395}139613971398// squares a cubic (arbitrary coeff) modulo a quartic of the form x^4+ax^2+bx+c1399// no initialization or validation is done, overlap of u,w is OK. Uses 23M+21A1400void old_ff_poly_square_mod_d4 (ff_t w[4], ff_t u[4], ff_t f[5])1401{1402_ff_t_declare_reg t0,t1,t2,t3,s03,s02,s01,s13,s12,s23,w0,w1;14031404// compute t_i = u_i^21405_ff_square(t0,u[0]); _ff_square(t1,u[1]); _ff_square(t2,u[2]); _ff_square(t3,u[3]);14061407// compute s_ij = u_i*u_j1408_ff_mult(s13,u[1],u[3]); _ff_mult(s23,u[2],u[3]);14091410_ff_mult(w1,u[0],u[3]);1411_ff_mult(w0,u[1],u[2]);1412_ff_addto(w1,w0);1413_ff_mult(w0,s23,f[2]);1414_ff_subfrom(w1,w0);1415_ff_x2(w1);1416_ff_mult(w0,t3,f[1]);1417_ff_sub(w[3],w1,w0); // w3 = 2(s03 + s12 - s23f2) - t3f1 = 2*u0*u3 + 2*u1*u2 - 2*u2*u3*f2 - u3^2*f114181419_ff_mult(w0,s13,f[2]);1420_ff_mult(w1,s23,f[1]);1421_ff_addto(w0,w1);1422_ff_mult(w1,u[0],u[2]);1423_ff_subfrom(w1,w0);1424_ff_x2(w1);1425_ff_addto(w1,t1);1426_ff_square(w0,f[2]);1427_ff_subfrom(w0,f[0]);1428ff_mult(w0,w0,t3);1429_ff_addto(w1,w0);1430_ff_mult(w0,t2,f[2]);1431_ff_sub(w[2],w1,w0); // w2 = 2(s02 - s12f2 - s23f1) + t1 - t2f2 + t3(f2^2-f0) = 2*u0*u2 + u1^2 - 2*u1*u3*f2 - u2^2*f2 - 2*u2*u3*f1 - u3^2*f0 + u3^2*f2^214321433_ff_mult(w0,s23,f[0]);1434_ff_mult(w1,u[0],u[1]);1435_ff_subfrom(w1,w0);1436_ff_x2(w1);1437_ff_mult(w0,f[1],f[2]);1438ff_mult(w0,w0,t3);1439_ff_addto(w1,w0);1440_ff_add(w0,s13,s13);1441_ff_addto(w0,t2);1442ff_mult(w0,w0,f[1]);1443_ff_sub(w[1],w1,w0); // w1 = 2(s01 - s23f0) - (t2 + 2s13)f1 + t3f1f2 = 2*u0*u1 - 2*u1*u3*f1 - u2^2*f1 - 2*u2*u3*f0 + u3^2*f1*f214441445_ff_mult(w0,t3,f[2]);1446_ff_add(w1,s13,s13);1447_ff_addto(w1,t2);1448_ff_subfrom(w0,w1);1449_ff_mult(w1,w0,f[0]);1450_ff_add(w[0],w1,t0); // w0 = t0 + (t3f2 - 2s13 - t2)f0 = u0^2 - 2*u1*u3*f0 - u2^2*f0 + u3^2*f0*f21451}1452*/1453/*1454void old_ff_poly_xpan_mod_d4 (ff_t g[4], ff_t a, unsigned long n, ff_t f[5])1455{1456_ff_t_declare p[16], *q;1457_ff_t_declare_reg a2, ca;1458register unsigned long m;1459int i, d;14601461memset (p, 0, sizeof(p));1462q = p;1463_ff_set_one(q[0]); // (x+a)^0 = 11464q += 4;1465_ff_set_one(q[1]);1466_ff_set(q[0],a); // (x+a)^1 = x + a1467q += 4;1468_ff_set_one(q[2]);1469_ff_add(ca,a,a);1470_ff_set(q[1],ca);1471_ff_square(a2,a);1472_ff_set(q[0],a2); // (x+a)^2 = x^2 + 2ax + a^21473q += 4;1474_ff_set_one(q[3]);1475_ff_addto(ca,a);1476_ff_set(q[2],ca);1477_ff_mult(q[1],ca,a);1478_ff_mult(q[0],a2,a); // (x+a)^3 = x^3 + 3ax^2+3a^2x+a^314791480// could move up to exp8 but we'll settle for a window size of 2 bits for now1481ff_poly_exp4_mod_d4 (g, p, n, f);1482}148314841485void ff_poly_exp4_mod_d4 (ff_t g[4], ff_t p[16], unsigned long n, ff_t f[5])1486{1487register ff_t *q;1488register unsigned long m;1489int i, d;14901491for ( m = (0x3UL<<60), i = 60 ; ! (n&m) ; m >>= 2, i-=2 );1492q = p + 4*((n&m)>>i);1493_ff_set(g[3],q[3]); _ff_set (g[2],q[2]); _ff_set(g[1],q[1]); _ff_set(g[0],q[0]);1494m >>= 2; i -= 2;1495while (m) {1496ff_poly_square_mod_d4 (g,g,f);1497ff_poly_square_mod_d4 (g,g,f);1498q = p + 4*((n&m)>>i);1499ff_poly_mult_mod_d4 (g, g, q, f);1500m >>= 2; i -= 2;1501}1502}15031504void ff_poly_exp8_mod_d4 (ff_t g[4], ff_t p[32], unsigned long n, ff_t f[5])1505{1506register ff_t *q;1507register unsigned long m;1508int i, d;15091510for ( m = (0x7UL<<60), i = 60 ; ! (n&m) ; m >>= 3, i-=3 );1511q = p + 4*((n&m)>>i);1512_ff_set(g[3],q[3]); _ff_set (g[2],q[2]); _ff_set(g[1],q[1]); _ff_set(g[0],q[0]);1513m >>= 3; i -= 3;1514while (m) {1515ff_poly_square_mod_d4 (g,g,f);1516ff_poly_square_mod_d4 (g,g,f);1517ff_poly_square_mod_d4 (g,g,f);1518q = p + 4*((n&m)>>i);1519ff_poly_mult_mod_d4 (g, g, q, f);1520m >>= 3; i -= 3;1521}1522}1523*/15241525/*1526// Computes x^n modulo a quartic of the form x^4+f2x^2+f1x+f0. Assumes n < 2^631527void old_ff_poly_xn_mod_d4 (ff_t g[4], unsigned long n, ff_t f[5])1528{1529_ff_t_declare p[32], *q;1530_ff_t_declare_reg t0, t1, t2, t3;1531register unsigned long m;1532int i, d;15331534memset (p, 0, sizeof(p));1535q = p;1536_ff_set_one(q[0]); // x^0 = 11537q += 4;1538_ff_set_one(q[1]); // x^1 = x1539q += 4;1540_ff_set_one(q[2]); // x^2 = x^21541q += 4;1542_ff_set_one(q[3]); // x^3 = x^31543q += 4;1544_ff_neg(t2,f[2]);1545_ff_neg(t1,f[1]);1546_ff_neg(t0,f[0]);1547_ff_set(q[2],t2);1548_ff_set(q[1],t1);1549_ff_set(q[0],t0); // x^4 = -f2x^2 - f1x-f01550q += 4;1551_ff_set(q[3],t2);1552_ff_set(q[2],t1);1553_ff_set(q[1],t0); // x^5 = -f2x^3 - f1x^2 - f0x1554q += 4;1555_ff_set(q[3],t1);1556_ff_square(t3,f[2]);1557_ff_subfrom(t3,f[0]);1558_ff_set(q[2],t3);1559_ff_mult(t1,f[1],f[2]);1560_ff_set(q[1],t1);1561_ff_mult(t0,f[0],f[2]);1562_ff_set(q[0],t0); // x^6 = -f1x^3 + (f2^2-f0)x^2 + f1f2x^2 + f0f21563q += 4;1564_ff_set(q[3],t3);1565_ff_add(q[2],t1,t1);1566_ff_square(t2,f[1]);1567_ff_add(q[1],t0,t2);1568_ff_mult(q[0],f[0],f[1]); // x^7 = (f2^2-f0)x^3 + 2f1f2x^2 + (f0f2+f1^2)x + f0f115691570ff_poly_exp8_mod_d4 (g, p, n, f);1571}1572*/157315741575