#include <stdlib.h>1#include <stdio.h>2#include <math.h>3#include <memory.h>4#include "gmp.h"5#include "mpzutil.h"6#include "hecurve.h"7#include "ffwrapper.h"8#include "ffpoly.h"9#include "cstd.h"1011/*12Copyright 2008 Andrew V. Sutherland1314This file is part of smalljac.1516smalljac is free software: you can redistribute it and/or modify17it under the terms of the GNU General Public License as published by18the Free Software Foundation, either version 2 of the License, or19(at your option) any later version.2021smalljac is distributed in the hope that it will be useful,22but WITHOUT ANY WARRANTY; without even the implied warranty of23MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the24GNU General Public License for more details.2526You should have received a copy of the GNU General Public License27along with smalljac. If not, see <http://www.gnu.org/licenses/>.28*/2930/*31This module implements operations for genus 1 curves, both low level32point operations and functions for exponentiation computing the33group order, group structure, etc...3435These functions are heavily optimized and responsible for a more than a 2x36improvement in genus 1 performance from smalljac version 3 over version 2.3738We begin with basic point arithmetic functions (mostly inlined), the higher39level functions appear below. At the end of the module are a collection of functions40that were implement and test but are not used because they were found to be41sub-optimal in the current application.42*/434445long hecurve_g1_JC_pp_order (hec_jc_t *a, long p, long q, ff_t f1);46int hecurve_g1_dlog (hec_jc_t *a, hec_jc_t *b, long o, ff_t f1);47int hecurve_g1_bsgs_search (long *ord, hec_jc_t *b, long low, long high, int m, int a, ff_t f1);48long hecurve_g1_fastorder (ff_t x, ff_t y, long k, ff_t f1);49int hecurve_g1_test_exponent (long e, ff_t f[4]);50void hecurve_g1_p_reduce (hec_jc_t *b1, long *q1, hec_jc_t *b2, long *q2, long p, ff_t f1);5152unsigned long hecurve_expbits;53unsigned long hecurve_steps;54unsigned long hecurve_retries;555657/*58We use a reduced form of the Chudnovsky Jacobian representation (JC) which uses (x,y,z^2,z^3) to represent the affine point (x/z^2,y/z^3), but does not maintain z.59Not computing z saves 1 field multiplication when compsing an affine point with JC point (usually called mixed addition). Squaring (doubling) costs an extra multiplication60but two fewer additions and the performance is comparable (in fact, when tested on an AMD Athlon 64, JC doubling was slighly faster, not clear why).6162In terms of the Mumford representation used elsewhere, the point point (x,y,z^2,z^3) is u[1]=z, u[0]=-x/z^2, v[0]=y/x^3. (notice the sign on x)63When z is 1 this corresponds to u(t)=(t-x) and v(t)=y, so that x is a root of u and y=v(x).6465Negating y inverts an element, as in the affine rep, and 2-torsion elements have y=0.66Any element with z=0 (or z2 for reduced Chudnovsky) is considered the identity, regardless of the value of x and y.6768As we use multiplicative notation for group operations elsewhere, we generally speak of the elliptic curve group operation multiplicatively,69so we multiply, square, and exponentiate points, rather than adding, doubling, or multiplying by a scalar.7071In our operation counts we don't distinguish field multiplication and squaring. For prime fields where p fits in a machine word, they are effectively the same.72We do count additions, as these are not negligible--roughly speaking, 1M is about 2.5A. Field inversions are horribly expensive relative to Montgomery multiplication,73costing 40M or more (for p~2^30, say). Whenever possible, we do n field inversions in parallel for a cost of I+3(n-1)M, effectively 3M per inverted element, for n large.74*/7576// We trust the compiler to be smart about register allocation and don't hassle with macros (the speed difference appears to be negligible in testing)77// It would be nice to have C++ reference data types here...787980static inline int hecurve_g1_JC_id (hec_jc_t *p) { return _ff_zero(p->z2); }81static inline int hecurve_g1_JC_2tor (hec_jc_t *p) { return _ff_zero(p->y); }82static inline void hecurve_g1_JC_invert (hec_jc_t *p) { ff_negate(p->y); }8384static inline int hecurve_g1_JC_cmp (hec_jc_t *p1, hec_jc_t *p2)85{86register ff_t t0,t1;8788_ff_mult(t0,p1->x,p2->z2); _ff_mult(t1,p2->x,p1->z2);89if ( ! _ff_equal(t0,t1) ) return 0;90_ff_mult(t0,p1->y,p2->z3); _ff_mult(t1,p2->y,p1->z3);91return ( _ff_equal(t0,t1) ? 1 : -1 );92}9394// converts reduced Chudnovsky Jacobian to affine coords, given the inverse of z3, overlap ok95static inline void hecurve_g1_JC_to_A (ff_t *px, ff_t *py, hec_jc_t *p, ff_t z3inv)96{97register ff_t t0,t1;9899ff_mult(*py,p->y,z3inv);100_ff_mult(t0,p->z2,z3inv);101_ff_square(t1,t0);102ff_mult(*px,p->x,t1);103// 4M104}105106// squares (doubles) an affine point into reduced Chudnovsky Jacobian coords107static inline void hecurve_g1_2AJC (hec_jc_t *p, ff_t x, ff_t y, ff_t f1)108{109register ff_t t0,t1,t2,a,b;110111_ff_add(t0,y,y); _ff_mult(t1,t0,y); _ff_add(p->z2,t1,t1); _ff_mult(a,x,p->z2); // a=4xy^2, t1=2y^2, z2=4y^2112_ff_mult(p->z3,t0,p->z2); // z3=8y^3113_ff_square(t1,x); _ff_add(b,t1,t1); _ff_addto(b,t1); _ff_addto(b,f1); // b = 3x^2+f1*1^4114_ff_square(t0,b); _ff_add(t2,a,a); _ff_sub(p->x,t0,t2); // x = b^2-2a115_ff_sub(t1,a,p->x); _ff_mult(t2,t1,b); _ff_mult(t0, y,p->z3); _ff_sub(p->y,t2,t0); // y = b(a-x)-8y^4 (note we use the new x here and new z3=8y^3)116// 7M+9A117}118119// squares (doubles) an affine point into reduced Chudnovsky Jacobian coords and also sets az4 to f1*z3^4=16y^4 (cached value for squaring used below)120static inline void hecurve_g1_2AJC_cache (hec_jc_t *p, ff_t x, ff_t y, ff_t f1, ff_t *az4)121{122register ff_t t0,t1,t2,a,b;123124_ff_add(t0,y,y); _ff_mult(t1,t0,y); _ff_add(p->z2,t1,t1); _ff_mult(a,x,p->z2); // a=4xy^2, t1=2y^2, z2=4y^2125_ff_mult(p->z3,t0,p->z2); // z3=8y^3126_ff_square(t1,x); _ff_add(b,t1,t1); _ff_addto(b,t1); _ff_addto(b,f1); // b = 3x^2+f1*1^4127_ff_square(t0,b); _ff_add(t2,a,a); _ff_sub(p->x,t0,t2); // x = b^2-2a128_ff_sub(t1,a,p->x); _ff_mult(t2,t1,b); _ff_mult(t0, y,p->z3); _ff_sub(p->y,t2,t0); // y = b(a-x)-8y^4 (note we use the new x here and new z3=8y^3)129_ff_add(*az4,t0,t0);130// 7M+10A131}132133// squares a point p1 in reduced Chudnovsky Jacobian coords (p3 is the output, may be equal to p1)134// This code requires one more multiplication than doubling in standard Jacobian coordinates (but two fewer additions, which makes it a close call)135// Surprisingly, it is actually slightly faster than doubling in Jacobian coords when tested on an AMD Athlon 64 (YMMV).136static inline void hecurve_g1_2JC (hec_jc_t *p3, hec_jc_t *p1, ff_t f1)137{138register ff_t a, b, c, t0, t1, t2;139140_ff_square(t0,p1->x); _ff_add(t1,t0,t0); _ff_addto(t1,t0); // t1 = 3x^2141_ff_square(t2,p1->z2); _ff_mult(t0,f1,t2); _ff_add(b,t1,t0); // b = 3x^2+f1*z^4 (note that f1=a4 in 13.2.1.c of HECHECC p.282)142_ff_add(c,p1->y,p1->y); _ff_square(t2,c); _ff_mult(a,t2,p1->x); // a=4xy^2, c=2y, t2=4y^2143_ff_add(t0,a,a); _ff_square(t1,b); _ff_sub(p3->x,t1,t0); // x = b^2-2a144_ff_mult(p3->z2,p1->z2,t2); _ff_mult(t1,c,t2); _ff_mult(p3->z3,p1->z3,t1); // z2=4y^2z2, z3=8y^3z3, t1=8y^3145_ff_mult(c,t1,p1->y); // c = 8y^4146_ff_sub(t0,a,p3->x); _ff_mult(t2,t0,b); _ff_sub(p3->y,t2,c); // y = b(a-x)-c -- note we use the new x value here147// 11M+8A148}149150// same as above except the parameter az4 contains a_4*z1^4 = f1*z1^4 and is updated to hold a_4*z3^4 (cost 1M less, 1A more)151static inline void hecurve_g1_2JC_cache (hec_jc_t *p3, hec_jc_t *p1, ff_t *az4)152{153register ff_t a, b, c, t0, t1, t2;154155_ff_square(t0,p1->x); _ff_add(t1,t0,t0); _ff_addto(t1,t0); _ff_add(b,t1,*az4); // b = 3x^2+f1*z^4156_ff_add(c,p1->y,p1->y); _ff_square(t2,c); _ff_mult(a,t2,p1->x); // a=4xy^2, c=2y, t2=4y^2157_ff_add(t0,a,a); _ff_square(t1,b); _ff_sub(p3->x,t1,t0); // x = b^2-2a158_ff_mult(p3->z2,p1->z2,t2); _ff_mult(t1,c,t2); _ff_mult(p3->z3,p1->z3,t1); // z2=4y^2z2, z3=8y^3z3, t1=8y^3159_ff_mult(c,t1,p1->y); _ff_add(t0,c,c); ff_mult(*az4,*az4,t0); // c = 8y^4, az4 = az4*16y^4160_ff_sub(t0,a,p3->x); _ff_mult(t2,t0,b); _ff_sub(p3->y,t2,c); // y = b(a-x)-c -- note we use the new x value here161// 10M+9A162}163164// multiplies a non-identity point p1 in reduced Chudnovsky Jacobians coords by a (non-identity) affine point and puts result in p165// using the reduced Chudnosky form (no z, only z^2 and z^3) saves one field mult (this is faster than any of the alternatives in table 13.3 of HECHECC on p.284)166static inline void hecurve_g1_AJC (hec_jc_t *p, hec_jc_t *p1, ff_t x0, ff_t y0, ff_t f1)167{168register ff_t a, c, e, f, t0, t1, t2;169170if ( _ff_zero(p1->z2) ) { _ff_set(p->x,x0); _ff_set(p->y,y0); _ff_set_one(p->z2); _ff_set_one(p->z3); return; }171_ff_mult(a,x0,p1->z2); // a = x0z^2, b = x*1^2=x (z0=1)172_ff_sub(e,p1->x,a); // e = a-b173_ff_mult(c,y0,p1->z3); // c = y0z^3, d=y*1^3=y (z0=1)174_ff_sub(f,p1->y,c); // f = d-c175if ( _ff_zero(e) && _ff_zero(f) ) { hecurve_g1_2AJC (p, x0, y0, f1); return; } // must use doubling code here, but at least it's an affine double (7M+9A) (inverses are ok)176_ff_square(t0,e); _ff_mult(t1,t0,e); _ff_mult(t2,t0,a); // e2=e^2, t1=e^3, t2=ae^2177ff_mult(p->z2,p1->z2,t0); ff_mult(p->z3,p1->z3,t1); // z2 = 1*z2*e^2, z3=1*z3*e^3178_ff_square(t0,f); _ff_sub(p->x,t0,t1); _ff_add(t0,t2,t2); _ff_subfrom(p->x,t0); // x = f^2-e^3-2ae^2179_ff_sub(t0,t2,p->x); _ff_mult(t2,f,t0); _ff_mult(t0,t1,c); _ff_sub(p->y,t2,t0); // y = f(ae^2-x) - ce^3180// 10M+6A181}182183// multiplies p1 in reduced Chudnovsky Jacobian coords by p2 in reduced Chudnovsky Jacobian coords and puts the result in p1.184// Handles identity and will square (double) if needed. Cost is 13M+7A, one less than with standard Chudnovsky Jacobian coords.185static inline void hecurve_g1_JCJC (hec_jc_t *p1, hec_jc_t *p2, ff_t f1)186{187register ff_t a, b, c, d, e, f, t0, t1, t2;188189// we need to handle identity cases in general (although in some cases we might be able to rule this out)190if ( _ff_zero(p2->z2) ) return;191if ( _ff_zero(p1->z2) ) { *p1=*p2; return; }192_ff_mult(a,p1->x,p2->z2); _ff_mult(b,p2->x,p1->z2); _ff_sub(e,b,a); // a = x1z2^2, b = x2z1^2, e = a-b193_ff_mult(c,p1->y,p2->z3); _ff_mult(d,p2->y,p1->z3); _ff_sub(f,d,c); // c = y1z2^2, d = y2z1^3, f = d-c194if ( _ff_zero(e) && _ff_zero(f) ) { hecurve_g1_2JC (p1,p1,f1); return; } // must double if pts are equal (11M+8A), inverses will end up with z2=0, so we let them through195_ff_square(t1,e); ff_mult(t0,p1->z2,p2->z2); _ff_mult(p1->z2,t0,t1); // z^2 = z1^2z2^2e^2, t1=e^2196_ff_mult(t2,t1,e); ff_mult(t0,p1->z3,p2->z3); _ff_mult(p1->z3,t0,t2); // z^2 = z1^2z2^2e^2, t2=e^3197ff_mult(t1,t1,a); // t1=ae^2198_ff_square(t0,f); _ff_sub(p1->x,t0,t2); _ff_add(t0,t1,t1); _ff_subfrom(p1->x,t0); // x = f^2-e^3-2ae^2199_ff_sub(t0,t1,p1->x); _ff_mult(t1,f,t0); _ff_mult(t0,t2,c); _ff_sub(p1->y,t1,t0); // y = f(ae^2-x) - ce^3 -- note we use the new x here200// 13M+7A201}202203/*204Computes p=(x,y)^n where (x,y) !=1 is in affine coordinates and p is in Jacobian coordinates205It takes advantage of the fact that all additions are of the form J+A, requiring only 11M, doubling is done in J+J form, using 10M206*/207void hecurve_g1_AJC_exp_ui (hec_jc_t *p, ff_t x0, ff_t y0, unsigned long n, ff_t f1)208{209register unsigned long m;210unsigned long pbits, nbits;211register ff_t negy0;212int i;213214if ( n == 0 ) { _ff_set_zero(p->z2); return; }215if ( n == 1 ) { _ff_set(p->x,x0); _ff_set(p->y,y0); _ff_set_one(p->z2); _ff_set_one(p->z3); return; }216hecurve_g1_2AJC(p,x0,y0,f1);217if ( n == 2 ) return;218ui_NAF(&pbits,&nbits,n);219i = _asm_highbit(pbits)-2; // we know the top two bits of the NAF are 10220_ff_neg(negy0,y0);221m = (1UL<<i);222for ( ; m ; m >>= 1 ) {223hecurve_g1_2JC(p,p,f1); // 11M+8A224if ( m&pbits ) hecurve_g1_AJC(p,p,x0,y0,f1); // 10M+6A225if ( m&nbits ) hecurve_g1_AJC(p,p,x0,negy0,f1); // 10M+6A226}227}228229/*230Computes p=a^e where a!=1 is in reduced Jacobian Chudnovsky coords, and so is p. overlap is ok.231*/232void hecurve_g1_JC_exp_ui (hec_jc_t *p, hec_jc_t *a, unsigned long n, ff_t f1)233{234register unsigned long m;235unsigned long pbits, nbits;236hec_jc_t t, ai;237int i;238239if ( n == 0 ) { _ff_set_zero(p->z2); return; }240if ( n == 1 ) { *p=*a; return; }241hecurve_g1_2JC(&t,a,f1); // 11M+8A242243if ( n == 2 ) { *p = t; return; }244ui_NAF(&pbits,&nbits,n);245i = _asm_highbit(pbits)-2;246ai=*a; hecurve_g1_JC_invert(&ai);247m = (1UL<<i);248for ( ; m ; m >>= 1 ) {249hecurve_g1_2JC(&t,&t,f1); // 11M+8A250if ( m&pbits ) hecurve_g1_JCJC(&t,a,f1); // 13M+7A251if ( m&nbits ) hecurve_g1_JCJC(&t,&ai,f1); // 13M+7A252}253*p = t;254}255256// Computes b=a^e where a and b are both in affine coordinates257void hecurve_g1_exp_ui (ff_t u[HECURVE_GENUS+1], ff_t v[HECURVE_GENUS], ff_t u1[HECURVE_GENUS+1], ff_t v1[HECURVE_GENUS], unsigned long n, ff_t f[HECURVE_DEGREE+1])258{259ff_t zinv,x0;260hec_jc_t o[1];261262if ( _hecurve_is_identity(u1,v1) ) { _hecurve_set_identity(u,v); return; }263if ( ! _ff_one(u1[1]) ) { printf ("p=%ld, input to hecurve_g1_exp_ui most be in affine coords!\n",_ff_p); hecurve_print(u,v); exit(0); }264_ff_neg(x0,u1[0]);265hecurve_g1_AJC_exp_ui (o, x0, v1[0], n, f[1]);266if ( hecurve_g1_JC_id(o) ) { _hecurve_set_identity(u,v); return; }267ff_invert(zinv,o->z3);268hecurve_g1_JC_to_A(&x0,v,o,zinv);269_ff_neg(u[0],x0);270_ff_set_one(u[1]);271}272273// Combines precomputed values p[i]=p[0]^(2^i) to compute p[0]^n. Assumes all required powers are present: NAF potentially requires one more bit!274// CAUTION: if this is false, the resulting behavior is very strange and unpredictable. You have been warned.275void hecurve_g1_JC_exp_powers(hec_jc_t *o, hec_jc_t *p, unsigned long n, ff_t f1)276{277register unsigned long m;278unsigned long pbits, nbits;279hec_jc_t t;280int i;281282// handle small n quickly283switch (n) {284case 0: _ff_set_zero(o->z2); return;285case 1: *o=p[0]; return;286case 2: *o=p[1]; return;287case 3: *o=p[0]; hecurve_g1_JCJC(o,p+1,f1); return;288case 4: *o=p[2]; return;289case 5: *o=p[2]; hecurve_g1_JCJC(o,p,f1); return;290case 6: *o=p[2]; hecurve_g1_JCJC(o,p+1,f1); return;291case 7: *o=p[0]; hecurve_g1_JC_invert(o); hecurve_g1_JCJC(o,p+3,f1); return;292case 8: *o=p[3]; return;293}294ui_NAF(&pbits,&nbits,n);295i = _asm_highbit(pbits);296*o = p[i];297i-=2;298m = (1UL<<i);299for ( ; m ; m >>= 1, i-- ) {300if ( (m&pbits) ) hecurve_g1_JCJC(o,p+i,f1); // 13M+8A301if ( (m&nbits) ) { // 13M+10A (slightly more expensive)302hecurve_g1_JC_invert(o); hecurve_g1_JCJC(o,p+i,f1); hecurve_g1_JC_invert(o);303}304}305}306307// Sets p[i] = p[0]^(2^i) for i in [0,k]. Input is an affine pt not equal to the identity, output is in reduced Jacobian Chudnovsky coords.308void hecurve_g1_AJC_powers (hec_jc_t p[], ff_t x0, ff_t y0, int k, ff_t f1)309{310register hec_jc_t *e;311ff_t az4;312313_ff_set(p->x,x0); _ff_set(p->y,y0); _ff_set_one(p->z2); _ff_set_one(p->z3);314hecurve_g1_2AJC_cache (p+1, x0, y0, f1, &az4); // 7M + 10A315for ( e=p+k,p+=2 ; p <= e ; p++ ) hecurve_g1_2JC_cache(p,p-1,&az4); // 10M + 9A316}317318// Sets p[i] = p[0]^(2^i) for i in [1,k]. Input is an pt not equal to the identity, input and output is in reduced Jacobian Chudnovsky coords.319void hecurve_g1_JC_powers (hec_jc_t p[], int k, ff_t f1)320{321register hec_jc_t *e;322323// don't bother caching324for ( e=p+k,p++ ; p <= e ; p++ ) hecurve_g1_2JC(p,p-1,f1); // 11M + 8A325}326327// Sets s[i] = s[0]*(x0,y0)^i for i < n. Returns k<n if s[k] is the identity (in which case no further elements are computed), otherwise returns n.328int hecurve_g1_AJC_steps_1 (hec_jc_t s[], ff_t x0, ff_t y0, int n, ff_t f1)329{330register hec_jc_t *end;331332if ( hecurve_g1_JC_id(s) ) return 0;333end = s+n;334for ( s++ ; s < end ; s++ ) {335hecurve_g1_AJC (s,s-1,x0,y0,f1); // 10M + 6A336if ( hecurve_g1_JC_id(s) ) break;337}338hecurve_steps += n-(end-s);339return n-(end-s);340}341342// Sets s[-i] = s[0]*(x0,-y0)^i for i < n. Returns k<n if s[k] is the identity (in which case no further elements are computed), otherwise returns n.343// Used for downward stepping - NOTE THAT s SHOULD POINT TO THE LAST ENTRY IN THE ARRAY344int hecurve_g1_AJC_dsteps_1 (hec_jc_t *s, ff_t x0, ff_t y0, int n, ff_t f1)345{346register hec_jc_t *end;347register ff_t negy0;348349if ( hecurve_g1_JC_id(s) ) return 0;350end = s-n;351_ff_neg(negy0,y0);352for ( s-- ; s > end ; s-- ) {353hecurve_g1_AJC (s,s+1,x0,negy0,f1); // 10M + 6A354if ( hecurve_g1_JC_id(s) ) break;355}356hecurve_steps += n-(s-end);357return n-(s-end);358}359360// sets s[2*i+j] = s[0]*(x0,y0)^(i+j)*(x1,y1)^i for 2*i+j < n with j=0 or 1. Returns k<n if s[k] is the identity (in which case no further elements are computed), otherwise returns n.361int hecurve_g1_AJC_steps_2 (hec_jc_t s[], ff_t x0, ff_t y0, ff_t x1, ff_t y1, int n, ff_t f1)362{363register hec_jc_t *end;364register int i;365366if ( hecurve_g1_JC_id(s) ) return 0;367end = s+n;368for ( i=1,s++ ; s < end ; i++,s++ ) {369if ( i&1) {370hecurve_g1_AJC (s,s-1,x0,y0,f1); // 10M + 6A371} else {372hecurve_g1_AJC (s,s-1,x1,y1,f1); // 10M + 6A373}374if ( hecurve_g1_JC_id(s) ) break;375}376hecurve_steps += n-(end-s);377return n-(end-s);378}379380static inline int inv_mod3 (int e) { return ((e%3)==1?1:2); }381382/*383Fast group order computation for elliptic curves over F_p, designed for p<2^40, optimized for p~2^30.384Returns 1 if successful and sets *o to the group order and if pd is non-null, sets *pd to gcd(m,6), where the group structure is Z/mZ x Z/nZ with m dividing n (possibly m=1).385This information can be used to speed up group structure computations.386387Failure can only happen for p < 229, in which case 0 is returned and *o is a divisor of the group order highly likely equal to the group exponent (*pd is as also set, as above).388*/389long hecurve_g1_order (long *pd, ff_t f[4])390{391long r, d, e, E, min, max, low, high, exp, M;392hec_jc_t t[1];393ff_t g[4],x,y,*h;394int a,i,j,m,twist,s2known,tor3,tor4,flag8;395396/*397We compute the 3-torsion subgroup first, because this information may lead us to work in the twist.398The function ff_poly_g1_3tor() computes the 3-torsion subgroup of y^2=f(x), but if it is trivial, it will try to399determine the 3-torsion subgroup of the twist, using the factorization pattern of the 3-division polynomial (a quartic).400*/401h = f; twist = 0;402tor3 = ff_poly_g1_3tor(f);403if ( tor3<0 ) { ff_poly_twist(g,f,3); h = g; twist = 1; tor3=-tor3; } // negative return value indicates 3-torsion in the twist (but not in the primary)404d = ( tor3==9 ? 3 : 1); // d is a known divisor of |G|/lambda(|G|), i.e. it divides the group exponent (lambda(G)) and its square divides |G|.405406if ( _ff_p < HECURVE_G1_4TOR_MINP ) { // when p is small, only compute 2-torsion but not 4-torsion (the marginal benefit does not justify the cost)407i = ff_poly_roots_d3(0,h);408if ( i == 0 ) { // no roots means there is no 2-torsion and the group order must be odd409s2known = 1; tor4 = 1; // the flag s2known indicates we know the entire 2-Sylow subgroup410} else { // this means that once we factor it out, the remaining exponent is known to be odd411s2known = 0; tor4 = 2; // if there is one root, we have 2-torsion Z/2Z, otherwise there are 3 roots and 2-torsion Z/2Z x Z/2Z412if ( i > 1 ) { tor4 = 4; d *= 2; }413}414} else {415s2known = 0;416flag8 = ( _ff_p < HECURVE_G1_8TOR_MINP ? 0 : 1); // flag8 set if we also want to check whether the 2-Sylow subgroup is a cyclic group containing Z/8Z417d *= ff_poly_g1_4tor(&tor4,h,flag8); // compute the 4-torsion subgroup (and, optionally, check Z/8Z as well)418if ( flag8 ) {419if ( tor4 <= 4 ) s2known = 1; // In these cases we know the entire 2-Sylow subgoup420} else {421if ( tor4 < 4 || (tor4==4 && d==2) ) s2known = 1; // if the 4-torsion subgroup contains no elements order 4, it must be equal to the 2-Sylow subgroup.422}423}424e = tor4*tor3/d; // e is our known divisor of lambda(|G|)425E = d*e; // E is our known divisor of |G|426m = (s2known==1?2:1)*(tor3==1?3:1); // we know that gcd(|G|/tor4,m) = 1, which we can use to speed up the BSGS search427428a = ( tor3==1 && _ff_p1mod3 ? -1 : 0 ); // if neither the curve or its twist has 3-torsion and p=1mod3, we must have a_p=0 and group order 2 mod 3429// For any divisor x of |G| this means |G|/x must be equal to -1/x mod 3 (and also mod 6 if m=6)430r = (long)(2.0*sqrt(_ff_p));431exp = _ff_p+1;432min = exp-r;433max = exp+r;434low = _ui_ceil_ratio(min,E);435high = max/E;436if ( low > high ) { printf ("Error, no multiple of torsion derived e=%ld exists in interval [%ld,%ld] for p=%ld\n", e, min, max, _ff_p); exit(0); }437if ( pd ) { *pd = ( twist && tor3==9 ? d/3 : d ); if ( !((*pd)&3) ) *pd/=2; } // set *pd to reflect 2-torsion and 3-torsion information in y^2=f(x) (but not the twist)438439//printf("%ld: tor3=%d, tor4=%d, e=%d, d=%d ", _ff_p, tor3, tor4, e, d); ff_poly_print(h,3);440441// Note that for a non-CM curve the probability that hecurve_g1_bsgs succeeds on the first try is close to 1 and grows with p (say 99.9% for p~2^20)442for ( i = 0 ; i < HECURVE_G1_ORDER_RETRIES ; i++ ) {443if ( low==high ) {exp=low; break; }444if ( ! hecurve_random_point(&x,&y,h) ) continue; // this can fail, e.g. y^2=x^3+2x+2 over F_3 has no finite rational points445//printf ("%ld: e=%d, Random pt (%ld,%ld) on curve ", _ff_p, e, _ff_get_ui(x), _ff_get_ui(y)); ff_poly_print(h,3);446hecurve_g1_AJC_exp_ui (t, x, y, e, h[1]); if ( hecurve_g1_JC_id(t) ) continue;447// handle order 2 elements here so that bsgs search can assume |t| > 2448if ( hecurve_g1_JC_2tor(t) ) {449exp = 2;450//On paper, the code below should speed things up (it uses no inversions and fewer operations for small intervals), but in testing it actually slows things down slightly451//} else if ( high-low < HECURVE_G1_SHORT_INTERVAL ) {452// if ( hecurve_g1_bsgs_short (&exp, &t, low, high, h[1]) ) break;453} else {454if ( a ) { a = -inv_mod3(E); if ( m==6 && a==-2 ) a = -5; }455if ( hecurve_g1_bsgs_search (&exp, t, low, high, m, a, h[1]) ) break;456}457e *= exp; E *= exp;458low = _ui_ceil_ratio(min,E);459high = max/E;460// there are lot's of optimizations we could insert here, but none of them apply often enough to be worth doing, so keep it simple.461hecurve_retries++;462}463if ( i < HECURVE_G1_ORDER_RETRIES ) {464E *= exp;465if ( twist ) E = 2*(_ff_p+1)-E;466if ( E < min || E > max ) { printf ("Error, computed order %ld is not in Hasse-Weil interval [%ld,%ld] for p=%ld, h= ", E, min, max, _ff_p); ff_poly_print(h,3); exit(0); }467return E;468}469// For non-CM curves we will almost never reach this point.470471if ( e > 2*r ) { printf("Error, exponent %ld for p=%ld has unique multiple in [%ld,%ld] that was not detected.\n", e, _ff_p, min, max); exit(0); }472if ( e < sqrt(min) ) { printf ("Error, exponent %ld is impossibly small for p=%ld\n", e, _ff_p); exit(0); }473474/*475With very high probability (exponential in HECURVE_G1_ORDER_RETRIES), e is now the group exponent. For all but 21 primes (the largest of which is 547)476this uniquely determines the group order (this result is due to Cremona and Harley, or see Washington, "Elliptic Curves", Prop. 4.19).477In fact, taking our knowledge of 2-torsion and 3-torsion into account, the group order is uniquely determined in every case (AVS: to be written up).478Unfortunately, we can't prove that e is definitely the group exponent without computing the group structure.479480Instead, we apply results of Schoof and Mestre (see Washington, Prop. 4.18) which tells us that either the curve or its twist contains an element whose481order has a unique multiple in the Hasse interval provided p > 229. If one examines the exceptional cases for p<=229, one finds that if we also482consider the information provided by 2-torsion, in every case but two (y^2=x^3+2 over F_13 and F_19) we obtain a known divisor of the group order483with a unique multiple in the Hasse interval, and the two exceptional cases are addressed if we consider 3-torsion. (AVS: to be written up).484485In the two 3-torsion cases, we will have chosen h to be the twist with 3-torsion and should have succeeded above. If not we mustn't have computed486the full group exponent and we should try again (by recursively calling ourselves below)487488For the 2-torsion cases, the twist will also have 2-torsion, we just need to use this information when we try to find a unique multiple of the exponent489in the twist below. If we fail, it must mean we got unlucky and should try again.490491In summary, this yields a Las Vegas algorithm with provably correct output, for all odd p (for p=2 the curve y^2=f(x) is singular).492*/493494m = (d%2?1:2); // m is a known divisor of the twist group order based on the rank of the 2-torsion subgroup, note that we don't keep track of d anymore and reuse it below495496for ( M = e*_ui_ceil_ratio(min,e) ; M <= max ; M += e ) { d = M/e; if ( !(e%d) && !((_ff_p-1)%d) ) break; }497if ( M > max ) { printf ("Error: no multiple M of ambiguous e=%ld satisfies M/e | gcd(e,p-1) (p=%ld).\n", e, _ff_p); exit (0); }498if ( ! twist ) { ff_poly_twist(g,f,3); h = g; } else { h = f; }499500do {501r = 2*(_ff_p+1)-M;502// Attempt to prove that r is the order of the twist. We don't bother trying to be efficient here, we expect to succeed on the first try.503for ( i = 0 ; i < HECURVE_G1_ORDER_RETRIES ; i++ ) {504if ( ! hecurve_random_point(&x,&y,h) ) { printf("hecurve_random_point failed!"); exit (0); }505hecurve_g1_AJC_exp_ui (t, x, y, r, h[1]);506if ( ! hecurve_g1_JC_id(t) ) break; // can't be the right M507exp = m*hecurve_g1_fastorder(x,y,r,h[1]); // compute the order of our random point, times our 2-torsion derived info508low = exp*_ui_ceil_ratio(min,exp); // low is the first multiple of exp >= min509if ( low <= max && low+exp > max ) { return ( twist ? 2*(_ff_p+1)-M : M ); } // success510}511// try another candidate, if there is one (this can happen for small p)512for ( M+= e; M <= max ; M += e ) { d = M/e; if ( !(e%d) && !((_ff_p-1)%d) ) break; }513} while ( M <= max );514// Otherwise, try again -- we must have not gotten the full group exponent (this essentially never happens unless HECURVE_G1_ORDER_RETRIES is set quite low).515return hecurve_g1_order(pd,f);516}517518/*519Given the group order N, compute the isomorphism type of the group structure of an elliptic curve, of the form Z/n1Z x Z/n2Z with n1 dividing n2 (and also p-1), possibly n1=1.520The parameter d specifies gcd(6,n1), which can be derived from 2-torsion and 3-torsion info by hecurve_g1_order above (specify 0 if not known).521The rank of the group (1 or 2) is returned, and n[0]=n2 if n1=1, o.w. n[0]=n1 and n[1]=n2. Note that n2 is the group exponent.522523It is possible to compute the group invariants (n1 and n2) in probabilistic (Las Vegas) polynomial time using the Weil pairing via Miller's algorithm524(although this doesn't actually determine a basis, i.e. independent elements of order n1 and n2).525526We take a simpler generic approach which is very fast in the typical case. We may in the future want to invoke (refinements of) Miller's algorithm for527hard cases (large prime dividing p-1 whose square divides N). As it stands, the algorithm below has complexity O(q^(1/2)) where q is the largest prime dividing p-1528whose square divides N. This is O(N^(1/4)) in the worst case, but this rarely happens and in practice computing the group structure takes about 10% the529time it takes to compute the group order (for a non-CM curve). (It might be interesting to do a performance comparison wth Miller's algorithm at some point)530531We just return the group invariants here, and optimize for computing these as quickly as possible, but the algorithm below can easily be modified to return a basis532without changing the complexity (but the constant factors will be slightly worse). This is a Las Vegas algorithm, i.e. provably correct output and bounded expected running time.533*/534int hecurve_g1_group_structure (long n[2], long N, long d, ff_t f[4])535{536long m,n1,n2,e;537hec_jc_t b1, b2;538unsigned long p[MAX_UI_PP_FACTORS], q[MAX_UI_PP_FACTORS], h[MAX_UI_PP_FACTORS];539ff_t x,y;540long q0, q1, q2, r;541int i, j, k;542543/*544For each prime q|N, if q does not divide p-1 we know the q-Sylow subgroup is cyclic. Even when q does divide p-1, if q^2 does not divide N, we545again know the q-Sylow is cyclic (of prime order in this case). Applying just these 2 facts will often suffice to determine the group structure without546using any group operations, particularly if information in d is also applied (e.g. 2-torsion and 3-torsion info).547*/548k = ui_factor(p,h,ui_gcd(_ff_p-1,N));549for ( i = j = 0 ; i < k ; i++ ) { // make a list of primes p[i] dividing p-1 whose square divides N550if ( p[i]==2 && d && d%2 ) continue; // d=gcd(6,n1) not divisible by 2 implies 2-Sylow must be cyclic551if ( p[i]==3 && d && d%3 ) continue; // d=gcd(6,n1) not divisible by 3 implies 3-Sylow must be cyclic552if ( !(N%(p[i]*p[i])) ) p[j++] = p[i];553}554n1 = 1; n2 = N;555for ( i = 0 ; i < j ; i++ ) {556for ( q[i] = p[i]*p[i] ; !(N%(q[i]*p[i])) ; q[i] *= p[i] ); // determine power of p[i] dividing N557n2 /= q[i]; // remove q[i] from n2558}559//printf("%ld: N=%d ", _ff_p, N); ff_poly_print(f,3);560// Now determine the p[i]-Sylow subgroups for p[i] dividing p-1 and p[i]^2 dividing N (if there are none, we have no work to do).561for ( i = 0 ; i < j ; i++ ) {562if ( d ) for ( q0 = 1 ; ! (d%(p[i]*q0)) ; q0 *= p[i] ); else q0 = 1; // if d was specified, use d to compute q0, a known divisor of q1 (and q2), possibly 1563e = N/(q[i]*n1); // the p[i]-Sylow subgroup must lie in the image of the e-power map (we can take out the n1 we know)564for ( q1 = q2 = q0, k=0 ; q1*q2 < q[i] && k < 1000 ; k++ ) { // bound the number of retries as a safety valve, just in case the group order was invalid565if ( ! hecurve_random_point(&x,&y,f) ) continue; // get a random finite point566hecurve_g1_AJC_exp_ui (&b1, x, y, e, f[1]); // get an element of the p[i]-Sylow via the e-power map567r = hecurve_g1_JC_pp_order (&b1,p[i],q[i],f[1]); // compute its order, which will be a power of p[i]568if ( r <= q1 ) continue; // if the order of b1 is not strictly larger than q1, it isn't going to help us569if ( q2==q0 ) { b2=b1; q2 = r; continue; } // first time through we will just set q2570hecurve_g1_p_reduce (&b1, &r, &b2, &q2, p[i], f[1]); // reduce to a basis for <b1,b2>571q1 = ( r > q0 ? r : q0); // note that if r<q0, then q1 is not the order of b1, but we don't care572}573if ( k == 1000 ) { printf ("Group structure computation failed on %d-Sylow of size %d with group of order %d on curve over F_%d: ", p[i], q[i], N, _ff_p); ff_poly_print(f,3); exit (0); }574n1 *= q1; n2 *= q2;575}576if ( n1*n2 != N ) { printf ("bug in hecurve_g1_group_order, %d*%d != %d, F_%ld curve ", n1, n2, N, _ff_p); ff_poly_print(f,3); exit(0); } // sanity check577if ( n1 == 1 ) { n[0] = n2; return 1; }578n[0] = n1; n[1] = n2;579return 2;580}581582/*583Slow Monte Carlo algorithm to test the group exponent, use for debugging/testing only.584*/585int hecurve_g1_test_exponent (long e, ff_t f[4])586{587hec_jc_t t[1];588ff_t x,y;589long m, n;590int i;591592n = 1;593for ( i = 0 ; i < 100 ; i++ ) {594if ( ! hecurve_random_point(&x,&y,f) ) continue;595hecurve_g1_AJC_exp_ui(t,x,y,e,f[1]);596if ( ! hecurve_g1_JC_id(t) ) return 0;597m = hecurve_g1_fastorder (x,y,e,f[1]);598if ( n%m ) n = m*(n/ui_gcd(m,n));599}600return (n==e);601}602603/*604Given non-trivial elements b1 and b2 of the p-Sylow subgroup with |b1|=q1 and |b2|=q2 powers of p (p here is a divisor of the group order, not the characteristic of the field),605The function below computes a basis for <b1,b2> and updates b1,b2,q1, and q2 appropriately, with q1 < q2 (possibly q1=1 if <b1,b2> is cyclic).606607The complexity is O(log_p(q1)*(sqrt(p)+log(q2)).608AVS: the code below is a special case of a new algorithm for group structure computation (a simpler and faster version of Algorithm 9.2 in my thesis which also incorporates609a few ideas from Teske's Pohlig-Hellman paper). I am in the process of writing a paper which I will eventually reference here.610*/611void hecurve_g1_p_reduce (hec_jc_t *b1, long *q1, hec_jc_t *b2, long *q2, long p, ff_t f1)612{613hec_jc_t t[1],a1[1],a2[1];614long k;615int i;616617if ( *q1 > *q2 ) { t[0] = *b1; *b1 = *b2; *b2 = t[0]; k = *q1; *q1 = *q2; *q2 = k; } // swap inputs if needed so that q1 <= q2618619hecurve_g1_JC_exp_ui(a2,b2,*q2/p,f1); // a2=b2^(q2/p), <a2> is the subgroup of <b2> of order p620while (*q1>1) {621hecurve_g1_JC_exp_ui(a1,b1,(*q1)/p,f1); // a1=b1^(q1/p), <a1> is the subgroup of <b1> of order p622k = hecurve_g1_dlog (a2,a1,p,f1); // compute least k>0 s.t. a1=a2^k, if possible623if ( ! k ) break; // if we can't, then a1 and a2 are independent, hence so are b1 and b2, and we are done624hecurve_g1_JC_exp_ui(t,b2,(*q2)/(*q1)*k,f1); hecurve_g1_JC_invert(t); // compute t=b2^-(q2/q1*k)625hecurve_g1_JCJC (b1,t,f1); // replace b1 with b1*b2^-(q2/q1*k), this reduces the order of b1 to at most q1/p since626// (b1*b2^-(q2/q1*k))^(q1/p) = a1*a2^{-k} = 1, and we do not change <b1,b2> by doing this.627k = hecurve_g1_JC_pp_order(b1,p,*q1,f1); // recompute q1 (it could be less than q1/p)628if ( k >= *q1 ) { printf ("%ld: Error in hecurve_g1_p_reduce(%d,%d) element order %d not reduced\n", _ff_p, *q1, *q2, k); exit (0); } // sanity check629*q1 = k;630}631}632633634long hecurve_g1_JC_pp_order (hec_jc_t *a, long p, long q, ff_t f1)635{636register long r;637hec_jc_t t[1];638639if ( hecurve_g1_JC_id(a) ) return 1;640if ( p==q ) return p;641hecurve_g1_JC_exp_ui (t,a,p,f1);642if ( hecurve_g1_JC_id(t) ) return p;643for ( r = p*p ; r < q ; r *= p ) {644hecurve_g1_JC_exp_ui (t,t,p,f1);645if ( hecurve_g1_JC_id(t) ) return r;646}647return q;648}649650/*651Fast and simple small hash table lookup used by hecurve_g1_bsgs_search and also by hecurve_g1_dlog.652*/653654#if FF_HALF_WORDS == 1655#define BSGS_MAX_STEPS 256 // guaranteed to handle p<2^31 (single word ff_t), assuming 2 torsion is known, enlarge if needed but it's nice to fit in L1 cache656#else657#define BSGS_MAX_STEPS 1024 // will BSGS up to 2^40, assuming 2 torsion is known, and nearly as high for dlogs658#endif659#define BSGS_TABSIZE BSGS_MAX_STEPS // don't make this too big, it takes time to initialize it. A few collisions won't kill us.660#define BSGS_TABMASK ((unsigned long)(BSGS_TABSIZE-1))661662static hec_jc_t babys[BSGS_MAX_STEPS];663static hec_jc_t giants[BSGS_MAX_STEPS];664665static ff_t stepzs[2*BSGS_MAX_STEPS];666#if FF_HALF_WORDS == 1667static unsigned char hashtab[BSGS_TABSIZE];668#else669static unsigned short hashtab[BSGS_TABSIZE];670#endif671static struct tab_entry {672ff_t x;673short i;674short next;675} entries[BSGS_MAX_STEPS+1];676short nexttabentry;677678static inline void tab_clear() { memset(hashtab,0,sizeof(hashtab)); nexttabentry = 1; } // don't use entry 0679680// we require inserts to have unique x values, return -1 if unique, otherwise return index value for existing entry681static inline int tab_insert(ff_t x, short i)682{683register struct tab_entry *p;684register int n,h;685686h = x&BSGS_TABMASK;687n = hashtab[h];688if ( n ) {689p=entries+n;690for(;;) {691if ( _ff_equal(p->x,x) ) return p->i;692if ( ! p->next ) break;693p=entries+p->next;694}695}696p = entries+nexttabentry;697_ff_set(p->x,x);698p->i = i;699p->next = n;700hashtab[h] = nexttabentry++;701return -1;702}703704static inline int tab_lookup(ff_t x)705{706register struct tab_entry *p;707register int n;708709n = hashtab[x&BSGS_TABMASK];710if ( ! n ) return -1;711p=entries+n;712for(;;) {713if ( _ff_equal(p->x,x) ) return p->i;714if ( ! p->next ) return -1;715p=entries+p->next;716}717return -1;718}719720/*721Computes the discrete log of b wrt a given the order of a using a BSGS search. Optimized for small searches (shares table lookup code with hecurve_g1_bsgs_search).722Returns 0 if b is not an element of <a> (this is not assumed).723724Note that we are only called for prime values whose square divides the group order, so we only use O(N^(1/4)) steps in the worst case, and even this is very rare725since the prime must also divide p-1.726*/727int hecurve_g1_dlog (hec_jc_t *a, hec_jc_t *b, long o, ff_t f1)728{729hec_jc_t c[1];730long bsteps, giant, gstep, gsteps;731ff_t zinv[2], baby_x[1], baby_y[1], giant_x[1], giant_y[1];732ff_t t0, t1;733register int i, j;734735// hard-wire small o cases736if ( hecurve_g1_JC_id(a) ) return o;737switch ( o ) {738case 1: return 0;739case 2: return ( hecurve_g1_JC_cmp (a,b) ? 1 : 0 );740case 3: i = hecurve_g1_JC_cmp (a,b); return ( i ? (i>0?1:2) : 0 );741}742743//printf ("dlog(%d) a=(%d,%d,%d,%d) b=(%d,%d,%d,%d)\n", o, _ff_get_ui(a->x), _ff_get_ui(a->y), _ff_get_ui(a->z2), _ff_get_ui(a->z3), _ff_get_ui(b->x), _ff_get_ui(b->y), _ff_get_ui(b->z2), _ff_get_ui(b->z3));744745// For small searches we will just search the whole range so we only match once (using just one inversion). The only optimization we use is the usual one for inverses.746bsteps = (long)sqrt(o/2);747gstep = 2*bsteps+1;748gsteps = _ui_ceil_ratio(o,gstep);749if ( bsteps > BSGS_MAX_STEPS || gsteps > BSGS_MAX_STEPS ) { printf ("%ld: order %ld caused BSGS_MAX_STEPS to be exceeded in hecurve_g1_dlog\n", _ff_p, o); exit (0); }750//printf("bsteps=%d, gsteps=%d, giant step=%d\n", bsteps, gsteps, gstep);751752// Convert baby and giant step to affine coords for stepping753hecurve_g1_JC_exp_ui (c,a,gstep,f1);754hecurve_g1_JC_invert(c);755_ff_set(zinv[0],a->z3);756_ff_set(zinv[1],c->z3);757ff_parallel_invert (zinv, zinv, 2);758hecurve_g1_JC_to_A (baby_x, baby_y, a, zinv[0]);759hecurve_g1_JC_to_A (giant_x, giant_y, c, zinv[1]);760//printf ("baby step(a) = (%d,%d)\n", _ff_get_ui(baby_x[0]), _ff_get_ui(baby_y[0]));761//printf ("giant step(a^-%d) = (%d,%d)\n", gstep, _ff_get_ui(giant_x[0]), _ff_get_ui(giant_y[0]));762763// Step764babys[0] = *a; giants[0] = *b;765j = hecurve_g1_AJC_steps_1 (babys, baby_x[0], baby_y[0], bsteps, f1); // baby steps766if ( j < bsteps ) { printf ("%ld: baby step hit identity in dlog with o=%ld\n", _ff_p, o); exit(0); }767j = hecurve_g1_AJC_steps_1 (giants, giant_x[0], giant_y[0], gsteps, f1); // giant steps768if ( j < gsteps ) return j*gstep;769770// Convert to affine to get unique x coords771for ( i = j = 0 ; i < bsteps ; i++, j++ ) _ff_set(stepzs[j], babys[i].z2);772for ( i = 0 ; i < gsteps ; i++, j++ ) _ff_set(stepzs[j], giants[i].z2);773ff_parallel_invert(stepzs, stepzs, j); // we assume j < FF_MAX_PARALLEL_INVERTS here774for ( i = j = 0 ; i < bsteps ; i++,j++ ) ff_mult(babys[i].x, babys[i].x, stepzs[j]); // only invert x-coord here, we never need to invert y775for ( i = 0 ; i < gsteps ; i++,j++ ) ff_mult(giants[i].x, giants[i].x, stepzs[j]);776777/*778printf ("affine baby x's:\n", j);779for ( i = 0 ; i < bsteps ; i++ ) printf (" %d: %ld\n", i+1, _ff_get_ui(babys[i].x));780printf ("affine giant x's:\n", j);781for ( i = 0 ; i < gsteps ; i++ ) printf (" %ld: %ld\n", i*gstep, _ff_get_ui(giants[i].x));782*/783// Populate the table with babys. Inserts should never fail (baby steps can't be inverses provided bsteps < o/2)784tab_clear();785for ( i = 0 ; i < bsteps ; i++ ) if ( (j=tab_insert(babys[i].x,i)) >= 0 ) break;786if ( i < bsteps ) { printf("%ld: baby step insert failed in dlog\n", _ff_p); exit (0); }787788// Now match giant steps789for ( i = 0 ; i < gsteps ; i++ ) {790if ( (j=tab_lookup(giants[i].x)) >= 0 ) {791_ff_mult(t0,babys[j].y,giants[i].z3); _ff_mult(t1,giants[i].y,babys[j].z3); // normalize y values for comparison792if ( _ff_equal(t0,t1) ) return i*gstep+j+1;793return (i?i*gstep-j-1:o-j-1);794}795}796return 0;797}798799800/*801Support functions for hecurve_g1_bsgs_search...802*/803804double baby_stretch[8] = {0.0,1.0,2.0,1.5,3.0,0.0,3.0,6.0};805806// returns the (j+1)-th integer satisfying property determined by modcase.807static inline int baby_index(int j, int modcase)808{809register int k;810811k=j&1;812switch (modcase) {813case 1: return j+1; // all values814case 2: return 2*j+1; // odd values (bspan,gsteps and giants even)815case 3: return 3*(j-k)/2+1+k; // prime to 3 (bspan, gstep, and giants multiples of 3)816case 4: return 3*(j+1); // 0 mod 3 (bspan and gstep multiples of 3, giants a mod 3)817case 6: return 6*(j-k)/2+1+4*k; // prime to 6 (bspan, gstep, and giants multiples of 6)818case 7: return 6*(j+1); // 0 mod 6 (bspan and gstep multiples of 6, giants a mod 6)819}820}821822/*823sets exp to the unique multiple of k in [low,high] if there is one (and returns 1) or sets exp=k and returns 0824*/825static inline int set_exp (long *exp, long k, long low, long high)826{827register int i;828i = ui_multiples_in_range(k,low,high);829if ( ! i ) { printf ("no multiple of element order %ld in interval [%ld,%ld]\n", k, low, high); exit(0); }830if ( i == 1 ) { *exp = (high/k)*k; return 1; } else { *exp = k; return 0; }831}832833834/*835Uses BSGS (and fastorder) to compute the order k of the element b, given that some multiple of k lies in [low,high] (this is assumed, not verified).836The values m and a specify modular constraints on k as follows:837838m=1 no constraint839m=2 k is odd (use odd baby steps, even giant steps)840m=3 k is prime to 3 (use baby steps 1,2 mod 3, giant steps 0 mod 3)841m=3 (a!=0) k = a mod 3 (use baby steps 0 mod 3, giant steps congruent to a mod 3)842m=6 k is prime to 6 (use baby steps 1,5 mod 6, giant steps 0 mod 6)843m=6 (a!=0) k = a mod 6 (use baby steps 0 mod 6, giant steps congruent to a mod 6)844845The value modcase = m+(a?1:0) is used internally to uniquely identify these 6 situations.846Note that the giant steps are congruent to a mod m in every case.847848If the return value is 1, then exp is the unique multiuple of k in [low,high].849If the return value is 0, exp=k is the order of b. This almost always means there is more than one multiple of k in [low,high], but not necessarily.850*/851int hecurve_g1_bsgs_search (long *exp, hec_jc_t *b, long low, long high, int m,int a, ff_t f1)852{853hec_jc_t p[64];854hec_jc_t baby_steps[3], giant_steps[1];855ff_t zinv[4], baby_x[3], baby_y[3], giant_x[1], giant_y[1];856register ff_t t0, t1, t2;857register int i,j,k,bsteps,gsteps,dsteps,usteps;858int bspan,gstep,modcase;859long o, o1, o2, gbase, range;860861//printf ("%d: hecurve_g1_bsgs pt (%ld,%ld,%ld,%ld), low=%d, high=%d, m=%d, a=%d\n", _ff_p, _ff_get_ui(b->x), _ff_get_ui(b->y),_ff_get_ui(b->z2),_ff_get_ui(b->z3), low, high, m, a);862863range = high-low+1; // range is inclusive, we assume high>low, so range > 1864bsteps = (long)sqrt((double)(range) / (2.0*baby_stretch[m]));865if ( ! bsteps ) bsteps = 1;866if ( m>2 && !a && (bsteps&1) ) bsteps++; // make sure bsteps is even if we are covering steps prime to 3 or 6867modcase = m + (a?1:0);868bspan = bsteps * baby_stretch[modcase]; // effective baby coverage depends on modular contraints m and a869if ( m > 1 ) bspan = m * _ui_ceil_ratio(bspan,m); // make sure bspan is a multiple of modulus m (we can safely round up)870gstep= 2*bspan; // giant span is effectively 2*bspan+1 due to inverses, but we use 2*bspan so bspan divides gspan...871if ( m==1 ) gstep++; // ...except when m=1, where we can use 2*bspan+1.872873/*874Pick first giant step to be a multiple of as large a value of 2 as possible (and still lie in the interval [low,high]).875We then need to adjust to make it congruent to a mod m, but this changes at most 2 bits.876(An optimal approach might search for the value in [low,high] congruent to a mod m with minimal hamming weight, but this would likely take more time than it is worth).877878The overall savings is only a few percent (it cuts the cost of computing the first giant step by 1/6, but computing the first giant step is less than 1/4 the total cost).879However, it's an easy optimization, so there is no reason not to do it.880881This idea was suggested by Dan Bernstein.882*/883k = ui_lg_floor(range);884for (;;) {885o1 = (1L<<(k+1)); o = o1*_ui_ceil_ratio(low,o1);886if ( o > high ) break;887k++;888}889o1 = 1L<<k;890gbase = o1*_ui_ceil_ratio(low,o1); // gbase is now a multiple of a largish power of 2 in [low,high]891if ( m > 1 ) {892i = (gbase-a)%m;893gbase -= i; // gbase is now congruent to a mod m894if ( gbase < low ) gbase += m; // make sure we stay in [low,high]895if ( gbase > high ) gbase -= m;896}897for ( dsteps = (gbase-low)/gstep ; gbase - (dsteps-1)*gstep - bspan > low ; dsteps++ );898while ( gbase-(dsteps-1)*gstep <= 0 ) gbase += m; // make sure we don't step on zero or negative values899for ( usteps = (high-gbase)/gstep ; gbase + (usteps-1)*gstep + bspan < high ; usteps++ );900if ( dsteps+usteps+1 > BSGS_MAX_STEPS ) { printf ("Exceeded BSGS_MAX_STEPS=%d! p=%ld, low=%ld, high=%ld, m=%d, a=%d\n", BSGS_MAX_STEPS, _ff_p, low, high, m, a); exit (0); }901gsteps = dsteps+usteps-1; // dsteps and usteps both include starting step, so total is one less then the sum902903//printf ("%d: bsteps=%d, gsteps=%d(%d,%d), bspan=%d, gstep=%d, first giant = %ld\n", _ff_p, bsteps, gsteps, dsteps, usteps, bspan, gstep, gbase);904905// compute 2^k powers of b sufficient to compute b^gstep, and b^gbase906o = _ui_max(gstep,gbase);907k = ui_lg_floor(o);908p[0] = *b;909hecurve_g1_JC_powers (p, k+1, f1); // add 1 extra power for NAF910baby_steps[0] = *b; k = 1;911912// compute baby step sizes depending on the value of modcase913switch (modcase) { // if m=1, fist step=step size=1914case 1: break;915case 2:916case 3: hecurve_g1_2JC (baby_steps+1, baby_steps, f1); k = 2; break; // first step is 1, steps size is 1 or 1,2917case 4: hecurve_g1_2JC (baby_steps+1,baby_steps,f1); hecurve_g1_JCJC(baby_steps+1,baby_steps,f1); k = 2; break; // first step = step size is 3918case 6: hecurve_g1_2JC (baby_steps+1, baby_steps, f1);919hecurve_g1_2JC (baby_steps+2, baby_steps+1, f1); k = 3; break; // first step is 1, step size is 2 or 4920case 7: hecurve_g1_2JC (baby_steps+1,baby_steps,f1); hecurve_g1_JCJC(baby_steps+1,baby_steps,f1);921hecurve_g1_2JC(baby_steps+1,baby_steps+1,f1); k = 2; break; // first step = step size is 6922default:923printf("Invalid modcase value %d in hecure_g1_bsgs\n", m); exit (0);924}925926hecurve_g1_JC_exp_powers(giant_steps, p, gstep, f1);927hecurve_g1_JC_exp_powers(giants+dsteps-1, p, gbase, f1);928929// If any of our baby steps is the identity, we won't get very far, so we need to check this situation (it can happen). The giant steps are handled below.930if ( k > 1 && hecurve_g1_JC_id(baby_steps+1) ) return set_exp(exp,(a?m:2),low,high);931if ( k > 2 && hecurve_g1_JC_id(baby_steps+2) ) return set_exp(exp,4,low,high);932933// We need to convert the steps to affine coords before using them--do this with one field inversion.934for ( i = 0 ; i < k ; i++ ) _ff_set(zinv[i], baby_steps[i].z3);935j = k;936if ( ! hecurve_g1_JC_id(giant_steps) ) { _ff_set(zinv[j], giant_steps[0].z3); j++; } // giant step could be the identity937// invert all the z's938ff_parallel_invert (zinv, zinv, j);939for ( i = 0 ; i < k ; i++ ) hecurve_g1_JC_to_A (baby_x+i, baby_y+i, baby_steps+i, zinv[i]);940j=k;941if ( ! hecurve_g1_JC_id(giant_steps) ) { hecurve_g1_JC_to_A (giant_x, giant_y, giant_steps, zinv[j]); j++; }942943// Now handle giant step = identity (we needed to get the baby in affine coords for the fastorder computation first)944if ( hecurve_g1_JC_id(giant_steps) ) { o = hecurve_g1_fastorder (baby_x[0], baby_y[0], gstep, f1); return set_exp(exp,o,low,high); }945if ( hecurve_g1_JC_id(giants+dsteps-1) ) { o = hecurve_g1_fastorder (baby_x[0], baby_y[0], gbase, f1); return set_exp(exp,o,low,high); }946947/*948printf("affine baby step 0 (%ld,%ld)\n", _ff_get_ui(baby_x[0]),_ff_get_ui(baby_y[0]));949if (m>1) printf("affine baby step 1 (%ld,%ld)\n", _ff_get_ui(baby_x[1]),_ff_get_ui(baby_y[1]));950if (m==6) printf("affine baby step 2 (%ld,%ld)\n", _ff_get_ui(baby_x[2]),_ff_get_ui(baby_y[2]));951printf("affine giant step %d: (%ld,%ld)\n", gstep, _ff_get_ui(giant_x[0]),_ff_get_ui(giant_y[0]));952*/953954// Baby steps955babys[0] = ( a ? baby_steps[1] : baby_steps[0] ); // take first baby step956switch (modcase) {957case 1: j=hecurve_g1_AJC_steps_1 (babys, baby_x[0], baby_y[0], bsteps, f1); break; // step on everything (use step size 1)958case 2: j=hecurve_g1_AJC_steps_1 (babys, baby_x[1], baby_y[1], bsteps, f1); break; // step on odd powers (use step size 2)959case 3: j=hecurve_g1_AJC_steps_2 (babys, baby_x[0], baby_y[0], baby_x[1], baby_y[1], bsteps, f1); break; // step on powers prime to 3 (alternate step sizes 1 and 2: 1,2,4,5,7,8,...)960case 4: j=hecurve_g1_AJC_steps_1 (babys, baby_x[1], baby_y[1], bsteps, f1); break; // step on multiples of 3961case 6: j=hecurve_g1_AJC_steps_2 (babys, baby_x[2], baby_y[2], baby_x[1], baby_y[1], bsteps, f1); break; // step on powers prime to 6 (alternate step sizes 4 and 2: 1,5,7,11,13,17,...)962case 7: j=hecurve_g1_AJC_steps_1 (babys, baby_x[1], baby_y[1], bsteps, f1); break; // step on multiples of 6963}964if ( j < bsteps ) { // if a baby step is the identity, we know the order of the element965k = baby_index(j,modcase); // note that we rely on the order being relatively prime to 2,3,6 (per m)966if ( a ) k/=m; // need to adjust for common divisor of baby steps when a!=0967return set_exp(exp,k,low,high);968}969970// Giant steps971j = hecurve_g1_AJC_dsteps_1 (giants+dsteps-1, giant_x[0], giant_y[0], dsteps, f1); // downward steps first972if ( j < dsteps ) { // if a giant step hit the identity, it could be a multiple of the element order973// It might be faster to just continue in this situation, but it's rare in any event974o = hecurve_g1_fastorder (baby_x[0], baby_y[0], gbase-j*gstep, f1);975return set_exp(exp,o,low,high);976}977j = hecurve_g1_AJC_steps_1 (giants+dsteps-1, giant_x[0], giant_y[0], usteps, f1); // now upward steps978if ( j < usteps ) { // if a giant step hit the identity, it could be a multiple of the element order979// It might be faster to just continue in this situation, but it's rare in any event980o = hecurve_g1_fastorder (baby_x[0], baby_y[0], gbase+j*gstep, f1);981return set_exp(exp,o,low,high);982}983gbase -= (dsteps-1)*gstep; // reset gbase to match giants[0]984985// Convert to affine to get unique x coords986for ( i = j = 0 ; i < bsteps ; i++, j++ ) _ff_set(stepzs[j], babys[i].z2);987for ( i = 0 ; i < gsteps ; i++, j++ ) _ff_set(stepzs[j], giants[i].z2);988ff_parallel_invert(stepzs, stepzs, j); // we assume j < FF_MAX_PARALLEL_INVERTS here989for ( i = j = 0 ; i < bsteps ; i++,j++ ) ff_mult(babys[i].x, babys[i].x, stepzs[j]); // only invert x-coord here, we never need to invert y990for ( i = 0 ; i < gsteps ; i++,j++ ) ff_mult(giants[i].x, giants[i].x, stepzs[j]);991/*992printf ("affine baby x's:\n", j);993for ( i = 0 ; i < bsteps ; i++ ) printf (" %d(%d): %ld\n", i, baby_index(i,modcase), _ff_get_ui(babys[i].x));994printf ("affine giant x's:\n", j);995for ( i = 0 ; i < gsteps ; i++ ) printf (" %ld: %ld\n", gbase+i*gstep, _ff_get_ui(giants[i].x));996*/997// Populate the table with babys. Insert will fail if we try to insert the same x value twice - this can only happen if two babys are inverses (they can't be equal because we didn't hit the identity).998tab_clear();999for ( i = 0 ; i < bsteps ; i++ ) if ( (j=tab_insert(babys[i].x,i)) >= 0 ) break;1000/*1001If we encountered two baby steps with the same x value, they must be inverses, since otherwise we would have hit the identity in between.1002If m=1,2,3,6 we have stepped on every possible element order between the two, and if m=4,5 implies we have stepped on every multiple of a common divisor of the two,1003and hence on every possible value of the difference of their indices (which would be the identity if the steps were equal).1004*/1005if ( i < bsteps ) {1006k = baby_index(i,modcase)+baby_index(j,modcase);1007if ( a ) k/=m; // need to adjust for common divisor of baby steps when a != 01008return set_exp(exp,k,low,high);1009}10101011// Now match giant steps1012o1 = o2 = 0;1013for ( i = 0 ; i < gsteps ; i++ ) {1014if ( (j=tab_lookup(giants[i].x)) >= 0 ) {1015_ff_mult(t0,babys[j].y,giants[i].z3); _ff_mult(t1,giants[i].y,babys[j].z3); // normalize y values for comparison1016if ( _ff_zero(t0) ) { k = 2*baby_index(j,modcase); return set_exp(exp,k,low,high); } // handle 2-torsion case1017k = baby_index(j, modcase);1018if ( _ff_equal(t0,t1) ) k = -k;1019o = gbase+i*gstep+k;1020if ( o==o1 ) continue; // ignore duplicate matches, this can happen for a != 01021if ( o1 ) { o2 = o; break; }1022o1 = o;1023}1024}1025if ( ! o1 ) { printf ("%ld: Error, BSGS search of entire Hasse-Weil interval failed, low=%ld, high=%ld, m=%d\n", _ff_p, low, high, m); exit (0); }1026if ( ! o2 ) { *exp = o1; return 1; } // we know the o1 is the unique multiple of the element order in [low,high]1027k=i_abs(o2-o1); // otherwise, we know |o2-o1| is a multiple of the element order and there are at least two multiples in [low,high]1028*exp = (m==1 ? k : hecurve_g1_fastorder (baby_x[0], baby_y[0], k, f1) ); // if m is not 1 we don't know that o1 and o2 are adjacent multiples, but a fastorder computation will figure it out1029return 0;1030}10311032/*1033Compute the order of affine point (x,y) given that (x,y)^e=1 using the classical algorithm.10341035This algorithm is not particularly fast, but it doesn't need to be, as it is not invoked for most p.1036The average number of bits exponentiated per prime for a non-CM curve (fixed curve, varying p) is less than 1.1037*/1038long hecurve_g1_fastorder (ff_t x, ff_t y, long e, ff_t f1)1039{1040hec_jc_t b[MAX_UI_PP_FACTORS];1041unsigned long q, p[MAX_UI_PP_FACTORS], h[MAX_UI_PP_FACTORS];1042register long o;1043register int i, j, k, w;10441045w = ui_factor(p,h,e); // takes about 2 microseconds for e~2^32 (2.5MHz AMD-64), but e is generally much smaller1046if ( w == 1 && h[0] == 1 ) return e; // not hard when the exponent is prime10471048for ( i = 0 ; i < w ; i++ ) {1049for ( q=p[i],j=1 ; j < h[i] ; j++ ) q *= p[i];1050o = e/q;1051hecurve_g1_AJC_exp_ui (b+i, x, y, o, f1);1052}1053o = 1;1054for ( i = 0 ; i < w ; i++ ) {1055for ( j = 0 ; j < h[i]-1 ; j++ ) {1056if ( hecurve_g1_JC_id(b+i) ) break;1057hecurve_g1_JC_exp_ui (b+i, b+i, p[i], f1);1058}1059if ( ! hecurve_g1_JC_id (b+i) ) j++;1060for ( k = 0 ; k < j ; k++ ) o *= p[i];1061}1062//printf ("fastorder computed order of (%ld,%ld) is %ld from exponent %ld\n", _ff_get_ui(x), _ff_get_ui(y), o, e);1063return o;1064}10651066/*1067BSGS search hardwired for very short intervals (typically less than 50), which uses only one or two baby steps.1068These intervals can arise when the first order computation does not uniquely determine the exponent of the group (or when p is small).10691070We assume b does not have 2 torsion (hence its order is at least 3) and high > low.1071We rely on the existence of an exponent of b in [low,high] and do not always verify this (e.g., if high=low+1 and b^low != id then we conclude b^high == id)1072*/1073int hecurve_g1_bsgs_short (long *exp, hec_jc_t *b, long low, long high, ff_t f1)1074{1075long range;1076hec_jc_t b2[1],b5[1],g[1];1077register ff_t t0, t1;1078register long e;1079long o1,o2;1080int i;10811082range = high-low+1;1083switch (range) {1084case 2: // E no computation other than the g exponentiation, cost E1085hecurve_g1_JC_exp_ui(g, b, low, f1);1086*exp = ( hecurve_g1_JC_id(g) ? low : high );1087return 1;1088case 3: // E+4/3M (on average)1089hecurve_g1_JC_exp_ui(g, b, low+1, f1);1090if ( hecurve_g1_JC_id(g) ) { *exp = low+1; return 1; }1091// it must be the case that g = +/- b, we don't verify this but simply compare y values to distinguish1092_ff_mult(t0,b->y,g->z3); _ff_mult(t1,g->y,b->z3);1093*exp = ( _ff_equal (t0, t1) ? low : high );1094return 1;1095case 4: // E+1/4D+2M = E+5M+2A (roughly) (D indicates doubling(squaring) a point on the curve, which costs 11M+8A field operations)1096hecurve_g1_JC_exp_ui(g, b, low+1, f1);1097if ( hecurve_g1_JC_id(g) ) { *exp = low+1; return 1; }1098_ff_mult(t0,b->x,g->z2); _ff_mult(t1,g->x,b->z2);1099if ( _ff_equal (t0, t1) ) {1100_ff_mult(t0,b->y,g->z3); _ff_mult(t1,g->y,b->z3);1101if ( ! _ff_equal (t0, t1) ) { *exp = low+2; return 1; }1102// we know b^low = id, but it could also be that b^high = id if b has order 3, so we need to check this1103hecurve_g1_2JC (b2,b, f1);1104_ff_mult(t0,b->x,b2->z2); _ff_mult(t1,b2->x,b->z2); // given b != id, b^3==id iff b and b^2 have the same x coord1105if ( _ff_equal(t0,t1) ) { *exp = 3; return 0; }1106*exp = low;1107return 1;1108}1109*exp = high;1110return 1;1111case 5: // < E+4/5D+5M (on average)1112hecurve_g1_JC_exp_ui(g, b, low+2, f1);1113if ( hecurve_g1_JC_id(g) ) { *exp = low+2; return 1; }1114hecurve_g1_2JC (b2,b, f1);1115if ( hecurve_g1_JC_id(b2) ) { *exp = 2; return 0; }1116if ( hecurve_g1_JC_2tor(b2) ) { *exp = 4; return 0; }1117_ff_mult(t0,b->x,b2->z2); _ff_mult(t1,b2->x,b->z2);1118if ( _ff_equal(t0,t1) ) { *exp = 3; return 0; }1119// we now know |b|>4, so there is a unique multiple in the interval1120_ff_mult(t0,b->x,g->z2); _ff_mult(t1,g->x,b->z2);1121if ( _ff_equal (t0, t1) ) {1122_ff_mult(t0,b->y,g->z3); _ff_mult(t1,g->y,b->z3);1123*exp = ( _ff_equal(t0,t1) ? low+1 : low+3 );1124return 1;1125}1126_ff_mult(t0,b2->y,g->z3); _ff_mult(t1,g->y,b2->z3);1127*exp = ( _ff_equal (t0, t1) ? low : low+4 );1128return 1;1129}1130hecurve_g1_2JC (b2,b, f1);1131if ( hecurve_g1_JC_id(b2) ) { *exp = 2; return 0; }1132if ( hecurve_g1_JC_2tor (b2) ) { *exp = 4; return 0; }1133_ff_mult(t0,b->x,b2->z2); _ff_mult(t1,b2->x,b->z2);1134if ( _ff_equal(t0,t1) ) { *exp = 3; return 0; }1135hecurve_g1_2JC (b5,b2, f1);1136hecurve_g1_JCJC (b5,b, f1);1137if ( hecurve_g1_JC_id(b5) ) { *exp = 5; return 0; }1138// we know |b|>51139e = low+2;1140hecurve_g1_JC_exp_ui(g, b, e, f1);1141o1 = o2 = 0;1142while ( e < high+3 ) {1143if ( hecurve_g1_JC_id(g) ) { o2 = o1; o1 = e; goto next; }1144_ff_mult(t0,b->x,g->z2); _ff_mult(t1,g->x,b->z2);1145if ( _ff_equal (t0, t1) ) {1146_ff_mult(t0,b->y,g->z3); _ff_mult(t1,g->y,b->z3);1147i = ( _ff_equal(t0,t1) ? -1 : 1 );1148o2 = o1; o1 = e+i; goto next;1149}1150_ff_mult(t0,b2->x,g->z2); _ff_mult(t1,g->x,b2->z2);1151if ( _ff_equal (t0, t1) ) {1152_ff_mult(t0,b2->y,g->z3); _ff_mult(t1,g->y,b2->z3);1153i = ( _ff_equal(t0,t1) ? -2 : 2 );1154o2 = o1; o1 = e+i; goto next;1155}1156next: if ( o2 ) break;1157hecurve_g1_JCJC (g,b5,f1);1158e += 5;1159}1160if ( ! o1 ) { printf ("%ld: No match found in short interval [%ld,%ld]\n", _ff_p, low, high); exit (0); }1161if ( o2 ) { *exp = o1-o2; return 0; }1162*exp = o1;1163// total cost is roughly E + 2D + (1+(range/5-1))*A + (range/5)*4M (D indicates point doubling, A indicates point addition)1164return 1;1165}11661167/*1168The rest of this file contains code that is not currently used because it was found to to be suboptimal in testing (on an AMD Athlon) but1169may be useful on other platforms and/or other applications.1170*/1171#if 011721173/*1174Below are elliptic curve arithmetic functions for Jacobian coordinates. The reduced Chudnovsky coordinates1175are faster at everything except 2JC versus 2J is (11M+8A) vs (10M+10A), but this difference is negligible (1M~2.5A),1176and in testing it was found to be faster to use JC everywhere.1177*/11781179// converts Jacobian to affine coords, given the inverse of z1180static inline void hecurve_g1_J_to_A (ff_t *px, ff_t *py, hec_j_t *p, ff_t zinv)1181{1182register ff_t t0,t1;11831184_ff_square(t0,zinv);1185_ff_mult(*px,p->x,t0);1186_ff_mult(t1,t0,zinv);1187_ff_mult(*py,p->y,t1);1188// 4M1189}11901191// converts Jacobian to reduced Chudnovsky Jacobian coords1192static inline void hecurve_g1_J_to_JC (hec_jc_t *o, hec_j_t *p)1193{1194_ff_set(o->x,p->x);1195_ff_set(o->y,p->y);1196_ff_square(o->z2,p->z);1197_ff_mult(o->z3,o->z2,p->z);1198// 2M1199}12001201// squares an affine point into Jacobian coords1202static inline void hecurve_g1_2AJ (hec_j_t *p, ff_t x, ff_t y, ff_t f1)1203{1204register ff_t t0,t1,t2,t3,a,b;12051206_ff_add(p->z,y,y); _ff_mult(t1,p->z,y); _ff_add(t2,t1,t1); _ff_mult(a,x,t2); // z=2y, a=4xy^2, t1=2y^2, t2=4y^21207_ff_mult(t3,t1,t2); // t3=8y^41208_ff_square(t1,x); _ff_add(b,t1,t1); _ff_addto(b,t1); _ff_addto(b,f1); // b = 3x^2+f1*1^41209_ff_square(t0,b); _ff_add(t2,a,a); _ff_sub(p->x,t0,t2); // x = b^2-2a1210_ff_sub(t1,a,p->x); _ff_mult(t2,t1,b); _ff_sub(p->y,t2,t3); // y = b(a-x)-8y^4 (note we use the new x here)1211// 6M+9A1212}121312141215// squares a point p1 in Jacobian coords (p3 is the output, may be equal to p1)1216static inline void hecurve_g1_2J (hec_j_t *p3, hec_j_t *p1, ff_t f1)1217{1218register ff_t a, b, c, t0, t1, t2;12191220_ff_square(t0,p1->x); _ff_add(t1,t0,t0); _ff_addto(t1,t0); // t1 = 3x^21221_ff_square(t0,p1->z); _ff_square(t2,t0); _ff_mult(t0,f1,t2); _ff_add(b,t1,t0); // b = 3x^2+f1*z^4 (note that f1=a4 in 13.2.1.c of HECHECC p.282)1222_ff_add(c,p1->y,p1->y); ff_mult(p3->z,p1->z,c); // c=2y, z = 2yz1223_ff_mult(t2,c,p1->y); _ff_add(t0,t2,t2); _ff_mult(a,t0,p1->x); // a=4xy^2,t2=2y^21224_ff_add(t0,a,a); _ff_square(t1,b); _ff_sub(p3->x,t1,t0); // x = b^2-2a1225_ff_square(t0,t2); _ff_add(c,t0,t0); // c = 8y^41226_ff_sub(t0,a,p3->x); _ff_mult(t2,t0,b); _ff_sub(p3->y,t2,c); // y = b(a-x)-c -- note we use the new x value here1227// 10M+10A1228}122912301231// multiplies a point p in Jacobian coords by a (non-identity) affine point (p is an input and an output)1232static inline void hecurve_g1_AJ (hec_j_t *p, ff_t x0, ff_t y0, ff_t f1)1233{1234register ff_t a, c, e, f, t0, t1, t2;12351236if ( _ff_zero(p->z) ) { _ff_set(p->x,x0); _ff_set(p->y,y0); _ff_set_one(p->z); return; }1237_ff_square(t0,p->z); _ff_mult(a,x0,t0); // a = x0z^2, b = x*1^2=x (since z0=1), and t0=z^21238_ff_sub(e,p->x,a); // e = a-b1239_ff_mult(t1,t0,p->z); _ff_mult(c,y0,t1); // c = y0z^3, d=y*1^3=y1240_ff_sub(f,p->y,c); // f = d-c1241if ( _ff_zero(e) && _ff_zero(f) ) { hecurve_g1_2AJ (p, x0,y0,f1); return; } // must use doubling code here, but at least its an affine double (6M+9A)1242_ff_square(t0,e); _ff_mult(t1,t0,e); _ff_mult(t2,t0,a); // t1=e^3, t2=ae^21243_ff_square(t0,f); _ff_sub(p->x,t0,t1); _ff_add(t0,t2,t2); _ff_subfrom(p->x,t0); // x = f^2-e^3-2ae^21244_ff_sub(t0,t2,p->x); _ff_mult(t2,f,t0); _ff_mult(t0,t1,c); _ff_sub(p->y,t2,t0); // y = f(ae^2-x) - ce^31245ff_mult(p->z,p->z,e); // z = 1*z*e1246// 11M+6A1247}124812491250// multiplies p1 in Jacobian coords by p2 Jacobian coords and puts the result in p1. Handles identity and will square (double) if needed1251// this is the slowest case and should be avoided when possible, AJ, AJC, and JCJC are all better1252static inline void hecurve_g1_JJ (hec_j_t *p1, hec_j_t *p2, ff_t f1)1253{1254register ff_t a, b, c, d, e, f, t0, t1, t2;12551256// we need to handle identity cases in general (although in some cases we might be able to rule this out)1257if ( _ff_zero(p2->z) ) return;1258if ( _ff_zero(p1->z) ) { *p1=*p2; return; }1259_ff_square(t0,p2->z); _ff_mult(a,p1->x,t0); // a = x1z2^2, and t0=z2^21260_ff_square(t1,p1->z); _ff_mult(b,p2->x,t1); // b = x2z1^2, and t1=z1^21261_ff_sub(e,b,a); // e = a-b1262_ff_mult(t2,t0,p2->z); _ff_mult(c,p1->y,t2); // c = y1z2^31263_ff_mult(t2,t1,p1->z); _ff_mult(d,p2->y,t2); // d = y2z1^31264_ff_sub(f,d,c); // f = d-c1265if ( _ff_zero(e) && _ff_zero(f) ) { hecurve_g1_2J (p1,p1,f1); return; } // must double if pts are equal (10M+10A), inverses will end up with z=0, so we let them through1266_ff_square(t0,e); _ff_mult(t1,t0,e); _ff_mult(t2,t0,a); // t1=e^3, t2=ae^21267_ff_square(t0,f); _ff_sub(p1->x,t0,t1); _ff_add(t0,t2,t2); _ff_subfrom(p1->x,t0); // x = f^2-e^3-2ae^21268_ff_sub(t0,t2,p1->x); _ff_mult(t2,f,t0); _ff_mult(t0,t1,c); _ff_sub(p1->y,t2,t0); // y = f(ae^2-x) - ce^3 -- note we use the new x here1269ff_mult(t0,p1->z,p2->z); _ff_mult(p1->z,t0,e); // z = z1z2e1270// 16M+7A (ouch)1271}127212731274// Computes p=(x,y)^n where (x,y) !=1 is in affine coordinates and p is in Jacobian coordinates1275// It takes advantage of the fact that all additions are of the form J+A, requiring only 11M, doubling is done in J+J form, using 10M1276void hecurve_g1_AJ_exp_ui (hec_j_t *p, ff_t x0, ff_t y0, unsigned long n, ff_t f1)1277{1278register unsigned long m;1279unsigned long pbits, nbits;1280register ff_t negy0;1281int i;12821283if ( n == 0 ) { _ff_set_zero(p->z); return; }1284if ( n == 1 ) { _ff_set(p->x,x0); _ff_set(p->y,y0); _ff_set_one(p->z); return; }1285hecurve_g1_2AJ(p,x0,y0,f1);1286if ( n == 2 ) return;1287ui_NAF(&pbits,&nbits,n);1288i = _asm_highbit(pbits)-2; // we know the top two bits of the NAF are 101289hecurve_expbits+=i+1;1290_ff_neg(negy0,y0);1291m = (1UL<<i);1292for ( ; m ; m >>= 1 ) {1293hecurve_g1_2J(p,p,f1); // 10M+10A1294if ( m&pbits ) hecurve_g1_AJ(p,x0,y0,f1); // 11M+6A1295if ( m&nbits ) hecurve_g1_AJ(p,x0,negy0,f1); // 11M+6A1296}1297}129812991300// Computes p=a^e where a!=1 is in Jacobian coords, and so is p.1301// This is slow and should be avoided. Overlap is ok1302void hecurve_g1_J_exp_ui (hec_j_t *p, hec_j_t *a, unsigned long n, ff_t f1)1303{1304register unsigned long m;1305unsigned long pbits, nbits;1306hec_j_t t, ai;1307int i;13081309if ( n == 0 ) { _ff_set_zero(p->z); return; }1310if ( n == 1 ) { *p=*a; return; }1311hecurve_g1_2J(&t,a,f1); // 10M+10A1312if ( n == 2 ) { *p = t; return; }1313ui_NAF(&pbits,&nbits,n);1314i = _asm_highbit(pbits)-2;1315hecurve_expbits+=i+1;1316ai=*a; ff_negate(ai.y);1317m = (1UL<<i);1318for ( ; m ; m >>= 1 ) {1319hecurve_g1_2J(&t,&t,f1); // 10M+10A1320if ( m&pbits ) hecurve_g1_JJ(&t,a,f1); // 16M+7A1321if ( m&nbits ) hecurve_g1_JJ(&t,&ai,f1); // 16M+7A1322}1323*p = t;1324}132513261327// Combines precomputed values p[i]=p[0]^(2^i) to compute p[0]^n. Assumes all required powers are present: NAF potentially requires one more bit!1328// CAUTION: if this is false, the resulting behavior is very strange and unpredictable. You have been warned.1329void hecurve_g1_J_exp_powers(hec_j_t *o, hec_j_t *p, unsigned long n, ff_t f1)1330{1331register unsigned long m;1332unsigned long pbits, nbits;1333hec_j_t t;1334int i;13351336// handle small n quickly1337switch (n) {1338case 0: _ff_set_zero(o->z); return;1339case 1: *o=p[0]; return;1340case 2: *o=p[1]; return;1341case 3: *o=p[0]; hecurve_g1_JJ(o,p+1,f1); return;1342case 4: *o=p[2]; return;1343case 5: *o=p[2]; hecurve_g1_JJ(o,p,f1); return;1344case 6: *o=p[2]; hecurve_g1_JJ(o,p+1,f1); return;1345case 7: *o=p[0]; ff_negate(o->y); hecurve_g1_JJ(o,p+3,f1); return;1346case 8: *o=p[3]; return;1347}1348ui_NAF(&pbits,&nbits,n);1349i = _asm_highbit(pbits);1350*o = p[i];1351i-=2;1352hecurve_expbits+=i+1;1353m = (1UL<<i);1354for ( ; m ; m >>= 1, i-- ) {1355if ( (m&pbits) ) hecurve_g1_JJ(o,p+i,f1); // 16M+7A (these are expensive)1356if ( (m&nbits) )1357{ ff_negate(o->y); hecurve_g1_JJ(o,p+i,f1); ff_negate(o->y); } // 16M+9A (slightly more expensive)1358}1359}13601361// Sets p[i] = p[0]^(2^i) for i in [0,k]. Input is an affine pt not equal to the identity, output is in Jacobian coords.1362void hecurve_g1_AJ_powers (hec_j_t p[], ff_t x0, ff_t y0, int k, ff_t f1)1363{1364ff_t a, b, c, x, y, z, t0, t1, t2;1365register hec_j_t *e;13661367_ff_set(p->x,x0); _ff_set(p->y,y0); _ff_set_one(p->z);1368hecurve_g1_2AJ (p+1, x0, y0, f1);1369for ( e=p+k,p+=2 ; p <= e ; p++ ) hecurve_g1_2J(p,p-1,f1); // 10M + 10A1370}1371#endif137213731374