/*1* Copyright (c) 2008-2020 Stefan Krah. All rights reserved.2*3* Redistribution and use in source and binary forms, with or without4* modification, are permitted provided that the following conditions5* are met:6*7* 1. Redistributions of source code must retain the above copyright8* notice, this list of conditions and the following disclaimer.9*10* 2. Redistributions in binary form must reproduce the above copyright11* notice, this list of conditions and the following disclaimer in the12* documentation and/or other materials provided with the distribution.13*14* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND15* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE16* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE17* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE18* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL19* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS20* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)21* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT22* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY23* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF24* SUCH DAMAGE.25*/262728#include "mpdecimal.h"2930#include <assert.h>3132#include "constants.h"33#include "fourstep.h"34#include "numbertheory.h"35#include "sixstep.h"36#include "umodarith.h"373839/* Bignum: Cache efficient Matrix Fourier Transform for arrays of the40form 3 * 2**n (See literature/matrix-transform.txt). */414243#ifndef PPRO44static inline void45std_size3_ntt(mpd_uint_t *x1, mpd_uint_t *x2, mpd_uint_t *x3,46mpd_uint_t w3table[3], mpd_uint_t umod)47{48mpd_uint_t r1, r2;49mpd_uint_t w;50mpd_uint_t s, tmp;515253/* k = 0 -> w = 1 */54s = *x1;55s = addmod(s, *x2, umod);56s = addmod(s, *x3, umod);5758r1 = s;5960/* k = 1 */61s = *x1;6263w = w3table[1];64tmp = MULMOD(*x2, w);65s = addmod(s, tmp, umod);6667w = w3table[2];68tmp = MULMOD(*x3, w);69s = addmod(s, tmp, umod);7071r2 = s;7273/* k = 2 */74s = *x1;7576w = w3table[2];77tmp = MULMOD(*x2, w);78s = addmod(s, tmp, umod);7980w = w3table[1];81tmp = MULMOD(*x3, w);82s = addmod(s, tmp, umod);8384*x3 = s;85*x2 = r2;86*x1 = r1;87}88#else /* PPRO */89static inline void90ppro_size3_ntt(mpd_uint_t *x1, mpd_uint_t *x2, mpd_uint_t *x3, mpd_uint_t w3table[3],91mpd_uint_t umod, double *dmod, uint32_t dinvmod[3])92{93mpd_uint_t r1, r2;94mpd_uint_t w;95mpd_uint_t s, tmp;969798/* k = 0 -> w = 1 */99s = *x1;100s = addmod(s, *x2, umod);101s = addmod(s, *x3, umod);102103r1 = s;104105/* k = 1 */106s = *x1;107108w = w3table[1];109tmp = ppro_mulmod(*x2, w, dmod, dinvmod);110s = addmod(s, tmp, umod);111112w = w3table[2];113tmp = ppro_mulmod(*x3, w, dmod, dinvmod);114s = addmod(s, tmp, umod);115116r2 = s;117118/* k = 2 */119s = *x1;120121w = w3table[2];122tmp = ppro_mulmod(*x2, w, dmod, dinvmod);123s = addmod(s, tmp, umod);124125w = w3table[1];126tmp = ppro_mulmod(*x3, w, dmod, dinvmod);127s = addmod(s, tmp, umod);128129*x3 = s;130*x2 = r2;131*x1 = r1;132}133#endif134135136/* forward transform, sign = -1; transform length = 3 * 2**n */137int138four_step_fnt(mpd_uint_t *a, mpd_size_t n, int modnum)139{140mpd_size_t R = 3; /* number of rows */141mpd_size_t C = n / 3; /* number of columns */142mpd_uint_t w3table[3];143mpd_uint_t kernel, w0, w1, wstep;144mpd_uint_t *s, *p0, *p1, *p2;145mpd_uint_t umod;146#ifdef PPRO147double dmod;148uint32_t dinvmod[3];149#endif150mpd_size_t i, k;151152153assert(n >= 48);154assert(n <= 3*MPD_MAXTRANSFORM_2N);155156157/* Length R transform on the columns. */158SETMODULUS(modnum);159_mpd_init_w3table(w3table, -1, modnum);160for (p0=a, p1=p0+C, p2=p0+2*C; p0<a+C; p0++,p1++,p2++) {161162SIZE3_NTT(p0, p1, p2, w3table);163}164165/* Multiply each matrix element (addressed by i*C+k) by r**(i*k). */166kernel = _mpd_getkernel(n, -1, modnum);167for (i = 1; i < R; i++) {168w0 = 1; /* r**(i*0): initial value for k=0 */169w1 = POWMOD(kernel, i); /* r**(i*1): initial value for k=1 */170wstep = MULMOD(w1, w1); /* r**(2*i) */171for (k = 0; k < C-1; k += 2) {172mpd_uint_t x0 = a[i*C+k];173mpd_uint_t x1 = a[i*C+k+1];174MULMOD2(&x0, w0, &x1, w1);175MULMOD2C(&w0, &w1, wstep); /* r**(i*(k+2)) = r**(i*k) * r**(2*i) */176a[i*C+k] = x0;177a[i*C+k+1] = x1;178}179}180181/* Length C transform on the rows. */182for (s = a; s < a+n; s += C) {183if (!six_step_fnt(s, C, modnum)) {184return 0;185}186}187188#if 0189/* An unordered transform is sufficient for convolution. */190/* Transpose the matrix. */191#include "transpose.h"192transpose_3xpow2(a, R, C);193#endif194195return 1;196}197198/* backward transform, sign = 1; transform length = 3 * 2**n */199int200inv_four_step_fnt(mpd_uint_t *a, mpd_size_t n, int modnum)201{202mpd_size_t R = 3; /* number of rows */203mpd_size_t C = n / 3; /* number of columns */204mpd_uint_t w3table[3];205mpd_uint_t kernel, w0, w1, wstep;206mpd_uint_t *s, *p0, *p1, *p2;207mpd_uint_t umod;208#ifdef PPRO209double dmod;210uint32_t dinvmod[3];211#endif212mpd_size_t i, k;213214215assert(n >= 48);216assert(n <= 3*MPD_MAXTRANSFORM_2N);217218219#if 0220/* An unordered transform is sufficient for convolution. */221/* Transpose the matrix, producing an R*C matrix. */222#include "transpose.h"223transpose_3xpow2(a, C, R);224#endif225226/* Length C transform on the rows. */227for (s = a; s < a+n; s += C) {228if (!inv_six_step_fnt(s, C, modnum)) {229return 0;230}231}232233/* Multiply each matrix element (addressed by i*C+k) by r**(i*k). */234SETMODULUS(modnum);235kernel = _mpd_getkernel(n, 1, modnum);236for (i = 1; i < R; i++) {237w0 = 1;238w1 = POWMOD(kernel, i);239wstep = MULMOD(w1, w1);240for (k = 0; k < C; k += 2) {241mpd_uint_t x0 = a[i*C+k];242mpd_uint_t x1 = a[i*C+k+1];243MULMOD2(&x0, w0, &x1, w1);244MULMOD2C(&w0, &w1, wstep);245a[i*C+k] = x0;246a[i*C+k+1] = x1;247}248}249250/* Length R transform on the columns. */251_mpd_init_w3table(w3table, 1, modnum);252for (p0=a, p1=p0+C, p2=p0+2*C; p0<a+C; p0++,p1++,p2++) {253254SIZE3_NTT(p0, p1, p2, w3table);255}256257return 1;258}259260261