Path: blob/main/Modules/_decimal/libmpdec/transpose.c
12 views
/*1* Copyright (c) 2008-2020 Stefan Krah. All rights reserved.2*3* Redistribution and use in source and binary forms, with or without4* modification, are permitted provided that the following conditions5* are met:6*7* 1. Redistributions of source code must retain the above copyright8* notice, this list of conditions and the following disclaimer.9*10* 2. Redistributions in binary form must reproduce the above copyright11* notice, this list of conditions and the following disclaimer in the12* documentation and/or other materials provided with the distribution.13*14* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND15* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE16* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE17* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE18* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL19* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS20* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)21* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT22* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY23* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF24* SUCH DAMAGE.25*/262728#include "mpdecimal.h"2930#include <assert.h>31#include <limits.h>32#include <stdio.h>33#include <stdlib.h>34#include <string.h>3536#include "bits.h"37#include "constants.h"38#include "transpose.h"39#include "typearith.h"404142#define BUFSIZE 409643#define SIDE 128444546/* Bignum: The transpose functions are used for very large transforms47in sixstep.c and fourstep.c. */484950/* Definition of the matrix transpose */51void52std_trans(mpd_uint_t dest[], mpd_uint_t src[], mpd_size_t rows, mpd_size_t cols)53{54mpd_size_t idest, isrc;55mpd_size_t r, c;5657for (r = 0; r < rows; r++) {58isrc = r * cols;59idest = r;60for (c = 0; c < cols; c++) {61dest[idest] = src[isrc];62isrc += 1;63idest += rows;64}65}66}6768/*69* Swap half-rows of 2^n * (2*2^n) matrix.70* FORWARD_CYCLE: even/odd permutation of the halfrows.71* BACKWARD_CYCLE: reverse the even/odd permutation.72*/73static int74swap_halfrows_pow2(mpd_uint_t *matrix, mpd_size_t rows, mpd_size_t cols, int dir)75{76mpd_uint_t buf1[BUFSIZE];77mpd_uint_t buf2[BUFSIZE];78mpd_uint_t *readbuf, *writebuf, *hp;79mpd_size_t *done, dbits;80mpd_size_t b = BUFSIZE, stride;81mpd_size_t hn, hmax; /* halfrow number */82mpd_size_t m, r=0;83mpd_size_t offset;84mpd_size_t next;858687assert(cols == mul_size_t(2, rows));8889if (dir == FORWARD_CYCLE) {90r = rows;91}92else if (dir == BACKWARD_CYCLE) {93r = 2;94}95else {96abort(); /* GCOV_NOT_REACHED */97}9899m = cols - 1;100hmax = rows; /* cycles start at odd halfrows */101dbits = 8 * sizeof *done;102if ((done = mpd_calloc(hmax/(sizeof *done) + 1, sizeof *done)) == NULL) {103return 0;104}105106for (hn = 1; hn <= hmax; hn += 2) {107108if (done[hn/dbits] & mpd_bits[hn%dbits]) {109continue;110}111112readbuf = buf1; writebuf = buf2;113114for (offset = 0; offset < cols/2; offset += b) {115116stride = (offset + b < cols/2) ? b : cols/2-offset;117118hp = matrix + hn*cols/2;119memcpy(readbuf, hp+offset, stride*(sizeof *readbuf));120pointerswap(&readbuf, &writebuf);121122next = mulmod_size_t(hn, r, m);123hp = matrix + next*cols/2;124125while (next != hn) {126127memcpy(readbuf, hp+offset, stride*(sizeof *readbuf));128memcpy(hp+offset, writebuf, stride*(sizeof *writebuf));129pointerswap(&readbuf, &writebuf);130131done[next/dbits] |= mpd_bits[next%dbits];132133next = mulmod_size_t(next, r, m);134hp = matrix + next*cols/2;135136}137138memcpy(hp+offset, writebuf, stride*(sizeof *writebuf));139140done[hn/dbits] |= mpd_bits[hn%dbits];141}142}143144mpd_free(done);145return 1;146}147148/* In-place transpose of a square matrix */149static inline void150squaretrans(mpd_uint_t *buf, mpd_size_t cols)151{152mpd_uint_t tmp;153mpd_size_t idest, isrc;154mpd_size_t r, c;155156for (r = 0; r < cols; r++) {157c = r+1;158isrc = r*cols + c;159idest = c*cols + r;160for (c = r+1; c < cols; c++) {161tmp = buf[isrc];162buf[isrc] = buf[idest];163buf[idest] = tmp;164isrc += 1;165idest += cols;166}167}168}169170/*171* Transpose 2^n * 2^n matrix. For cache efficiency, the matrix is split into172* square blocks with side length 'SIDE'. First, the blocks are transposed,173* then a square transposition is done on each individual block.174*/175static void176squaretrans_pow2(mpd_uint_t *matrix, mpd_size_t size)177{178mpd_uint_t buf1[SIDE*SIDE];179mpd_uint_t buf2[SIDE*SIDE];180mpd_uint_t *to, *from;181mpd_size_t b = size;182mpd_size_t r, c;183mpd_size_t i;184185while (b > SIDE) b >>= 1;186187for (r = 0; r < size; r += b) {188189for (c = r; c < size; c += b) {190191from = matrix + r*size + c;192to = buf1;193for (i = 0; i < b; i++) {194memcpy(to, from, b*(sizeof *to));195from += size;196to += b;197}198squaretrans(buf1, b);199200if (r == c) {201to = matrix + r*size + c;202from = buf1;203for (i = 0; i < b; i++) {204memcpy(to, from, b*(sizeof *to));205from += b;206to += size;207}208continue;209}210else {211from = matrix + c*size + r;212to = buf2;213for (i = 0; i < b; i++) {214memcpy(to, from, b*(sizeof *to));215from += size;216to += b;217}218squaretrans(buf2, b);219220to = matrix + c*size + r;221from = buf1;222for (i = 0; i < b; i++) {223memcpy(to, from, b*(sizeof *to));224from += b;225to += size;226}227228to = matrix + r*size + c;229from = buf2;230for (i = 0; i < b; i++) {231memcpy(to, from, b*(sizeof *to));232from += b;233to += size;234}235}236}237}238239}240241/*242* In-place transposition of a 2^n x 2^n or a 2^n x (2*2^n)243* or a (2*2^n) x 2^n matrix.244*/245int246transpose_pow2(mpd_uint_t *matrix, mpd_size_t rows, mpd_size_t cols)247{248mpd_size_t size = mul_size_t(rows, cols);249250assert(ispower2(rows));251assert(ispower2(cols));252253if (cols == rows) {254squaretrans_pow2(matrix, rows);255}256else if (cols == mul_size_t(2, rows)) {257if (!swap_halfrows_pow2(matrix, rows, cols, FORWARD_CYCLE)) {258return 0;259}260squaretrans_pow2(matrix, rows);261squaretrans_pow2(matrix+(size/2), rows);262}263else if (rows == mul_size_t(2, cols)) {264squaretrans_pow2(matrix, cols);265squaretrans_pow2(matrix+(size/2), cols);266if (!swap_halfrows_pow2(matrix, cols, rows, BACKWARD_CYCLE)) {267return 0;268}269}270else {271abort(); /* GCOV_NOT_REACHED */272}273274return 1;275}276277278