/*1* jidctflt.c2*3* Copyright (C) 1994-1998, Thomas G. Lane.4* Modified 2010-2017 by Guido Vollbeding.5* This file is part of the Independent JPEG Group's software.6* For conditions of distribution and use, see the accompanying README file.7*8* This file contains a floating-point implementation of the9* inverse DCT (Discrete Cosine Transform). In the IJG code, this routine10* must also perform dequantization of the input coefficients.11*12* This implementation should be more accurate than either of the integer13* IDCT implementations. However, it may not give the same results on all14* machines because of differences in roundoff behavior. Speed will depend15* on the hardware's floating point capacity.16*17* A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT18* on each row (or vice versa, but it's more convenient to emit a row at19* a time). Direct algorithms are also available, but they are much more20* complex and seem not to be any faster when reduced to code.21*22* This implementation is based on Arai, Agui, and Nakajima's algorithm for23* scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in24* Japanese, but the algorithm is described in the Pennebaker & Mitchell25* JPEG textbook (see REFERENCES section in file README). The following code26* is based directly on figure 4-8 in P&M.27* While an 8-point DCT cannot be done in less than 11 multiplies, it is28* possible to arrange the computation so that many of the multiplies are29* simple scalings of the final outputs. These multiplies can then be30* folded into the multiplications or divisions by the JPEG quantization31* table entries. The AA&N method leaves only 5 multiplies and 29 adds32* to be done in the DCT itself.33* The primary disadvantage of this method is that with a fixed-point34* implementation, accuracy is lost due to imprecise representation of the35* scaled quantization values. However, that problem does not arise if36* we use floating point arithmetic.37*/3839#define JPEG_INTERNALS40#include "jinclude.h"41#include "jpeglib.h"42#include "jdct.h" /* Private declarations for DCT subsystem */4344#ifdef DCT_FLOAT_SUPPORTED454647/*48* This module is specialized to the case DCTSIZE = 8.49*/5051#if DCTSIZE != 852Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */53#endif545556/* Dequantize a coefficient by multiplying it by the multiplier-table57* entry; produce a float result.58*/5960#define DEQUANTIZE(coef,quantval) (((FAST_FLOAT) (coef)) * (quantval))616263/*64* Perform dequantization and inverse DCT on one block of coefficients.65*66* cK represents cos(K*pi/16).67*/6869GLOBAL(void)70jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,71JCOEFPTR coef_block,72JSAMPARRAY output_buf, JDIMENSION output_col)73{74FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;75FAST_FLOAT tmp10, tmp11, tmp12, tmp13;76FAST_FLOAT z5, z10, z11, z12, z13;77JCOEFPTR inptr;78FLOAT_MULT_TYPE * quantptr;79FAST_FLOAT * wsptr;80JSAMPROW outptr;81JSAMPLE *range_limit = IDCT_range_limit(cinfo);82int ctr;83FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */8485/* Pass 1: process columns from input, store into work array. */8687inptr = coef_block;88quantptr = (FLOAT_MULT_TYPE *) compptr->dct_table;89wsptr = workspace;90for (ctr = DCTSIZE; ctr > 0; ctr--) {91/* Due to quantization, we will usually find that many of the input92* coefficients are zero, especially the AC terms. We can exploit this93* by short-circuiting the IDCT calculation for any column in which all94* the AC terms are zero. In that case each output is equal to the95* DC coefficient (with scale factor as needed).96* With typical images and quantization tables, half or more of the97* column DCT calculations can be simplified this way.98*/99100if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&101inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&102inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&103inptr[DCTSIZE*7] == 0) {104/* AC terms all zero */105FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);106107wsptr[DCTSIZE*0] = dcval;108wsptr[DCTSIZE*1] = dcval;109wsptr[DCTSIZE*2] = dcval;110wsptr[DCTSIZE*3] = dcval;111wsptr[DCTSIZE*4] = dcval;112wsptr[DCTSIZE*5] = dcval;113wsptr[DCTSIZE*6] = dcval;114wsptr[DCTSIZE*7] = dcval;115116inptr++; /* advance pointers to next column */117quantptr++;118wsptr++;119continue;120}121122/* Even part */123124tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);125tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);126tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);127tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);128129tmp10 = tmp0 + tmp2; /* phase 3 */130tmp11 = tmp0 - tmp2;131132tmp13 = tmp1 + tmp3; /* phases 5-3 */133tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT) 1.414213562) - tmp13; /* 2*c4 */134135tmp0 = tmp10 + tmp13; /* phase 2 */136tmp3 = tmp10 - tmp13;137tmp1 = tmp11 + tmp12;138tmp2 = tmp11 - tmp12;139140/* Odd part */141142tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);143tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);144tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);145tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);146147z13 = tmp6 + tmp5; /* phase 6 */148z10 = tmp6 - tmp5;149z11 = tmp4 + tmp7;150z12 = tmp4 - tmp7;151152tmp7 = z11 + z13; /* phase 5 */153tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */154155z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */156tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */157tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */158159tmp6 = tmp12 - tmp7; /* phase 2 */160tmp5 = tmp11 - tmp6;161tmp4 = tmp10 - tmp5;162163wsptr[DCTSIZE*0] = tmp0 + tmp7;164wsptr[DCTSIZE*7] = tmp0 - tmp7;165wsptr[DCTSIZE*1] = tmp1 + tmp6;166wsptr[DCTSIZE*6] = tmp1 - tmp6;167wsptr[DCTSIZE*2] = tmp2 + tmp5;168wsptr[DCTSIZE*5] = tmp2 - tmp5;169wsptr[DCTSIZE*3] = tmp3 + tmp4;170wsptr[DCTSIZE*4] = tmp3 - tmp4;171172inptr++; /* advance pointers to next column */173quantptr++;174wsptr++;175}176177/* Pass 2: process rows from work array, store into output array. */178179wsptr = workspace;180for (ctr = 0; ctr < DCTSIZE; ctr++) {181outptr = output_buf[ctr] + output_col;182/* Rows of zeroes can be exploited in the same way as we did with columns.183* However, the column calculation has created many nonzero AC terms, so184* the simplification applies less often (typically 5% to 10% of the time).185* And testing floats for zero is relatively expensive, so we don't bother.186*/187188/* Even part */189190/* Prepare range-limit and float->int conversion */191z5 = wsptr[0] + (((FAST_FLOAT) RANGE_CENTER) + ((FAST_FLOAT) 0.5));192tmp10 = z5 + wsptr[4];193tmp11 = z5 - wsptr[4];194195tmp13 = wsptr[2] + wsptr[6];196tmp12 = (wsptr[2] - wsptr[6]) *197((FAST_FLOAT) 1.414213562) - tmp13; /* 2*c4 */198199tmp0 = tmp10 + tmp13;200tmp3 = tmp10 - tmp13;201tmp1 = tmp11 + tmp12;202tmp2 = tmp11 - tmp12;203204/* Odd part */205206z13 = wsptr[5] + wsptr[3];207z10 = wsptr[5] - wsptr[3];208z11 = wsptr[1] + wsptr[7];209z12 = wsptr[1] - wsptr[7];210211tmp7 = z11 + z13; /* phase 5 */212tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */213214z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */215tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */216tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */217218tmp6 = tmp12 - tmp7; /* phase 2 */219tmp5 = tmp11 - tmp6;220tmp4 = tmp10 - tmp5;221222/* Final output stage: float->int conversion and range-limit */223224outptr[0] = range_limit[(int) (tmp0 + tmp7) & RANGE_MASK];225outptr[7] = range_limit[(int) (tmp0 - tmp7) & RANGE_MASK];226outptr[1] = range_limit[(int) (tmp1 + tmp6) & RANGE_MASK];227outptr[6] = range_limit[(int) (tmp1 - tmp6) & RANGE_MASK];228outptr[2] = range_limit[(int) (tmp2 + tmp5) & RANGE_MASK];229outptr[5] = range_limit[(int) (tmp2 - tmp5) & RANGE_MASK];230outptr[3] = range_limit[(int) (tmp3 + tmp4) & RANGE_MASK];231outptr[4] = range_limit[(int) (tmp3 - tmp4) & RANGE_MASK];232233wsptr += DCTSIZE; /* advance pointer to next row */234}235}236237#endif /* DCT_FLOAT_SUPPORTED */238239240