/*1* jidctfst.c2*3* Copyright (C) 1994-1998, Thomas G. Lane.4* Modified 2015-2025 by Guido Vollbeding.5* This file is part of the Independent JPEG Group's software.6* For conditions of distribution and use, see the accompanying README file.7*8* This file contains a fast, not so accurate integer implementation of the9* inverse DCT (Discrete Cosine Transform). In the IJG code, this routine10* must also perform dequantization of the input coefficients.11*12* A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT13* on each row (or vice versa, but it's more convenient to emit a row at14* a time). Direct algorithms are also available, but they are much more15* complex and seem not to be any faster when reduced to code.16*17* This implementation is based on Arai, Agui, and Nakajima's algorithm18* for scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is19* in Japanese, but the algorithm is described in the Pennebaker & Mitchell20* JPEG textbook (see REFERENCES section in file README). The following21* code is based directly on figure 4-8 in P&M.22* While an 8-point DCT cannot be done in less than 11 multiplies, it is23* possible to arrange the computation so that many of the multiplies are24* simple scalings of the final outputs. These multiplies can then be25* folded into the multiplications or divisions by the JPEG quantization26* table entries. The AA&N method leaves only 5 multiplies and 29 adds27* to be done in the DCT itself.28* The primary disadvantage of this method is that with fixed-point math,29* accuracy is lost due to imprecise representation of the scaled30* quantization values. The smaller the quantization table entry,31* the less precise the scaled value, so this implementation does32* worse with high-quality-setting files than with low-quality ones.33*/3435#define JPEG_INTERNALS36#include "jinclude.h"37#include "jpeglib.h"38#include "jdct.h" /* Private declarations for DCT subsystem */3940#ifdef DCT_IFAST_SUPPORTED414243/*44* This module is specialized to the case DCTSIZE = 8.45*/4647#if DCTSIZE != 848Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */49#endif505152/* Scaling decisions are generally the same as in the LL&M algorithm;53* see jidctint.c for more details. However, we choose to descale54* (right shift) multiplication products as soon as they are formed,55* rather than carrying additional fractional bits into subsequent additions.56* This compromises accuracy slightly, but it lets us save a few shifts.57* More importantly, 16-bit arithmetic is then adequate (for up to 10-bit58* data) everywhere except in the multiplications proper;59* this saves a good deal of work on 16-bit-int machines.60*61* The dequantized coefficients are not integers because the AA&N scaling62* factors have been incorporated. We represent them scaled up by PASS1_BITS,63* so that the first and second IDCT rounds have the same input scaling.64* For up to 10-bit data, we choose IFAST_SCALE_BITS = PASS1_BITS so as to65* avoid a descaling shift; this compromises accuracy rather drastically66* for small quantization table entries, but it saves a lot of shifts.67* For higher bit depths, there's no hope of using 16x16 multiplies anyway,68* so we use a much larger scaling factor to preserve accuracy.69*70* A final compromise is to represent the multiplicative constants to only71* 8 fractional bits, rather than 13. This saves some shifting work on some72* machines, and may also reduce the cost of multiplication (since there73* are fewer one-bits in the constants).74*/7576#if JPEG_DATA_PRECISION <= 10 && BITS_IN_JSAMPLE <= 1377#define CONST_BITS 878#define PASS1_BITS (10 - JPEG_DATA_PRECISION)79#define PASS2_BITS (13 - BITS_IN_JSAMPLE)80#else81#if JPEG_DATA_PRECISION <= 13 && BITS_IN_JSAMPLE <= 1682#define CONST_BITS 883#define PASS1_BITS (13 - JPEG_DATA_PRECISION)84#define PASS2_BITS (16 - BITS_IN_JSAMPLE)85#endif86#endif8788/* Some C compilers fail to reduce "FIX(constant)" at compile time,89* thus causing a lot of useless floating-point operations at run time.90* To get around this we use the following pre-calculated constants.91* If you change CONST_BITS you may want to add appropriate values.92* (With a reasonable C compiler, you can just rely on the FIX() macro...)93*/9495#if CONST_BITS == 896#define FIX_1_082392200 ((INT32) 277) /* FIX(1.082392200) */97#define FIX_1_414213562 ((INT32) 362) /* FIX(1.414213562) */98#define FIX_1_847759065 ((INT32) 473) /* FIX(1.847759065) */99#define FIX_2_613125930 ((INT32) 669) /* FIX(2.613125930) */100#else101#define FIX_1_082392200 FIX(1.082392200)102#define FIX_1_414213562 FIX(1.414213562)103#define FIX_1_847759065 FIX(1.847759065)104#define FIX_2_613125930 FIX(2.613125930)105#endif106107108/* We can gain a little more speed, with a further compromise109* in accuracy, by omitting the addition in a descaling shift.110* This yields an incorrectly rounded result half the time...111*/112113#ifndef USE_ACCURATE_ROUNDING114#undef DESCALE115#define DESCALE(x,n) RIGHT_SHIFT(x, n)116#endif117118119/* Multiply a DCTELEM variable by an INT32 constant,120* and immediately descale to yield a DCTELEM result.121*/122123#define MULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const), CONST_BITS))124125126/* Dequantize a coefficient by multiplying it by the multiplier-table127* entry; produce a DCTELEM result. For up to 10-bit data a 16x16->16128* multiplication will do. For higher bit depths, the multiplier table129* is declared INT32, so a 32-bit multiply will be used.130*/131132#if JPEG_DATA_PRECISION <= 10 && BITS_IN_JSAMPLE <= 13133#define DEQUANTIZE(coef,quantval) (((IFAST_MULT_TYPE) (coef)) * (quantval))134#else135#define DEQUANTIZE(coef,quantval) \136DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS)137#endif138139140/* Final output conversion: scale down and range-limit. */141142#if PASS2_BITS > 0143#define FINAL_OUTPUT(x) \144range_limit[(int) IRIGHT_SHIFT(x, PASS2_BITS) & RANGE_MASK]145#else146#define FINAL_OUTPUT(x) range_limit[(int) (x) & RANGE_MASK]147#endif148149150/*151* Perform dequantization and inverse DCT on one block of coefficients.152*153* cK represents cos(K*pi/16).154*/155156GLOBAL(void)157jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,158JCOEFPTR coef_block,159JSAMPARRAY output_buf, JDIMENSION output_col)160{161DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;162DCTELEM tmp10, tmp11, tmp12, tmp13;163DCTELEM z5, z10, z11, z12, z13;164JCOEFPTR inptr;165IFAST_MULT_TYPE * quantptr;166int * wsptr;167JSAMPROW outptr;168JSAMPLE *range_limit = IDCT_range_limit(cinfo);169int ctr;170int workspace[DCTSIZE2]; /* buffers data between passes */171SHIFT_TEMPS /* for DESCALE */172ISHIFT_TEMPS /* for IRIGHT_SHIFT */173174/* Pass 1: process columns from input, store into work array. */175176inptr = coef_block;177quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;178wsptr = workspace;179for (ctr = DCTSIZE; ctr > 0; ctr--) {180/* Due to quantization, we will usually find that many of the input181* coefficients are zero, especially the AC terms. We can exploit this182* by short-circuiting the IDCT calculation for any column in which all183* the AC terms are zero. In that case each output is equal to the184* DC coefficient (with scale factor as needed).185* With typical images and quantization tables, half or more of the186* column DCT calculations can be simplified this way.187*/188189if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&190inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&191inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&192inptr[DCTSIZE*7] == 0) {193/* AC terms all zero */194int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);195196wsptr[DCTSIZE*0] = dcval;197wsptr[DCTSIZE*1] = dcval;198wsptr[DCTSIZE*2] = dcval;199wsptr[DCTSIZE*3] = dcval;200wsptr[DCTSIZE*4] = dcval;201wsptr[DCTSIZE*5] = dcval;202wsptr[DCTSIZE*6] = dcval;203wsptr[DCTSIZE*7] = dcval;204205inptr++; /* advance pointers to next column */206quantptr++;207wsptr++;208continue;209}210211/* Even part */212213tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);214tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);215tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);216tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);217218tmp10 = tmp0 + tmp2; /* phase 3 */219tmp11 = tmp0 - tmp2;220221tmp13 = tmp1 + tmp3; /* phases 5-3 */222tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */223224tmp0 = tmp10 + tmp13; /* phase 2 */225tmp3 = tmp10 - tmp13;226tmp1 = tmp11 + tmp12;227tmp2 = tmp11 - tmp12;228229/* Odd part */230231tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);232tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);233tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);234tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);235236z13 = tmp6 + tmp5; /* phase 6 */237z10 = tmp6 - tmp5;238z11 = tmp4 + tmp7;239z12 = tmp4 - tmp7;240241tmp7 = z11 + z13; /* phase 5 */242tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */243244z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */245tmp10 = z5 - MULTIPLY(z12, FIX_1_082392200); /* 2*(c2-c6) */246tmp12 = z5 - MULTIPLY(z10, FIX_2_613125930); /* 2*(c2+c6) */247248tmp6 = tmp12 - tmp7; /* phase 2 */249tmp5 = tmp11 - tmp6;250tmp4 = tmp10 - tmp5;251252wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);253wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);254wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);255wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);256wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);257wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);258wsptr[DCTSIZE*3] = (int) (tmp3 + tmp4);259wsptr[DCTSIZE*4] = (int) (tmp3 - tmp4);260261inptr++; /* advance pointers to next column */262quantptr++;263wsptr++;264}265266/* Pass 2: process rows from work array, store into output array.267* Note that we must descale the results by a factor of 8 == 2**3,268* which is folded into the PASS2_BITS value.269*/270271wsptr = workspace;272for (ctr = 0; ctr < DCTSIZE; ctr++) {273outptr = output_buf[ctr] + output_col;274275/* Add range center and fudge factor for final descale and range-limit. */276#if PASS2_BITS > 1277z5 = (DCTELEM) wsptr[0] +278((((DCTELEM) RANGE_CENTER) << PASS2_BITS) + (1 << (PASS2_BITS-1)));279#else280#if PASS2_BITS > 0281z5 = (DCTELEM) wsptr[0] + ((((DCTELEM) RANGE_CENTER) << 1) + 1);282#else283z5 = (DCTELEM) wsptr[0] + (DCTELEM) RANGE_CENTER;284#endif285#endif286287/* Rows of zeroes can be exploited in the same way as we did with columns.288* However, the column calculation has created many nonzero AC terms, so289* the simplification applies less often (typically 5% to 10% of the time).290* On machines with very fast multiplication, it's possible that the291* test takes more time than it's worth. In that case this section292* may be commented out.293*/294295#ifndef NO_ZERO_ROW_TEST296if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&297wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {298/* AC terms all zero */299JSAMPLE dcval = FINAL_OUTPUT(z5);300301outptr[0] = dcval;302outptr[1] = dcval;303outptr[2] = dcval;304outptr[3] = dcval;305outptr[4] = dcval;306outptr[5] = dcval;307outptr[6] = dcval;308outptr[7] = dcval;309310wsptr += DCTSIZE; /* advance pointer to next row */311continue;312}313#endif314315/* Even part */316317tmp10 = z5 + (DCTELEM) wsptr[4];318tmp11 = z5 - (DCTELEM) wsptr[4];319320tmp13 = (DCTELEM) wsptr[2] + (DCTELEM) wsptr[6];321tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6],322FIX_1_414213562) - tmp13; /* 2*c4 */323324tmp0 = tmp10 + tmp13;325tmp3 = tmp10 - tmp13;326tmp1 = tmp11 + tmp12;327tmp2 = tmp11 - tmp12;328329/* Odd part */330331z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];332z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];333z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];334z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];335336tmp7 = z11 + z13; /* phase 5 */337tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */338339z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */340tmp10 = z5 - MULTIPLY(z12, FIX_1_082392200); /* 2*(c2-c6) */341tmp12 = z5 - MULTIPLY(z10, FIX_2_613125930); /* 2*(c2+c6) */342343tmp6 = tmp12 - tmp7; /* phase 2 */344tmp5 = tmp11 - tmp6;345tmp4 = tmp10 - tmp5;346347/* Final output stage: scale down and range-limit */348349outptr[0] = FINAL_OUTPUT(tmp0 + tmp7);350outptr[7] = FINAL_OUTPUT(tmp0 - tmp7);351outptr[1] = FINAL_OUTPUT(tmp1 + tmp6);352outptr[6] = FINAL_OUTPUT(tmp1 - tmp6);353outptr[2] = FINAL_OUTPUT(tmp2 + tmp5);354outptr[5] = FINAL_OUTPUT(tmp2 - tmp5);355outptr[3] = FINAL_OUTPUT(tmp3 + tmp4);356outptr[4] = FINAL_OUTPUT(tmp3 - tmp4);357358wsptr += DCTSIZE; /* advance pointer to next row */359}360}361362#endif /* DCT_IFAST_SUPPORTED */363364365