CoCalc -- jfdctint.c

GitHub Repository: wine-mirror/wine
Path: blob/master/libs/jpeg/jfdctint.c
⁸⁶⁹³ views
1
/*
2
 * jfdctint.c
3
 *
4
 * Copyright (C) 1991-1996, Thomas G. Lane.
5
 * Modification developed 2003-2026 by Guido Vollbeding.
6
 * This file is part of the Independent JPEG Group's software.
7
 * For conditions of distribution and use, see the accompanying README file.
8
 *
9
 * This file contains a slow-but-accurate integer implementation of the
10
 * forward DCT (Discrete Cosine Transform).
11
 *
12
 * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
13
 * on each column.  Direct algorithms are also available, but they are
14
 * much more complex and seem not to be any faster when reduced to code.
15
 *
16
 * This implementation is based on an algorithm described in
17
 *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
18
 *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
19
 *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
20
 * The primary algorithm described there uses 11 multiplies and 29 adds.
21
 * We use their alternate method with 12 multiplies and 32 adds.
22
 * The advantage of this method is that no data path contains more than one
23
 * multiplication; this allows a very simple and accurate implementation in
24
 * scaled fixed-point arithmetic, with a minimal number of shifts.
25
 *
26
 * We also provide FDCT routines with various input sample block sizes for
27
 * direct resolution reduction or enlargement and for direct resolving the
28
 * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
29
 * (N=1...16), 2NxN, and Nx2N (N=1...8) samples for one 8x8 output DCT block.
30
 *
31
 * For N<8 we fill the remaining block coefficients with zero.
32
 * For N>8 we apply a partial N-point FDCT on the input samples, computing
33
 * just the lower 8 frequency coefficients and discarding the rest.
34
 *
35
 * We must scale the output coefficients of the N-point FDCT appropriately
36
 * to the standard 8-point FDCT level by 8/N per 1-D pass.  This scaling
37
 * is folded into the constant multipliers (pass 2) and/or final/initial
38
 * shifting.
39
 *
40
 * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
41
 * since there would be too many additional constants to pre-calculate.
42
 */
43

44
#define JPEG_INTERNALS
45
#include "jinclude.h"
46
#include "jpeglib.h"
47
#include "jdct.h"		/* Private declarations for DCT subsystem */
48

49
#ifdef DCT_ISLOW_SUPPORTED
50

51

52
/*
53
 * This module is specialized to the case DCTSIZE = 8.
54
 */
55

56
#if DCTSIZE != 8
57
  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
58
#endif
59

60

61
/*
62
 * The poop on this scaling stuff is as follows:
63
 *
64
 * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
65
 * larger than the true DCT outputs.  The final outputs are therefore
66
 * a factor of N larger than desired; since N=8 this can be cured by
67
 * a simple right shift at the end of the algorithm.  The advantage of
68
 * this arrangement is that we save two multiplications per 1-D DCT,
69
 * because the y0 and y4 outputs need not be divided by sqrt(N).
70
 * In the IJG code, this factor of 8 is removed by the quantization step
71
 * (in jcdctmgr.c), NOT in this module.
72
 *
73
 * We have to do addition and subtraction of the integer inputs, which
74
 * is no problem, and multiplication by fractional constants, which is
75
 * a problem to do in integer arithmetic.  We multiply all the constants
76
 * by CONST_SCALE and convert them to integer constants (thus retaining
77
 * CONST_BITS bits of precision in the constants).  After doing a
78
 * multiplication we have to divide the product by CONST_SCALE, with
79
 * proper rounding, to produce the correct output.  This division can
80
 * be done cheaply as a right shift of CONST_BITS bits.  We postpone
81
 * shifting as long as possible so that partial sums can be added
82
 * together with full fractional precision.
83
 *
84
 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
85
 * they are represented to better-than-integral precision.  These outputs
86
 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit
87
 * word with the recommended scaling.  (For higher bit depths, the
88
 * intermediate array is INT32 anyway.)
89
 *
90
 * To avoid overflow of the 32-bit intermediate results in pass 2, we
91
 * must have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error
92
 * analysis shows that the values given below are the most effective.
93
 */
94

95
#if BITS_IN_JSAMPLE <= 10 && JPEG_DATA_PRECISION <= 10
96
#define CONST_BITS  13
97
#define PASS1_BITS  (10 - BITS_IN_JSAMPLE)
98
#define PASS2_BITS  (10 - JPEG_DATA_PRECISION)
99
#else
100
#if BITS_IN_JSAMPLE <= 13 && JPEG_DATA_PRECISION <= 13
101
#define CONST_BITS  13
102
#define PASS1_BITS  (13 - BITS_IN_JSAMPLE)
103
#define PASS2_BITS  (13 - JPEG_DATA_PRECISION)
104
#endif
105
#endif
106

107
/* Some C compilers fail to reduce "FIX(constant)" at compile time,
108
 * thus causing a lot of useless floating-point operations at run time.
109
 * To get around this we use the following pre-calculated constants.
110
 * If you change CONST_BITS you may want to add appropriate values.
111
 * (With a reasonable C compiler, you can just rely on the FIX() macro...)
112
 */
113

114
#if CONST_BITS == 13
115
#define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */
116
#define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */
117
#define FIX_0_541196100  ((INT32)  4433)	/* FIX(0.541196100) */
118
#define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
119
#define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
120
#define FIX_1_175875602  ((INT32)  9633)	/* FIX(1.175875602) */
121
#define FIX_1_501321110  ((INT32)  12299)	/* FIX(1.501321110) */
122
#define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
123
#define FIX_1_961570560  ((INT32)  16069)	/* FIX(1.961570560) */
124
#define FIX_2_053119869  ((INT32)  16819)	/* FIX(2.053119869) */
125
#define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
126
#define FIX_3_072711026  ((INT32)  25172)	/* FIX(3.072711026) */
127
#else
128
#define FIX_0_298631336  FIX(0.298631336)
129
#define FIX_0_390180644  FIX(0.390180644)
130
#define FIX_0_541196100  FIX(0.541196100)
131
#define FIX_0_765366865  FIX(0.765366865)
132
#define FIX_0_899976223  FIX(0.899976223)
133
#define FIX_1_175875602  FIX(1.175875602)
134
#define FIX_1_501321110  FIX(1.501321110)
135
#define FIX_1_847759065  FIX(1.847759065)
136
#define FIX_1_961570560  FIX(1.961570560)
137
#define FIX_2_053119869  FIX(2.053119869)
138
#define FIX_2_562915447  FIX(2.562915447)
139
#define FIX_3_072711026  FIX(3.072711026)
140
#endif
141

142

143
/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
144
 * For up to 10-bit data with the recommended scaling, all the variable
145
 * and constant values involved are no more than 16 bits wide, so a
146
 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
147
 * For higher bit depths, a full 32-bit multiplication will be needed.
148
 */
149

150
#if BITS_IN_JSAMPLE <= 10 && JPEG_DATA_PRECISION <= 10
151
#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
152
#else
153
#define MULTIPLY(var,const)  ((var) * (const))
154
#endif
155

156

157
/* Pass 1 output: smart scale up. */
158

159
#if PASS1_BITS > 0
160
#define PASS1_OUTPUT(x)  (DCTELEM) ((x) << PASS1_BITS)
161
#else
162
#define PASS1_OUTPUT(x)  (DCTELEM) (x)
163
#endif
164

165

166
/* Pass 2 output: smart scale down. */
167

168
#if PASS2_BITS > 0
169
#define PASS2_OUTPUT(x)  (DCTELEM) RIGHT_SHIFT(x, PASS2_BITS)
170
#else
171
#define PASS2_OUTPUT(x)  (DCTELEM) (x)
172
#endif
173

174

175
/*
176
 * Perform the forward DCT on one block of samples.
177
 */
178

179
GLOBAL(void)
180
jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
181
{
182
  INT32 tmp0, tmp1, tmp2, tmp3;
183
  INT32 tmp10, tmp11, tmp12, tmp13;
184
  INT32 z1;
185
  DCTELEM *dataptr;
186
  JSAMPROW elemptr;
187
  int ctr;
188
  SHIFT_TEMPS
189

190
  /* Pass 1: process rows.
191
   * Note results are scaled up by sqrt(8) compared to a true DCT;
192
   * furthermore, we scale the results by 2**PASS1_BITS.
193
   * cK represents sqrt(2) * cos(K*pi/16).
194
   */
195

196
  dataptr = data;
197
  for (ctr = 0; ctr < DCTSIZE; ctr++) {
198
    elemptr = sample_data[ctr] + start_col;
199

200
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
201
     * rotator "c1" should be "c6".
202
     */
203

204
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
205
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
206
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
207
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
208

209
    tmp10 = tmp0 + tmp3;
210
    tmp12 = tmp0 - tmp3;
211
    tmp11 = tmp1 + tmp2;
212
    tmp13 = tmp1 - tmp2;
213

214
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
215
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
216
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
217
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
218

219
    /* Apply unsigned->signed conversion. */
220
    dataptr[0] = PASS1_OUTPUT(tmp10 + tmp11 - 8 * CENTERJSAMPLE);
221
    dataptr[4] = PASS1_OUTPUT(tmp10 - tmp11);
222

223
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
224
    /* Add fudge factor here for final descale. */
225
    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
226

227
    dataptr[2] = (DCTELEM)
228
      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
229
		  CONST_BITS-PASS1_BITS);
230
    dataptr[6] = (DCTELEM)
231
      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
232
		  CONST_BITS-PASS1_BITS);
233

234
    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
235
     * i0..i3 in the paper are tmp0..tmp3 here.
236
     */
237

238
    tmp12 = tmp0 + tmp2;
239
    tmp13 = tmp1 + tmp3;
240

241
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
242
    /* Add fudge factor here for final descale. */
243
    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
244

245
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
246
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */
247
    tmp12 += z1;
248
    tmp13 += z1;
249

250
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
251
    tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
252
    tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
253
    tmp0 += z1 + tmp12;
254
    tmp3 += z1 + tmp13;
255

256
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
257
    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
258
    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
259
    tmp1 += z1 + tmp13;
260
    tmp2 += z1 + tmp12;
261

262
    dataptr[1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS-PASS1_BITS);
263
    dataptr[3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS-PASS1_BITS);
264
    dataptr[5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
265
    dataptr[7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS-PASS1_BITS);
266

267
    dataptr += DCTSIZE;		/* advance pointer to next row */
268
  }
269

270
  /* Pass 2: process columns.
271
   * We apply the PASS2_BITS scaling, but leave the
272
   * results scaled up by an overall factor of 8.
273
   * cK represents sqrt(2) * cos(K*pi/16).
274
   */
275

276
  dataptr = data;
277
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
278
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
279
     * rotator "c1" should be "c6".
280
     */
281

282
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
283
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
284
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
285
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
286

287
    /* Add fudge factor here for final descale. */
288
#if PASS2_BITS > 1
289
    tmp10 = tmp0 + tmp3 + (ONE << (PASS2_BITS-1));
290
#else
291
#if PASS2_BITS > 0
292
    tmp10 = tmp0 + tmp3 + ONE;
293
#else
294
    tmp10 = tmp0 + tmp3;
295
#endif
296
#endif
297
    tmp12 = tmp0 - tmp3;
298
    tmp11 = tmp1 + tmp2;
299
    tmp13 = tmp1 - tmp2;
300

301
    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
302
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
303
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
304
    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
305

306
    dataptr[DCTSIZE*0] = PASS2_OUTPUT(tmp10 + tmp11);
307
    dataptr[DCTSIZE*4] = PASS2_OUTPUT(tmp10 - tmp11);
308

309
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
310
    /* Add fudge factor here for final descale. */
311
    z1 += ONE << (CONST_BITS+PASS2_BITS-1);
312

313
    dataptr[DCTSIZE*2] = (DCTELEM)
314
      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
315
		  CONST_BITS+PASS2_BITS);
316
    dataptr[DCTSIZE*6] = (DCTELEM)
317
      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
318
		  CONST_BITS+PASS2_BITS);
319

320
    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
321
     * i0..i3 in the paper are tmp0..tmp3 here.
322
     */
323

324
    tmp12 = tmp0 + tmp2;
325
    tmp13 = tmp1 + tmp3;
326

327
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
328
    /* Add fudge factor here for final descale. */
329
    z1 += ONE << (CONST_BITS+PASS2_BITS-1);
330

331
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
332
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */
333
    tmp12 += z1;
334
    tmp13 += z1;
335

336
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
337
    tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
338
    tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
339
    tmp0 += z1 + tmp12;
340
    tmp3 += z1 + tmp13;
341

342
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
343
    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
344
    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
345
    tmp1 += z1 + tmp13;
346
    tmp2 += z1 + tmp12;
347

348
    dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS+PASS2_BITS);
349
    dataptr[DCTSIZE*3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS+PASS2_BITS);
350
    dataptr[DCTSIZE*5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS+PASS2_BITS);
351
    dataptr[DCTSIZE*7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS+PASS2_BITS);
352

353
    dataptr++;			/* advance pointer to next column */
354
  }
355
}
356

357
#ifdef DCT_SCALING_SUPPORTED
358

359

360
/*
361
 * Perform the forward DCT on a 7x7 sample block.
362
 */
363

364
GLOBAL(void)
365
jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
366
{
367
  INT32 tmp0, tmp1, tmp2, tmp3;
368
  INT32 tmp10, tmp11, tmp12;
369
  INT32 z1, z2, z3;
370
  DCTELEM *dataptr;
371
  JSAMPROW elemptr;
372
  int ctr;
373
  SHIFT_TEMPS
374

375
  /* Pre-zero output coefficient block. */
376
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
377

378
  /* Pass 1: process rows.
379
   * Note results are scaled up by sqrt(8) compared to a true DCT;
380
   * furthermore, we scale the results by 2**PASS1_BITS.
381
   * cK represents sqrt(2) * cos(K*pi/14).
382
   */
383

384
  dataptr = data;
385
  for (ctr = 0; ctr < 7; ctr++) {
386
    elemptr = sample_data[ctr] + start_col;
387

388
    /* Even part */
389

390
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
391
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
392
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
393
    tmp3 = GETJSAMPLE(elemptr[3]);
394

395
    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
396
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
397
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
398

399
    z1 = tmp0 + tmp2;
400
    /* Apply unsigned->signed conversion. */
401
    dataptr[0] = PASS1_OUTPUT(z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE);
402
    tmp3 += tmp3;
403
    z1 -= tmp3;
404
    z1 -= tmp3;
405
    z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
406
    z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
407
    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
408
    dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
409
    z1 -= z2;
410
    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
411
    dataptr[4] = (DCTELEM)
412
      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
413
	      CONST_BITS-PASS1_BITS);
414
    dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
415

416
    /* Odd part */
417

418
    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
419
    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */
420
    tmp0 = tmp1 - tmp2;
421
    tmp1 += tmp2;
422
    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
423
    tmp1 += tmp2;
424
    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268));   /* c5 */
425
    tmp0 += tmp3;
426
    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693));   /* c3+c1-c5 */
427

428
    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
429
    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
430
    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
431

432
    dataptr += DCTSIZE;		/* advance pointer to next row */
433
  }
434

435
  /* Pass 2: process columns.
436
   * We apply the PASS2_BITS scaling, but leave the
437
   * results scaled up by an overall factor of 8.
438
   * We must also scale the output by (8/7)**2 = 64/49,
439
   * which we fold into the constant multipliers:
440
   * cK now represents sqrt(2) * cos(K*pi/14) * 64/49.
441
   */
442

443
  dataptr = data;
444
  for (ctr = 0; ctr < 7; ctr++) {
445
    /* Even part */
446

447
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
448
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
449
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
450
    tmp3 = dataptr[DCTSIZE*3];
451

452
    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
453
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
454
    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
455

456
    z1 = tmp0 + tmp2;
457
    dataptr[DCTSIZE*0] = (DCTELEM)
458
      DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
459
	      CONST_BITS+PASS2_BITS);
460
    tmp3 += tmp3;
461
    z1 -= tmp3;
462
    z1 -= tmp3;
463
    z1 = MULTIPLY(z1, FIX(0.461784020));                /* (c2+c6-c4)/2 */
464
    z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084));       /* (c2+c4-c6)/2 */
465
    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446));       /* c6 */
466
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS2_BITS);
467
    z1 -= z2;
468
    z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509));       /* c4 */
469
    dataptr[DCTSIZE*4] = (DCTELEM)
470
      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
471
	      CONST_BITS+PASS2_BITS);
472
    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS2_BITS);
473

474
    /* Odd part */
475

476
    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677));   /* (c3+c1-c5)/2 */
477
    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464));   /* (c3+c5-c1)/2 */
478
    tmp0 = tmp1 - tmp2;
479
    tmp1 += tmp2;
480
    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
481
    tmp1 += tmp2;
482
    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310));   /* c5 */
483
    tmp0 += tmp3;
484
    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355));   /* c3+c1-c5 */
485

486
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS2_BITS);
487
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS2_BITS);
488
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS2_BITS);
489

490
    dataptr++;			/* advance pointer to next column */
491
  }
492
}
493

494

495
/*
496
 * Perform the forward DCT on a 6x6 sample block.
497
 */
498

499
GLOBAL(void)
500
jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
501
{
502
  INT32 tmp0, tmp1, tmp2;
503
  INT32 tmp10, tmp11, tmp12;
504
  DCTELEM *dataptr;
505
  JSAMPROW elemptr;
506
  int ctr;
507
  SHIFT_TEMPS
508

509
  /* Pre-zero output coefficient block. */
510
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
511

512
  /* Pass 1: process rows.
513
   * Note results are scaled up by sqrt(8) compared to a true DCT;
514
   * furthermore, we scale the results by 2**PASS1_BITS.
515
   * cK represents sqrt(2) * cos(K*pi/12).
516
   */
517

518
  dataptr = data;
519
  for (ctr = 0; ctr < 6; ctr++) {
520
    elemptr = sample_data[ctr] + start_col;
521

522
    /* Even part */
523

524
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
525
    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
526
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
527

528
    tmp10 = tmp0 + tmp2;
529
    tmp12 = tmp0 - tmp2;
530

531
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
532
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
533
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
534

535
    /* Apply unsigned->signed conversion. */
536
    dataptr[0] = PASS1_OUTPUT(tmp10 + tmp11 - 6 * CENTERJSAMPLE);
537
    dataptr[2] = (DCTELEM)
538
      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
539
	      CONST_BITS-PASS1_BITS);
540
    dataptr[4] = (DCTELEM)
541
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
542
	      CONST_BITS-PASS1_BITS);
543

544
    /* Odd part */
545

546
    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
547
		    CONST_BITS-PASS1_BITS);
548

549
#if PASS1_BITS > 0
550
    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
551
    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
552
    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
553
#else
554
    dataptr[1] = (DCTELEM) (tmp10 + tmp0 + tmp1);
555
    dataptr[3] = (DCTELEM) (tmp0 - tmp1 - tmp2);
556
    dataptr[5] = (DCTELEM) (tmp10 + tmp2 - tmp1);
557
#endif
558

559
    dataptr += DCTSIZE;		/* advance pointer to next row */
560
  }
561

562
  /* Pass 2: process columns.
563
   * We apply the PASS2_BITS scaling, but leave the
564
   * results scaled up by an overall factor of 8.
565
   * We must also scale the output by (8/6)**2 = 16/9,
566
   * which we fold into the constant multipliers:
567
   * cK now represents sqrt(2) * cos(K*pi/12) * 16/9.
568
   */
569

570
  dataptr = data;
571
  for (ctr = 0; ctr < 6; ctr++) {
572
    /* Even part */
573

574
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
575
    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
576
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
577

578
    tmp10 = tmp0 + tmp2;
579
    tmp12 = tmp0 - tmp2;
580

581
    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
582
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
583
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
584

585
    dataptr[DCTSIZE*0] = (DCTELEM)
586
      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
587
	      CONST_BITS+PASS2_BITS);
588
    dataptr[DCTSIZE*2] = (DCTELEM)
589
      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
590
	      CONST_BITS+PASS2_BITS);
591
    dataptr[DCTSIZE*4] = (DCTELEM)
592
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
593
	      CONST_BITS+PASS2_BITS);
594

595
    /* Odd part */
596

597
    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
598

599
    dataptr[DCTSIZE*1] = (DCTELEM)
600
      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
601
	      CONST_BITS+PASS2_BITS);
602
    dataptr[DCTSIZE*3] = (DCTELEM)
603
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
604
	      CONST_BITS+PASS2_BITS);
605
    dataptr[DCTSIZE*5] = (DCTELEM)
606
      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
607
	      CONST_BITS+PASS2_BITS);
608

609
    dataptr++;			/* advance pointer to next column */
610
  }
611
}
612

613

614
/*
615
 * Perform the forward DCT on a 5x5 sample block.
616
 */
617

618
GLOBAL(void)
619
jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
620
{
621
  INT32 tmp0, tmp1, tmp2;
622
  INT32 tmp10, tmp11;
623
  DCTELEM *dataptr;
624
  JSAMPROW elemptr;
625
  int ctr;
626
  SHIFT_TEMPS
627

628
  /* Pre-zero output coefficient block. */
629
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
630

631
  /* Pass 1: process rows.
632
   * Note results are scaled up by sqrt(8) compared to a true DCT;
633
   * furthermore, we scale the results by 2**PASS1_BITS.
634
   * We scale the results further by 2 as part of output adaption
635
   * scaling for different DCT size.
636
   * cK represents sqrt(2) * cos(K*pi/10).
637
   */
638

639
  dataptr = data;
640
  for (ctr = 0; ctr < 5; ctr++) {
641
    elemptr = sample_data[ctr] + start_col;
642

643
    /* Even part */
644

645
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
646
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
647
    tmp2 = GETJSAMPLE(elemptr[2]);
648

649
    tmp10 = tmp0 + tmp1;
650
    tmp11 = tmp0 - tmp1;
651

652
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
653
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
654

655
    /* Apply unsigned->signed conversion. */
656
    dataptr[0] = (DCTELEM)
657
      ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
658
    tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
659
    tmp10 -= tmp2 << 2;
660
    tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
661
    dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
662
    dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
663

664
    /* Odd part */
665

666
    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
667

668
    dataptr[1] = (DCTELEM)
669
      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
670
	      CONST_BITS-PASS1_BITS-1);
671
    dataptr[3] = (DCTELEM)
672
      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
673
	      CONST_BITS-PASS1_BITS-1);
674

675
    dataptr += DCTSIZE;		/* advance pointer to next row */
676
  }
677

678
  /* Pass 2: process columns.
679
   * We apply the PASS2_BITS scaling, but leave the
680
   * results scaled up by an overall factor of 8.
681
   * We must also scale the output by (8/5)**2 = 64/25, which we partially
682
   * fold into the constant multipliers (other part was done in pass 1):
683
   * cK now represents sqrt(2) * cos(K*pi/10) * 32/25.
684
   */
685

686
  dataptr = data;
687
  for (ctr = 0; ctr < 5; ctr++) {
688
    /* Even part */
689

690
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
691
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
692
    tmp2 = dataptr[DCTSIZE*2];
693

694
    tmp10 = tmp0 + tmp1;
695
    tmp11 = tmp0 - tmp1;
696

697
    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
698
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
699

700
    dataptr[DCTSIZE*0] = (DCTELEM)
701
      DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)),        /* 32/25 */
702
	      CONST_BITS+PASS2_BITS);
703
    tmp11 = MULTIPLY(tmp11, FIX(1.011928851));          /* (c2+c4)/2 */
704
    tmp10 -= tmp2 << 2;
705
    tmp10 = MULTIPLY(tmp10, FIX(0.452548340));          /* (c2-c4)/2 */
706
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS2_BITS);
707
    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS2_BITS);
708

709
    /* Odd part */
710

711
    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961));    /* c3 */
712

713
    dataptr[DCTSIZE*1] = (DCTELEM)
714
      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
715
	      CONST_BITS+PASS2_BITS);
716
    dataptr[DCTSIZE*3] = (DCTELEM)
717
      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
718
	      CONST_BITS+PASS2_BITS);
719

720
    dataptr++;			/* advance pointer to next column */
721
  }
722
}
723

724

725
/*
726
 * Perform the forward DCT on a 4x4 sample block.
727
 */
728

729
GLOBAL(void)
730
jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
731
{
732
  INT32 tmp0, tmp1;
733
  INT32 tmp10, tmp11;
734
  DCTELEM *dataptr;
735
  JSAMPROW elemptr;
736
  int ctr;
737
  SHIFT_TEMPS
738

739
  /* Pre-zero output coefficient block. */
740
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
741

742
  /* Pass 1: process rows.
743
   * Note results are scaled up by sqrt(8) compared to a true DCT;
744
   * furthermore, we scale the results by 2**PASS1_BITS.
745
   * We must also scale the output by (8/4)**2 = 2**2, which we add here.
746
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
747
   */
748

749
  dataptr = data;
750
  for (ctr = 0; ctr < 4; ctr++) {
751
    elemptr = sample_data[ctr] + start_col;
752

753
    /* Even part */
754

755
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
756
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
757

758
    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
759
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
760

761
    /* Apply unsigned->signed conversion. */
762
    dataptr[0] = (DCTELEM)
763
      ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
764
    dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
765

766
    /* Odd part */
767

768
    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
769
    /* Add fudge factor here for final descale. */
770
    tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
771

772
    dataptr[1] = (DCTELEM)
773
      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
774
		  CONST_BITS-PASS1_BITS-2);
775
    dataptr[3] = (DCTELEM)
776
      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
777
		  CONST_BITS-PASS1_BITS-2);
778

779
    dataptr += DCTSIZE;		/* advance pointer to next row */
780
  }
781

782
  /* Pass 2: process columns.
783
   * We apply the PASS2_BITS scaling, but leave the
784
   * results scaled up by an overall factor of 8.
785
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
786
   */
787

788
  dataptr = data;
789
  for (ctr = 0; ctr < 4; ctr++) {
790
    /* Even part */
791

792
    /* Add fudge factor here for final descale. */
793
#if PASS2_BITS > 1
794
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS2_BITS-1));
795
#else
796
#if PASS2_BITS > 0
797
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + ONE;
798
#else
799
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
800
#endif
801
#endif
802
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
803

804
    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
805
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
806

807
    dataptr[DCTSIZE*0] = PASS2_OUTPUT(tmp0 + tmp1);
808
    dataptr[DCTSIZE*2] = PASS2_OUTPUT(tmp0 - tmp1);
809

810
    /* Odd part */
811

812
    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
813
    /* Add fudge factor here for final descale. */
814
    tmp0 += ONE << (CONST_BITS+PASS2_BITS-1);
815

816
    dataptr[DCTSIZE*1] = (DCTELEM)
817
      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
818
		  CONST_BITS+PASS2_BITS);
819
    dataptr[DCTSIZE*3] = (DCTELEM)
820
      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
821
		  CONST_BITS+PASS2_BITS);
822

823
    dataptr++;			/* advance pointer to next column */
824
  }
825
}
826

827

828
/*
829
 * Perform the forward DCT on a 3x3 sample block.
830
 */
831

832
GLOBAL(void)
833
jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
834
{
835
  INT32 tmp0, tmp1, tmp2;
836
  DCTELEM *dataptr;
837
  JSAMPROW elemptr;
838
  int ctr;
839
  SHIFT_TEMPS
840

841
  /* Pre-zero output coefficient block. */
842
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
843

844
  /* Pass 1: process rows.
845
   * Note results are scaled up by sqrt(8) compared to a true DCT;
846
   * furthermore, we scale the results by 2**PASS1_BITS.
847
   * We scale the results further by 2**2 as part of output adaption
848
   * scaling for different DCT size.
849
   * cK represents sqrt(2) * cos(K*pi/6).
850
   */
851

852
  dataptr = data;
853
  for (ctr = 0; ctr < 3; ctr++) {
854
    elemptr = sample_data[ctr] + start_col;
855

856
    /* Even part */
857

858
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
859
    tmp1 = GETJSAMPLE(elemptr[1]);
860

861
    tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
862

863
    /* Apply unsigned->signed conversion. */
864
    dataptr[0] = (DCTELEM)
865
      ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
866
    dataptr[2] = (DCTELEM)
867
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
868
	      CONST_BITS-PASS1_BITS-2);
869

870
    /* Odd part */
871

872
    dataptr[1] = (DCTELEM)
873
      DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
874
	      CONST_BITS-PASS1_BITS-2);
875

876
    dataptr += DCTSIZE;		/* advance pointer to next row */
877
  }
878

879
  /* Pass 2: process columns.
880
   * We apply the PASS2_BITS scaling, but leave the
881
   * results scaled up by an overall factor of 8.
882
   * We must also scale the output by (8/3)**2 = 64/9, which we partially
883
   * fold into the constant multipliers (other part was done in pass 1):
884
   * cK now represents sqrt(2) * cos(K*pi/6) * 16/9.
885
   */
886

887
  dataptr = data;
888
  for (ctr = 0; ctr < 3; ctr++) {
889
    /* Even part */
890

891
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
892
    tmp1 = dataptr[DCTSIZE*1];
893

894
    tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
895

896
    dataptr[DCTSIZE*0] = (DCTELEM)
897
      DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),        /* 16/9 */
898
	      CONST_BITS+PASS2_BITS);
899
    dataptr[DCTSIZE*2] = (DCTELEM)
900
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
901
	      CONST_BITS+PASS2_BITS);
902

903
    /* Odd part */
904

905
    dataptr[DCTSIZE*1] = (DCTELEM)
906
      DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
907
	      CONST_BITS+PASS2_BITS);
908

909
    dataptr++;			/* advance pointer to next column */
910
  }
911
}
912

913

914
/*
915
 * Perform the forward DCT on a 2x2 sample block.
916
 */
917

918
GLOBAL(void)
919
jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
920
{
921
  DCTELEM tmp0, tmp1, tmp2, tmp3;
922
  JSAMPROW elemptr;
923

924
  /* Pre-zero output coefficient block. */
925
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
926

927
  /* Pass 1: process rows.
928
   * Note results are scaled up by sqrt(8) compared to a true DCT.
929
   */
930

931
  /* Row 0 */
932
  elemptr = sample_data[0] + start_col;
933

934
  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
935
  tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
936

937
  /* Row 1 */
938
  elemptr = sample_data[1] + start_col;
939

940
  tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
941
  tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
942

943
  /* Pass 2: process columns.
944
   * We leave the results scaled up by an overall factor of 8.
945
   * We must also scale the output by (8/2)**2 = 2**4.
946
   */
947

948
  /* Column 0 */
949
  /* Apply unsigned->signed conversion. */
950

951
#if PASS2_BITS < PASS1_BITS + 4
952
  data[DCTSIZE*0] =
953
    (tmp0 + tmp2 - 4 * CENTERJSAMPLE) << (4+PASS1_BITS-PASS2_BITS);
954
  data[DCTSIZE*1] = (tmp0 - tmp2) << (4+PASS1_BITS-PASS2_BITS);
955

956
  /* Column 1 */
957
  data[DCTSIZE*0+1] = (tmp1 + tmp3) << (4+PASS1_BITS-PASS2_BITS);
958
  data[DCTSIZE*1+1] = (tmp1 - tmp3) << (4+PASS1_BITS-PASS2_BITS);
959
#else
960
  data[DCTSIZE*0] = tmp0 + tmp2 - 4 * CENTERJSAMPLE;
961
  data[DCTSIZE*1] = tmp0 - tmp2;
962

963
  /* Column 1 */
964
  data[DCTSIZE*0+1] = tmp1 + tmp3;
965
  data[DCTSIZE*1+1] = tmp1 - tmp3;
966
#endif
967
}
968

969

970
/*
971
 * Perform the forward DCT on a 1x1 sample block.
972
 */
973

974
GLOBAL(void)
975
jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
976
{
977
  DCTELEM dcval;
978

979
  /* Pre-zero output coefficient block. */
980
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
981

982
  dcval = GETJSAMPLE(sample_data[0][start_col]);
983

984
  /* We leave the result scaled up by an overall factor of 8. */
985
  /* We must also scale the output by (8/1)**2 = 2**6. */
986
  /* Apply unsigned->signed conversion. */
987
  data[0] = (dcval - CENTERJSAMPLE) << (6+PASS1_BITS-PASS2_BITS);
988
}
989

990

991
/* Pass 1 bits decrement scaling for block sizes 9, 10, 11. */
992

993
#if PASS1_BITS > 0
994
#define PASS1_DECR  (PASS1_BITS - 1)
995
#else
996
#define PASS1_DECR  0
997
#endif
998

999
#if PASS1_DECR > 0
1000
#define PASS1_OUTDEC(x)  (DCTELEM) ((x) << PASS1_DECR)
1001
#else
1002
#define PASS1_OUTDEC(x)  (DCTELEM) (x)
1003
#endif
1004

1005

1006
/*
1007
 * Perform the forward DCT on a 9x9 sample block.
1008
 */
1009

1010
GLOBAL(void)
1011
jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1012
{
1013
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
1014
  INT32 tmp10, tmp11, tmp12, tmp13;
1015
  INT32 z1, z2;
1016
  DCTELEM workspace[8];
1017
  DCTELEM *dataptr;
1018
  DCTELEM *wsptr;
1019
  JSAMPROW elemptr;
1020
  int ctr;
1021
  SHIFT_TEMPS
1022

1023
  /* Pass 1: process rows.
1024
   * Note results are scaled up by sqrt(8) compared to a true DCT;
1025
   * furthermore, we scale the results by 2**PASS1_DECR.
1026
   * cK represents sqrt(2) * cos(K*pi/18).
1027
   */
1028

1029
  dataptr = data;
1030
  ctr = 0;
1031
  for (;;) {
1032
    elemptr = sample_data[ctr] + start_col;
1033

1034
    /* Even part */
1035

1036
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
1037
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
1038
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
1039
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
1040
    tmp4 = GETJSAMPLE(elemptr[4]);
1041

1042
    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
1043
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
1044
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
1045
    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
1046

1047
    z1 = tmp0 + tmp2 + tmp3;
1048
    z2 = tmp1 + tmp4;
1049
    /* Apply unsigned->signed conversion. */
1050
    dataptr[0] = PASS1_OUTDEC(z1 + z2 - 9 * CENTERJSAMPLE);
1051
    dataptr[6] = (DCTELEM)
1052
      DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)),  /* c6 */
1053
	      CONST_BITS-PASS1_DECR);
1054
    z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049));        /* c2 */
1055
    z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
1056
    dataptr[2] = (DCTELEM)
1057
      DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441))    /* c4 */
1058
	      + z1 + z2, CONST_BITS-PASS1_DECR);
1059
    dataptr[4] = (DCTELEM)
1060
      DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608))    /* c8 */
1061
	      + z1 - z2, CONST_BITS-PASS1_DECR);
1062

1063
    /* Odd part */
1064

1065
    dataptr[3] = (DCTELEM)
1066
      DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
1067
	      CONST_BITS-PASS1_DECR);
1068

1069
    tmp11 = MULTIPLY(tmp11, FIX(1.224744871));        /* c3 */
1070
    tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */
1071
    tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */
1072

1073
    dataptr[1] = (DCTELEM)
1074
      DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-PASS1_DECR);
1075

1076
    tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */
1077

1078
    dataptr[5] = (DCTELEM)
1079
      DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-PASS1_DECR);
1080
    dataptr[7] = (DCTELEM)
1081
      DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-PASS1_DECR);
1082

1083
    ctr++;
1084

1085
    if (ctr != DCTSIZE) {
1086
      if (ctr == 9)
1087
	break;			/* Done. */
1088
      dataptr += DCTSIZE;	/* advance pointer to next row */
1089
    } else
1090
      dataptr = workspace;	/* switch pointer to extended workspace */
1091
  }
1092

1093
  /* Pass 2: process columns.
1094
   * We remove the PASS1_DECR scaling, but leave the results scaled up
1095
   * by an overall factor of 8.
1096
   * We must also scale the output by (8/9)**2 = 64/81, which we partially
1097
   * fold into the constant multipliers and final shifting:
1098
   * cK now represents sqrt(2) * cos(K*pi/18) * 128/81.
1099
   */
1100

1101
  dataptr = data;
1102
  wsptr = workspace;
1103
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1104
    /* Even part */
1105

1106
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*0];
1107
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*7];
1108
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*6];
1109
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*5];
1110
    tmp4 = dataptr[DCTSIZE*4];
1111

1112
    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*0];
1113
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*7];
1114
    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*6];
1115
    tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*5];
1116

1117
    z1 = tmp0 + tmp2 + tmp3;
1118
    z2 = tmp1 + tmp4;
1119
    dataptr[DCTSIZE*0] = (DCTELEM)
1120
      DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)),       /* 128/81 */
1121
	      CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1122
    dataptr[DCTSIZE*6] = (DCTELEM)
1123
      DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)),  /* c6 */
1124
	      CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1125
    z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287));        /* c2 */
1126
    z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */
1127
    dataptr[DCTSIZE*2] = (DCTELEM)
1128
      DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190))    /* c4 */
1129
	      + z1 + z2, CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1130
    dataptr[DCTSIZE*4] = (DCTELEM)
1131
      DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096))    /* c8 */
1132
	      + z1 - z2, CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1133

1134
    /* Odd part */
1135

1136
    dataptr[DCTSIZE*3] = (DCTELEM)
1137
      DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */
1138
	      CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1139

1140
    tmp11 = MULTIPLY(tmp11, FIX(1.935399303));        /* c3 */
1141
    tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */
1142
    tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */
1143

1144
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1,
1145
      CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1146

1147
    tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */
1148

1149
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2,
1150
      CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1151
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2,
1152
      CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1153

1154
    dataptr++;			/* advance pointer to next column */
1155
    wsptr++;			/* advance pointer to next column */
1156
  }
1157
}
1158

1159

1160
/*
1161
 * Perform the forward DCT on a 10x10 sample block.
1162
 */
1163

1164
GLOBAL(void)
1165
jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1166
{
1167
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
1168
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1169
  DCTELEM workspace[8*2];
1170
  DCTELEM *dataptr;
1171
  DCTELEM *wsptr;
1172
  JSAMPROW elemptr;
1173
  int ctr;
1174
  SHIFT_TEMPS
1175

1176
  /* Pass 1: process rows.
1177
   * Note results are scaled up by sqrt(8) compared to a true DCT;
1178
   * furthermore, we scale the results by 2**PASS1_DECR.
1179
   * cK represents sqrt(2) * cos(K*pi/20).
1180
   */
1181

1182
  dataptr = data;
1183
  ctr = 0;
1184
  for (;;) {
1185
    elemptr = sample_data[ctr] + start_col;
1186

1187
    /* Even part */
1188

1189
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
1190
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
1191
    tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
1192
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
1193
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
1194

1195
    tmp10 = tmp0 + tmp4;
1196
    tmp13 = tmp0 - tmp4;
1197
    tmp11 = tmp1 + tmp3;
1198
    tmp14 = tmp1 - tmp3;
1199

1200
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
1201
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
1202
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
1203
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
1204
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
1205

1206
    /* Apply unsigned->signed conversion. */
1207
    dataptr[0] =
1208
      PASS1_OUTDEC(tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE);
1209
    tmp12 += tmp12;
1210
    dataptr[4] = (DCTELEM)
1211
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
1212
	      MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
1213
	      CONST_BITS-PASS1_DECR);
1214
    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
1215
    dataptr[2] = (DCTELEM)
1216
      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
1217
	      CONST_BITS-PASS1_DECR);
1218
    dataptr[6] = (DCTELEM)
1219
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
1220
	      CONST_BITS-PASS1_DECR);
1221

1222
    /* Odd part */
1223

1224
    tmp10 = tmp0 + tmp4;
1225
    tmp11 = tmp1 - tmp3;
1226
    dataptr[5] = PASS1_OUTDEC(tmp10 - tmp11 - tmp2);
1227
    tmp2 <<= CONST_BITS;
1228
    dataptr[1] = (DCTELEM)
1229
      DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) +          /* c1 */
1230
	      MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 +   /* c3 */
1231
	      MULTIPLY(tmp3, FIX(0.642039522)) +          /* c7 */
1232
	      MULTIPLY(tmp4, FIX(0.221231742)),           /* c9 */
1233
	      CONST_BITS-PASS1_DECR);
1234
    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) -     /* (c3+c7)/2 */
1235
	    MULTIPLY(tmp1 + tmp3, FIX(0.587785252));      /* (c1-c9)/2 */
1236
    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) +   /* (c3-c7)/2 */
1237
	    (tmp11 << (CONST_BITS - 1)) - tmp2;
1238
    dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_DECR);
1239
    dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_DECR);
1240

1241
    ctr++;
1242

1243
    if (ctr != DCTSIZE) {
1244
      if (ctr == 10)
1245
	break;			/* Done. */
1246
      dataptr += DCTSIZE;	/* advance pointer to next row */
1247
    } else
1248
      dataptr = workspace;	/* switch pointer to extended workspace */
1249
  }
1250

1251
  /* Pass 2: process columns.
1252
   * We remove the PASS1_DECR scaling, but leave the results scaled up
1253
   * by an overall factor of 8.
1254
   * We must also scale the output by (8/10)**2 = 16/25, which we partially
1255
   * fold into the constant multipliers and final shifting:
1256
   * cK now represents sqrt(2) * cos(K*pi/20) * 32/25.
1257
   */
1258

1259
  dataptr = data;
1260
  wsptr = workspace;
1261
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1262
    /* Even part */
1263

1264
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
1265
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
1266
    tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
1267
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
1268
    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
1269

1270
    tmp10 = tmp0 + tmp4;
1271
    tmp13 = tmp0 - tmp4;
1272
    tmp11 = tmp1 + tmp3;
1273
    tmp14 = tmp1 - tmp3;
1274

1275
    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
1276
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
1277
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
1278
    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
1279
    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
1280

1281
    dataptr[DCTSIZE*0] = (DCTELEM)
1282
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
1283
	      CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1284
    tmp12 += tmp12;
1285
    dataptr[DCTSIZE*4] = (DCTELEM)
1286
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
1287
	      MULTIPLY(tmp11 - tmp12, FIX(0.559380511)),  /* c8 */
1288
	      CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1289
    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961));    /* c6 */
1290
    dataptr[DCTSIZE*2] = (DCTELEM)
1291
      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)),  /* c2-c6 */
1292
	      CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1293
    dataptr[DCTSIZE*6] = (DCTELEM)
1294
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)),  /* c2+c6 */
1295
	      CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1296

1297
    /* Odd part */
1298

1299
    tmp10 = tmp0 + tmp4;
1300
    tmp11 = tmp1 - tmp3;
1301
    dataptr[DCTSIZE*5] = (DCTELEM)
1302
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)),  /* 32/25 */
1303
	      CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1304
    tmp2 = MULTIPLY(tmp2, FIX(1.28));                     /* 32/25 */
1305
    dataptr[DCTSIZE*1] = (DCTELEM)
1306
      DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) +          /* c1 */
1307
	      MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 +   /* c3 */
1308
	      MULTIPLY(tmp3, FIX(0.821810588)) +          /* c7 */
1309
	      MULTIPLY(tmp4, FIX(0.283176630)),           /* c9 */
1310
	      CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1311
    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) -     /* (c3+c7)/2 */
1312
	    MULTIPLY(tmp1 + tmp3, FIX(0.752365123));      /* (c1-c9)/2 */
1313
    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) +   /* (c3-c7)/2 */
1314
	    MULTIPLY(tmp11, FIX(0.64)) - tmp2;            /* 16/25 */
1315
    dataptr[DCTSIZE*3] = (DCTELEM)
1316
      DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1317
    dataptr[DCTSIZE*7] = (DCTELEM)
1318
      DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1319

1320
    dataptr++;			/* advance pointer to next column */
1321
    wsptr++;			/* advance pointer to next column */
1322
  }
1323
}
1324

1325

1326
/*
1327
 * Perform the forward DCT on an 11x11 sample block.
1328
 */
1329

1330
GLOBAL(void)
1331
jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1332
{
1333
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1334
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1335
  INT32 z1, z2, z3;
1336
  DCTELEM workspace[8*3];
1337
  DCTELEM *dataptr;
1338
  DCTELEM *wsptr;
1339
  JSAMPROW elemptr;
1340
  int ctr;
1341
  SHIFT_TEMPS
1342

1343
  /* Pass 1: process rows.
1344
   * Note results are scaled up by sqrt(8) compared to a true DCT;
1345
   * furthermore, we scale the results by 2**PASS1_DECR.
1346
   * cK represents sqrt(2) * cos(K*pi/22).
1347
   */
1348

1349
  dataptr = data;
1350
  ctr = 0;
1351
  for (;;) {
1352
    elemptr = sample_data[ctr] + start_col;
1353

1354
    /* Even part */
1355

1356
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
1357
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
1358
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
1359
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
1360
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
1361
    tmp5 = GETJSAMPLE(elemptr[5]);
1362

1363
    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
1364
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
1365
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
1366
    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
1367
    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
1368

1369
    /* Apply unsigned->signed conversion. */
1370
    dataptr[0] =
1371
      PASS1_OUTDEC(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE);
1372
    tmp5 += tmp5;
1373
    tmp0 -= tmp5;
1374
    tmp1 -= tmp5;
1375
    tmp2 -= tmp5;
1376
    tmp3 -= tmp5;
1377
    tmp4 -= tmp5;
1378
    z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) +       /* c2 */
1379
	 MULTIPLY(tmp2 + tmp4, FIX(0.201263574));        /* c10 */
1380
    z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931));        /* c6 */
1381
    z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156));        /* c4 */
1382
    dataptr[2] = (DCTELEM)
1383
      DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
1384
	      - MULTIPLY(tmp4, FIX(1.390975730)),        /* c4+c10 */
1385
	      CONST_BITS-PASS1_DECR);
1386
    dataptr[4] = (DCTELEM)
1387
      DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
1388
	      - MULTIPLY(tmp2, FIX(1.356927976))         /* c2 */
1389
	      + MULTIPLY(tmp4, FIX(0.587485545)),        /* c8 */
1390
	      CONST_BITS-PASS1_DECR);
1391
    dataptr[6] = (DCTELEM)
1392
      DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */
1393
	      - MULTIPLY(tmp2, FIX(0.788749120)),        /* c8+c10 */
1394
	      CONST_BITS-PASS1_DECR);
1395

1396
    /* Odd part */
1397

1398
    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905));    /* c3 */
1399
    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298));    /* c5 */
1400
    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576));    /* c7 */
1401
    tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */
1402
	   + MULTIPLY(tmp14, FIX(0.398430003));          /* c9 */
1403
    tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576));  /* -c7 */
1404
    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907));  /* -c1 */
1405
    tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */
1406
	    - MULTIPLY(tmp14, FIX(1.068791298));         /* c5 */
1407
    tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003));   /* c9 */
1408
    tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */
1409
	    + MULTIPLY(tmp14, FIX(1.399818907));         /* c1 */
1410
    tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */
1411
	    - MULTIPLY(tmp14, FIX(1.286413905));         /* c3 */
1412

1413
    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_DECR);
1414
    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_DECR);
1415
    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_DECR);
1416
    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-PASS1_DECR);
1417

1418
    ctr++;
1419

1420
    if (ctr != DCTSIZE) {
1421
      if (ctr == 11)
1422
	break;			/* Done. */
1423
      dataptr += DCTSIZE;	/* advance pointer to next row */
1424
    } else
1425
      dataptr = workspace;	/* switch pointer to extended workspace */
1426
  }
1427

1428
  /* Pass 2: process columns.
1429
   * We remove the PASS1_DECR scaling, but leave the results scaled up
1430
   * by an overall factor of 8.
1431
   * We must also scale the output by (8/11)**2 = 64/121, which we partially
1432
   * fold into the constant multipliers and final shifting:
1433
   * cK now represents sqrt(2) * cos(K*pi/22) * 128/121.
1434
   */
1435

1436
  dataptr = data;
1437
  wsptr = workspace;
1438
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1439
    /* Even part */
1440

1441
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*2];
1442
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*1];
1443
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*0];
1444
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*7];
1445
    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*6];
1446
    tmp5 = dataptr[DCTSIZE*5];
1447

1448
    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*2];
1449
    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*1];
1450
    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*0];
1451
    tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*7];
1452
    tmp14 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*6];
1453

1454
    dataptr[DCTSIZE*0] = (DCTELEM)
1455
      DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
1456
		       FIX(1.057851240)),                /* 128/121 */
1457
	      CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1458
    tmp5 += tmp5;
1459
    tmp0 -= tmp5;
1460
    tmp1 -= tmp5;
1461
    tmp2 -= tmp5;
1462
    tmp3 -= tmp5;
1463
    tmp4 -= tmp5;
1464
    z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) +       /* c2 */
1465
	 MULTIPLY(tmp2 + tmp4, FIX(0.212906922));        /* c10 */
1466
    z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713));        /* c6 */
1467
    z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479));        /* c4 */
1468
    dataptr[DCTSIZE*2] = (DCTELEM)
1469
      DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */
1470
	      - MULTIPLY(tmp4, FIX(1.471445400)),        /* c4+c10 */
1471
	      CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1472
    dataptr[DCTSIZE*4] = (DCTELEM)
1473
      DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */
1474
	      - MULTIPLY(tmp2, FIX(1.435427942))         /* c2 */
1475
	      + MULTIPLY(tmp4, FIX(0.621472312)),        /* c8 */
1476
	      CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1477
    dataptr[DCTSIZE*6] = (DCTELEM)
1478
      DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */
1479
	      - MULTIPLY(tmp2, FIX(0.834379234)),        /* c8+c10 */
1480
	      CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1481

1482
    /* Odd part */
1483

1484
    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544));    /* c3 */
1485
    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199));    /* c5 */
1486
    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568));    /* c7 */
1487
    tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */
1488
	   + MULTIPLY(tmp14, FIX(0.421479672));          /* c9 */
1489
    tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568));  /* -c7 */
1490
    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167));  /* -c1 */
1491
    tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */
1492
	    - MULTIPLY(tmp14, FIX(1.130622199));         /* c5 */
1493
    tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672));   /* c9 */
1494
    tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */
1495
	    + MULTIPLY(tmp14, FIX(1.480800167));         /* c1 */
1496
    tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */
1497
	    - MULTIPLY(tmp14, FIX(1.360834544));         /* c3 */
1498

1499
    dataptr[DCTSIZE*1] = (DCTELEM)
1500
      DESCALE(tmp0, CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1501
    dataptr[DCTSIZE*3] = (DCTELEM)
1502
      DESCALE(tmp1, CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1503
    dataptr[DCTSIZE*5] = (DCTELEM)
1504
      DESCALE(tmp2, CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1505
    dataptr[DCTSIZE*7] = (DCTELEM)
1506
      DESCALE(tmp3, CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1507

1508
    dataptr++;			/* advance pointer to next column */
1509
    wsptr++;			/* advance pointer to next column */
1510
  }
1511
}
1512

1513

1514
/*
1515
 * Perform the forward DCT on a 12x12 sample block.
1516
 */
1517

1518
GLOBAL(void)
1519
jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1520
{
1521
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1522
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1523
  DCTELEM workspace[8*4];
1524
  DCTELEM *dataptr;
1525
  DCTELEM *wsptr;
1526
  JSAMPROW elemptr;
1527
  int ctr;
1528
  SHIFT_TEMPS
1529

1530
  /* Pass 1: process rows.
1531
   * Note results are scaled up by sqrt(8) compared to a true DCT.
1532
   * cK represents sqrt(2) * cos(K*pi/24).
1533
   */
1534

1535
  dataptr = data;
1536
  ctr = 0;
1537
  for (;;) {
1538
    elemptr = sample_data[ctr] + start_col;
1539

1540
    /* Even part */
1541

1542
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
1543
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
1544
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
1545
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
1546
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
1547
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
1548

1549
    tmp10 = tmp0 + tmp5;
1550
    tmp13 = tmp0 - tmp5;
1551
    tmp11 = tmp1 + tmp4;
1552
    tmp14 = tmp1 - tmp4;
1553
    tmp12 = tmp2 + tmp3;
1554
    tmp15 = tmp2 - tmp3;
1555

1556
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
1557
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
1558
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
1559
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
1560
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
1561
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
1562

1563
    /* Apply unsigned->signed conversion. */
1564
    dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
1565
    dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
1566
    dataptr[4] = (DCTELEM)
1567
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
1568
	      CONST_BITS);
1569
    dataptr[2] = (DCTELEM)
1570
      DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
1571
	      CONST_BITS);
1572

1573
    /* Odd part */
1574

1575
    tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
1576
    tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
1577
    tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
1578
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
1579
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
1580
    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
1581
	    + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
1582
    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
1583
    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
1584
	    + MULTIPLY(tmp5, FIX(0.860918669));        /* c7 */
1585
    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
1586
	    - MULTIPLY(tmp5, FIX(1.121971054));        /* c5 */
1587
    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
1588
	    - MULTIPLY(tmp2 + tmp5, FIX_0_541196100);  /* c9 */
1589

1590
    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
1591
    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
1592
    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
1593
    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS);
1594

1595
    ctr++;
1596

1597
    if (ctr != DCTSIZE) {
1598
      if (ctr == 12)
1599
	break;			/* Done. */
1600
      dataptr += DCTSIZE;	/* advance pointer to next row */
1601
    } else
1602
      dataptr = workspace;	/* switch pointer to extended workspace */
1603
  }
1604

1605
  /* Pass 2: process columns.
1606
   * We leave the results scaled up by an overall factor of 8.
1607
   * We must also scale the output by (8/12)**2 = 4/9, which we partially
1608
   * fold into the constant multipliers and final shifting:
1609
   * cK now represents sqrt(2) * cos(K*pi/24) * 8/9.
1610
   */
1611

1612
  dataptr = data;
1613
  wsptr = workspace;
1614
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1615
    /* Even part */
1616

1617
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
1618
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
1619
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
1620
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
1621
    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
1622
    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
1623

1624
    tmp10 = tmp0 + tmp5;
1625
    tmp13 = tmp0 - tmp5;
1626
    tmp11 = tmp1 + tmp4;
1627
    tmp14 = tmp1 - tmp4;
1628
    tmp12 = tmp2 + tmp3;
1629
    tmp15 = tmp2 - tmp3;
1630

1631
    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
1632
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
1633
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
1634
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
1635
    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
1636
    tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
1637

1638
    dataptr[DCTSIZE*0] = (DCTELEM)
1639
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
1640
	      CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1641
    dataptr[DCTSIZE*6] = (DCTELEM)
1642
      DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
1643
	      CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1644
    dataptr[DCTSIZE*4] = (DCTELEM)
1645
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)),         /* c4 */
1646
	      CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1647
    dataptr[DCTSIZE*2] = (DCTELEM)
1648
      DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) +        /* 8/9 */
1649
	      MULTIPLY(tmp13 + tmp15, FIX(1.214244803)),         /* c2 */
1650
	      CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1651

1652
    /* Odd part */
1653

1654
    tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200));   /* c9 */
1655
    tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102));  /* c3-c9 */
1656
    tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502));  /* c3+c9 */
1657
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603));   /* c5 */
1658
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039));   /* c7 */
1659
    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
1660
	    + MULTIPLY(tmp5, FIX(0.164081699));        /* c11 */
1661
    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
1662
    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
1663
	    + MULTIPLY(tmp5, FIX(0.765261039));        /* c7 */
1664
    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
1665
	    - MULTIPLY(tmp5, FIX(0.997307603));        /* c5 */
1666
    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
1667
	    - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
1668

1669
    dataptr[DCTSIZE*1] = (DCTELEM)
1670
      DESCALE(tmp10, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1671
    dataptr[DCTSIZE*3] = (DCTELEM)
1672
      DESCALE(tmp11, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1673
    dataptr[DCTSIZE*5] = (DCTELEM)
1674
      DESCALE(tmp12, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1675
    dataptr[DCTSIZE*7] = (DCTELEM)
1676
      DESCALE(tmp13, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1677

1678
    dataptr++;			/* advance pointer to next column */
1679
    wsptr++;			/* advance pointer to next column */
1680
  }
1681
}
1682

1683

1684
/*
1685
 * Perform the forward DCT on a 13x13 sample block.
1686
 */
1687

1688
GLOBAL(void)
1689
jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1690
{
1691
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1692
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1693
  INT32 z1, z2;
1694
  DCTELEM workspace[8*5];
1695
  DCTELEM *dataptr;
1696
  DCTELEM *wsptr;
1697
  JSAMPROW elemptr;
1698
  int ctr;
1699
  SHIFT_TEMPS
1700

1701
  /* Pass 1: process rows.
1702
   * Note results are scaled up by sqrt(8) compared to a true DCT.
1703
   * cK represents sqrt(2) * cos(K*pi/26).
1704
   */
1705

1706
  dataptr = data;
1707
  ctr = 0;
1708
  for (;;) {
1709
    elemptr = sample_data[ctr] + start_col;
1710

1711
    /* Even part */
1712

1713
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
1714
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
1715
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
1716
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
1717
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
1718
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
1719
    tmp6 = GETJSAMPLE(elemptr[6]);
1720

1721
    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
1722
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
1723
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
1724
    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
1725
    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
1726
    tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
1727

1728
    /* Apply unsigned->signed conversion. */
1729
    dataptr[0] = (DCTELEM)
1730
      (tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
1731
    tmp6 += tmp6;
1732
    tmp0 -= tmp6;
1733
    tmp1 -= tmp6;
1734
    tmp2 -= tmp6;
1735
    tmp3 -= tmp6;
1736
    tmp4 -= tmp6;
1737
    tmp5 -= tmp6;
1738
    dataptr[2] = (DCTELEM)
1739
      DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) +   /* c2 */
1740
	      MULTIPLY(tmp1, FIX(1.058554052)) +   /* c6 */
1741
	      MULTIPLY(tmp2, FIX(0.501487041)) -   /* c10 */
1742
	      MULTIPLY(tmp3, FIX(0.170464608)) -   /* c12 */
1743
	      MULTIPLY(tmp4, FIX(0.803364869)) -   /* c8 */
1744
	      MULTIPLY(tmp5, FIX(1.252223920)),    /* c4 */
1745
	      CONST_BITS);
1746
    z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
1747
	 MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
1748
	 MULTIPLY(tmp1 - tmp5, FIX(0.316450131));  /* (c8-c12)/2 */
1749
    z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */
1750
	 MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */
1751
	 MULTIPLY(tmp1 + tmp5, FIX(0.486914739));  /* (c8+c12)/2 */
1752

1753
    dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
1754
    dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);
1755

1756
    /* Odd part */
1757

1758
    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651));   /* c3 */
1759
    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945));   /* c5 */
1760
    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) +  /* c7 */
1761
	   MULTIPLY(tmp14 + tmp15, FIX(0.338443458));   /* c11 */
1762
    tmp0 = tmp1 + tmp2 + tmp3 -
1763
	   MULTIPLY(tmp10, FIX(2.020082300)) +          /* c3+c5+c7-c1 */
1764
	   MULTIPLY(tmp14, FIX(0.318774355));           /* c9-c11 */
1765
    tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) -  /* c7 */
1766
	   MULTIPLY(tmp11 + tmp12, FIX(0.338443458));   /* c11 */
1767
    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */
1768
    tmp1 += tmp4 + tmp5 +
1769
	    MULTIPLY(tmp11, FIX(0.837223564)) -         /* c5+c9+c11-c3 */
1770
	    MULTIPLY(tmp14, FIX(2.341699410));          /* c1+c7 */
1771
    tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */
1772
    tmp2 += tmp4 + tmp6 -
1773
	    MULTIPLY(tmp12, FIX(1.572116027)) +         /* c1+c5-c9-c11 */
1774
	    MULTIPLY(tmp15, FIX(2.260109708));          /* c3+c7 */
1775
    tmp3 += tmp5 + tmp6 +
1776
	    MULTIPLY(tmp13, FIX(2.205608352)) -         /* c3+c5+c9-c7 */
1777
	    MULTIPLY(tmp15, FIX(1.742345811));          /* c1+c11 */
1778

1779
    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
1780
    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
1781
    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
1782
    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
1783

1784
    ctr++;
1785

1786
    if (ctr != DCTSIZE) {
1787
      if (ctr == 13)
1788
	break;			/* Done. */
1789
      dataptr += DCTSIZE;	/* advance pointer to next row */
1790
    } else
1791
      dataptr = workspace;	/* switch pointer to extended workspace */
1792
  }
1793

1794
  /* Pass 2: process columns.
1795
   * We leave the results scaled up by an overall factor of 8.
1796
   * We must also scale the output by (8/13)**2 = 64/169, which we partially
1797
   * fold into the constant multipliers and final shifting:
1798
   * cK now represents sqrt(2) * cos(K*pi/26) * 128/169.
1799
   */
1800

1801
  dataptr = data;
1802
  wsptr = workspace;
1803
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1804
    /* Even part */
1805

1806
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*4];
1807
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*3];
1808
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*2];
1809
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*1];
1810
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*0];
1811
    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*7];
1812
    tmp6 = dataptr[DCTSIZE*6];
1813

1814
    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*4];
1815
    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*3];
1816
    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*2];
1817
    tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*1];
1818
    tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*0];
1819
    tmp15 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*7];
1820

1821
    dataptr[DCTSIZE*0] = (DCTELEM)
1822
      DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
1823
		       FIX(0.757396450)),          /* 128/169 */
1824
	      CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1825
    tmp6 += tmp6;
1826
    tmp0 -= tmp6;
1827
    tmp1 -= tmp6;
1828
    tmp2 -= tmp6;
1829
    tmp3 -= tmp6;
1830
    tmp4 -= tmp6;
1831
    tmp5 -= tmp6;
1832
    dataptr[DCTSIZE*2] = (DCTELEM)
1833
      DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) +   /* c2 */
1834
	      MULTIPLY(tmp1, FIX(0.801745081)) +   /* c6 */
1835
	      MULTIPLY(tmp2, FIX(0.379824504)) -   /* c10 */
1836
	      MULTIPLY(tmp3, FIX(0.129109289)) -   /* c12 */
1837
	      MULTIPLY(tmp4, FIX(0.608465700)) -   /* c8 */
1838
	      MULTIPLY(tmp5, FIX(0.948429952)),    /* c4 */
1839
	      CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1840
    z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */
1841
	 MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */
1842
	 MULTIPLY(tmp1 - tmp5, FIX(0.239678205));  /* (c8-c12)/2 */
1843
    z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */
1844
	 MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */
1845
	 MULTIPLY(tmp1 + tmp5, FIX(0.368787494));  /* (c8+c12)/2 */
1846

1847
    dataptr[DCTSIZE*4] = (DCTELEM)
1848
      DESCALE(z1 + z2, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1849
    dataptr[DCTSIZE*6] = (DCTELEM)
1850
      DESCALE(z1 - z2, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1851

1852
    /* Odd part */
1853

1854
    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908));   /* c3 */
1855
    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751));   /* c5 */
1856
    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) +  /* c7 */
1857
	   MULTIPLY(tmp14 + tmp15, FIX(0.256335874));   /* c11 */
1858
    tmp0 = tmp1 + tmp2 + tmp3 -
1859
	   MULTIPLY(tmp10, FIX(1.530003162)) +          /* c3+c5+c7-c1 */
1860
	   MULTIPLY(tmp14, FIX(0.241438564));           /* c9-c11 */
1861
    tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) -  /* c7 */
1862
	   MULTIPLY(tmp11 + tmp12, FIX(0.256335874));   /* c11 */
1863
    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */
1864
    tmp1 += tmp4 + tmp5 +
1865
	    MULTIPLY(tmp11, FIX(0.634110155)) -         /* c5+c9+c11-c3 */
1866
	    MULTIPLY(tmp14, FIX(1.773594819));          /* c1+c7 */
1867
    tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */
1868
    tmp2 += tmp4 + tmp6 -
1869
	    MULTIPLY(tmp12, FIX(1.190715098)) +         /* c1+c5-c9-c11 */
1870
	    MULTIPLY(tmp15, FIX(1.711799069));          /* c3+c7 */
1871
    tmp3 += tmp5 + tmp6 +
1872
	    MULTIPLY(tmp13, FIX(1.670519935)) -         /* c3+c5+c9-c7 */
1873
	    MULTIPLY(tmp15, FIX(1.319646532));          /* c1+c11 */
1874

1875
    dataptr[DCTSIZE*1] = (DCTELEM)
1876
      DESCALE(tmp0, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1877
    dataptr[DCTSIZE*3] = (DCTELEM)
1878
      DESCALE(tmp1, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1879
    dataptr[DCTSIZE*5] = (DCTELEM)
1880
      DESCALE(tmp2, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1881
    dataptr[DCTSIZE*7] = (DCTELEM)
1882
      DESCALE(tmp3, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1883

1884
    dataptr++;			/* advance pointer to next column */
1885
    wsptr++;			/* advance pointer to next column */
1886
  }
1887
}
1888

1889

1890
/*
1891
 * Perform the forward DCT on a 14x14 sample block.
1892
 */
1893

1894
GLOBAL(void)
1895
jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1896
{
1897
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1898
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
1899
  DCTELEM workspace[8*6];
1900
  DCTELEM *dataptr;
1901
  DCTELEM *wsptr;
1902
  JSAMPROW elemptr;
1903
  int ctr;
1904
  SHIFT_TEMPS
1905

1906
  /* Pass 1: process rows.
1907
   * Note results are scaled up by sqrt(8) compared to a true DCT.
1908
   * cK represents sqrt(2) * cos(K*pi/28).
1909
   */
1910

1911
  dataptr = data;
1912
  ctr = 0;
1913
  for (;;) {
1914
    elemptr = sample_data[ctr] + start_col;
1915

1916
    /* Even part */
1917

1918
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
1919
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
1920
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
1921
    tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
1922
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
1923
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
1924
    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
1925

1926
    tmp10 = tmp0 + tmp6;
1927
    tmp14 = tmp0 - tmp6;
1928
    tmp11 = tmp1 + tmp5;
1929
    tmp15 = tmp1 - tmp5;
1930
    tmp12 = tmp2 + tmp4;
1931
    tmp16 = tmp2 - tmp4;
1932

1933
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
1934
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
1935
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
1936
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
1937
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
1938
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
1939
    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
1940

1941
    /* Apply unsigned->signed conversion. */
1942
    dataptr[0] = (DCTELEM)
1943
      (tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
1944
    tmp13 += tmp13;
1945
    dataptr[4] = (DCTELEM)
1946
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
1947
	      MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
1948
	      MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
1949
	      CONST_BITS);
1950

1951
    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
1952

1953
    dataptr[2] = (DCTELEM)
1954
      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
1955
	      + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
1956
	      CONST_BITS);
1957
    dataptr[6] = (DCTELEM)
1958
      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
1959
	      - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
1960
	      CONST_BITS);
1961

1962
    /* Odd part */
1963

1964
    tmp10 = tmp1 + tmp2;
1965
    tmp11 = tmp5 - tmp4;
1966
    dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
1967
    tmp3 <<= CONST_BITS;
1968
    tmp10 = MULTIPLY(tmp10, - FIX(0.158341681));          /* -c13 */
1969
    tmp11 = MULTIPLY(tmp11, FIX(1.405321284));            /* c1 */
1970
    tmp10 += tmp11 - tmp3;
1971
    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) +     /* c5 */
1972
	    MULTIPLY(tmp4 + tmp6, FIX(0.752406978));      /* c9 */
1973
    dataptr[5] = (DCTELEM)
1974
      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
1975
	      + MULTIPLY(tmp4, FIX(1.119999435)),         /* c1+c11-c9 */
1976
	      CONST_BITS);
1977
    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) +     /* c3 */
1978
	    MULTIPLY(tmp5 - tmp6, FIX(0.467085129));      /* c11 */
1979
    dataptr[3] = (DCTELEM)
1980
      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
1981
	      - MULTIPLY(tmp5, FIX(3.069855259)),         /* c1+c5+c11 */
1982
	      CONST_BITS);
1983
    dataptr[1] = (DCTELEM)
1984
      DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
1985
	      MULTIPLY(tmp0 + tmp6, FIX(1.126980169)),    /* c3+c5-c1 */
1986
	      CONST_BITS);
1987

1988
    ctr++;
1989

1990
    if (ctr != DCTSIZE) {
1991
      if (ctr == 14)
1992
	break;			/* Done. */
1993
      dataptr += DCTSIZE;	/* advance pointer to next row */
1994
    } else
1995
      dataptr = workspace;	/* switch pointer to extended workspace */
1996
  }
1997

1998
  /* Pass 2: process columns.
1999
   * We leave the results scaled up by an overall factor of 8.
2000
   * We must also scale the output by (8/14)**2 = 16/49, which we partially
2001
   * fold into the constant multipliers and final shifting:
2002
   * cK now represents sqrt(2) * cos(K*pi/28) * 32/49.
2003
   */
2004

2005
  dataptr = data;
2006
  wsptr = workspace;
2007
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2008
    /* Even part */
2009

2010
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
2011
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
2012
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
2013
    tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
2014
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
2015
    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
2016
    tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
2017

2018
    tmp10 = tmp0 + tmp6;
2019
    tmp14 = tmp0 - tmp6;
2020
    tmp11 = tmp1 + tmp5;
2021
    tmp15 = tmp1 - tmp5;
2022
    tmp12 = tmp2 + tmp4;
2023
    tmp16 = tmp2 - tmp4;
2024

2025
    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
2026
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
2027
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
2028
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
2029
    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
2030
    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
2031
    tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
2032

2033
    dataptr[DCTSIZE*0] = (DCTELEM)
2034
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
2035
		       FIX(0.653061224)),                 /* 32/49 */
2036
	      CONST_BITS+1+PASS2_BITS-PASS1_BITS);
2037
    tmp13 += tmp13;
2038
    dataptr[DCTSIZE*4] = (DCTELEM)
2039
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
2040
	      MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
2041
	      MULTIPLY(tmp12 - tmp13, FIX(0.575835255)),  /* c8 */
2042
	      CONST_BITS+1+PASS2_BITS-PASS1_BITS);
2043

2044
    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570));    /* c6 */
2045

2046
    dataptr[DCTSIZE*2] = (DCTELEM)
2047
      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691))   /* c2-c6 */
2048
	      + MULTIPLY(tmp16, FIX(0.400721155)),        /* c10 */
2049
	      CONST_BITS+1+PASS2_BITS-PASS1_BITS);
2050
    dataptr[DCTSIZE*6] = (DCTELEM)
2051
      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725))   /* c6+c10 */
2052
	      - MULTIPLY(tmp16, FIX(0.900412262)),        /* c2 */
2053
	      CONST_BITS+1+PASS2_BITS-PASS1_BITS);
2054

2055
    /* Odd part */
2056

2057
    tmp10 = tmp1 + tmp2;
2058
    tmp11 = tmp5 - tmp4;
2059
    dataptr[DCTSIZE*7] = (DCTELEM)
2060
      DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
2061
		       FIX(0.653061224)),                 /* 32/49 */
2062
	      CONST_BITS+1+PASS2_BITS-PASS1_BITS);
2063
    tmp3  = MULTIPLY(tmp3 , FIX(0.653061224));            /* 32/49 */
2064
    tmp10 = MULTIPLY(tmp10, - FIX(0.103406812));          /* -c13 */
2065
    tmp11 = MULTIPLY(tmp11, FIX(0.917760839));            /* c1 */
2066
    tmp10 += tmp11 - tmp3;
2067
    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) +     /* c5 */
2068
	    MULTIPLY(tmp4 + tmp6, FIX(0.491367823));      /* c9 */
2069
    dataptr[DCTSIZE*5] = (DCTELEM)
2070
      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
2071
	      + MULTIPLY(tmp4, FIX(0.731428202)),         /* c1+c11-c9 */
2072
	      CONST_BITS+1+PASS2_BITS-PASS1_BITS);
2073
    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) +     /* c3 */
2074
	    MULTIPLY(tmp5 - tmp6, FIX(0.305035186));      /* c11 */
2075
    dataptr[DCTSIZE*3] = (DCTELEM)
2076
      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
2077
	      - MULTIPLY(tmp5, FIX(2.004803435)),         /* c1+c5+c11 */
2078
	      CONST_BITS+1+PASS2_BITS-PASS1_BITS);
2079
    dataptr[DCTSIZE*1] = (DCTELEM)
2080
      DESCALE(tmp11 + tmp12 + tmp3
2081
	      - MULTIPLY(tmp0, FIX(0.735987049))          /* c3+c5-c1 */
2082
	      - MULTIPLY(tmp6, FIX(0.082925825)),         /* c9-c11-c13 */
2083
	      CONST_BITS+1+PASS2_BITS-PASS1_BITS);
2084

2085
    dataptr++;			/* advance pointer to next column */
2086
    wsptr++;			/* advance pointer to next column */
2087
  }
2088
}
2089

2090

2091
/*
2092
 * Perform the forward DCT on a 15x15 sample block.
2093
 */
2094

2095
GLOBAL(void)
2096
jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2097
{
2098
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2099
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2100
  INT32 z1, z2, z3;
2101
  DCTELEM workspace[8*7];
2102
  DCTELEM *dataptr;
2103
  DCTELEM *wsptr;
2104
  JSAMPROW elemptr;
2105
  int ctr;
2106
  SHIFT_TEMPS
2107

2108
  /* Pass 1: process rows.
2109
   * Note results are scaled up by sqrt(8) compared to a true DCT.
2110
   * cK represents sqrt(2) * cos(K*pi/30).
2111
   */
2112

2113
  dataptr = data;
2114
  ctr = 0;
2115
  for (;;) {
2116
    elemptr = sample_data[ctr] + start_col;
2117

2118
    /* Even part */
2119

2120
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
2121
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
2122
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
2123
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
2124
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
2125
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
2126
    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
2127
    tmp7 = GETJSAMPLE(elemptr[7]);
2128

2129
    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
2130
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
2131
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
2132
    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
2133
    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
2134
    tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
2135
    tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
2136

2137
    z1 = tmp0 + tmp4 + tmp5;
2138
    z2 = tmp1 + tmp3 + tmp6;
2139
    z3 = tmp2 + tmp7;
2140
    /* Apply unsigned->signed conversion. */
2141
    dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
2142
    z3 += z3;
2143
    dataptr[6] = (DCTELEM)
2144
      DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
2145
	      MULTIPLY(z2 - z3, FIX(0.437016024)),  /* c12 */
2146
	      CONST_BITS);
2147
    tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
2148
    z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) -  /* c2+c14 */
2149
         MULTIPLY(tmp6 - tmp2, FIX(2.238241955));   /* c4+c8 */
2150
    z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) -  /* c8-c14 */
2151
	 MULTIPLY(tmp0 - tmp2, FIX(0.091361227));   /* c2-c4 */
2152
    z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) +  /* c2 */
2153
	 MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) +  /* c8 */
2154
	 MULTIPLY(tmp1 - tmp4, FIX(0.790569415));   /* (c6+c12)/2 */
2155

2156
    dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
2157
    dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
2158

2159
    /* Odd part */
2160

2161
    tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
2162
		    FIX(1.224744871));                         /* c5 */
2163
    tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */
2164
	   MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876));  /* c9 */
2165
    tmp12 = MULTIPLY(tmp12, FIX(1.224744871));                 /* c5 */
2166
    tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) +         /* c1 */
2167
	   MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) +         /* c3 */
2168
	   MULTIPLY(tmp13 + tmp15, FIX(0.575212477));          /* c11 */
2169
    tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) -                 /* c7-c11 */
2170
	   MULTIPLY(tmp14, FIX(0.513743148)) +                 /* c3-c9 */
2171
	   MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12;   /* c1+c13 */
2172
    tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) -               /* -(c1-c7) */
2173
	   MULTIPLY(tmp11, FIX(2.176250899)) -                 /* c3+c9 */
2174
	   MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12;   /* c11+c13 */
2175

2176
    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
2177
    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
2178
    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
2179
    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
2180

2181
    ctr++;
2182

2183
    if (ctr != DCTSIZE) {
2184
      if (ctr == 15)
2185
	break;			/* Done. */
2186
      dataptr += DCTSIZE;	/* advance pointer to next row */
2187
    } else
2188
      dataptr = workspace;	/* switch pointer to extended workspace */
2189
  }
2190

2191
  /* Pass 2: process columns.
2192
   * We leave the results scaled up by an overall factor of 8.
2193
   * We must also scale the output by (8/15)**2 = 64/225, which we partially
2194
   * fold into the constant multipliers and final shifting:
2195
   * cK now represents sqrt(2) * cos(K*pi/30) * 256/225.
2196
   */
2197

2198
  dataptr = data;
2199
  wsptr = workspace;
2200
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2201
    /* Even part */
2202

2203
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*6];
2204
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*5];
2205
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*4];
2206
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*3];
2207
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*2];
2208
    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*1];
2209
    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*0];
2210
    tmp7 = dataptr[DCTSIZE*7];
2211

2212
    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*6];
2213
    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*5];
2214
    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*4];
2215
    tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*3];
2216
    tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*2];
2217
    tmp15 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*1];
2218
    tmp16 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*0];
2219

2220
    z1 = tmp0 + tmp4 + tmp5;
2221
    z2 = tmp1 + tmp3 + tmp6;
2222
    z3 = tmp2 + tmp7;
2223
    dataptr[DCTSIZE*0] = (DCTELEM)
2224
      DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */
2225
	      CONST_BITS+2+PASS2_BITS-PASS1_BITS);
2226
    z3 += z3;
2227
    dataptr[DCTSIZE*6] = (DCTELEM)
2228
      DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */
2229
	      MULTIPLY(z2 - z3, FIX(0.497227121)),  /* c12 */
2230
	      CONST_BITS+2+PASS2_BITS-PASS1_BITS);
2231
    tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
2232
    z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) -  /* c2+c14 */
2233
         MULTIPLY(tmp6 - tmp2, FIX(2.546621957));   /* c4+c8 */
2234
    z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) -  /* c8-c14 */
2235
	 MULTIPLY(tmp0 - tmp2, FIX(0.103948774));   /* c2-c4 */
2236
    z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) +  /* c2 */
2237
	 MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) +  /* c8 */
2238
	 MULTIPLY(tmp1 - tmp4, FIX(0.899492312));   /* (c6+c12)/2 */
2239

2240
    dataptr[DCTSIZE*2] = (DCTELEM)
2241
      DESCALE(z1 + z3, CONST_BITS+2+PASS2_BITS-PASS1_BITS);
2242
    dataptr[DCTSIZE*4] = (DCTELEM)
2243
      DESCALE(z2 + z3, CONST_BITS+2+PASS2_BITS-PASS1_BITS);
2244

2245
    /* Odd part */
2246

2247
    tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
2248
		    FIX(1.393487498));                         /* c5 */
2249
    tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */
2250
	   MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187));  /* c9 */
2251
    tmp12 = MULTIPLY(tmp12, FIX(1.393487498));                 /* c5 */
2252
    tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) +         /* c1 */
2253
	   MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) +         /* c3 */
2254
	   MULTIPLY(tmp13 + tmp15, FIX(0.654463974));          /* c11 */
2255
    tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) -                 /* c7-c11 */
2256
	   MULTIPLY(tmp14, FIX(0.584525538)) +                 /* c3-c9 */
2257
	   MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12;   /* c1+c13 */
2258
    tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) -               /* -(c1-c7) */
2259
	   MULTIPLY(tmp11, FIX(2.476089912)) -                 /* c3+c9 */
2260
	   MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12;   /* c11+c13 */
2261

2262
    dataptr[DCTSIZE*1] = (DCTELEM)
2263
      DESCALE(tmp0, CONST_BITS+2+PASS2_BITS-PASS1_BITS);
2264
    dataptr[DCTSIZE*3] = (DCTELEM)
2265
      DESCALE(tmp1, CONST_BITS+2+PASS2_BITS-PASS1_BITS);
2266
    dataptr[DCTSIZE*5] = (DCTELEM)
2267
      DESCALE(tmp2, CONST_BITS+2+PASS2_BITS-PASS1_BITS);
2268
    dataptr[DCTSIZE*7] = (DCTELEM)
2269
      DESCALE(tmp3, CONST_BITS+2+PASS2_BITS-PASS1_BITS);
2270

2271
    dataptr++;			/* advance pointer to next column */
2272
    wsptr++;			/* advance pointer to next column */
2273
  }
2274
}
2275

2276

2277
/*
2278
 * Perform the forward DCT on a 16x16 sample block.
2279
 */
2280

2281
GLOBAL(void)
2282
jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2283
{
2284
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2285
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2286
  DCTELEM workspace[DCTSIZE2];
2287
  DCTELEM *dataptr;
2288
  DCTELEM *wsptr;
2289
  JSAMPROW elemptr;
2290
  int ctr;
2291
  SHIFT_TEMPS
2292

2293
  /* Pass 1: process rows.
2294
   * Note results are scaled up by sqrt(8) compared to a true DCT;
2295
   * furthermore, we scale the results by 2**PASS1_BITS.
2296
   * cK represents sqrt(2) * cos(K*pi/32).
2297
   */
2298

2299
  dataptr = data;
2300
  ctr = 0;
2301
  for (;;) {
2302
    elemptr = sample_data[ctr] + start_col;
2303

2304
    /* Even part */
2305

2306
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2307
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2308
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2309
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2310
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2311
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2312
    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2313
    tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2314

2315
    tmp10 = tmp0 + tmp7;
2316
    tmp14 = tmp0 - tmp7;
2317
    tmp11 = tmp1 + tmp6;
2318
    tmp15 = tmp1 - tmp6;
2319
    tmp12 = tmp2 + tmp5;
2320
    tmp16 = tmp2 - tmp5;
2321
    tmp13 = tmp3 + tmp4;
2322
    tmp17 = tmp3 - tmp4;
2323

2324
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2325
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2326
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2327
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2328
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2329
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2330
    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2331
    tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2332

2333
    /* Apply unsigned->signed conversion. */
2334
    dataptr[0] =
2335
      PASS1_OUTPUT(tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE);
2336
    dataptr[4] = (DCTELEM)
2337
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2338
	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
2339
	      CONST_BITS-PASS1_BITS);
2340

2341
    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
2342
	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
2343

2344
    dataptr[2] = (DCTELEM)
2345
      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
2346
	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
2347
	      CONST_BITS-PASS1_BITS);
2348
    dataptr[6] = (DCTELEM)
2349
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
2350
	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
2351
	      CONST_BITS-PASS1_BITS);
2352

2353
    /* Odd part */
2354

2355
    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
2356
	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
2357
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
2358
	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
2359
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
2360
	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
2361
    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
2362
	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
2363
    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
2364
	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
2365
    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
2366
	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
2367
    tmp10 = tmp11 + tmp12 + tmp13 -
2368
	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
2369
	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
2370
    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2371
	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
2372
    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2373
	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
2374
    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2375
	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
2376

2377
    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2378
    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2379
    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2380
    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2381

2382
    ctr++;
2383

2384
    if (ctr != DCTSIZE) {
2385
      if (ctr == DCTSIZE * 2)
2386
	break;			/* Done. */
2387
      dataptr += DCTSIZE;	/* advance pointer to next row */
2388
    } else
2389
      dataptr = workspace;	/* switch pointer to extended workspace */
2390
  }
2391

2392
  /* Pass 2: process columns.
2393
   * We apply the PASS2_BITS scaling, but leave the
2394
   * results scaled up by an overall factor of 8.
2395
   * We must also scale the output by (8/16)**2 = 1/2**2.
2396
   * cK represents sqrt(2) * cos(K*pi/32).
2397
   */
2398

2399
  dataptr = data;
2400
  wsptr = workspace;
2401
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2402
    /* Even part */
2403

2404
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
2405
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
2406
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
2407
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
2408
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
2409
    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
2410
    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
2411
    tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
2412

2413
    tmp10 = tmp0 + tmp7;
2414
    tmp14 = tmp0 - tmp7;
2415
    tmp11 = tmp1 + tmp6;
2416
    tmp15 = tmp1 - tmp6;
2417
    tmp12 = tmp2 + tmp5;
2418
    tmp16 = tmp2 - tmp5;
2419
    tmp13 = tmp3 + tmp4;
2420
    tmp17 = tmp3 - tmp4;
2421

2422
    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
2423
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
2424
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
2425
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
2426
    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
2427
    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
2428
    tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
2429
    tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
2430

2431
    dataptr[DCTSIZE*0] = (DCTELEM)
2432
      DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS2_BITS+2);
2433
    dataptr[DCTSIZE*4] = (DCTELEM)
2434
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2435
	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
2436
	      CONST_BITS+PASS2_BITS+2);
2437

2438
    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
2439
	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
2440

2441
    dataptr[DCTSIZE*2] = (DCTELEM)
2442
      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
2443
	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+10 */
2444
	      CONST_BITS+PASS2_BITS+2);
2445
    dataptr[DCTSIZE*6] = (DCTELEM)
2446
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
2447
	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
2448
	      CONST_BITS+PASS2_BITS+2);
2449

2450
    /* Odd part */
2451

2452
    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
2453
	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
2454
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
2455
	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
2456
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
2457
	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
2458
    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
2459
	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
2460
    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
2461
	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
2462
    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
2463
	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
2464
    tmp10 = tmp11 + tmp12 + tmp13 -
2465
	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
2466
	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
2467
    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2468
	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
2469
    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2470
	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
2471
    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2472
	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
2473

2474
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS2_BITS+2);
2475
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS2_BITS+2);
2476
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS2_BITS+2);
2477
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS2_BITS+2);
2478

2479
    dataptr++;			/* advance pointer to next column */
2480
    wsptr++;			/* advance pointer to next column */
2481
  }
2482
}
2483

2484

2485
/*
2486
 * Perform the forward DCT on a 16x8 sample block.
2487
 *
2488
 * 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
2489
 */
2490

2491
GLOBAL(void)
2492
jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2493
{
2494
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2495
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2496
  INT32 z1;
2497
  DCTELEM *dataptr;
2498
  JSAMPROW elemptr;
2499
  int ctr;
2500
  SHIFT_TEMPS
2501

2502
  /* Pass 1: process rows.
2503
   * Note results are scaled up by sqrt(8) compared to a true DCT;
2504
   * furthermore, we scale the results by 2**PASS1_BITS.
2505
   * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
2506
   */
2507

2508
  dataptr = data;
2509
  ctr = 0;
2510
  for (ctr = 0; ctr < DCTSIZE; ctr++) {
2511
    elemptr = sample_data[ctr] + start_col;
2512

2513
    /* Even part */
2514

2515
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2516
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2517
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2518
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2519
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2520
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2521
    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2522
    tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2523

2524
    tmp10 = tmp0 + tmp7;
2525
    tmp14 = tmp0 - tmp7;
2526
    tmp11 = tmp1 + tmp6;
2527
    tmp15 = tmp1 - tmp6;
2528
    tmp12 = tmp2 + tmp5;
2529
    tmp16 = tmp2 - tmp5;
2530
    tmp13 = tmp3 + tmp4;
2531
    tmp17 = tmp3 - tmp4;
2532

2533
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2534
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2535
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2536
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2537
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2538
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2539
    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2540
    tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2541

2542
    /* Apply unsigned->signed conversion. */
2543
    dataptr[0] =
2544
      PASS1_OUTPUT(tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE);
2545
    dataptr[4] = (DCTELEM)
2546
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2547
	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
2548
	      CONST_BITS-PASS1_BITS);
2549

2550
    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
2551
	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
2552

2553
    dataptr[2] = (DCTELEM)
2554
      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
2555
	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
2556
	      CONST_BITS-PASS1_BITS);
2557
    dataptr[6] = (DCTELEM)
2558
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
2559
	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
2560
	      CONST_BITS-PASS1_BITS);
2561

2562
    /* Odd part */
2563

2564
    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
2565
	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
2566
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
2567
	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
2568
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
2569
	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
2570
    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
2571
	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
2572
    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
2573
	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
2574
    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
2575
	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
2576
    tmp10 = tmp11 + tmp12 + tmp13 -
2577
	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
2578
	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
2579
    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2580
	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
2581
    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2582
	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
2583
    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2584
	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
2585

2586
    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2587
    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2588
    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2589
    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2590

2591
    dataptr += DCTSIZE;		/* advance pointer to next row */
2592
  }
2593

2594
  /* Pass 2: process columns.
2595
   * We apply the PASS2_BITS scaling, but leave the
2596
   * results scaled up by an overall factor of 8.
2597
   * We must also scale the output by 8/16 = 1/2.
2598
   * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
2599
   */
2600

2601
  dataptr = data;
2602
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2603
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
2604
     * rotator "c1" should be "c6".
2605
     */
2606

2607
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
2608
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
2609
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
2610
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
2611

2612
    /* Add fudge factor here for final descale. */
2613
#if PASS2_BITS > 0
2614
    tmp10 = tmp0 + tmp3 + (ONE << PASS2_BITS);
2615
#else
2616
    tmp10 = tmp0 + tmp3 + ONE;
2617
#endif
2618
    tmp12 = tmp0 - tmp3;
2619
    tmp11 = tmp1 + tmp2;
2620
    tmp13 = tmp1 - tmp2;
2621

2622
    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
2623
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
2624
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
2625
    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
2626

2627
    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS2_BITS+1);
2628
    dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS2_BITS+1);
2629

2630
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
2631
    /* Add fudge factor here for final descale. */
2632
    z1 += ONE << (CONST_BITS+PASS2_BITS);
2633

2634
    dataptr[DCTSIZE*2] = (DCTELEM)
2635
      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
2636
		  CONST_BITS+PASS2_BITS+1);
2637
    dataptr[DCTSIZE*6] = (DCTELEM)
2638
      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
2639
		  CONST_BITS+PASS2_BITS+1);
2640

2641
    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
2642
     * i0..i3 in the paper are tmp0..tmp3 here.
2643
     */
2644

2645
    tmp12 = tmp0 + tmp2;
2646
    tmp13 = tmp1 + tmp3;
2647

2648
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
2649
    /* Add fudge factor here for final descale. */
2650
    z1 += ONE << (CONST_BITS+PASS2_BITS);
2651

2652
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
2653
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */
2654
    tmp12 += z1;
2655
    tmp13 += z1;
2656

2657
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
2658
    tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
2659
    tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
2660
    tmp0 += z1 + tmp12;
2661
    tmp3 += z1 + tmp13;
2662

2663
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
2664
    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
2665
    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
2666
    tmp1 += z1 + tmp13;
2667
    tmp2 += z1 + tmp12;
2668

2669
    dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS+PASS2_BITS+1);
2670
    dataptr[DCTSIZE*3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS+PASS2_BITS+1);
2671
    dataptr[DCTSIZE*5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS+PASS2_BITS+1);
2672
    dataptr[DCTSIZE*7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS+PASS2_BITS+1);
2673

2674
    dataptr++;			/* advance pointer to next column */
2675
  }
2676
}
2677

2678

2679
/*
2680
 * Perform the forward DCT on a 14x7 sample block.
2681
 *
2682
 * 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
2683
 */
2684

2685
GLOBAL(void)
2686
jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2687
{
2688
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
2689
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2690
  INT32 z1, z2, z3;
2691
  DCTELEM *dataptr;
2692
  JSAMPROW elemptr;
2693
  int ctr;
2694
  SHIFT_TEMPS
2695

2696
  /* Zero bottom row of output coefficient block. */
2697
  MEMZERO(&data[DCTSIZE*7], SIZEOF(DCTELEM) * DCTSIZE);
2698

2699
  /* Pass 1: process rows.
2700
   * Note results are scaled up by sqrt(8) compared to a true DCT;
2701
   * furthermore, we scale the results by 2**PASS1_BITS.
2702
   * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
2703
   */
2704

2705
  dataptr = data;
2706
  for (ctr = 0; ctr < 7; ctr++) {
2707
    elemptr = sample_data[ctr] + start_col;
2708

2709
    /* Even part */
2710

2711
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
2712
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
2713
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
2714
    tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
2715
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
2716
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
2717
    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
2718

2719
    tmp10 = tmp0 + tmp6;
2720
    tmp14 = tmp0 - tmp6;
2721
    tmp11 = tmp1 + tmp5;
2722
    tmp15 = tmp1 - tmp5;
2723
    tmp12 = tmp2 + tmp4;
2724
    tmp16 = tmp2 - tmp4;
2725

2726
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
2727
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
2728
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
2729
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
2730
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
2731
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
2732
    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
2733

2734
    /* Apply unsigned->signed conversion. */
2735
    dataptr[0] =
2736
      PASS1_OUTPUT(tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
2737
    tmp13 += tmp13;
2738
    dataptr[4] = (DCTELEM)
2739
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
2740
	      MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
2741
	      MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
2742
	      CONST_BITS-PASS1_BITS);
2743

2744
    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
2745

2746
    dataptr[2] = (DCTELEM)
2747
      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
2748
	      + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
2749
	      CONST_BITS-PASS1_BITS);
2750
    dataptr[6] = (DCTELEM)
2751
      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
2752
	      - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
2753
	      CONST_BITS-PASS1_BITS);
2754

2755
    /* Odd part */
2756

2757
    tmp10 = tmp1 + tmp2;
2758
    tmp11 = tmp5 - tmp4;
2759
    dataptr[7] = PASS1_OUTPUT(tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
2760
    tmp3 <<= CONST_BITS;
2761
    tmp10 = MULTIPLY(tmp10, - FIX(0.158341681));          /* -c13 */
2762
    tmp11 = MULTIPLY(tmp11, FIX(1.405321284));            /* c1 */
2763
    tmp10 += tmp11 - tmp3;
2764
    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) +     /* c5 */
2765
	    MULTIPLY(tmp4 + tmp6, FIX(0.752406978));      /* c9 */
2766
    dataptr[5] = (DCTELEM)
2767
      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
2768
	      + MULTIPLY(tmp4, FIX(1.119999435)),         /* c1+c11-c9 */
2769
	      CONST_BITS-PASS1_BITS);
2770
    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) +     /* c3 */
2771
	    MULTIPLY(tmp5 - tmp6, FIX(0.467085129));      /* c11 */
2772
    dataptr[3] = (DCTELEM)
2773
      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
2774
	      - MULTIPLY(tmp5, FIX(3.069855259)),         /* c1+c5+c11 */
2775
	      CONST_BITS-PASS1_BITS);
2776
    dataptr[1] = (DCTELEM)
2777
      DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
2778
	      MULTIPLY(tmp0 + tmp6, FIX(1.126980169)),    /* c3+c5-c1 */
2779
	      CONST_BITS-PASS1_BITS);
2780

2781
    dataptr += DCTSIZE;		/* advance pointer to next row */
2782
  }
2783

2784
  /* Pass 2: process columns.
2785
   * We apply the PASS2_BITS scaling, but leave the
2786
   * results scaled up by an overall factor of 8.
2787
   * We must also scale the output by (8/14)*(8/7) = 32/49, which we
2788
   * partially fold into the constant multipliers and final shifting:
2789
   * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14) * 64/49.
2790
   */
2791

2792
  dataptr = data;
2793
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2794
    /* Even part */
2795

2796
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
2797
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
2798
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
2799
    tmp3 = dataptr[DCTSIZE*3];
2800

2801
    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
2802
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
2803
    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
2804

2805
    z1 = tmp0 + tmp2;
2806
    dataptr[DCTSIZE*0] = (DCTELEM)
2807
      DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
2808
	      CONST_BITS+PASS2_BITS+1);
2809
    tmp3 += tmp3;
2810
    z1 -= tmp3;
2811
    z1 -= tmp3;
2812
    z1 = MULTIPLY(z1, FIX(0.461784020));                /* (c2+c6-c4)/2 */
2813
    z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084));       /* (c2+c4-c6)/2 */
2814
    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446));       /* c6 */
2815
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS2_BITS+1);
2816
    z1 -= z2;
2817
    z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509));       /* c4 */
2818
    dataptr[DCTSIZE*4] = (DCTELEM)
2819
      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
2820
	      CONST_BITS+PASS2_BITS+1);
2821
    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS2_BITS+1);
2822

2823
    /* Odd part */
2824

2825
    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677));   /* (c3+c1-c5)/2 */
2826
    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464));   /* (c3+c5-c1)/2 */
2827
    tmp0 = tmp1 - tmp2;
2828
    tmp1 += tmp2;
2829
    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
2830
    tmp1 += tmp2;
2831
    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310));   /* c5 */
2832
    tmp0 += tmp3;
2833
    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355));   /* c3+c1-c5 */
2834

2835
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS2_BITS+1);
2836
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS2_BITS+1);
2837
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS2_BITS+1);
2838

2839
    dataptr++;			/* advance pointer to next column */
2840
  }
2841
}
2842

2843

2844
/*
2845
 * Perform the forward DCT on a 12x6 sample block.
2846
 *
2847
 * 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
2848
 */
2849

2850
GLOBAL(void)
2851
jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2852
{
2853
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
2854
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2855
  DCTELEM *dataptr;
2856
  JSAMPROW elemptr;
2857
  int ctr;
2858
  SHIFT_TEMPS
2859

2860
  /* Zero 2 bottom rows of output coefficient block. */
2861
  MEMZERO(&data[DCTSIZE*6], SIZEOF(DCTELEM) * DCTSIZE * 2);
2862

2863
  /* Pass 1: process rows.
2864
   * Note results are scaled up by sqrt(8) compared to a true DCT;
2865
   * furthermore, we scale the results by 2**PASS1_BITS.
2866
   * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
2867
   */
2868

2869
  dataptr = data;
2870
  for (ctr = 0; ctr < 6; ctr++) {
2871
    elemptr = sample_data[ctr] + start_col;
2872

2873
    /* Even part */
2874

2875
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
2876
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
2877
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
2878
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
2879
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
2880
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
2881

2882
    tmp10 = tmp0 + tmp5;
2883
    tmp13 = tmp0 - tmp5;
2884
    tmp11 = tmp1 + tmp4;
2885
    tmp14 = tmp1 - tmp4;
2886
    tmp12 = tmp2 + tmp3;
2887
    tmp15 = tmp2 - tmp3;
2888

2889
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
2890
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
2891
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
2892
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
2893
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
2894
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
2895

2896
    /* Apply unsigned->signed conversion. */
2897
    dataptr[0] =
2898
      PASS1_OUTPUT(tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
2899
    dataptr[6] = PASS1_OUTPUT(tmp13 - tmp14 - tmp15);
2900
    dataptr[4] = (DCTELEM)
2901
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
2902
	      CONST_BITS-PASS1_BITS);
2903
    dataptr[2] = (DCTELEM)
2904
      DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
2905
	      CONST_BITS-PASS1_BITS);
2906

2907
    /* Odd part */
2908

2909
    tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
2910
    tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
2911
    tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
2912
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
2913
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
2914
    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
2915
	    + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
2916
    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
2917
    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
2918
	    + MULTIPLY(tmp5, FIX(0.860918669));        /* c7 */
2919
    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
2920
	    - MULTIPLY(tmp5, FIX(1.121971054));        /* c5 */
2921
    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
2922
	    - MULTIPLY(tmp2 + tmp5, FIX_0_541196100);  /* c9 */
2923

2924
    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2925
    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2926
    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2927
    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2928

2929
    dataptr += DCTSIZE;		/* advance pointer to next row */
2930
  }
2931

2932
  /* Pass 2: process columns.
2933
   * We apply the PASS2_BITS scaling, but leave the
2934
   * results scaled up by an overall factor of 8.
2935
   * We must also scale the output by (8/12)*(8/6) = 8/9, which we
2936
   * partially fold into the constant multipliers and final shifting:
2937
   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
2938
   */
2939

2940
  dataptr = data;
2941
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2942
    /* Even part */
2943

2944
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
2945
    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
2946
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
2947

2948
    tmp10 = tmp0 + tmp2;
2949
    tmp12 = tmp0 - tmp2;
2950

2951
    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
2952
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
2953
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
2954

2955
    dataptr[DCTSIZE*0] = (DCTELEM)
2956
      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
2957
	      CONST_BITS+PASS2_BITS+1);
2958
    dataptr[DCTSIZE*2] = (DCTELEM)
2959
      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
2960
	      CONST_BITS+PASS2_BITS+1);
2961
    dataptr[DCTSIZE*4] = (DCTELEM)
2962
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
2963
	      CONST_BITS+PASS2_BITS+1);
2964

2965
    /* Odd part */
2966

2967
    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
2968

2969
    dataptr[DCTSIZE*1] = (DCTELEM)
2970
      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
2971
	      CONST_BITS+PASS2_BITS+1);
2972
    dataptr[DCTSIZE*3] = (DCTELEM)
2973
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
2974
	      CONST_BITS+PASS2_BITS+1);
2975
    dataptr[DCTSIZE*5] = (DCTELEM)
2976
      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
2977
	      CONST_BITS+PASS2_BITS+1);
2978

2979
    dataptr++;			/* advance pointer to next column */
2980
  }
2981
}
2982

2983

2984
/*
2985
 * Perform the forward DCT on a 10x5 sample block.
2986
 *
2987
 * 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
2988
 */
2989

2990
GLOBAL(void)
2991
jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2992
{
2993
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
2994
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
2995
  DCTELEM *dataptr;
2996
  JSAMPROW elemptr;
2997
  int ctr;
2998
  SHIFT_TEMPS
2999

3000
  /* Zero 3 bottom rows of output coefficient block. */
3001
  MEMZERO(&data[DCTSIZE*5], SIZEOF(DCTELEM) * DCTSIZE * 3);
3002

3003
  /* Pass 1: process rows.
3004
   * Note results are scaled up by sqrt(8) compared to a true DCT;
3005
   * furthermore, we scale the results by 2**PASS1_BITS.
3006
   * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
3007
   */
3008

3009
  dataptr = data;
3010
  for (ctr = 0; ctr < 5; ctr++) {
3011
    elemptr = sample_data[ctr] + start_col;
3012

3013
    /* Even part */
3014

3015
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
3016
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
3017
    tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
3018
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
3019
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
3020

3021
    tmp10 = tmp0 + tmp4;
3022
    tmp13 = tmp0 - tmp4;
3023
    tmp11 = tmp1 + tmp3;
3024
    tmp14 = tmp1 - tmp3;
3025

3026
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
3027
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
3028
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
3029
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
3030
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
3031

3032
    /* Apply unsigned->signed conversion. */
3033
    dataptr[0] =
3034
      PASS1_OUTPUT(tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE);
3035
    tmp12 += tmp12;
3036
    dataptr[4] = (DCTELEM)
3037
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
3038
	      MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
3039
	      CONST_BITS-PASS1_BITS);
3040
    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
3041
    dataptr[2] = (DCTELEM)
3042
      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
3043
	      CONST_BITS-PASS1_BITS);
3044
    dataptr[6] = (DCTELEM)
3045
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
3046
	      CONST_BITS-PASS1_BITS);
3047

3048
    /* Odd part */
3049

3050
    tmp10 = tmp0 + tmp4;
3051
    tmp11 = tmp1 - tmp3;
3052
    dataptr[5] = PASS1_OUTPUT(tmp10 - tmp11 - tmp2);
3053
    tmp2 <<= CONST_BITS;
3054
    dataptr[1] = (DCTELEM)
3055
      DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) +          /* c1 */
3056
	      MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 +   /* c3 */
3057
	      MULTIPLY(tmp3, FIX(0.642039522)) +          /* c7 */
3058
	      MULTIPLY(tmp4, FIX(0.221231742)),           /* c9 */
3059
	      CONST_BITS-PASS1_BITS);
3060
    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) -     /* (c3+c7)/2 */
3061
	    MULTIPLY(tmp1 + tmp3, FIX(0.587785252));      /* (c1-c9)/2 */
3062
    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) +   /* (c3-c7)/2 */
3063
	    (tmp11 << (CONST_BITS - 1)) - tmp2;
3064
    dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
3065
    dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);
3066

3067
    dataptr += DCTSIZE;		/* advance pointer to next row */
3068
  }
3069

3070
  /* Pass 2: process columns.
3071
   * We apply the PASS2_BITS scaling, but leave the
3072
   * results scaled up by an overall factor of 8.
3073
   * We must also scale the output by (8/10)*(8/5) = 32/25,
3074
   * which we fold into the constant multipliers:
3075
   * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10) * 32/25.
3076
   */
3077

3078
  dataptr = data;
3079
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3080
    /* Even part */
3081

3082
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
3083
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
3084
    tmp2 = dataptr[DCTSIZE*2];
3085

3086
    tmp10 = tmp0 + tmp1;
3087
    tmp11 = tmp0 - tmp1;
3088

3089
    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
3090
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
3091

3092
    dataptr[DCTSIZE*0] = (DCTELEM)
3093
      DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)),        /* 32/25 */
3094
	      CONST_BITS+PASS2_BITS);
3095
    tmp11 = MULTIPLY(tmp11, FIX(1.011928851));          /* (c2+c4)/2 */
3096
    tmp10 -= tmp2 << 2;
3097
    tmp10 = MULTIPLY(tmp10, FIX(0.452548340));          /* (c2-c4)/2 */
3098
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS2_BITS);
3099
    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS2_BITS);
3100

3101
    /* Odd part */
3102

3103
    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961));    /* c3 */
3104

3105
    dataptr[DCTSIZE*1] = (DCTELEM)
3106
      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
3107
	      CONST_BITS+PASS2_BITS);
3108
    dataptr[DCTSIZE*3] = (DCTELEM)
3109
      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
3110
	      CONST_BITS+PASS2_BITS);
3111

3112
    dataptr++;			/* advance pointer to next column */
3113
  }
3114
}
3115

3116

3117
/*
3118
 * Perform the forward DCT on an 8x4 sample block.
3119
 *
3120
 * 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
3121
 */
3122

3123
GLOBAL(void)
3124
jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3125
{
3126
  INT32 tmp0, tmp1, tmp2, tmp3;
3127
  INT32 tmp10, tmp11, tmp12, tmp13;
3128
  INT32 z1;
3129
  DCTELEM *dataptr;
3130
  JSAMPROW elemptr;
3131
  int ctr;
3132
  SHIFT_TEMPS
3133

3134
  /* Zero 4 bottom rows of output coefficient block. */
3135
  MEMZERO(&data[DCTSIZE*4], SIZEOF(DCTELEM) * DCTSIZE * 4);
3136

3137
  /* Pass 1: process rows.
3138
   * Note results are scaled up by sqrt(8) compared to a true DCT;
3139
   * furthermore, we scale the results by 2**PASS1_BITS.
3140
   * We must also scale the output by 8/4 = 2, which we add here.
3141
   * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3142
   */
3143

3144
  dataptr = data;
3145
  for (ctr = 0; ctr < 4; ctr++) {
3146
    elemptr = sample_data[ctr] + start_col;
3147

3148
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
3149
     * rotator "c1" should be "c6".
3150
     */
3151

3152
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3153
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3154
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3155
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3156

3157
    tmp10 = tmp0 + tmp3;
3158
    tmp12 = tmp0 - tmp3;
3159
    tmp11 = tmp1 + tmp2;
3160
    tmp13 = tmp1 - tmp2;
3161

3162
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3163
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3164
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3165
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3166

3167
    /* Apply unsigned->signed conversion. */
3168
    dataptr[0] = (DCTELEM)
3169
      ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
3170
    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
3171

3172
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
3173
    /* Add fudge factor here for final descale. */
3174
    z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3175

3176
    dataptr[2] = (DCTELEM)
3177
      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
3178
		  CONST_BITS-PASS1_BITS-1);
3179
    dataptr[6] = (DCTELEM)
3180
      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
3181
		  CONST_BITS-PASS1_BITS-1);
3182

3183
    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3184
     * i0..i3 in the paper are tmp0..tmp3 here.
3185
     */
3186

3187
    tmp12 = tmp0 + tmp2;
3188
    tmp13 = tmp1 + tmp3;
3189

3190
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
3191
    /* Add fudge factor here for final descale. */
3192
    z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3193

3194
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
3195
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */
3196
    tmp12 += z1;
3197
    tmp13 += z1;
3198

3199
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
3200
    tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
3201
    tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
3202
    tmp0 += z1 + tmp12;
3203
    tmp3 += z1 + tmp13;
3204

3205
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
3206
    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
3207
    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
3208
    tmp1 += z1 + tmp13;
3209
    tmp2 += z1 + tmp12;
3210

3211
    dataptr[1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS-PASS1_BITS-1);
3212
    dataptr[3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS-PASS1_BITS-1);
3213
    dataptr[5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS-1);
3214
    dataptr[7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS-PASS1_BITS-1);
3215

3216
    dataptr += DCTSIZE;		/* advance pointer to next row */
3217
  }
3218

3219
  /* Pass 2: process columns.
3220
   * We apply the PASS2_BITS scaling, but leave the
3221
   * results scaled up by an overall factor of 8.
3222
   * 4-point FDCT kernel,
3223
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
3224
   */
3225

3226
  dataptr = data;
3227
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3228
    /* Even part */
3229

3230
    /* Add fudge factor here for final descale. */
3231
#if PASS2_BITS > 1
3232
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS2_BITS-1));
3233
#else
3234
#if PASS2_BITS > 0
3235
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + ONE;
3236
#else
3237
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
3238
#endif
3239
#endif
3240
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
3241

3242
    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
3243
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
3244

3245
    dataptr[DCTSIZE*0] = PASS2_OUTPUT(tmp0 + tmp1);
3246
    dataptr[DCTSIZE*2] = PASS2_OUTPUT(tmp0 - tmp1);
3247

3248
    /* Odd part */
3249

3250
    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
3251
    /* Add fudge factor here for final descale. */
3252
    tmp0 += ONE << (CONST_BITS+PASS2_BITS-1);
3253

3254
    dataptr[DCTSIZE*1] = (DCTELEM)
3255
      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
3256
		  CONST_BITS+PASS2_BITS);
3257
    dataptr[DCTSIZE*3] = (DCTELEM)
3258
      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
3259
		  CONST_BITS+PASS2_BITS);
3260

3261
    dataptr++;			/* advance pointer to next column */
3262
  }
3263
}
3264

3265

3266
/*
3267
 * Perform the forward DCT on a 6x3 sample block.
3268
 *
3269
 * 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
3270
 */
3271

3272
GLOBAL(void)
3273
jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3274
{
3275
  INT32 tmp0, tmp1, tmp2;
3276
  INT32 tmp10, tmp11, tmp12;
3277
  DCTELEM *dataptr;
3278
  JSAMPROW elemptr;
3279
  int ctr;
3280
  SHIFT_TEMPS
3281

3282
  /* Pre-zero output coefficient block. */
3283
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3284

3285
  /* Pass 1: process rows.
3286
   * Note results are scaled up by sqrt(8) compared to a true DCT;
3287
   * furthermore, we scale the results by 2**PASS1_BITS.
3288
   * We scale the results further by 2 as part of output adaption
3289
   * scaling for different DCT size.
3290
   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3291
   */
3292

3293
  dataptr = data;
3294
  for (ctr = 0; ctr < 3; ctr++) {
3295
    elemptr = sample_data[ctr] + start_col;
3296

3297
    /* Even part */
3298

3299
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3300
    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3301
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3302

3303
    tmp10 = tmp0 + tmp2;
3304
    tmp12 = tmp0 - tmp2;
3305

3306
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3307
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3308
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3309

3310
    /* Apply unsigned->signed conversion. */
3311
    dataptr[0] = (DCTELEM)
3312
      ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
3313
    dataptr[2] = (DCTELEM)
3314
      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
3315
	      CONST_BITS-PASS1_BITS-1);
3316
    dataptr[4] = (DCTELEM)
3317
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3318
	      CONST_BITS-PASS1_BITS-1);
3319

3320
    /* Odd part */
3321

3322
    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
3323
		    CONST_BITS-PASS1_BITS-1);
3324

3325
    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
3326
    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
3327
    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
3328

3329
    dataptr += DCTSIZE;		/* advance pointer to next row */
3330
  }
3331

3332
  /* Pass 2: process columns.
3333
   * We apply the PASS2_BITS scaling, but leave the
3334
   * results scaled up by an overall factor of 8.
3335
   * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
3336
   * fold into the constant multipliers (other part was done in pass 1):
3337
   * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6) * 16/9.
3338
   */
3339

3340
  dataptr = data;
3341
  for (ctr = 0; ctr < 6; ctr++) {
3342
    /* Even part */
3343

3344
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
3345
    tmp1 = dataptr[DCTSIZE*1];
3346

3347
    tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
3348

3349
    dataptr[DCTSIZE*0] = (DCTELEM)
3350
      DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),        /* 16/9 */
3351
	      CONST_BITS+PASS2_BITS);
3352
    dataptr[DCTSIZE*2] = (DCTELEM)
3353
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
3354
	      CONST_BITS+PASS2_BITS);
3355

3356
    /* Odd part */
3357

3358
    dataptr[DCTSIZE*1] = (DCTELEM)
3359
      DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
3360
	      CONST_BITS+PASS2_BITS);
3361

3362
    dataptr++;			/* advance pointer to next column */
3363
  }
3364
}
3365

3366

3367
/*
3368
 * Perform the forward DCT on a 4x2 sample block.
3369
 *
3370
 * 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
3371
 */
3372

3373
GLOBAL(void)
3374
jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3375
{
3376
  DCTELEM tmp0, tmp2, tmp10, tmp12, tmp4, tmp5;
3377
  INT32 tmp1, tmp3, tmp11, tmp13;
3378
  INT32 z1, z2, z3;
3379
  JSAMPROW elemptr;
3380
  SHIFT_TEMPS
3381
#if PASS2_BITS > PASS1_BITS + 3
3382
  ISHIFT_TEMPS
3383
#endif
3384

3385
  /* Pre-zero output coefficient block. */
3386
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3387

3388
  /* Pass 1: process rows.
3389
   * Note results are scaled up by sqrt(8) compared to a true DCT.
3390
   * 4-point FDCT kernel,
3391
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
3392
   */
3393

3394
  /* Row 0 */
3395
  elemptr = sample_data[0] + start_col;
3396

3397
  /* Even part */
3398

3399
  tmp4 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
3400
  tmp5 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
3401

3402
#if PASS2_BITS > PASS1_BITS + 3
3403
  /* Add fudge factor here for final downscale. */
3404
#if PASS2_BITS > PASS1_BITS + 4
3405
  tmp4 += 1 << (PASS2_BITS-PASS1_BITS-3-1);
3406
#else
3407
  tmp4 += 1;
3408
#endif
3409
#endif
3410

3411
  tmp0 = tmp4 + tmp5;
3412
  tmp2 = tmp4 - tmp5;
3413

3414
  /* Odd part */
3415

3416
  z2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
3417
  z3 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
3418

3419
  z1 = MULTIPLY(z2 + z3, FIX_0_541196100);    /* c6 */
3420
  /* Add fudge factor here for final descale. */
3421
  z1 += ONE << (CONST_BITS+PASS2_BITS-PASS1_BITS-3-1);
3422
  tmp1 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
3423
  tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
3424

3425
  /* Row 1 */
3426
  elemptr = sample_data[1] + start_col;
3427

3428
  /* Even part */
3429

3430
  tmp4 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
3431
  tmp5 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
3432

3433
  tmp10 = tmp4 + tmp5;
3434
  tmp12 = tmp4 - tmp5;
3435

3436
  /* Odd part */
3437

3438
  z2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
3439
  z3 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
3440

3441
  z1 = MULTIPLY(z2 + z3, FIX_0_541196100);    /* c6 */
3442
  tmp11 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
3443
  tmp13 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
3444

3445
  /* Pass 2: process columns.
3446
   * We leave the results scaled up by an overall factor of 8.
3447
   * We must also scale the output by (8/4)*(8/2) = 2**3.
3448
   */
3449

3450
  /* Column 0 */
3451
  /* Apply unsigned->signed conversion. */
3452

3453
#if PASS2_BITS < PASS1_BITS + 3
3454
  data[DCTSIZE*0] =
3455
    (tmp0 + tmp10 - 8 * CENTERJSAMPLE) << (3+PASS1_BITS-PASS2_BITS);
3456
  data[DCTSIZE*1] = (tmp0 - tmp10) << (3+PASS1_BITS-PASS2_BITS);
3457

3458
  /* Column 2 */
3459
  data[DCTSIZE*0+2] = (tmp2 + tmp12) << (3+PASS1_BITS-PASS2_BITS);
3460
  data[DCTSIZE*1+2] = (tmp2 - tmp12) << (3+PASS1_BITS-PASS2_BITS);
3461
#else
3462
#if PASS2_BITS == PASS1_BITS + 3
3463
  data[DCTSIZE*0] = tmp0 + tmp10 - 8 * CENTERJSAMPLE;
3464
  data[DCTSIZE*1] = tmp0 - tmp10;
3465

3466
  /* Column 2 */
3467
  data[DCTSIZE*0+2] = tmp2 + tmp12;
3468
  data[DCTSIZE*1+2] = tmp2 - tmp12;
3469
#else
3470
  data[DCTSIZE*0] =
3471
    IRIGHT_SHIFT(tmp0 + tmp10 - 8 * CENTERJSAMPLE,
3472
		 PASS2_BITS-PASS1_BITS-3);
3473
  data[DCTSIZE*1] =
3474
    IRIGHT_SHIFT(tmp0 - tmp10, PASS2_BITS-PASS1_BITS-3);
3475

3476
  /* Column 2 */
3477
  data[DCTSIZE*0+2] =
3478
    IRIGHT_SHIFT(tmp2 + tmp12, PASS2_BITS-PASS1_BITS-3);
3479
  data[DCTSIZE*1+2] =
3480
    IRIGHT_SHIFT(tmp2 - tmp12, PASS2_BITS-PASS1_BITS-3);
3481
#endif
3482
#endif
3483

3484
  /* Column 1 */
3485
  data[DCTSIZE*0+1] = (DCTELEM)
3486
    RIGHT_SHIFT(tmp1 + tmp11, CONST_BITS+PASS2_BITS-PASS1_BITS-3);
3487
  data[DCTSIZE*1+1] = (DCTELEM)
3488
    RIGHT_SHIFT(tmp1 - tmp11, CONST_BITS+PASS2_BITS-PASS1_BITS-3);
3489

3490
  /* Column 3 */
3491
  data[DCTSIZE*0+3] = (DCTELEM)
3492
    RIGHT_SHIFT(tmp3 + tmp13, CONST_BITS+PASS2_BITS-PASS1_BITS-3);
3493
  data[DCTSIZE*1+3] = (DCTELEM)
3494
    RIGHT_SHIFT(tmp3 - tmp13, CONST_BITS+PASS2_BITS-PASS1_BITS-3);
3495
}
3496

3497

3498
/*
3499
 * Perform the forward DCT on a 2x1 sample block.
3500
 *
3501
 * 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
3502
 */
3503

3504
GLOBAL(void)
3505
jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3506
{
3507
  DCTELEM tmp0, tmp1;
3508
  JSAMPROW elemptr;
3509

3510
  /* Pre-zero output coefficient block. */
3511
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3512

3513
  elemptr = sample_data[0] + start_col;
3514

3515
  tmp0 = GETJSAMPLE(elemptr[0]);
3516
  tmp1 = GETJSAMPLE(elemptr[1]);
3517

3518
  /* We leave the results scaled up by an overall factor of 8.
3519
   * We must also scale the output by (8/2)*(8/1) = 2**5.
3520
   */
3521

3522
  /* Even part */
3523

3524
  /* Apply unsigned->signed conversion. */
3525
  data[0] =
3526
    (tmp0 + tmp1 - 2 * CENTERJSAMPLE) << (5+PASS1_BITS-PASS2_BITS);
3527

3528
  /* Odd part */
3529

3530
  data[1] = (tmp0 - tmp1) << (5+PASS1_BITS-PASS2_BITS);
3531
}
3532

3533

3534
/*
3535
 * Perform the forward DCT on an 8x16 sample block.
3536
 *
3537
 * 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
3538
 */
3539

3540
GLOBAL(void)
3541
jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3542
{
3543
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3544
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
3545
  INT32 z1;
3546
  DCTELEM workspace[DCTSIZE2];
3547
  DCTELEM *dataptr;
3548
  DCTELEM *wsptr;
3549
  JSAMPROW elemptr;
3550
  int ctr;
3551
  SHIFT_TEMPS
3552

3553
  /* Pass 1: process rows.
3554
   * Note results are scaled up by sqrt(8) compared to a true DCT;
3555
   * furthermore, we scale the results by 2**PASS1_BITS.
3556
   * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3557
   */
3558

3559
  dataptr = data;
3560
  ctr = 0;
3561
  for (;;) {
3562
    elemptr = sample_data[ctr] + start_col;
3563

3564
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
3565
     * rotator "c1" should be "c6".
3566
     */
3567

3568
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3569
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3570
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3571
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3572

3573
    tmp10 = tmp0 + tmp3;
3574
    tmp12 = tmp0 - tmp3;
3575
    tmp11 = tmp1 + tmp2;
3576
    tmp13 = tmp1 - tmp2;
3577

3578
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3579
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3580
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3581
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3582

3583
    /* Apply unsigned->signed conversion. */
3584
    dataptr[0] = PASS1_OUTPUT(tmp10 + tmp11 - 8 * CENTERJSAMPLE);
3585
    dataptr[4] = PASS1_OUTPUT(tmp10 - tmp11);
3586

3587
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
3588
    /* Add fudge factor here for final descale. */
3589
    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
3590

3591
    dataptr[2] = (DCTELEM)
3592
      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
3593
		  CONST_BITS-PASS1_BITS);
3594
    dataptr[6] = (DCTELEM)
3595
      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
3596
		  CONST_BITS-PASS1_BITS);
3597

3598
    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3599
     * i0..i3 in the paper are tmp0..tmp3 here.
3600
     */
3601

3602
    tmp12 = tmp0 + tmp2;
3603
    tmp13 = tmp1 + tmp3;
3604

3605
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
3606
    /* Add fudge factor here for final descale. */
3607
    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
3608

3609
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
3610
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */
3611
    tmp12 += z1;
3612
    tmp13 += z1;
3613

3614
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
3615
    tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
3616
    tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
3617
    tmp0 += z1 + tmp12;
3618
    tmp3 += z1 + tmp13;
3619

3620
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
3621
    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
3622
    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
3623
    tmp1 += z1 + tmp13;
3624
    tmp2 += z1 + tmp12;
3625

3626
    dataptr[1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS-PASS1_BITS);
3627
    dataptr[3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS-PASS1_BITS);
3628
    dataptr[5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
3629
    dataptr[7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS-PASS1_BITS);
3630

3631
    ctr++;
3632

3633
    if (ctr != DCTSIZE) {
3634
      if (ctr == DCTSIZE * 2)
3635
	break;			/* Done. */
3636
      dataptr += DCTSIZE;	/* advance pointer to next row */
3637
    } else
3638
      dataptr = workspace;	/* switch pointer to extended workspace */
3639
  }
3640

3641
  /* Pass 2: process columns.
3642
   * We apply the PASS2_BITS scaling, but leave the
3643
   * results scaled up by an overall factor of 8.
3644
   * We must also scale the output by 8/16 = 1/2.
3645
   * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
3646
   */
3647

3648
  dataptr = data;
3649
  wsptr = workspace;
3650
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3651
    /* Even part */
3652

3653
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
3654
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
3655
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
3656
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
3657
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
3658
    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
3659
    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
3660
    tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
3661

3662
    tmp10 = tmp0 + tmp7;
3663
    tmp14 = tmp0 - tmp7;
3664
    tmp11 = tmp1 + tmp6;
3665
    tmp15 = tmp1 - tmp6;
3666
    tmp12 = tmp2 + tmp5;
3667
    tmp16 = tmp2 - tmp5;
3668
    tmp13 = tmp3 + tmp4;
3669
    tmp17 = tmp3 - tmp4;
3670

3671
    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
3672
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
3673
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
3674
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
3675
    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
3676
    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
3677
    tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
3678
    tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
3679

3680
    dataptr[DCTSIZE*0] = (DCTELEM)
3681
#if PASS2_BITS > 0
3682
      RIGHT_SHIFT(tmp10 + tmp11 + tmp12 + tmp13 + (ONE << PASS2_BITS),
3683
		  PASS2_BITS+1);
3684
#else
3685
      RIGHT_SHIFT(tmp10 + tmp11 + tmp12 + tmp13 + ONE, 1);
3686
#endif
3687
    dataptr[DCTSIZE*4] = (DCTELEM)
3688
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
3689
	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
3690
	      CONST_BITS+PASS2_BITS+1);
3691

3692
    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
3693
	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
3694

3695
    dataptr[DCTSIZE*2] = (DCTELEM)
3696
      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
3697
	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
3698
	      CONST_BITS+PASS2_BITS+1);
3699
    dataptr[DCTSIZE*6] = (DCTELEM)
3700
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
3701
	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
3702
	      CONST_BITS+PASS2_BITS+1);
3703

3704
    /* Odd part */
3705

3706
    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
3707
	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
3708
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
3709
	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
3710
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
3711
	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
3712
    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
3713
	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
3714
    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
3715
	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
3716
    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
3717
	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
3718
    tmp10 = tmp11 + tmp12 + tmp13 -
3719
	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
3720
	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
3721
    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
3722
	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
3723
    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
3724
	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
3725
    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
3726
	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
3727

3728
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS2_BITS+1);
3729
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS2_BITS+1);
3730
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS2_BITS+1);
3731
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS2_BITS+1);
3732

3733
    dataptr++;			/* advance pointer to next column */
3734
    wsptr++;			/* advance pointer to next column */
3735
  }
3736
}
3737

3738

3739
/*
3740
 * Perform the forward DCT on a 7x14 sample block.
3741
 *
3742
 * 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
3743
 */
3744

3745
GLOBAL(void)
3746
jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3747
{
3748
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
3749
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
3750
  INT32 z1, z2, z3;
3751
  DCTELEM workspace[8*6];
3752
  DCTELEM *dataptr;
3753
  DCTELEM *wsptr;
3754
  JSAMPROW elemptr;
3755
  int ctr;
3756
  SHIFT_TEMPS
3757

3758
  /* Pre-zero output coefficient block. */
3759
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3760

3761
  /* Pass 1: process rows.
3762
   * Note results are scaled up by sqrt(8) compared to a true DCT;
3763
   * furthermore, we scale the results by 2**PASS1_BITS.
3764
   * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
3765
   */
3766

3767
  dataptr = data;
3768
  ctr = 0;
3769
  for (;;) {
3770
    elemptr = sample_data[ctr] + start_col;
3771

3772
    /* Even part */
3773

3774
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
3775
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
3776
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
3777
    tmp3 = GETJSAMPLE(elemptr[3]);
3778

3779
    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
3780
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
3781
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
3782

3783
    z1 = tmp0 + tmp2;
3784
    /* Apply unsigned->signed conversion. */
3785
    dataptr[0] = PASS1_OUTPUT(z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE);
3786
    tmp3 += tmp3;
3787
    z1 -= tmp3;
3788
    z1 -= tmp3;
3789
    z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
3790
    z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
3791
    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
3792
    dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
3793
    z1 -= z2;
3794
    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
3795
    dataptr[4] = (DCTELEM)
3796
      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
3797
	      CONST_BITS-PASS1_BITS);
3798
    dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
3799

3800
    /* Odd part */
3801

3802
    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
3803
    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */
3804
    tmp0 = tmp1 - tmp2;
3805
    tmp1 += tmp2;
3806
    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
3807
    tmp1 += tmp2;
3808
    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268));   /* c5 */
3809
    tmp0 += tmp3;
3810
    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693));   /* c3+c1-c5 */
3811

3812
    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
3813
    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
3814
    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
3815

3816
    ctr++;
3817

3818
    if (ctr != DCTSIZE) {
3819
      if (ctr == 14)
3820
	break;			/* Done. */
3821
      dataptr += DCTSIZE;	/* advance pointer to next row */
3822
    } else
3823
      dataptr = workspace;	/* switch pointer to extended workspace */
3824
  }
3825

3826
  /* Pass 2: process columns.
3827
   * We apply the PASS2_BITS scaling, but leave the
3828
   * results scaled up by an overall factor of 8.
3829
   * We must also scale the output by (8/7)*(8/14) = 32/49,
3830
   * which we fold into the constant multipliers:
3831
   * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28) * 32/49.
3832
   */
3833

3834
  dataptr = data;
3835
  wsptr = workspace;
3836
  for (ctr = 0; ctr < 7; ctr++) {
3837
    /* Even part */
3838

3839
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
3840
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
3841
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
3842
    tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
3843
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
3844
    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
3845
    tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
3846

3847
    tmp10 = tmp0 + tmp6;
3848
    tmp14 = tmp0 - tmp6;
3849
    tmp11 = tmp1 + tmp5;
3850
    tmp15 = tmp1 - tmp5;
3851
    tmp12 = tmp2 + tmp4;
3852
    tmp16 = tmp2 - tmp4;
3853

3854
    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
3855
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
3856
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
3857
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
3858
    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
3859
    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
3860
    tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
3861

3862
    dataptr[DCTSIZE*0] = (DCTELEM)
3863
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
3864
		       FIX(0.653061224)),                 /* 32/49 */
3865
	      CONST_BITS+PASS2_BITS);
3866
    tmp13 += tmp13;
3867
    dataptr[DCTSIZE*4] = (DCTELEM)
3868
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
3869
	      MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
3870
	      MULTIPLY(tmp12 - tmp13, FIX(0.575835255)),  /* c8 */
3871
	      CONST_BITS+PASS2_BITS);
3872

3873
    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570));    /* c6 */
3874

3875
    dataptr[DCTSIZE*2] = (DCTELEM)
3876
      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691))   /* c2-c6 */
3877
	      + MULTIPLY(tmp16, FIX(0.400721155)),        /* c10 */
3878
	      CONST_BITS+PASS2_BITS);
3879
    dataptr[DCTSIZE*6] = (DCTELEM)
3880
      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725))   /* c6+c10 */
3881
	      - MULTIPLY(tmp16, FIX(0.900412262)),        /* c2 */
3882
	      CONST_BITS+PASS2_BITS);
3883

3884
    /* Odd part */
3885

3886
    tmp10 = tmp1 + tmp2;
3887
    tmp11 = tmp5 - tmp4;
3888
    dataptr[DCTSIZE*7] = (DCTELEM)
3889
      DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
3890
		       FIX(0.653061224)),                 /* 32/49 */
3891
	      CONST_BITS+PASS2_BITS);
3892
    tmp3  = MULTIPLY(tmp3 , FIX(0.653061224));            /* 32/49 */
3893
    tmp10 = MULTIPLY(tmp10, - FIX(0.103406812));          /* -c13 */
3894
    tmp11 = MULTIPLY(tmp11, FIX(0.917760839));            /* c1 */
3895
    tmp10 += tmp11 - tmp3;
3896
    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) +     /* c5 */
3897
	    MULTIPLY(tmp4 + tmp6, FIX(0.491367823));      /* c9 */
3898
    dataptr[DCTSIZE*5] = (DCTELEM)
3899
      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
3900
	      + MULTIPLY(tmp4, FIX(0.731428202)),         /* c1+c11-c9 */
3901
	      CONST_BITS+PASS2_BITS);
3902
    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) +     /* c3 */
3903
	    MULTIPLY(tmp5 - tmp6, FIX(0.305035186));      /* c11 */
3904
    dataptr[DCTSIZE*3] = (DCTELEM)
3905
      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
3906
	      - MULTIPLY(tmp5, FIX(2.004803435)),         /* c1+c5+c11 */
3907
	      CONST_BITS+PASS2_BITS);
3908
    dataptr[DCTSIZE*1] = (DCTELEM)
3909
      DESCALE(tmp11 + tmp12 + tmp3
3910
	      - MULTIPLY(tmp0, FIX(0.735987049))          /* c3+c5-c1 */
3911
	      - MULTIPLY(tmp6, FIX(0.082925825)),         /* c9-c11-c13 */
3912
	      CONST_BITS+PASS2_BITS);
3913

3914
    dataptr++;			/* advance pointer to next column */
3915
    wsptr++;			/* advance pointer to next column */
3916
  }
3917
}
3918

3919

3920
/*
3921
 * Perform the forward DCT on a 6x12 sample block.
3922
 *
3923
 * 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
3924
 */
3925

3926
GLOBAL(void)
3927
jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3928
{
3929
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3930
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3931
  DCTELEM workspace[8*4];
3932
  DCTELEM *dataptr;
3933
  DCTELEM *wsptr;
3934
  JSAMPROW elemptr;
3935
  int ctr;
3936
  SHIFT_TEMPS
3937

3938
  /* Pre-zero output coefficient block. */
3939
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3940

3941
  /* Pass 1: process rows.
3942
   * Note results are scaled up by sqrt(8) compared to a true DCT;
3943
   * furthermore, we scale the results by 2**PASS1_BITS.
3944
   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3945
   */
3946

3947
  dataptr = data;
3948
  ctr = 0;
3949
  for (;;) {
3950
    elemptr = sample_data[ctr] + start_col;
3951

3952
    /* Even part */
3953

3954
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3955
    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3956
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3957

3958
    tmp10 = tmp0 + tmp2;
3959
    tmp12 = tmp0 - tmp2;
3960

3961
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3962
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3963
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3964

3965
    /* Apply unsigned->signed conversion. */
3966
    dataptr[0] = PASS1_OUTPUT(tmp10 + tmp11 - 6 * CENTERJSAMPLE);
3967
    dataptr[2] = (DCTELEM)
3968
      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
3969
	      CONST_BITS-PASS1_BITS);
3970
    dataptr[4] = (DCTELEM)
3971
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3972
	      CONST_BITS-PASS1_BITS);
3973

3974
    /* Odd part */
3975

3976
    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
3977
		    CONST_BITS-PASS1_BITS);
3978

3979
#if PASS1_BITS > 0
3980
    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
3981
    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
3982
    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
3983
#else
3984
    dataptr[1] = (DCTELEM) (tmp10 + tmp0 + tmp1);
3985
    dataptr[3] = (DCTELEM) (tmp0 - tmp1 - tmp2);
3986
    dataptr[5] = (DCTELEM) (tmp10 + tmp2 - tmp1);
3987
#endif
3988

3989
    ctr++;
3990

3991
    if (ctr != DCTSIZE) {
3992
      if (ctr == 12)
3993
	break;			/* Done. */
3994
      dataptr += DCTSIZE;	/* advance pointer to next row */
3995
    } else
3996
      dataptr = workspace;	/* switch pointer to extended workspace */
3997
  }
3998

3999
  /* Pass 2: process columns.
4000
   * We apply the PASS2_BITS scaling, but leave the
4001
   * results scaled up by an overall factor of 8.
4002
   * We must also scale the output by (8/6)*(8/12) = 8/9,
4003
   * which we fold into the constant multipliers:
4004
   * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24) * 8/9.
4005
   */
4006

4007
  dataptr = data;
4008
  wsptr = workspace;
4009
  for (ctr = 0; ctr < 6; ctr++) {
4010
    /* Even part */
4011

4012
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
4013
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
4014
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
4015
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
4016
    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
4017
    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
4018

4019
    tmp10 = tmp0 + tmp5;
4020
    tmp13 = tmp0 - tmp5;
4021
    tmp11 = tmp1 + tmp4;
4022
    tmp14 = tmp1 - tmp4;
4023
    tmp12 = tmp2 + tmp3;
4024
    tmp15 = tmp2 - tmp3;
4025

4026
    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
4027
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
4028
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
4029
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
4030
    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
4031
    tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
4032

4033
    dataptr[DCTSIZE*0] = (DCTELEM)
4034
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
4035
	      CONST_BITS+PASS2_BITS);
4036
    dataptr[DCTSIZE*6] = (DCTELEM)
4037
      DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
4038
	      CONST_BITS+PASS2_BITS);
4039
    dataptr[DCTSIZE*4] = (DCTELEM)
4040
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)),         /* c4 */
4041
	      CONST_BITS+PASS2_BITS);
4042
    dataptr[DCTSIZE*2] = (DCTELEM)
4043
      DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) +        /* 8/9 */
4044
	      MULTIPLY(tmp13 + tmp15, FIX(1.214244803)),         /* c2 */
4045
	      CONST_BITS+PASS2_BITS);
4046

4047
    /* Odd part */
4048

4049
    tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200));   /* c9 */
4050
    tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102));  /* c3-c9 */
4051
    tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502));  /* c3+c9 */
4052
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603));   /* c5 */
4053
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039));   /* c7 */
4054
    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
4055
	    + MULTIPLY(tmp5, FIX(0.164081699));        /* c11 */
4056
    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
4057
    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
4058
	    + MULTIPLY(tmp5, FIX(0.765261039));        /* c7 */
4059
    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
4060
	    - MULTIPLY(tmp5, FIX(0.997307603));        /* c5 */
4061
    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
4062
	    - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
4063

4064
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS2_BITS);
4065
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS2_BITS);
4066
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS2_BITS);
4067
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS2_BITS);
4068

4069
    dataptr++;			/* advance pointer to next column */
4070
    wsptr++;			/* advance pointer to next column */
4071
  }
4072
}
4073

4074

4075
/*
4076
 * Perform the forward DCT on a 5x10 sample block.
4077
 *
4078
 * 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
4079
 */
4080

4081
GLOBAL(void)
4082
jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4083
{
4084
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
4085
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
4086
  DCTELEM workspace[8*2];
4087
  DCTELEM *dataptr;
4088
  DCTELEM *wsptr;
4089
  JSAMPROW elemptr;
4090
  int ctr;
4091
  SHIFT_TEMPS
4092

4093
  /* Pre-zero output coefficient block. */
4094
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4095

4096
  /* Pass 1: process rows.
4097
   * Note results are scaled up by sqrt(8) compared to a true DCT;
4098
   * furthermore, we scale the results by 2**PASS1_BITS.
4099
   * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
4100
   */
4101

4102
  dataptr = data;
4103
  ctr = 0;
4104
  for (;;) {
4105
    elemptr = sample_data[ctr] + start_col;
4106

4107
    /* Even part */
4108

4109
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
4110
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
4111
    tmp2 = GETJSAMPLE(elemptr[2]);
4112

4113
    tmp10 = tmp0 + tmp1;
4114
    tmp11 = tmp0 - tmp1;
4115

4116
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
4117
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
4118

4119
    /* Apply unsigned->signed conversion. */
4120
    dataptr[0] = PASS1_OUTPUT(tmp10 + tmp2 - 5 * CENTERJSAMPLE);
4121
    tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
4122
    tmp10 -= tmp2 << 2;
4123
    tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
4124
    dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
4125
    dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
4126

4127
    /* Odd part */
4128

4129
    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
4130

4131
    dataptr[1] = (DCTELEM)
4132
      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
4133
	      CONST_BITS-PASS1_BITS);
4134
    dataptr[3] = (DCTELEM)
4135
      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
4136
	      CONST_BITS-PASS1_BITS);
4137

4138
    ctr++;
4139

4140
    if (ctr != DCTSIZE) {
4141
      if (ctr == 10)
4142
	break;			/* Done. */
4143
      dataptr += DCTSIZE;	/* advance pointer to next row */
4144
    } else
4145
      dataptr = workspace;	/* switch pointer to extended workspace */
4146
  }
4147

4148
  /* Pass 2: process columns.
4149
   * We apply the PASS2_BITS scaling, but leave the
4150
   * results scaled up by an overall factor of 8.
4151
   * We must also scale the output by (8/5)*(8/10) = 32/25,
4152
   * which we fold into the constant multipliers:
4153
   * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20) * 32/25.
4154
   */
4155

4156
  dataptr = data;
4157
  wsptr = workspace;
4158
  for (ctr = 0; ctr < 5; ctr++) {
4159
    /* Even part */
4160

4161
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
4162
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
4163
    tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
4164
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
4165
    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
4166

4167
    tmp10 = tmp0 + tmp4;
4168
    tmp13 = tmp0 - tmp4;
4169
    tmp11 = tmp1 + tmp3;
4170
    tmp14 = tmp1 - tmp3;
4171

4172
    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
4173
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
4174
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
4175
    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
4176
    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
4177

4178
    dataptr[DCTSIZE*0] = (DCTELEM)
4179
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
4180
	      CONST_BITS+PASS2_BITS);
4181
    tmp12 += tmp12;
4182
    dataptr[DCTSIZE*4] = (DCTELEM)
4183
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
4184
	      MULTIPLY(tmp11 - tmp12, FIX(0.559380511)),  /* c8 */
4185
	      CONST_BITS+PASS2_BITS);
4186
    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961));    /* c6 */
4187
    dataptr[DCTSIZE*2] = (DCTELEM)
4188
      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)),  /* c2-c6 */
4189
	      CONST_BITS+PASS2_BITS);
4190
    dataptr[DCTSIZE*6] = (DCTELEM)
4191
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)),  /* c2+c6 */
4192
	      CONST_BITS+PASS2_BITS);
4193

4194
    /* Odd part */
4195

4196
    tmp10 = tmp0 + tmp4;
4197
    tmp11 = tmp1 - tmp3;
4198
    dataptr[DCTSIZE*5] = (DCTELEM)
4199
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)),  /* 32/25 */
4200
	      CONST_BITS+PASS2_BITS);
4201
    tmp2 = MULTIPLY(tmp2, FIX(1.28));                     /* 32/25 */
4202
    dataptr[DCTSIZE*1] = (DCTELEM)
4203
      DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) +          /* c1 */
4204
	      MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 +   /* c3 */
4205
	      MULTIPLY(tmp3, FIX(0.821810588)) +          /* c7 */
4206
	      MULTIPLY(tmp4, FIX(0.283176630)),           /* c9 */
4207
	      CONST_BITS+PASS2_BITS);
4208
    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) -     /* (c3+c7)/2 */
4209
	    MULTIPLY(tmp1 + tmp3, FIX(0.752365123));      /* (c1-c9)/2 */
4210
    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) +   /* (c3-c7)/2 */
4211
	    MULTIPLY(tmp11, FIX(0.64)) - tmp2;            /* 16/25 */
4212
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS2_BITS);
4213
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS2_BITS);
4214

4215
    dataptr++;			/* advance pointer to next column */
4216
    wsptr++;			/* advance pointer to next column */
4217
  }
4218
}
4219

4220

4221
/*
4222
 * Perform the forward DCT on a 4x8 sample block.
4223
 *
4224
 * 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
4225
 */
4226

4227
GLOBAL(void)
4228
jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4229
{
4230
  INT32 tmp0, tmp1, tmp2, tmp3;
4231
  INT32 tmp10, tmp11, tmp12, tmp13;
4232
  INT32 z1;
4233
  DCTELEM *dataptr;
4234
  JSAMPROW elemptr;
4235
  int ctr;
4236
  SHIFT_TEMPS
4237

4238
  /* Pre-zero output coefficient block. */
4239
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4240

4241
  /* Pass 1: process rows.
4242
   * Note results are scaled up by sqrt(8) compared to a true DCT;
4243
   * furthermore, we scale the results by 2**PASS1_BITS.
4244
   * We must also scale the output by 8/4 = 2, which we add here.
4245
   * 4-point FDCT kernel,
4246
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
4247
   */
4248

4249
  dataptr = data;
4250
  for (ctr = 0; ctr < DCTSIZE; ctr++) {
4251
    elemptr = sample_data[ctr] + start_col;
4252

4253
    /* Even part */
4254

4255
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
4256
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
4257

4258
    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
4259
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
4260

4261
    /* Apply unsigned->signed conversion. */
4262
    dataptr[0] = (DCTELEM)
4263
      ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
4264
    dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
4265

4266
    /* Odd part */
4267

4268
    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
4269
    /* Add fudge factor here for final descale. */
4270
    tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
4271

4272
    dataptr[1] = (DCTELEM)
4273
      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4274
		  CONST_BITS-PASS1_BITS-1);
4275
    dataptr[3] = (DCTELEM)
4276
      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4277
		  CONST_BITS-PASS1_BITS-1);
4278

4279
    dataptr += DCTSIZE;		/* advance pointer to next row */
4280
  }
4281

4282
  /* Pass 2: process columns.
4283
   * We apply the PASS2_BITS scaling, but leave the
4284
   * results scaled up by an overall factor of 8.
4285
   * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4286
   */
4287

4288
  dataptr = data;
4289
  for (ctr = 0; ctr < 4; ctr++) {
4290
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
4291
     * rotator "c1" should be "c6".
4292
     */
4293

4294
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
4295
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
4296
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
4297
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
4298

4299
    /* Add fudge factor here for final descale. */
4300
#if PASS2_BITS > 1
4301
    tmp10 = tmp0 + tmp3 + (ONE << (PASS2_BITS-1));
4302
#else
4303
#if PASS2_BITS > 0
4304
    tmp10 = tmp0 + tmp3 + ONE;
4305
#else
4306
    tmp10 = tmp0 + tmp3;
4307
#endif
4308
#endif
4309
    tmp12 = tmp0 - tmp3;
4310
    tmp11 = tmp1 + tmp2;
4311
    tmp13 = tmp1 - tmp2;
4312

4313
    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
4314
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
4315
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
4316
    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
4317

4318
    dataptr[DCTSIZE*0] = PASS2_OUTPUT(tmp10 + tmp11);
4319
    dataptr[DCTSIZE*4] = PASS2_OUTPUT(tmp10 - tmp11);
4320

4321
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
4322
    /* Add fudge factor here for final descale. */
4323
    z1 += ONE << (CONST_BITS+PASS2_BITS-1);
4324

4325
    dataptr[DCTSIZE*2] = (DCTELEM)
4326
      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
4327
		  CONST_BITS+PASS2_BITS);
4328
    dataptr[DCTSIZE*6] = (DCTELEM)
4329
      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
4330
		  CONST_BITS+PASS2_BITS);
4331

4332
    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
4333
     * i0..i3 in the paper are tmp0..tmp3 here.
4334
     */
4335

4336
    tmp12 = tmp0 + tmp2;
4337
    tmp13 = tmp1 + tmp3;
4338

4339
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
4340
    /* Add fudge factor here for final descale. */
4341
    z1 += ONE << (CONST_BITS+PASS2_BITS-1);
4342

4343
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
4344
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */
4345
    tmp12 += z1;
4346
    tmp13 += z1;
4347

4348
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
4349
    tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
4350
    tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
4351
    tmp0 += z1 + tmp12;
4352
    tmp3 += z1 + tmp13;
4353

4354
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
4355
    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
4356
    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
4357
    tmp1 += z1 + tmp13;
4358
    tmp2 += z1 + tmp12;
4359

4360
    dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS+PASS2_BITS);
4361
    dataptr[DCTSIZE*3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS+PASS2_BITS);
4362
    dataptr[DCTSIZE*5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS+PASS2_BITS);
4363
    dataptr[DCTSIZE*7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS+PASS2_BITS);
4364

4365
    dataptr++;			/* advance pointer to next column */
4366
  }
4367
}
4368

4369

4370
/*
4371
 * Perform the forward DCT on a 3x6 sample block.
4372
 *
4373
 * 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
4374
 */
4375

4376
GLOBAL(void)
4377
jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4378
{
4379
  INT32 tmp0, tmp1, tmp2;
4380
  INT32 tmp10, tmp11, tmp12;
4381
  DCTELEM *dataptr;
4382
  JSAMPROW elemptr;
4383
  int ctr;
4384
  SHIFT_TEMPS
4385

4386
  /* Pre-zero output coefficient block. */
4387
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4388

4389
  /* Pass 1: process rows.
4390
   * Note results are scaled up by sqrt(8) compared to a true DCT;
4391
   * furthermore, we scale the results by 2**PASS1_BITS.
4392
   * We scale the results further by 2 as part of output adaption
4393
   * scaling for different DCT size.
4394
   * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
4395
   */
4396

4397
  dataptr = data;
4398
  for (ctr = 0; ctr < 6; ctr++) {
4399
    elemptr = sample_data[ctr] + start_col;
4400

4401
    /* Even part */
4402

4403
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
4404
    tmp1 = GETJSAMPLE(elemptr[1]);
4405

4406
    tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
4407

4408
    /* Apply unsigned->signed conversion. */
4409
    dataptr[0] = (DCTELEM)
4410
      ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
4411
    dataptr[2] = (DCTELEM)
4412
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
4413
	      CONST_BITS-PASS1_BITS-1);
4414

4415
    /* Odd part */
4416

4417
    dataptr[1] = (DCTELEM)
4418
      DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
4419
	      CONST_BITS-PASS1_BITS-1);
4420

4421
    dataptr += DCTSIZE;		/* advance pointer to next row */
4422
  }
4423

4424
  /* Pass 2: process columns.
4425
   * We apply the PASS2_BITS scaling, but leave the
4426
   * results scaled up by an overall factor of 8.
4427
   * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
4428
   * fold into the constant multipliers (other part was done in pass 1):
4429
   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
4430
   */
4431

4432
  dataptr = data;
4433
  for (ctr = 0; ctr < 3; ctr++) {
4434
    /* Even part */
4435

4436
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
4437
    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
4438
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
4439

4440
    tmp10 = tmp0 + tmp2;
4441
    tmp12 = tmp0 - tmp2;
4442

4443
    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
4444
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
4445
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
4446

4447
    dataptr[DCTSIZE*0] = (DCTELEM)
4448
      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
4449
	      CONST_BITS+PASS2_BITS);
4450
    dataptr[DCTSIZE*2] = (DCTELEM)
4451
      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
4452
	      CONST_BITS+PASS2_BITS);
4453
    dataptr[DCTSIZE*4] = (DCTELEM)
4454
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
4455
	      CONST_BITS+PASS2_BITS);
4456

4457
    /* Odd part */
4458

4459
    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
4460

4461
    dataptr[DCTSIZE*1] = (DCTELEM)
4462
      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
4463
	      CONST_BITS+PASS2_BITS);
4464
    dataptr[DCTSIZE*3] = (DCTELEM)
4465
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
4466
	      CONST_BITS+PASS2_BITS);
4467
    dataptr[DCTSIZE*5] = (DCTELEM)
4468
      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
4469
	      CONST_BITS+PASS2_BITS);
4470

4471
    dataptr++;			/* advance pointer to next column */
4472
  }
4473
}
4474

4475

4476
/*
4477
 * Perform the forward DCT on a 2x4 sample block.
4478
 *
4479
 * 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
4480
 */
4481

4482
GLOBAL(void)
4483
jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4484
{
4485
  INT32 tmp0, tmp1;
4486
  INT32 tmp10, tmp11;
4487
  DCTELEM *dataptr;
4488
  JSAMPROW elemptr;
4489
  int ctr;
4490
  SHIFT_TEMPS
4491

4492
  /* Pre-zero output coefficient block. */
4493
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4494

4495
  /* Pass 1: process rows.
4496
   * Note results are scaled up by sqrt(8) compared to a true DCT.
4497
   */
4498

4499
  dataptr = data;
4500
  for (ctr = 0; ctr < 4; ctr++) {
4501
    elemptr = sample_data[ctr] + start_col;
4502

4503
    /* Even part */
4504

4505
    tmp0 = GETJSAMPLE(elemptr[0]);
4506
    tmp1 = GETJSAMPLE(elemptr[1]);
4507

4508
    /* Apply unsigned->signed conversion. */
4509
    dataptr[0] = (DCTELEM) (tmp0 + tmp1 - 2 * CENTERJSAMPLE);
4510

4511
    /* Odd part */
4512

4513
    dataptr[1] = (DCTELEM) (tmp0 - tmp1);
4514

4515
    dataptr += DCTSIZE;		/* advance pointer to next row */
4516
  }
4517

4518
  /* Pass 2: process columns.
4519
   * We leave the results scaled up by an overall factor of 8.
4520
   * We must also scale the output by (8/2)*(8/4) = 2**3.
4521
   * 4-point FDCT kernel,
4522
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
4523
   */
4524

4525
  dataptr = data;
4526
  for (ctr = 0; ctr < 2; ctr++) {
4527
    /* Even part */
4528

4529
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
4530
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
4531

4532
    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
4533
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
4534

4535
#if PASS2_BITS < PASS1_BITS + 3
4536
    dataptr[DCTSIZE*0] = (DCTELEM)
4537
      ((tmp0 + tmp1) << (3+PASS1_BITS-PASS2_BITS));
4538
    dataptr[DCTSIZE*2] = (DCTELEM)
4539
      ((tmp0 - tmp1) << (3+PASS1_BITS-PASS2_BITS));
4540
#else
4541
#if PASS2_BITS == PASS1_BITS + 3
4542
    dataptr[DCTSIZE*0] = (DCTELEM) (tmp0 + tmp1);
4543
    dataptr[DCTSIZE*2] = (DCTELEM) (tmp0 - tmp1);
4544
#else
4545
    /* Add fudge factor for descale. */
4546
    tmp0 += ONE << (PASS2_BITS-PASS1_BITS-3-1);
4547

4548
    dataptr[DCTSIZE*0] = (DCTELEM)
4549
      RIGHT_SHIFT(tmp0 + tmp1, PASS2_BITS-PASS1_BITS-3);
4550
    dataptr[DCTSIZE*2] = (DCTELEM)
4551
      RIGHT_SHIFT(tmp0 - tmp1, PASS2_BITS-PASS1_BITS-3);
4552
#endif
4553
#endif
4554

4555
    /* Odd part */
4556

4557
    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
4558
    /* Add fudge factor for descale. */
4559
    tmp0 += ONE << (CONST_BITS+PASS2_BITS-PASS1_BITS-3-1);
4560

4561
    dataptr[DCTSIZE*1] = (DCTELEM)
4562
      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4563
		  CONST_BITS+PASS2_BITS-PASS1_BITS-3);
4564
    dataptr[DCTSIZE*3] = (DCTELEM)
4565
      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4566
		  CONST_BITS+PASS2_BITS-PASS1_BITS-3);
4567

4568
    dataptr++;			/* advance pointer to next column */
4569
  }
4570
}
4571

4572

4573
/*
4574
 * Perform the forward DCT on a 1x2 sample block.
4575
 *
4576
 * 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
4577
 */
4578

4579
GLOBAL(void)
4580
jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4581
{
4582
  DCTELEM tmp0, tmp1;
4583

4584
  /* Pre-zero output coefficient block. */
4585
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4586

4587
  /* Pass 1: empty. */
4588

4589
  /* Pass 2: process columns.
4590
   * We leave the results scaled up by an overall factor of 8.
4591
   * We must also scale the output by (8/1)*(8/2) = 2**5.
4592
   */
4593

4594
  /* Even part */
4595

4596
  tmp0 = GETJSAMPLE(sample_data[0][start_col]);
4597
  tmp1 = GETJSAMPLE(sample_data[1][start_col]);
4598

4599
  /* Apply unsigned->signed conversion. */
4600
  data[DCTSIZE*0] =
4601
    (tmp0 + tmp1 - 2 * CENTERJSAMPLE) << (5+PASS1_BITS-PASS2_BITS);
4602

4603
  /* Odd part */
4604

4605
  data[DCTSIZE*1] = (tmp0 - tmp1) << (5+PASS1_BITS-PASS2_BITS);
4606
}
4607

4608
#endif /* DCT_SCALING_SUPPORTED */
4609
#endif /* DCT_ISLOW_SUPPORTED */
4610

4611
Product

Resources

Company