CoCalc -- transform.c

GitHub Repository: BitchX/BitchX1.3
Path: blob/master/dll/amp/transform.c
¹⁷¹⁷ views
1
/* this file is a part of amp software, (C) tomislav uzelac 1996,1997
2
*/
3
 
4
/* transform.c  imdct and polyphase(DCT) transforms
5
 *
6
 * Created by: tomislav uzelac  May 1996
7
 * Karl Anders Oygard optimized this for speed, Mar 13 97
8
 * Some optimisations based on ideas from Michael Hipp's mpg123 package
9
 */
10

11
/*
12
 * Comments for this file:
13
 *
14
 * The polyphase algorithm is clearly the most cpu consuming part of mpeg 1
15
 * layer 3 decoding.  Thus, there has been some effort to optimise this
16
 * particular algorithm.  Currently, everything has been kept in straight C
17
 * with no assembler optimisations, but in order to provide efficient paths
18
 * for different architectures, alternative implementations of some
19
 * critical sections has been done.  You may want to experiment with these,
20
 * to see which suits your architecture better.
21
 *
22
 * Selection of the different implementations is done with the following
23
 * defines:
24
 *
25
 *     HAS_AUTOINCREMENT
26
 *
27
 *         Define this if your architecture supports preincrementation of
28
 *         pointers when referencing (applies to e.g. 68k)
29
 *
30
 * For those who are optimising amp, check out the Pentium rdtsc code
31
 * (define PENTIUM_RDTSC).  This code uses the rdtsc counter for showing
32
 * how many cycles are spent in different parts of the code.
33
 */
34

35
#include <math.h>
36
#include <sys/types.h>
37
#include <unistd.h>
38
#include <string.h>
39

40
#include "audio.h"
41
#include "getdata.h"
42
#include "misc2.h"
43

44
#define TRANSFORM
45
#include "transform.h"
46

47
#define PI12      0.261799387f
48
#define PI36      0.087266462f
49

50
#undef ARCH_i586
51

52
void imdct_init()
53
{
54
  int i;
55

56
  for(i=0;i<36;i++) /* 0 */
57
    win[0][i] = (float) sin(PI36 *(i+0.5));
58
  for(i=0;i<18;i++) /* 1 */
59
    win[1][i] = (float) sin(PI36 *(i+0.5));
60
  for(i=18;i<24;i++)
61
    win[1][i] = 1.0f;
62
  for(i=24;i<30;i++)
63
    win[1][i] = (float) sin(PI12 *(i+0.5-18));
64
  for(i=30;i<36;i++)
65
    win[1][i] = 0.0f;
66
  for(i=0;i<6;i++) /* 3 */
67
    win[3][i] = 0.0f;
68
  for(i=6;i<12;i++)
69
    win[3][i] = (float) sin(PI12 * (i+ 0.5 - 6.0));
70
  for(i=12;i<18;i++)
71
    win[3][i] = 1.0f;
72
  for(i=18;i<36;i++)
73
    win[3][i] = (float) sin(PI36 * (i + 0.5));
74
}
75

76
/* This uses Byeong Gi Lee's Fast Cosine Transform algorithm to decompose
77
   the 36 point and 12 point IDCT's into 9 point and 3 point IDCT's,
78
   respectively. Then the 9 point IDCT is computed by a modified version of
79
   Mikko Tommila's IDCT algorithm, based on the WFTA. See his comments
80
   before the first 9 point IDCT. The 3 point IDCT is already efficient to
81
   implement. -- Jeff Tsay. */
82
/* I got the unrolled IDCT from Jeff Tsay; the code is presumably by 
83
   Francois-Raymond Boyer - I unrolled it a little further. tu */
84

85
void imdct(int win_type,int sb,int ch)
86
{
87
/*------------------------------------------------------------------*/
88
/*                                                                  */
89
/*    Function: Calculation of the inverse MDCT                     */
90
/*    In the case of short blocks the 3 output vectors are already  */
91
/*    overlapped and added in this modul.                           */
92
/*                                                                  */
93
/*    New layer3                                                    */
94
/*                                                                  */
95
/*------------------------------------------------------------------*/
96

97
       float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11;
98

99
       register float  save;
100
       float  pp1, pp2;
101
       float   *win_bt;
102
       int     i, p, ss;
103
		 float *in = xr[ch][sb];
104
		 float *s_p = s[ch][sb];
105
		 float *res_p = res[sb];
106
		 float out[36];
107

108
   if(win_type == 2){
109
		for(p=0;p<36;p+=9) {
110
			out[p]   = out[p+1] = out[p+2] = out[p+3] =
111
			out[p+4] = out[p+5] = out[p+6] = out[p+7] =
112
			out[p+8] = 0.0f;
113
		}
114

115
	for(ss=0;ss<18;ss+=6) {
116

117
	/*
118
	 *  12 point IMDCT
119
	 */
120

121
	        /* Begin 12 point IDCT */
122

123
	        /* Input aliasing for 12 pt IDCT */
124
		in[5+ss]+=in[4+ss];in[4+ss]+=in[3+ss];in[3+ss]+=in[2+ss];
125
		in[2+ss]+=in[1+ss];in[1+ss]+=in[0+ss];
126

127
		/* Input aliasing on odd indices (for 6 point IDCT) */
128
		in[5+ss] += in[3+ss];  in[3+ss]  += in[1+ss];
129

130
		/* 3 point IDCT on even indices */
131

132
		pp2 = in[4+ss] * 0.500000000f;
133
		pp1 = in[2+ss] * 0.866025403f;
134
		save = in[0+ss] + pp2;
135
		tmp1 = in[0+ss] - in[4+ss];
136
		tmp0 = save + pp1;
137
		tmp2 = save - pp1;
138

139
		/* End 3 point IDCT on even indices */
140

141
		/* 3 point IDCT on odd indices (for 6 point IDCT) */
142

143
		pp2 = in[5+ss] * 0.500000000f;
144
		pp1 = in[3+ss] * 0.866025403f;
145
		save = in[1+ss] + pp2;
146
		tmp4 = in[1+ss] - in[5+ss];
147
		tmp5 = save + pp1;
148
		tmp3 = save - pp1;
149

150
		/* End 3 point IDCT on odd indices */
151

152
		/* Twiddle factors on odd indices (for 6 point IDCT) */
153

154
		tmp3 *= 1.931851653f;
155
		tmp4 *= 0.707106781f;
156
		tmp5 *= 0.517638090f;
157

158
		/* Output butterflies on 2 3 point IDCT's (for 6 point IDCT) */
159

160
		save = tmp0;
161
		tmp0 += tmp5;
162
		tmp5 = save - tmp5;
163

164
		save = tmp1;
165
		tmp1 += tmp4;
166
		tmp4 = save - tmp4;
167

168
		save = tmp2;
169
		tmp2 += tmp3;
170
		tmp3 = save - tmp3;
171

172
		/* End 6 point IDCT */
173

174
		/* Twiddle factors on indices (for 12 point IDCT) */
175

176
		tmp0 *= 0.504314480f;
177
		tmp1 *= 0.541196100f;
178
		tmp2 *= 0.630236207f;
179
		tmp3 *= 0.821339815f;
180
		tmp4 *= 1.306562965f;
181
		tmp5 *= 3.830648788f;
182

183
		/* End 12 point IDCT */
184

185
		/* Shift to 12 point modified IDCT, multiply by window type 2 */
186
		tmp8  = tmp0 * -0.793353340f;
187
		tmp9  = tmp0 * -0.608761429f;
188
		tmp7  = tmp1 * -0.923879532f;
189
		tmp10 = tmp1 * -0.382683432f;
190
		tmp6  = tmp2 * -0.991444861f;
191
		tmp11 = tmp2 * -0.130526192f;
192

193
		tmp0  = tmp3;
194
		tmp1  = tmp4 *  0.382683432f;
195
		tmp2  = tmp5 *  0.608761429f;
196

197
		tmp3  = tmp5 * -0.793353340f;
198
		tmp4  = tmp4 * -0.923879532f;
199
		tmp5  = tmp0 * -0.991444861f;
200

201
		tmp0 *= 0.130526192f;
202

203
		out[ss + 6]  += tmp0;
204
		out[ss + 7]  += tmp1;
205
		out[ss + 8]  += tmp2;
206
		out[ss + 9]  += tmp3;
207
		out[ss + 10] += tmp4;
208
		out[ss + 11] += tmp5;
209
		out[ss + 12] += tmp6;
210
		out[ss + 13] += tmp7;
211
		out[ss + 14] += tmp8;
212
		out[ss + 15] += tmp9;
213
		out[ss + 16] += tmp10;
214
		out[ss + 17] += tmp11;
215

216
	}
217
	if (sb&1) {
218
		for (i=0;i<18;i+=2) res_p[i]=out[i] + s_p[i];
219
		for (i=1;i<18;i+=2) res_p[i]=-out[i] - s_p[i];
220
	} else
221
		for (i=0;i<18;i++) res_p[i]=out[i] + s_p[i];
222
	for (i=18;i<36;i++) s_p[i-18]=out[i];
223

224
    } else {
225
/*
226
 * 36 point IDCT ****************************************************************
227
 */
228
	float tmp[18];
229

230
      /* input aliasing for 36 point IDCT */
231
      in[17]+=in[16]; in[16]+=in[15]; in[15]+=in[14]; in[14]+=in[13];
232
      in[13]+=in[12]; in[12]+=in[11]; in[11]+=in[10]; in[10]+=in[9];
233
      in[9] +=in[8];  in[8] +=in[7];  in[7] +=in[6];  in[6] +=in[5];
234
      in[5] +=in[4];  in[4] +=in[3];  in[3] +=in[2];  in[2] +=in[1];
235
      in[1] +=in[0];
236

237
      /* 18 point IDCT for odd indices */
238
      
239
      /* input aliasing for 18 point IDCT */
240
      in[17]+=in[15]; in[15]+=in[13]; in[13]+=in[11]; in[11]+=in[9];
241
      in[9] +=in[7];  in[7] +=in[5];  in[5] +=in[3];  in[3] +=in[1];
242
      
243

244
{
245
   float tmp0,tmp1,tmp2,tmp3,tmp4,tmp0_,tmp1_,tmp2_,tmp3_;
246
   float tmp0o,tmp1o,tmp2o,tmp3o,tmp4o,tmp0_o,tmp1_o,tmp2_o,tmp3_o;
247

248
/* Fast 9 Point Inverse Discrete Cosine Transform
249
//
250
// By  Francois-Raymond Boyer
251
//         mailto:[email protected]
252
//         http://www.iro.umontreal.ca/~boyerf
253
//
254
// The code has been optimized for Intel processors
255
//  (takes a lot of time to convert float to and from iternal FPU representation)
256
//
257
// It is a simple "factorization" of the IDCT matrix.
258
*/
259
   /* 9 point IDCT on even indices */
260
   {
261
   /* 5 points on odd indices (not realy an IDCT) */
262
   float i0 = in[0]+in[0];
263
   float i0p12 = i0 + in[12];
264

265
   tmp0 = i0p12 + in[4]*1.8793852415718f  + in[8]*1.532088886238f   + in[16]*0.34729635533386f;
266
   tmp1 = i0    + in[4]                   - in[8] - in[12] - in[12] - in[16];
267
   tmp2 = i0p12 - in[4]*0.34729635533386f - in[8]*1.8793852415718f  + in[16]*1.532088886238f;
268
   tmp3 = i0p12 - in[4]*1.532088886238f   + in[8]*0.34729635533386f - in[16]*1.8793852415718f;
269
   tmp4 = in[0] - in[4]                   + in[8] - in[12]          + in[16];
270
   }
271
   {
272
   float i6_ = in[6]*1.732050808f;		
273

274
   tmp0_ = in[2]*1.9696155060244f  + i6_ + in[10]*1.2855752193731f  + in[14]*0.68404028665134f;
275
   tmp1_ = (in[2]                        - in[10]                   - in[14])*1.732050808f;
276
   tmp2_ = in[2]*1.2855752193731f  - i6_ - in[10]*0.68404028665134f + in[14]*1.9696155060244f;
277
   tmp3_ = in[2]*0.68404028665134f - i6_ + in[10]*1.9696155060244f  - in[14]*1.2855752193731f;
278
   }
279

280
   /* 9 point IDCT on odd indices */
281
   {
282
   /* 5 points on odd indices (not realy an IDCT) */
283
   float i0 = in[0+1]+in[0+1];
284
   float i0p12 = i0 + in[12+1];
285

286
   tmp0o = i0p12   + in[4+1]*1.8793852415718f  + in[8+1]*1.532088886238f       + in[16+1]*0.34729635533386f;
287
   tmp1o = i0      + in[4+1]                   - in[8+1] - in[12+1] - in[12+1] - in[16+1];
288
   tmp2o = i0p12   - in[4+1]*0.34729635533386f - in[8+1]*1.8793852415718f      + in[16+1]*1.532088886238f;
289
   tmp3o = i0p12   - in[4+1]*1.532088886238f   + in[8+1]*0.34729635533386f     - in[16+1]*1.8793852415718f;
290
   tmp4o = (in[0+1] - in[4+1]                   + in[8+1] - in[12+1]            + in[16+1])*0.707106781f; /* Twiddled */
291
   }
292
   {
293
   /* 4 points on even indices */
294
   float i6_ = in[6+1]*1.732050808f;		/* Sqrt[3] */
295

296
   tmp0_o = in[2+1]*1.9696155060244f  + i6_ + in[10+1]*1.2855752193731f  + in[14+1]*0.68404028665134f;
297
   tmp1_o = (in[2+1]                        - in[10+1]                   - in[14+1])*1.732050808f;
298
   tmp2_o = in[2+1]*1.2855752193731f  - i6_ - in[10+1]*0.68404028665134f + in[14+1]*1.9696155060244f;
299
   tmp3_o = in[2+1]*0.68404028665134f - i6_ + in[10+1]*1.9696155060244f  - in[14+1]*1.2855752193731f;
300
   }
301

302
   /* Twiddle factors on odd indices
303
   // and
304
   // Butterflies on 9 point IDCT's
305
   // and
306
   // twiddle factors for 36 point IDCT
307
   */
308
   {
309
   float e, o;
310
   e = tmp0 + tmp0_; o = (tmp0o + tmp0_o)*0.501909918f; tmp[0] = (e + o)*(-0.500476342f*.5f);    tmp[17] = (e - o)*(-11.46279281f*.5f);
311
   e = tmp1 + tmp1_; o = (tmp1o + tmp1_o)*0.517638090f; tmp[1] = (e + o)*(-0.504314480f*.5f);    tmp[16] = (e - o)*(-3.830648788f*.5f);
312
   e = tmp2 + tmp2_; o = (tmp2o + tmp2_o)*0.551688959f; tmp[2] = (e + o)*(-0.512139757f*.5f);    tmp[15] = (e - o)*(-2.310113158f*.5f);
313
   e = tmp3 + tmp3_; o = (tmp3o + tmp3_o)*0.610387294f; tmp[3] = (e + o)*(-0.524264562f*.5f);    tmp[14] = (e - o)*(-1.662754762f*.5f);
314
                                                        tmp[4] = (tmp4 + tmp4o)*(-0.541196100f); tmp[13] = (tmp4 - tmp4o)*(-1.306562965f);
315
   e = tmp3 - tmp3_; o = (tmp3o - tmp3_o)*0.871723397f; tmp[5] = (e + o)*(-0.563690973f*.5f);    tmp[12] = (e - o)*(-1.082840285f*.5f);
316
   e = tmp2 - tmp2_; o = (tmp2o - tmp2_o)*1.183100792f; tmp[6] = (e + o)*(-0.592844523f*.5f);    tmp[11] = (e - o)*(-0.930579498f*.5f);
317
   e = tmp1 - tmp1_; o = (tmp1o - tmp1_o)*1.931851653f; tmp[7] = (e + o)*(-0.630236207f*.5f);    tmp[10] = (e - o)*(-0.821339815f*.5f);
318
   e = tmp0 - tmp0_; o = (tmp0o - tmp0_o)*5.736856623f; tmp[8] = (e + o)*(-0.678170852f*.5f);    tmp[9] =  (e - o)*(-0.740093616f*.5f);
319
   }
320
   }
321
	/* shift to modified IDCT */
322
	win_bt = win[win_type];
323

324
	if (sb&1) {
325
		res_p[0] =  -tmp[9]  * win_bt[0] + s_p[0];
326
		res_p[1] =-(-tmp[10] * win_bt[1] + s_p[1]);
327
		res_p[2] =  -tmp[11] * win_bt[2] + s_p[2];
328
		res_p[3] =-(-tmp[12] * win_bt[3] + s_p[3]);
329
		res_p[4] =  -tmp[13] * win_bt[4] + s_p[4];
330
		res_p[5] =-(-tmp[14] * win_bt[5] + s_p[5]);
331
		res_p[6] =  -tmp[15] * win_bt[6] + s_p[6];
332
		res_p[7] =-(-tmp[16] * win_bt[7] + s_p[7]);
333
		res_p[8] =  -tmp[17] * win_bt[8] + s_p[8];
334
	   
335
		res_p[9] = -(tmp[17] * win_bt[9] + s_p[9]);
336
		res_p[10]=  tmp[16] * win_bt[10] + s_p[10];
337
		res_p[11]=-(tmp[15] * win_bt[11] + s_p[11]);
338
		res_p[12]=  tmp[14] * win_bt[12] + s_p[12];
339
		res_p[13]=-(tmp[13] * win_bt[13] + s_p[13]);
340
		res_p[14]=  tmp[12] * win_bt[14] + s_p[14];
341
		res_p[15]=-(tmp[11] * win_bt[15] + s_p[15]);
342
		res_p[16]=  tmp[10] * win_bt[16] + s_p[16];
343
		res_p[17]=-(tmp[9]  * win_bt[17] + s_p[17]);
344
	} else {
345
		res_p[0] = -tmp[9]  * win_bt[0] + s_p[0];
346
		res_p[1] = -tmp[10] * win_bt[1] + s_p[1];
347
		res_p[2] = -tmp[11] * win_bt[2] + s_p[2];
348
		res_p[3] = -tmp[12] * win_bt[3] + s_p[3];
349
		res_p[4] = -tmp[13] * win_bt[4] + s_p[4];
350
		res_p[5] = -tmp[14] * win_bt[5] + s_p[5];
351
		res_p[6] = -tmp[15] * win_bt[6] + s_p[6];
352
		res_p[7] = -tmp[16] * win_bt[7] + s_p[7];
353
		res_p[8] = -tmp[17] * win_bt[8] + s_p[8];
354
	   
355
		res_p[9] = tmp[17] * win_bt[9] + s_p[9];
356
		res_p[10]= tmp[16] * win_bt[10] + s_p[10];
357
		res_p[11]= tmp[15] * win_bt[11] + s_p[11];
358
		res_p[12]= tmp[14] * win_bt[12] + s_p[12];
359
		res_p[13]= tmp[13] * win_bt[13] + s_p[13];
360
		res_p[14]= tmp[12] * win_bt[14] + s_p[14];
361
		res_p[15]= tmp[11] * win_bt[15] + s_p[15];
362
		res_p[16]= tmp[10] * win_bt[16] + s_p[16];
363
		res_p[17]= tmp[9]  * win_bt[17] + s_p[17];
364
	}
365

366
	s_p[0]= tmp[8]  * win_bt[18];
367
	s_p[1]= tmp[7]  * win_bt[19];
368
	s_p[2]= tmp[6]  * win_bt[20];
369
	s_p[3]= tmp[5]  * win_bt[21];
370
	s_p[4]= tmp[4]  * win_bt[22];
371
	s_p[5]= tmp[3]  * win_bt[23];
372
	s_p[6]= tmp[2]  * win_bt[24];
373
	s_p[7]= tmp[1]  * win_bt[25];
374
	s_p[8]= tmp[0]  * win_bt[26];
375

376
	s_p[9]= tmp[0]  * win_bt[27];
377
	s_p[10]= tmp[1]  * win_bt[28];
378
	s_p[11]= tmp[2]  * win_bt[29];
379
	s_p[12]= tmp[3]  * win_bt[30];
380
	s_p[13]= tmp[4]  * win_bt[31];
381
	s_p[14]= tmp[5]  * win_bt[32];
382
	s_p[15]= tmp[6]  * win_bt[33];
383
	s_p[16]= tmp[7]  * win_bt[34];
384
	s_p[17]= tmp[8]  * win_bt[35];
385
    }
386
}
387

388
/* fast DCT according to Lee[84]
389
 * reordering according to Konstantinides[94]
390
 */ 
391
void poly(const int ch,int f)
392
{
393
static float u[2][2][17][16]; /* no v[][], it's redundant */
394
static int u_start[2]={0,0}; /* first element of u[][] */
395
static int u_div[2]={0,0}; /* which part of u[][] is currently used */
396
int start = u_start[ch];
397
int div = u_div[ch];
398
float (*u_p)[16];
399

400
#if defined(PENTIUM_RDTSC)
401
unsigned int cnt4, cnt3, cnt2, cnt1;
402
static int min_cycles = 99999999;
403

404
	__asm__(".byte 0x0f,0x31" : "=a" (cnt1), "=d" (cnt4));
405
#endif
406

407
	{
408
	float d16,d17,d18,d19,d20,d21,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31;
409
	float d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15;
410

411
	/* step 1: initial reordering and 1st (16 wide) butterflies
412
	*/
413

414
	d0 = res[ 0][f]; d16=(d0  - res[31][f]) *  b1; d0 += res[31][f];
415
	d1 = res[ 1][f]; d17=(d1  - res[30][f]) *  b3; d1 += res[30][f];
416
	d3 = res[ 2][f]; d19=(d3  - res[29][f]) *  b5; d3 += res[29][f];
417
	d2 = res[ 3][f]; d18=(d2  - res[28][f]) *  b7; d2 += res[28][f];
418
	d6 = res[ 4][f]; d22=(d6  - res[27][f]) *  b9; d6 += res[27][f];
419
	d7 = res[ 5][f]; d23=(d7  - res[26][f]) * b11; d7 += res[26][f];
420
	d5 = res[ 6][f]; d21=(d5  - res[25][f]) * b13; d5 += res[25][f];
421
	d4 = res[ 7][f]; d20=(d4  - res[24][f]) * b15; d4 += res[24][f];
422
	d12= res[ 8][f]; d28=(d12 - res[23][f]) * b17; d12+= res[23][f];
423
	d13= res[ 9][f]; d29=(d13 - res[22][f]) * b19; d13+= res[22][f];
424
	d15= res[10][f]; d31=(d15 - res[21][f]) * b21; d15+= res[21][f];
425
	d14= res[11][f]; d30=(d14 - res[20][f]) * b23; d14+= res[20][f];
426
	d10= res[12][f]; d26=(d10 - res[19][f]) * b25; d10+= res[19][f];
427
	d11= res[13][f]; d27=(d11 - res[18][f]) * b27; d11+= res[18][f];
428
	d9 = res[14][f]; d25=(d9  - res[17][f]) * b29; d9 += res[17][f];
429
	d8 = res[15][f]; d24=(d8  - res[16][f]) * b31; d8 += res[16][f];
430

431
	{
432
	float c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15;
433

434
/* a test to see what can be done with memory separation
435
 * first we process indexes 0-15
436
*/
437
	c0 = d0 + d8 ; c8 = ( d0 - d8 ) *  b2;
438
	c1 = d1 + d9 ; c9 = ( d1 - d9 ) *  b6;
439
	c2 = d2 + d10; c10= ( d2 - d10) * b14;
440
	c3 = d3 + d11; c11= ( d3 - d11) * b10;
441
	c4 = d4 + d12; c12= ( d4 - d12) * b30;
442
	c5 = d5 + d13; c13= ( d5 - d13) * b26;
443
	c6 = d6 + d14; c14= ( d6 - d14) * b18;
444
	c7 = d7 + d15; c15= ( d7 - d15) * b22;
445
	
446
	/* step 3: 4-wide butterflies
447
	*/
448
	d0 = c0 + c4 ; d4 = ( c0 - c4 ) *  b4;
449
	d1 = c1 + c5 ; d5 = ( c1 - c5 ) * b12;
450
	d2 = c2 + c6 ; d6 = ( c2 - c6 ) * b28;
451
	d3 = c3 + c7 ; d7 = ( c3 - c7 ) * b20;
452
	
453
	d8 = c8 + c12; d12= ( c8 - c12) *  b4;
454
	d9 = c9 + c13; d13= ( c9 - c13) * b12;
455
	d10= c10+ c14; d14= (c10 - c14) * b28;
456
	d11= c11+ c15; d15= (c11 - c15) * b20;
457

458

459
	/* step 4: 2-wide butterflies
460
	*/
461
	{
462
	float rb8 = b8;
463
	float rb24 = b24;
464

465
/**/	c0 = d0 + d2 ; c2 = ( d0 - d2 ) *  rb8;
466
	c1 = d1 + d3 ; c3 = ( d1 - d3 ) * rb24;
467
/**/	c4 = d4 + d6 ; c6 = ( d4 - d6 ) *  rb8;
468
	c5 = d5 + d7 ; c7 = ( d5 - d7 ) * rb24;
469
/**/	c8 = d8 + d10; c10= ( d8 - d10) *  rb8;
470
	c9 = d9 + d11; c11= ( d9 - d11) * rb24;
471
/**/	c12= d12+ d14; c14= (d12 - d14) *  rb8;
472
	c13= d13+ d15; c15= (d13 - d15) * rb24;
473
	}
474

475
	/* step 5: 1-wide butterflies
476
	*/
477
	{
478
	float rb16 = b16;
479

480
	/* this is a little 'hacked up'
481
	*/
482
	d0 = (-c0 -c1) * 2; d1 = ( c0 - c1 ) * rb16; 
483
	d2 = c2 + c3; d3 = ( c2 - c3 ) * rb16; 
484
	d3 -= d2;
485

486
	d4 = c4 +c5; d5 = ( c4 - c5 ) * rb16;
487
	d5 += d4;
488
	d7 = -d5;
489
	d7 += ( c6 - c7 ) * rb16; d6 = +c6 +c7;
490

491
	d8 = c8 + c9 ; d9 = ( c8 - c9 ) * rb16;
492
	d11= +d8 +d9;
493
	d11 +=(c10 - c11) * rb16; d10= c10+ c11; 
494

495
	d12 = c12+ c13; d13 = (c12 - c13) * rb16;
496
	d13 += -d8-d9+d12;
497
	d14 = c14+ c15; d15 = (c14 - c15) * rb16;
498
	d15-=d11;
499
	d14 += -d8 -d10;
500
	}
501

502
        /* step 6: final resolving & reordering
503
         * the other 32 are stored for use with the next granule
504
         */
505

506
        u_p = (float (*)[16]) &u[ch][div][0][start];
507

508
/*16*/  u_p[ 0][0] =+d1 ;
509
        u_p[ 2][0] = +d9 -d14;
510
/*20*/  u_p[ 4][0] = +d5 -d6;
511
        u_p[ 6][0] = -d10 +d13;
512
/*24*/  u_p[ 8][0] =d3;
513
        u_p[10][0] = -d8 -d9 +d11 -d13;
514
/*28*/  u_p[12][0] = +d7;
515
        u_p[14][0] = +d15;
516

517
        /* the other 32 are stored for use with the next granule
518
         */
519

520
        u_p = (float (*)[16]) &u[ch][!div][0][start];
521

522
/*0*/   u_p[16][0] = d0;
523
        u_p[14][0] = -(+d8 );
524
/*4*/   u_p[12][0] = -(+d4 );
525
        u_p[10][0] = -(-d8 +d12 );
526
/*8*/   u_p[ 8][0] = -(+d2 );
527
        u_p[ 6][0] = -(+d8 +d10 -d12 );
528
/*12*/  u_p[ 4][0] = -(-d4 +d6 );
529
        u_p[ 2][0] = -d14;
530
        u_p[ 0][0] = -d1;
531
	}
532

533
	{
534
	float c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15;
535

536
/* memory separation, second part
537
*/
538
/* 2
539
*/
540
        c0=d16 + d24; c8= (d16 - d24) *  b2;
541
        c1=d17 + d25; c9= (d17 - d25) *  b6;
542
        c2=d18 + d26; c10= (d18 - d26) * b14;
543
        c3=d19 + d27; c11= (d19 - d27) * b10;
544
        c4=d20 + d28; c12= (d20 - d28) * b30;
545
        c5=d21 + d29; c13= (d21 - d29) * b26;
546
        c6=d22 + d30; c14= (d22 - d30) * b18;
547
        c7=d23 + d31; c15= (d23 - d31) * b22;
548

549
/* 3
550
*/
551
        d16= c0+ c4; d20= (c0 - c4) *  b4;
552
        d17= c1+ c5; d21= (c1 - c5) * b12;
553
        d18= c2+ c6; d22= (c2 - c6) * b28;
554
        d19= c3+ c7; d23= (c3 - c7) * b20;
555

556
        d24= c8+ c12; d28= (c8 - c12) *  b4;
557
        d25= c9+ c13; d29= (c9 - c13) * b12;
558
        d26= c10+ c14; d30= (c10 - c14) * b28;
559
        d27= c11+ c15; d31= (c11 - c15) * b20;
560

561
/* 4
562
*/
563
        {
564
        float rb8 = b8;
565
        float rb24 = b24;
566

567
/**/    c0= d16+ d18; c2= (d16 - d18) *  rb8;
568
        c1= d17+ d19; c3= (d17 - d19) * rb24;
569
/**/    c4= d20+ d22; c6= (d20 - d22) *  rb8;
570
        c5= d21+ d23; c7= (d21 - d23) * rb24;
571
/**/    c8= d24+ d26; c10= (d24 - d26) *  rb8;
572
        c9= d25+ d27; c11= (d25 - d27) * rb24;
573
/**/    c12= d28+ d30; c14= (d28 - d30) *  rb8;
574
        c13= d29+ d31; c15= (d29 - d31) * rb24;
575
        }
576

577
/* 5
578
*/
579
        {
580
        float rb16 = b16;
581
        d16= c0+ c1; d17= (c0 - c1) * rb16;
582
        d18= c2+ c3; d19= (c2 - c3) * rb16;
583

584
        d20= c4+ c5; d21= (c4 - c5) * rb16;
585
        d20+=d16; d21+=d17;
586
        d22= c6+ c7; d23= (c6 - c7) * rb16;
587
        d22+=d16; d22+=d18;
588
        d23+=d16; d23+=d17; d23+=d19;
589

590

591
        d24= c8+ c9; d25= (c8 - c9) * rb16;
592
        d26= c10+ c11; d27= (c10 - c11) * rb16;
593
        d26+=d24;
594
        d27+=d24; d27+=d25;
595

596
        d28= c12+ c13; d29= (c12 - c13) * rb16;
597
        d28-=d20; d29+=d28; d29-=d21;
598
        d30= c14+ c15; d31= (c14 - c15) * rb16;
599
        d30-=d22;
600
        d31-=d23;
601
	}
602

603
	/* step 6: final resolving & reordering 
604
	 * the other 32 are stored for use with the next granule
605
	 */
606
	
607
	u_p = (float (*)[16]) &u[ch][!div][0][start];
608

609
	u_p[ 1][0] = -(+d30 );	
610
	u_p[ 3][0] = -(+d22 -d26 );
611
	u_p[ 5][0] = -(-d18 -d20 +d26 );
612
	u_p[ 7][0] = -(+d18 -d28 );
613
	u_p[ 9][0] = -(+d28 );
614
	u_p[11][0] = -(+d20 -d24 );
615
	u_p[13][0] = -(-d16 +d24 );
616
	u_p[15][0] = -(+d16 );
617

618
	/* the other 32 are stored for use with the next granule
619
	 */
620

621
	u_p = (float (*)[16]) &u[ch][div][0][start];
622

623
	u_p[15][0] = +d31;
624
	u_p[13][0] = +d23 -d27;
625
	u_p[11][0] = -d19 -d20 -d21 +d27;
626
	u_p[ 9][0] = +d19 -d29;
627
	u_p[ 7][0] = -d18 +d29;
628
	u_p[ 5][0] = +d18 +d20 +d21 -d25 -d26;
629
	u_p[ 3][0] = -d17 -d22 +d25 +d26;
630
	u_p[ 1][0] = +d17 -d30;
631
	}
632
	}
633

634
#if defined(PENTIUM_RDTSC1)
635
	__asm__(".byte 0x0f,0x31" : "=a" (cnt3), "=d" (cnt4));
636
#endif
637

638
	/* we're doing dewindowing and calculating final samples now
639
	 */
640

641
#if defined(ARCH_i586)
642
	/* x86 assembler optimisations.  These optimisations are tuned
643
           specifically for Intel Pentiums. */
644

645
    asm("movl $15,%%eax\n\t"\
646
        "1:\n\t"\
647
	"flds (%0)\n\t"\
648
	"fmuls (%1)\n\t"\
649
	"flds 4(%0)\n\t"\
650
	"fmuls 4(%1)\n\t"\
651
	"flds 8(%0)\n\t"\
652
	"fmuls 8(%1)\n\t"\
653
	"fxch %%st(2)\n\t"\
654
	"faddp\n\t"\
655
	"flds 12(%0)\n\t"\
656
	"fmuls 12(%1)\n\t"\
657
	"fxch %%st(2)\n\t"\
658
	"faddp\n\t"\
659
	"flds 16(%0)\n\t"\
660
	"fmuls 16(%1)\n\t"\
661
	"fxch %%st(2)\n\t"\
662
	"faddp\n\t"\
663
	"flds 20(%0)\n\t"\
664
	"fmuls 20(%1)\n\t"\
665
	"fxch %%st(2)\n\t"\
666
	"faddp\n\t"\
667
	"flds 24(%0)\n\t"\
668
	"fmuls 24(%1)\n\t"\
669
	"fxch %%st(2)\n\t"\
670
	"faddp\n\t"\
671
	"flds 28(%0)\n\t"\
672
	"fmuls 28(%1)\n\t"\
673
	"fxch %%st(2)\n\t"\
674
	"faddp\n\t"\
675
	"flds 32(%0)\n\t"\
676
	"fmuls 32(%1)\n\t"\
677
	"fxch %%st(2)\n\t"\
678
	"faddp\n\t"\
679
	"flds 36(%0)\n\t"\
680
	"fmuls 36(%1)\n\t"\
681
	"fxch %%st(2)\n\t"\
682
	"faddp\n\t"\
683
	"flds 40(%0)\n\t"\
684
	"fmuls 40(%1)\n\t"\
685
	"fxch %%st(2)\n\t"\
686
	"faddp\n\t"\
687
	"flds 44(%0)\n\t"\
688
	"fmuls 44(%1)\n\t"\
689
	"fxch %%st(2)\n\t"\
690
	"faddp\n\t"\
691
	"flds 48(%0)\n\t"\
692
	"fmuls 48(%1)\n\t"\
693
	"fxch %%st(2)\n\t"\
694
	"faddp\n\t"\
695
	"flds 52(%0)\n\t"\
696
	"fmuls 52(%1)\n\t"\
697
	"fxch %%st(2)\n\t"\
698
	"faddp\n\t"\
699
	"flds 56(%0)\n\t"\
700
	"fmuls 56(%1)\n\t"\
701
	"fxch %%st(2)\n\t"\
702
	"faddp\n\t"\
703
	"flds 60(%0)\n\t"\
704
	"fmuls 60(%1)\n\t"\
705
	"fxch %%st(2)\n\t"\
706
	"faddp\n\t"\
707
	"addl $64,%0\n\t"\
708
	"addl $128,%1\n\t"\
709
	"subl $4,%%esp\n\t"\
710
	"faddp\n\t"\
711
	"fistpl (%%esp)\n\t"\
712
	"popl %%ecx\n\t"\
713
	"cmpl $32767,%%ecx\n\t"\
714
	"jle 2f\n\t"\
715
	"movw $32767,%%cx\n\t"\
716
	"jmp 3f\n\t"\
717
	"2: cmpl $-32768,%%ecx\n\t"\
718
	"jge 3f\n\t"\
719
	"movw $-32768,%%cx\n\t"\
720
	"3: movw %%cx,(%2)\n\t"\
721
	"addl %3,%2\n\t"\
722
	"decl %%eax\n\t"\
723
	"jns 1b\n\t"\
724

725
	"testb $1,%4\n\t"\
726
        "je 4f\n\t"
727

728
	"flds (%0)\n\t"\
729
	"fmuls (%1)\n\t"\
730
	"flds 8(%0)\n\t"\
731
	"fmuls 8(%1)\n\t"\
732
	"flds 16(%0)\n\t"\
733
	"fmuls 16(%1)\n\t"\
734
	"fxch %%st(2)\n\t"\
735
	"faddp\n\t"\
736
	"flds 24(%0)\n\t"\
737
	"fmuls 24(%1)\n\t"\
738
	"fxch %%st(2)\n\t"\
739
	"faddp\n\t"\
740
	"flds 32(%0)\n\t"\
741
	"fmuls 32(%1)\n\t"\
742
	"fxch %%st(2)\n\t"\
743
	"faddp\n\t"\
744
	"flds 40(%0)\n\t"\
745
	"fmuls 40(%1)\n\t"\
746
	"fxch %%st(2)\n\t"\
747
	"faddp\n\t"\
748
	"flds 48(%0)\n\t"\
749
	"fmuls 48(%1)\n\t"\
750
	"fxch %%st(2)\n\t"\
751
	"faddp\n\t"\
752
	"flds 56(%0)\n\t"\
753
	"fmuls 56(%1)\n\t"\
754
	"fxch %%st(2)\n\t"\
755
	"faddp\n\t"\
756
	"subl $4,%%esp\n\t"\
757
	"subl $64,%0\n\t"\
758
	"subl $192,%1\n\t"\
759
	"faddp\n\t"\
760
	"fistpl (%%esp)\n\t"\
761
	"popl %%ecx\n\t"\
762
	"cmpl $32767,%%ecx\n\t"\
763
	"jle 2f\n\t"\
764
	"movw $32767,%%cx\n\t"\
765
	"jmp 3f\n\t"\
766
	"2: cmpl $-32768,%%ecx\n\t"\
767
	"jge 3f\n\t"\
768
	"movw $-32768,%%cx\n\t"\
769
	"3: movw %%cx,(%2)\n\t"\
770

771
	"movl %5,%%ecx\n\t"\
772
	"sall $3,%%ecx\n\t"\
773
	"addl %%ecx,%1\n\t"\
774
	"addl %3,%2\n\t"\
775
	"movl $14,%%eax\n\t"\
776

777
	"1:flds 4(%0)\n\t"\
778
	"fmuls 56(%1)\n\t"\
779
	"flds (%0)\n\t"\
780
	"fmuls 60(%1)\n\t"\
781
	"flds 12(%0)\n\t"\
782
	"fmuls 48(%1)\n\t"\
783
	"fxch %%st(2)\n\t"\
784
	"fsubp\n\t"\
785
	"flds 8(%0)\n\t"\
786
	"fmuls 52(%1)\n\t"\
787
	"fxch %%st(2)\n\t"\
788
	"faddp\n\t"\
789
	"flds 20(%0)\n\t"\
790
	"fmuls 40(%1)\n\t"\
791
	"fxch %%st(2)\n\t"\
792
	"fsubrp\n\t"\
793
	"flds 16(%0)\n\t"\
794
	"fmuls 44(%1)\n\t"\
795
	"fxch %%st(2)\n\t"\
796
	"faddp\n\t"\
797
	"flds 28(%0)\n\t"\
798
	"fmuls 32(%1)\n\t"\
799
	"fxch %%st(2)\n\t"\
800
	"fsubrp\n\t"\
801
	"flds 24(%0)\n\t"\
802
	"fmuls 36(%1)\n\t"\
803
	"fxch %%st(2)\n\t"\
804
	"faddp\n\t"\
805
	"flds 36(%0)\n\t"\
806
	"fmuls 24(%1)\n\t"\
807
	"fxch %%st(2)\n\t"\
808
	"fsubrp\n\t"\
809
	"flds 32(%0)\n\t"\
810
	"fmuls 28(%1)\n\t"\
811
	"fxch %%st(2)\n\t"\
812
	"faddp\n\t"\
813
	"flds 44(%0)\n\t"\
814
	"fmuls 16(%1)\n\t"\
815
	"fxch %%st(2)\n\t"\
816
	"fsubrp\n\t"\
817
	"flds 40(%0)\n\t"\
818
	"fmuls 20(%1)\n\t"\
819
	"fxch %%st(2)\n\t"\
820
	"faddp\n\t"\
821
	"flds 52(%0)\n\t"\
822
	"fmuls 8(%1)\n\t"\
823
	"fxch %%st(2)\n\t"\
824
	"fsubrp\n\t"\
825
	"flds 48(%0)\n\t"\
826
	"fmuls 12(%1)\n\t"\
827
	"fxch %%st(2)\n\t"\
828
	"faddp\n\t"\
829
	"flds 60(%0)\n\t"\
830
	"fmuls (%1)\n\t"\
831
	"fxch %%st(2)\n\t"\
832
	"fsubrp\n\t"\
833
	"flds 56(%0)\n\t"\
834
	"fmuls 4(%1)\n\t"\
835
	"fxch %%st(2)\n\t"\
836
	"faddp\n\t"\
837
	"subl $64,%0\n\t"\
838
	"subl $128,%1\n\t"\
839
	"subl $4,%%esp\n\t"\
840
	"fsubp\n\t"\
841
	"fistpl (%%esp)\n\t"\
842
	"popl %%ecx\n\t"\
843
	"cmpl $32767,%%ecx\n\t"\
844
	"jle 2f\n\t"\
845
	"movw $32767,%%cx\n\t"\
846
	"jmp 3f\n\t"\
847
	"2: cmpl $-32768,%%ecx\n\t"\
848
	"jge 3f\n\t"\
849
	"movw $-32768,%%cx\n\t"\
850
	"3: movw %%cx,(%2)\n\t"\
851
	"addl %3,%2\n\t"\
852
	"decl %%eax\n\t"\
853
	"jns 1b\n\t"\
854
	"jmp 5f\n\t"\
855

856
	"4:flds 4(%0)\n\t"\
857
	"fmuls 4(%1)\n\t"\
858
	"flds 12(%0)\n\t"\
859
	"fmuls 12(%1)\n\t"\
860
	"flds 20(%0)\n\t"\
861
	"fmuls 20(%1)\n\t"\
862
	"fxch %%st(2)\n\t"\
863
	"faddp\n\t"\
864
	"flds 28(%0)\n\t"\
865
	"fmuls 28(%1)\n\t"\
866
	"fxch %%st(2)\n\t"\
867
	"faddp\n\t"\
868
	"flds 36(%0)\n\t"\
869
	"fmuls 36(%1)\n\t"\
870
	"fxch %%st(2)\n\t"\
871
	"faddp\n\t"\
872
	"flds 44(%0)\n\t"\
873
	"fmuls 44(%1)\n\t"\
874
	"fxch %%st(2)\n\t"\
875
	"faddp\n\t"\
876
	"flds 52(%0)\n\t"\
877
	"fmuls 52(%1)\n\t"\
878
	"fxch %%st(2)\n\t"\
879
	"faddp\n\t"\
880
	"flds 60(%0)\n\t"\
881
	"fmuls 60(%1)\n\t"\
882
	"fxch %%st(2)\n\t"\
883
	"faddp\n\t"\
884
	"subl $4,%%esp\n\t"\
885
	"subl $64,%0\n\t"\
886
	"subl $192,%1\n\t"\
887
	"faddp\n\t"\
888
	"fistpl (%%esp)\n\t"\
889
	"popl %%ecx\n\t"\
890
	"cmpl $32767,%%ecx\n\t"\
891
	"jle 2f\n\t"\
892
	"movw $32767,%%cx\n\t"\
893
	"jmp 3f\n\t"\
894
	"2: cmpl $-32768,%%ecx\n\t"\
895
	"jge 3f\n\t"\
896
	"movw $-32768,%%cx\n\t"\
897
	"3: movw %%cx,(%2)\n\t"\
898

899
	"movl %5,%%ecx\n\t"\
900
	"sall $3,%%ecx\n\t"\
901
	"addl %%ecx,%1\n\t"\
902
	"addl %3,%2\n\t"\
903

904
	"movl $14,%%eax\n\t"\
905
	"1:flds (%0)\n\t"\
906
	"fmuls 60(%1)\n\t"\
907
	"flds 4(%0)\n\t"\
908
	"fmuls 56(%1)\n\t"\
909
	"flds 8(%0)\n\t"\
910
	"fmuls 52(%1)\n\t"\
911
	"fxch %%st(2)\n\t"\
912
	"fsubp\n\t"\
913
	"flds 12(%0)\n\t"\
914
	"fmuls 48(%1)\n\t"\
915
	"fxch %%st(2)\n\t"\
916
	"faddp\n\t"\
917
	"flds 16(%0)\n\t"\
918
	"fmuls 44(%1)\n\t"\
919
	"fxch %%st(2)\n\t"\
920
	"fsubrp\n\t"\
921
	"flds 20(%0)\n\t"\
922
	"fmuls 40(%1)\n\t"\
923
	"fxch %%st(2)\n\t"\
924
	"faddp\n\t"\
925
	"flds 24(%0)\n\t"\
926
	"fmuls 36(%1)\n\t"\
927
	"fxch %%st(2)\n\t"\
928
	"fsubrp\n\t"\
929
	"flds 28(%0)\n\t"\
930
	"fmuls 32(%1)\n\t"\
931
	"fxch %%st(2)\n\t"\
932
	"faddp\n\t"\
933
	"flds 32(%0)\n\t"\
934
	"fmuls 28(%1)\n\t"\
935
	"fxch %%st(2)\n\t"\
936
	"fsubrp\n\t"\
937
	"flds 36(%0)\n\t"\
938
	"fmuls 24(%1)\n\t"\
939
	"fxch %%st(2)\n\t"\
940
	"faddp\n\t"\
941
	"flds 40(%0)\n\t"\
942
	"fmuls 20(%1)\n\t"\
943
	"fxch %%st(2)\n\t"\
944
	"fsubrp\n\t"\
945
	"flds 44(%0)\n\t"\
946
	"fmuls 16(%1)\n\t"\
947
	"fxch %%st(2)\n\t"\
948
	"faddp\n\t"\
949
	"flds 48(%0)\n\t"\
950
	"fmuls 12(%1)\n\t"\
951
	"fxch %%st(2)\n\t"\
952
	"fsubrp\n\t"\
953
	"flds 52(%0)\n\t"\
954
	"fmuls 8(%1)\n\t"\
955
	"fxch %%st(2)\n\t"\
956
	"faddp\n\t"\
957
	"flds 56(%0)\n\t"\
958
	"fmuls 4(%1)\n\t"\
959
	"fxch %%st(2)\n\t"\
960
	"fsubrp\n\t"\
961
	"flds 60(%0)\n\t"\
962
	"fmuls (%1)\n\t"\
963
	"fxch %%st(2)\n\t"\
964
	"faddp\n\t"\
965
	"subl $64,%0\n\t"\
966
	"subl $128,%1\n\t"\
967
	"subl $4,%%esp\n\t"\
968
	"fsubp\n\t"\
969
	"fistpl (%%esp)\n\t"\
970
	"popl %%ecx\n\t"\
971
	"cmpl $32767,%%ecx\n\t"\
972
	"jle 2f\n\t"\
973
	"movw $32767,%%cx\n\t"\
974
	"jmp 3f\n\t"\
975
	"2: cmpl $-32768,%%ecx\n\t"\
976
	"jge 3f\n\t"\
977
	"movw $-32768,%%cx\n\t"\
978
	"3: movw %%cx,(%2)\n\t"\
979
	"addl %3,%2\n\t"\
980
	"decl %%eax\n\t"\
981
	"jns 1b\n\t"\
982

983
	"5:"\
984
	    : : "b" (u[ch][div]), "d" (t_dewindow[0] + 16 - start), "S" (&sample_buffer[f>>(2-nch)][nch==2?0:(f&1?16:0)][ch]), "m" (sizeof(short) * nch), "m" (div), "m" (start)\
985
	    : "eax", "ecx", "memory");
986
#else
987
	{
988
	  short *samples = (&sample_buffer[f>>(2-nch)][nch==2?0:(f&1?16:0)][ch]);
989
	  int out, j;
990

991
#define PUT_SAMPLE(out)			\
992
		if (out > 32767)	\
993
		  *samples = 32767;	\
994
		else			\
995
		  if (out < -32768)	\
996
		    *samples = -32768;	\
997
		  else			\
998
		    *samples = out;	\
999
					\
1000
		samples += nch;
1001

1002
#if defined(SUPERHACK)
1003
	  /* These is a simple implementation which should be nicer to the
1004
             cache; computation of samples are done in one pass rather than
1005
             two.  However, for various reasons which I do not have time to
1006
             investigate, it runs quite a lot slower than two pass
1007
             computations.  If you have time, you are welcome to look into
1008
             it. */
1009

1010
          {
1011
            float (*u_ptr)[16] = u[ch][div];
1012
	    const float *dewindow2 = t_dewindow[0] + start;
1013

1014
	    {
1015
	      float outf1, outf2, outf3, outf4;
1016

1017
	      outf1  = u_ptr[0][ 0] * dewindow[0x0];
1018
	      outf2  = u_ptr[0][ 1] * dewindow[0x1];
1019
	      outf3  = u_ptr[0][ 2] * dewindow[0x2];
1020
	      outf4  = u_ptr[0][ 3] * dewindow[0x3];
1021
	      outf1 += u_ptr[0][ 4] * dewindow[0x4];
1022
	      outf2 += u_ptr[0][ 5] * dewindow[0x5];
1023
	      outf3 += u_ptr[0][ 6] * dewindow[0x6];
1024
	      outf4 += u_ptr[0][ 7] * dewindow[0x7];
1025
	      outf1 += u_ptr[0][ 8] * dewindow[0x8];
1026
	      outf2 += u_ptr[0][ 9] * dewindow[0x9];
1027
	      outf3 += u_ptr[0][10] * dewindow[0xa];
1028
	      outf4 += u_ptr[0][11] * dewindow[0xb];
1029
	      outf1 += u_ptr[0][12] * dewindow[0xc];
1030
	      outf2 += u_ptr[0][13] * dewindow[0xd];
1031
	      outf3 += u_ptr[0][14] * dewindow[0xe];
1032
	      outf4 += u_ptr[0][15] * dewindow[0xf];
1033

1034
	      out = outf1 + outf2 + outf3 + outf4;
1035

1036
	      dewindow += 32;
1037
	      dewindow2 += 32;
1038
	      u_ptr++;
1039

1040
	      if (out > 32767)
1041
		samples[0] = 32767;
1042
	      else
1043
		if (out < -32768)
1044
		  samples[0] = -32768;
1045
		else
1046
		  samples[0] = out;
1047
	    }
1048

1049
            if (div & 0x1) {
1050
              for (j = 1; j < 16; ++j) {
1051
                float outf1, outf2, outf3, outf4;
1052

1053
                outf1  = u_ptr[0][ 0] * dewindow[0x0];
1054
                outf3  = u_ptr[0][ 0] * dewindow2[0xf];
1055
                outf2  = u_ptr[0][ 1] * dewindow[0x1];
1056
                outf4  = u_ptr[0][ 1] * dewindow2[0xe];
1057
                outf1 += u_ptr[0][ 2] * dewindow[0x2];
1058
                outf3 += u_ptr[0][ 2] * dewindow2[0xd];
1059
                outf2 += u_ptr[0][ 3] * dewindow[0x3];
1060
                outf4 += u_ptr[0][ 3] * dewindow2[0xc];
1061
                outf1 += u_ptr[0][ 4] * dewindow[0x4];
1062
                outf3 += u_ptr[0][ 4] * dewindow2[0xb];
1063
                outf2 += u_ptr[0][ 5] * dewindow[0x5];
1064
                outf4 += u_ptr[0][ 5] * dewindow2[0xa];
1065
                outf1 += u_ptr[0][ 6] * dewindow[0x6];
1066
                outf3 += u_ptr[0][ 6] * dewindow2[0x9];
1067
                outf2 += u_ptr[0][ 7] * dewindow[0x7];
1068
                outf4 += u_ptr[0][ 7] * dewindow2[0x8];
1069
                outf1 += u_ptr[0][ 8] * dewindow[0x8];
1070
                outf3 += u_ptr[0][ 8] * dewindow2[0x7];
1071
                outf2 += u_ptr[0][ 9] * dewindow[0x9];
1072
                outf4 += u_ptr[0][ 9] * dewindow2[0x6];
1073
                outf1 += u_ptr[0][10] * dewindow[0xa];
1074
                outf3 += u_ptr[0][10] * dewindow2[0x5];
1075
                outf2 += u_ptr[0][11] * dewindow[0xb];
1076
                outf4 += u_ptr[0][11] * dewindow2[0x4];
1077
                outf1 += u_ptr[0][12] * dewindow[0xc];
1078
                outf3 += u_ptr[0][12] * dewindow2[0x3];
1079
                outf2 += u_ptr[0][13] * dewindow[0xd];
1080
                outf4 += u_ptr[0][13] * dewindow2[0x2];
1081
                outf1 += u_ptr[0][14] * dewindow[0xe];
1082
                outf3 += u_ptr[0][14] * dewindow2[0x1];
1083
                outf2 += u_ptr[0][15] * dewindow[0xf];
1084
                outf4 += u_ptr[0][15] * dewindow2[0x0];
1085

1086
                dewindow += 32;
1087
                dewindow2 += 32;
1088
                u_ptr++;
1089

1090
                out = outf1 + outf2;
1091

1092
                if (out > 32767)
1093
                  samples[j * 2] = 32767;
1094
                else
1095
                  if (out < -32768)
1096
                    samples[j * 2] = -32768;
1097
                  else
1098
                    samples[j * 2] = out;
1099

1100
                out = outf4 - outf3;
1101

1102
                if (out > 32767)
1103
                  samples[64 - (j * 2)] = 32767;
1104
                else
1105
                  if (out < -32768)
1106
                    samples[64 - (j * 2)] = -32768;
1107
                  else
1108
                    samples[64 - (j * 2)] = out;
1109
              }
1110

1111
              {
1112
                float outf2, outf4;
1113

1114
                outf2  = u_ptr[0][ 0] * dewindow[0x0];
1115
                outf4  = u_ptr[0][ 2] * dewindow[0x2];
1116
                outf2 += u_ptr[0][ 4] * dewindow[0x4];
1117
                outf4 += u_ptr[0][ 6] * dewindow[0x6];
1118
                outf2 += u_ptr[0][ 8] * dewindow[0x8];
1119
                outf4 += u_ptr[0][10] * dewindow[0xa];
1120
                outf2 += u_ptr[0][12] * dewindow[0xc];
1121
                outf4 += u_ptr[0][14] * dewindow[0xe];
1122

1123
                out = outf2 + outf4;
1124

1125
                if (out > 32767)
1126
                  samples[16 * 2] = 32767;
1127
                else
1128
                  if (out < -32768)
1129
                    samples[16 * 2] = -32768;
1130
                  else
1131
                    samples[16 * 2] = out;
1132
              }
1133
            } else {
1134
              for (j = 1; j < 16; ++j) {
1135
                float outf1, outf2, outf3, outf4;
1136

1137
                outf1  = u_ptr[0][ 0] * dewindow[0x0];
1138
                outf3  = u_ptr[0][ 0] * dewindow2[0xf];
1139
                outf2  = u_ptr[0][ 1] * dewindow[0x1];
1140
                outf4  = u_ptr[0][ 1] * dewindow2[0xe];
1141
                outf1 += u_ptr[0][ 2] * dewindow[0x2];
1142
                outf3 += u_ptr[0][ 2] * dewindow2[0xd];
1143
                outf2 += u_ptr[0][ 3] * dewindow[0x3];
1144
                outf4 += u_ptr[0][ 3] * dewindow2[0xc];
1145
                outf1 += u_ptr[0][ 4] * dewindow[0x4];
1146
                outf3 += u_ptr[0][ 4] * dewindow2[0xb];
1147
                outf2 += u_ptr[0][ 5] * dewindow[0x5];
1148
                outf4 += u_ptr[0][ 5] * dewindow2[0xa];
1149
                outf1 += u_ptr[0][ 6] * dewindow[0x6];
1150
                outf3 += u_ptr[0][ 6] * dewindow2[0x9];
1151
                outf2 += u_ptr[0][ 7] * dewindow[0x7];
1152
                outf4 += u_ptr[0][ 7] * dewindow2[0x8];
1153
                outf1 += u_ptr[0][ 8] * dewindow[0x8];
1154
                outf3 += u_ptr[0][ 8] * dewindow2[0x7];
1155
                outf2 += u_ptr[0][ 9] * dewindow[0x9];
1156
                outf4 += u_ptr[0][ 9] * dewindow2[0x6];
1157
                outf1 += u_ptr[0][10] * dewindow[0xa];
1158
                outf3 += u_ptr[0][10] * dewindow2[0x5];
1159
                outf2 += u_ptr[0][11] * dewindow[0xb];
1160
                outf4 += u_ptr[0][11] * dewindow2[0x4];
1161
                outf1 += u_ptr[0][12] * dewindow[0xc];
1162
                outf3 += u_ptr[0][12] * dewindow2[0x3];
1163
                outf2 += u_ptr[0][13] * dewindow[0xd];
1164
                outf4 += u_ptr[0][13] * dewindow2[0x2];
1165
                outf1 += u_ptr[0][14] * dewindow[0xe];
1166
                outf3 += u_ptr[0][14] * dewindow2[0x1];
1167
                outf2 += u_ptr[0][15] * dewindow[0xf];
1168
                outf4 += u_ptr[0][15] * dewindow2[0x0];
1169

1170
                dewindow += 32;
1171
                dewindow2 += 32;
1172
                u_ptr++;
1173

1174
                out = outf1 + outf2;
1175

1176
                if (out > 32767)
1177
                  samples[j * 2] = 32767;
1178
                else
1179
                  if (out < -32768)
1180
                    samples[j * 2] = -32768;
1181
                  else
1182
                    samples[j * 2] = out;
1183

1184
                out = outf3 - outf4;
1185

1186
                if (out > 32767)
1187
                  samples[64 - (j * 2)] = 32767;
1188
                else
1189
                  if (out < -32768)
1190
                    samples[64 - (j * 2)] = -32768;
1191
                  else
1192
                    samples[64 - (j * 2)] = out;
1193
              }
1194

1195
              {
1196
                float outf2, outf4;
1197

1198
                outf2  = u_ptr[0][ 1] * dewindow[0x1];
1199
                outf4  = u_ptr[0][ 3] * dewindow[0x3];
1200
                outf2 += u_ptr[0][ 5] * dewindow[0x5];
1201
                outf4 += u_ptr[0][ 7] * dewindow[0x7];
1202
                outf2 += u_ptr[0][ 9] * dewindow[0x9];
1203
                outf4 += u_ptr[0][11] * dewindow[0xb];
1204
                outf2 += u_ptr[0][13] * dewindow[0xd];
1205
                outf4 += u_ptr[0][15] * dewindow[0xf];
1206

1207
                out = outf2 + outf4;
1208

1209
                if (out > 32767)
1210
                  samples[16 * 2] = 32767;
1211
                else
1212
                  if (out < -32768)
1213
                    samples[16 * 2] = -32768;
1214
                  else
1215
                    samples[16 * 2] = out;
1216
              }
1217
            }
1218
          }
1219
#elif defined(HAS_AUTOINCREMENT)
1220
	  const float *dewindow = t_dewindow[0] + 15 - start;
1221

1222
	  /* This is tuned specifically for architectures with
1223
             autoincrement and -decrement. */
1224

1225
	  {
1226
	    float *u_ptr = (float*) u[ch][div];
1227

1228
	    u_ptr--;
1229

1230
	    for (j = 0; j < 16; ++j) {
1231
	      float outf1, outf2, outf3, outf4;
1232

1233
	      outf1  = *++u_ptr * *++dewindow;
1234
	      outf2  = *++u_ptr * *++dewindow;
1235
	      outf3  = *++u_ptr * *++dewindow;
1236
	      outf4  = *++u_ptr * *++dewindow;
1237
	      outf1 += *++u_ptr * *++dewindow;
1238
	      outf2 += *++u_ptr * *++dewindow;
1239
	      outf3 += *++u_ptr * *++dewindow;
1240
	      outf4 += *++u_ptr * *++dewindow;
1241
	      outf1 += *++u_ptr * *++dewindow;
1242
	      outf2 += *++u_ptr * *++dewindow;
1243
	      outf3 += *++u_ptr * *++dewindow;
1244
	      outf4 += *++u_ptr * *++dewindow;
1245
	      outf1 += *++u_ptr * *++dewindow;
1246
	      outf2 += *++u_ptr * *++dewindow;
1247
	      outf3 += *++u_ptr * *++dewindow;
1248
	      outf4 += *++u_ptr * *++dewindow;
1249

1250
	      out = outf1 + outf2 + outf3 + outf4;
1251

1252
	      dewindow += 16;
1253

1254
	      PUT_SAMPLE(out)
1255
	    }
1256

1257
	    if (div & 0x1) {
1258
	      {
1259
		float outf2, outf4;
1260

1261
		outf2  = u_ptr[ 1] * dewindow[0x1];
1262
		outf4  = u_ptr[ 3] * dewindow[0x3];
1263
		outf2 += u_ptr[ 5] * dewindow[0x5];
1264
		outf4 += u_ptr[ 7] * dewindow[0x7];
1265
		outf2 += u_ptr[ 9] * dewindow[0x9];
1266
		outf4 += u_ptr[11] * dewindow[0xb];
1267
		outf2 += u_ptr[13] * dewindow[0xd];
1268
		outf4 += u_ptr[15] * dewindow[0xf];
1269

1270
		out = outf2 + outf4;
1271

1272
		PUT_SAMPLE(out)
1273
	      }
1274

1275
	      dewindow -= 31;
1276
	      dewindow += start;
1277
	      dewindow += start;
1278
	      u_ptr -= 16;
1279

1280
	      for (; j < 31; ++j) {
1281
		float outf1, outf2, outf3, outf4;
1282

1283
		outf1  = *++u_ptr * *--dewindow;
1284
		outf2  = *++u_ptr * *--dewindow;
1285
		outf3  = *++u_ptr * *--dewindow;
1286
		outf4  = *++u_ptr * *--dewindow;
1287
		outf1 += *++u_ptr * *--dewindow;
1288
		outf2 += *++u_ptr * *--dewindow;
1289
		outf3 += *++u_ptr * *--dewindow;
1290
		outf4 += *++u_ptr * *--dewindow;
1291
		outf1 += *++u_ptr * *--dewindow;
1292
		outf2 += *++u_ptr * *--dewindow;
1293
		outf3 += *++u_ptr * *--dewindow;
1294
		outf4 += *++u_ptr * *--dewindow;
1295
		outf1 += *++u_ptr * *--dewindow;
1296
		outf2 += *++u_ptr * *--dewindow;
1297
		outf3 += *++u_ptr * *--dewindow;
1298
		outf4 += *++u_ptr * *--dewindow;
1299

1300
		out = outf2 - outf1 + outf4 - outf3;
1301

1302
		dewindow -= 16;
1303
		u_ptr -= 32;
1304

1305
		PUT_SAMPLE(out)
1306
	      }
1307
	    } else {
1308
	      {
1309
		float outf2, outf4;
1310

1311
		outf2  = u_ptr[ 2] * dewindow[ 0x2];
1312
		outf4  = u_ptr[ 4] * dewindow[ 0x4];
1313
		outf2 += u_ptr[ 6] * dewindow[ 0x6];
1314
		outf4 += u_ptr[ 8] * dewindow[ 0x8];
1315
		outf2 += u_ptr[10] * dewindow[ 0xa];
1316
		outf4 += u_ptr[12] * dewindow[ 0xc];
1317
		outf2 += u_ptr[14] * dewindow[ 0xe];
1318
		outf4 += u_ptr[16] * dewindow[0x10];
1319

1320
		out = outf2 + outf4;
1321

1322
		PUT_SAMPLE(out)
1323
	      }
1324

1325
	      dewindow -= 31;
1326
	      dewindow += start;
1327
	      dewindow += start;
1328
	      u_ptr -= 16;
1329

1330
	      for (; j < 31; ++j) {
1331
		float outf1, outf2, outf3, outf4;
1332

1333
		outf1  = *++u_ptr * *--dewindow;
1334
		outf2  = *++u_ptr * *--dewindow;
1335
		outf3  = *++u_ptr * *--dewindow;
1336
		outf4  = *++u_ptr * *--dewindow;
1337
		outf1 += *++u_ptr * *--dewindow;
1338
		outf2 += *++u_ptr * *--dewindow;
1339
		outf3 += *++u_ptr * *--dewindow;
1340
		outf4 += *++u_ptr * *--dewindow;
1341
		outf1 += *++u_ptr * *--dewindow;
1342
		outf2 += *++u_ptr * *--dewindow;
1343
		outf3 += *++u_ptr * *--dewindow;
1344
		outf4 += *++u_ptr * *--dewindow;
1345
		outf1 += *++u_ptr * *--dewindow;
1346
		outf2 += *++u_ptr * *--dewindow;
1347
		outf3 += *++u_ptr * *--dewindow;
1348
		outf4 += *++u_ptr * *--dewindow;
1349

1350
		out = outf1 - outf2 + outf3 - outf4;
1351

1352
		dewindow -= 16;
1353
		u_ptr -= 32;
1354

1355
		PUT_SAMPLE(out)
1356
	      }
1357
	    }
1358
	  }
1359
#else
1360
	  const float *dewindow = t_dewindow[0] + 16 - start;
1361

1362
	  /* These optimisations are tuned specifically for architectures
1363
             without autoincrement and -decrement. */
1364

1365
	  {
1366
	    float (*u_ptr)[16] = u[ch][div];
1367

1368
	    for (j = 0; j < 16; ++j) {
1369
	      float outf1, outf2, outf3, outf4;
1370

1371
	      outf1  = u_ptr[0][ 0] * dewindow[0x0];
1372
	      outf2  = u_ptr[0][ 1] * dewindow[0x1];
1373
	      outf3  = u_ptr[0][ 2] * dewindow[0x2];
1374
	      outf4  = u_ptr[0][ 3] * dewindow[0x3];
1375
	      outf1 += u_ptr[0][ 4] * dewindow[0x4];
1376
	      outf2 += u_ptr[0][ 5] * dewindow[0x5];
1377
	      outf3 += u_ptr[0][ 6] * dewindow[0x6];
1378
	      outf4 += u_ptr[0][ 7] * dewindow[0x7];
1379
	      outf1 += u_ptr[0][ 8] * dewindow[0x8];
1380
	      outf2 += u_ptr[0][ 9] * dewindow[0x9];
1381
	      outf3 += u_ptr[0][10] * dewindow[0xa];
1382
	      outf4 += u_ptr[0][11] * dewindow[0xb];
1383
	      outf1 += u_ptr[0][12] * dewindow[0xc];
1384
	      outf2 += u_ptr[0][13] * dewindow[0xd];
1385
	      outf3 += u_ptr[0][14] * dewindow[0xe];
1386
	      outf4 += u_ptr[0][15] * dewindow[0xf];
1387

1388
	      out = outf1 + outf2 + outf3 + outf4;
1389

1390
	      dewindow += 32;
1391
	      u_ptr++;
1392

1393
	      PUT_SAMPLE(out)
1394
	    }
1395

1396
	    if (div & 0x1) {
1397
	      {
1398
		float outf2, outf4;
1399

1400
		outf2  = u_ptr[0][ 0] * dewindow[0x0];
1401
		outf4  = u_ptr[0][ 2] * dewindow[0x2];
1402
		outf2 += u_ptr[0][ 4] * dewindow[0x4];
1403
		outf4 += u_ptr[0][ 6] * dewindow[0x6];
1404
		outf2 += u_ptr[0][ 8] * dewindow[0x8];
1405
		outf4 += u_ptr[0][10] * dewindow[0xa];
1406
		outf2 += u_ptr[0][12] * dewindow[0xc];
1407
		outf4 += u_ptr[0][14] * dewindow[0xe];
1408

1409
		out = outf2 + outf4;
1410

1411
		PUT_SAMPLE(out)
1412
	      }
1413

1414
	      dewindow -= 48;
1415
	      dewindow += start;
1416
	      dewindow += start;
1417

1418
	      for (; j < 31; ++j) {
1419
		float outf1, outf2, outf3, outf4;
1420

1421
		--u_ptr;
1422

1423
		outf1  = u_ptr[0][ 0] * dewindow[0xf];
1424
		outf2  = u_ptr[0][ 1] * dewindow[0xe];
1425
		outf3  = u_ptr[0][ 2] * dewindow[0xd];
1426
		outf4  = u_ptr[0][ 3] * dewindow[0xc];
1427
		outf1 += u_ptr[0][ 4] * dewindow[0xb];
1428
		outf2 += u_ptr[0][ 5] * dewindow[0xa];
1429
		outf3 += u_ptr[0][ 6] * dewindow[0x9];
1430
		outf4 += u_ptr[0][ 7] * dewindow[0x8];
1431
		outf1 += u_ptr[0][ 8] * dewindow[0x7];
1432
		outf2 += u_ptr[0][ 9] * dewindow[0x6];
1433
		outf3 += u_ptr[0][10] * dewindow[0x5];
1434
		outf4 += u_ptr[0][11] * dewindow[0x4];
1435
		outf1 += u_ptr[0][12] * dewindow[0x3];
1436
		outf2 += u_ptr[0][13] * dewindow[0x2];
1437
		outf3 += u_ptr[0][14] * dewindow[0x1];
1438
		outf4 += u_ptr[0][15] * dewindow[0x0];
1439

1440
		out = -outf1 + outf2 - outf3 + outf4;
1441

1442
		dewindow -= 32;
1443

1444
		PUT_SAMPLE(out)
1445
	      }
1446
	    } else {
1447
	      {
1448
		float outf2, outf4;
1449

1450
		outf2  = u_ptr[0][ 1] * dewindow[0x1];
1451
		outf4  = u_ptr[0][ 3] * dewindow[0x3];
1452
		outf2 += u_ptr[0][ 5] * dewindow[0x5];
1453
		outf4 += u_ptr[0][ 7] * dewindow[0x7];
1454
		outf2 += u_ptr[0][ 9] * dewindow[0x9];
1455
		outf4 += u_ptr[0][11] * dewindow[0xb];
1456
		outf2 += u_ptr[0][13] * dewindow[0xd];
1457
		outf4 += u_ptr[0][15] * dewindow[0xf];
1458

1459
		out = outf2 + outf4;
1460

1461
		PUT_SAMPLE(out)
1462
	      }
1463

1464
	      dewindow -= 48;
1465
	      dewindow += start;
1466
	      dewindow += start;
1467

1468
	      for (; j < 31; ++j) {
1469
		float outf1, outf2, outf3, outf4;
1470

1471
		--u_ptr;
1472

1473
		outf1  = u_ptr[0][ 0] * dewindow[0xf];
1474
		outf2  = u_ptr[0][ 1] * dewindow[0xe];
1475
		outf3  = u_ptr[0][ 2] * dewindow[0xd];
1476
		outf4  = u_ptr[0][ 3] * dewindow[0xc];
1477
		outf1 += u_ptr[0][ 4] * dewindow[0xb];
1478
		outf2 += u_ptr[0][ 5] * dewindow[0xa];
1479
		outf3 += u_ptr[0][ 6] * dewindow[0x9];
1480
		outf4 += u_ptr[0][ 7] * dewindow[0x8];
1481
		outf1 += u_ptr[0][ 8] * dewindow[0x7];
1482
		outf2 += u_ptr[0][ 9] * dewindow[0x6];
1483
		outf3 += u_ptr[0][10] * dewindow[0x5];
1484
		outf4 += u_ptr[0][11] * dewindow[0x4];
1485
		outf1 += u_ptr[0][12] * dewindow[0x3];
1486
		outf2 += u_ptr[0][13] * dewindow[0x2];
1487
		outf3 += u_ptr[0][14] * dewindow[0x1];
1488
		outf4 += u_ptr[0][15] * dewindow[0x0];
1489

1490
		out = outf1 - outf2 + outf3 - outf4;
1491

1492
		dewindow -= 32;
1493

1494
		PUT_SAMPLE(out)
1495
	      }
1496
	    }
1497
	  }
1498
#endif								        
1499
	}
1500
#endif
1501

1502
	--u_start[ch];
1503
	u_start[ch] &= 0xf;
1504
	u_div[ch]=u_div[ch] ? 0 : 1;
1505

1506
#if defined(PENTIUM_RDTSC)
1507
	__asm__(".byte 0x0f,0x31" : "=a" (cnt2), "=d" (cnt4));
1508
			
1509
	if (cnt2-cnt1 < min_cycles) {
1510
	  min_cycles = cnt2-cnt1;
1511
	  /*printf("%d, %d cycles, %d\n", cnt3-cnt1, min_cycles, start);*/
1512
	}
1513
#endif
1514
}
1515

1516
void premultiply()
1517
{
1518
  int i,t;
1519

1520
  for (i = 0; i < 17; ++i)
1521
    for (t = 0; t < 32; ++t)
1522
      t_dewindow[i][t] *= 16383.5f;
1523
}
1524

1525
Product

Resources

Company