Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libtheora/x86/mmxfdct.c
9898 views
1
/********************************************************************
2
* *
3
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7
* *
8
* THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 *
9
* by the Xiph.Org Foundation https://www.xiph.org/ *
10
* *
11
********************************************************************/
12
/*MMX fDCT implementation for x86_32*/
13
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
14
#include "x86enc.h"
15
#include "x86zigzag.h"
16
17
#if defined(OC_X86_ASM)
18
19
# define OC_FDCT_STAGE1_8x4 \
20
"#OC_FDCT_STAGE1_8x4\n\t" \
21
/*Stage 1:*/ \
22
/*mm0=t7'=t0-t7*/ \
23
"psubw %%mm7,%%mm0\n\t" \
24
"paddw %%mm7,%%mm7\n\t" \
25
/*mm1=t6'=t1-t6*/ \
26
"psubw %%mm6,%%mm1\n\t" \
27
"paddw %%mm6,%%mm6\n\t" \
28
/*mm2=t5'=t2-t5*/ \
29
"psubw %%mm5,%%mm2\n\t" \
30
"paddw %%mm5,%%mm5\n\t" \
31
/*mm3=t4'=t3-t4*/ \
32
"psubw %%mm4,%%mm3\n\t" \
33
"paddw %%mm4,%%mm4\n\t" \
34
/*mm7=t0'=t0+t7*/ \
35
"paddw %%mm0,%%mm7\n\t" \
36
/*mm6=t1'=t1+t6*/ \
37
"paddw %%mm1,%%mm6\n\t" \
38
/*mm5=t2'=t2+t5*/ \
39
"paddw %%mm2,%%mm5\n\t" \
40
/*mm4=t3'=t3+t4*/ \
41
"paddw %%mm3,%%mm4\n\t" \
42
43
# define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
44
"#OC_FDCT8x4\n\t" \
45
/*Stage 2:*/ \
46
/*mm7=t3''=t0'-t3'*/ \
47
"psubw %%mm4,%%mm7\n\t" \
48
"paddw %%mm4,%%mm4\n\t" \
49
/*mm6=t2''=t1'-t2'*/ \
50
"psubw %%mm5,%%mm6\n\t" \
51
"movq %%mm7,"_r6"(%[y])\n\t" \
52
"paddw %%mm5,%%mm5\n\t" \
53
/*mm1=t5''=t6'-t5'*/ \
54
"psubw %%mm2,%%mm1\n\t" \
55
"movq %%mm6,"_r2"(%[y])\n\t" \
56
/*mm4=t0''=t0'+t3'*/ \
57
"paddw %%mm7,%%mm4\n\t" \
58
"paddw %%mm2,%%mm2\n\t" \
59
/*mm5=t1''=t1'+t2'*/ \
60
"movq %%mm4,"_r0"(%[y])\n\t" \
61
"paddw %%mm6,%%mm5\n\t" \
62
/*mm2=t6''=t6'+t5'*/ \
63
"paddw %%mm1,%%mm2\n\t" \
64
"movq %%mm5,"_r4"(%[y])\n\t" \
65
/*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
66
/*mm4, mm5, mm6, mm7 are free.*/ \
67
/*Stage 3:*/ \
68
/*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
69
"mov $0x5A806A0A,%[a]\n\t" \
70
"pcmpeqb %%mm6,%%mm6\n\t" \
71
"movd %[a],%%mm7\n\t" \
72
"psrlw $15,%%mm6\n\t" \
73
"punpckldq %%mm7,%%mm7\n\t" \
74
"paddw %%mm6,%%mm6\n\t" \
75
/*mm0=0, m2={-1}x4 \
76
mm5:mm4=t5''*27146+0xB500*/ \
77
"movq %%mm1,%%mm4\n\t" \
78
"movq %%mm1,%%mm5\n\t" \
79
"punpcklwd %%mm6,%%mm4\n\t" \
80
"movq %%mm2,"_r3"(%[y])\n\t" \
81
"pmaddwd %%mm7,%%mm4\n\t" \
82
"movq %%mm0,"_r7"(%[y])\n\t" \
83
"punpckhwd %%mm6,%%mm5\n\t" \
84
"pxor %%mm0,%%mm0\n\t" \
85
"pmaddwd %%mm7,%%mm5\n\t" \
86
"pcmpeqb %%mm2,%%mm2\n\t" \
87
/*mm2=t6'', mm1=t5''+(t5''!=0) \
88
mm4=(t5''*27146+0xB500>>16)*/ \
89
"pcmpeqw %%mm1,%%mm0\n\t" \
90
"psrad $16,%%mm4\n\t" \
91
"psubw %%mm2,%%mm0\n\t" \
92
"movq "_r3"(%[y]),%%mm2\n\t" \
93
"psrad $16,%%mm5\n\t" \
94
"paddw %%mm0,%%mm1\n\t" \
95
"packssdw %%mm5,%%mm4\n\t" \
96
/*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
97
"paddw %%mm1,%%mm4\n\t" \
98
"movq "_r7"(%[y]),%%mm0\n\t" \
99
"psraw $1,%%mm4\n\t" \
100
"movq %%mm3,%%mm1\n\t" \
101
/*mm3=t4''=t4'+s*/ \
102
"paddw %%mm4,%%mm3\n\t" \
103
/*mm1=t5'''=t4'-s*/ \
104
"psubw %%mm4,%%mm1\n\t" \
105
/*mm1=0, mm3={-1}x4 \
106
mm5:mm4=t6''*27146+0xB500*/ \
107
"movq %%mm2,%%mm4\n\t" \
108
"movq %%mm2,%%mm5\n\t" \
109
"punpcklwd %%mm6,%%mm4\n\t" \
110
"movq %%mm1,"_r5"(%[y])\n\t" \
111
"pmaddwd %%mm7,%%mm4\n\t" \
112
"movq %%mm3,"_r1"(%[y])\n\t" \
113
"punpckhwd %%mm6,%%mm5\n\t" \
114
"pxor %%mm1,%%mm1\n\t" \
115
"pmaddwd %%mm7,%%mm5\n\t" \
116
"pcmpeqb %%mm3,%%mm3\n\t" \
117
/*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
118
"psrad $16,%%mm4\n\t" \
119
"pcmpeqw %%mm2,%%mm1\n\t" \
120
"psrad $16,%%mm5\n\t" \
121
"psubw %%mm3,%%mm1\n\t" \
122
"packssdw %%mm5,%%mm4\n\t" \
123
"paddw %%mm1,%%mm2\n\t" \
124
/*mm1=t1'' \
125
mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
126
"paddw %%mm2,%%mm4\n\t" \
127
"movq "_r4"(%[y]),%%mm1\n\t" \
128
"psraw $1,%%mm4\n\t" \
129
"movq %%mm0,%%mm2\n\t" \
130
/*mm7={54491-0x7FFF,0x7FFF}x2 \
131
mm0=t7''=t7'+s*/ \
132
"paddw %%mm4,%%mm0\n\t" \
133
/*mm2=t6'''=t7'-s*/ \
134
"psubw %%mm4,%%mm2\n\t" \
135
/*Stage 4:*/ \
136
/*mm0=0, mm2=t0'' \
137
mm5:mm4=t1''*27146+0xB500*/ \
138
"movq %%mm1,%%mm4\n\t" \
139
"movq %%mm1,%%mm5\n\t" \
140
"punpcklwd %%mm6,%%mm4\n\t" \
141
"movq %%mm2,"_r3"(%[y])\n\t" \
142
"pmaddwd %%mm7,%%mm4\n\t" \
143
"movq "_r0"(%[y]),%%mm2\n\t" \
144
"punpckhwd %%mm6,%%mm5\n\t" \
145
"movq %%mm0,"_r7"(%[y])\n\t" \
146
"pmaddwd %%mm7,%%mm5\n\t" \
147
"pxor %%mm0,%%mm0\n\t" \
148
/*mm7={27146,0x4000>>1}x2 \
149
mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
150
"psrad $16,%%mm4\n\t" \
151
"mov $0x20006A0A,%[a]\n\t" \
152
"pcmpeqw %%mm1,%%mm0\n\t" \
153
"movd %[a],%%mm7\n\t" \
154
"psrad $16,%%mm5\n\t" \
155
"psubw %%mm3,%%mm0\n\t" \
156
"packssdw %%mm5,%%mm4\n\t" \
157
"paddw %%mm1,%%mm0\n\t" \
158
"punpckldq %%mm7,%%mm7\n\t" \
159
"paddw %%mm4,%%mm0\n\t" \
160
/*mm6={0x00000E3D}x2 \
161
mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
162
"movq %%mm2,%%mm4\n\t" \
163
"movq %%mm2,%%mm5\n\t" \
164
"punpcklwd %%mm6,%%mm4\n\t" \
165
"mov $0x0E3D,%[a]\n\t" \
166
"pmaddwd %%mm7,%%mm4\n\t" \
167
"punpckhwd %%mm6,%%mm5\n\t" \
168
"movd %[a],%%mm6\n\t" \
169
"pmaddwd %%mm7,%%mm5\n\t" \
170
"pxor %%mm1,%%mm1\n\t" \
171
"punpckldq %%mm6,%%mm6\n\t" \
172
"pcmpeqw %%mm2,%%mm1\n\t" \
173
/*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
174
"psrad $16,%%mm4\n\t" \
175
"psubw %%mm3,%%mm1\n\t" \
176
"psrad $16,%%mm5\n\t" \
177
"paddw %%mm1,%%mm2\n\t" \
178
"packssdw %%mm5,%%mm4\n\t" \
179
"movq "_r5"(%[y]),%%mm1\n\t" \
180
"paddw %%mm2,%%mm4\n\t" \
181
/*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
182
The naive implementation could cause overflow, so we use \
183
u=(r&s)+((r^s)>>1).*/ \
184
"movq "_r3"(%[y]),%%mm2\n\t" \
185
"movq %%mm0,%%mm7\n\t" \
186
"pxor %%mm4,%%mm0\n\t" \
187
"pand %%mm4,%%mm7\n\t" \
188
"psraw $1,%%mm0\n\t" \
189
"mov $0x7FFF54DC,%[a]\n\t" \
190
"paddw %%mm7,%%mm0\n\t" \
191
"movd %[a],%%mm7\n\t" \
192
/*mm7={54491-0x7FFF,0x7FFF}x2 \
193
mm4=_y[4]=v=r-u*/ \
194
"psubw %%mm0,%%mm4\n\t" \
195
"punpckldq %%mm7,%%mm7\n\t" \
196
"movq %%mm4,"_r4"(%[y])\n\t" \
197
/*mm0=0, mm7={36410}x4 \
198
mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
199
"movq %%mm1,%%mm4\n\t" \
200
"movq %%mm1,%%mm5\n\t" \
201
"punpcklwd %%mm1,%%mm4\n\t" \
202
"mov $0x8E3A8E3A,%[a]\n\t" \
203
"pmaddwd %%mm7,%%mm4\n\t" \
204
"movq %%mm0,"_r0"(%[y])\n\t" \
205
"punpckhwd %%mm1,%%mm5\n\t" \
206
"pxor %%mm0,%%mm0\n\t" \
207
"pmaddwd %%mm7,%%mm5\n\t" \
208
"pcmpeqw %%mm0,%%mm1\n\t" \
209
"movd %[a],%%mm7\n\t" \
210
"psubw %%mm3,%%mm1\n\t" \
211
"punpckldq %%mm7,%%mm7\n\t" \
212
"paddd %%mm6,%%mm4\n\t" \
213
"paddd %%mm6,%%mm5\n\t" \
214
/*mm0=0 \
215
mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
216
"movq %%mm2,%%mm6\n\t" \
217
"movq %%mm2,%%mm3\n\t" \
218
"pmulhw %%mm7,%%mm6\n\t" \
219
"paddw %%mm2,%%mm1\n\t" \
220
"pmullw %%mm7,%%mm3\n\t" \
221
"pxor %%mm0,%%mm0\n\t" \
222
"paddw %%mm1,%%mm6\n\t" \
223
"movq %%mm3,%%mm1\n\t" \
224
"punpckhwd %%mm6,%%mm3\n\t" \
225
"punpcklwd %%mm6,%%mm1\n\t" \
226
/*mm3={-1}x4, mm6={1}x4 \
227
mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
228
"paddd %%mm3,%%mm5\n\t" \
229
"paddd %%mm1,%%mm4\n\t" \
230
"psrad $16,%%mm5\n\t" \
231
"pxor %%mm6,%%mm6\n\t" \
232
"psrad $16,%%mm4\n\t" \
233
"pcmpeqb %%mm3,%%mm3\n\t" \
234
"packssdw %%mm5,%%mm4\n\t" \
235
"psubw %%mm3,%%mm6\n\t" \
236
/*mm1=t7'', mm7={26568,0x3400}x2 \
237
mm2=s=t6'''-(36410*u>>16)*/ \
238
"movq %%mm4,%%mm1\n\t" \
239
"mov $0x340067C8,%[a]\n\t" \
240
"pmulhw %%mm7,%%mm4\n\t" \
241
"movd %[a],%%mm7\n\t" \
242
"movq %%mm1,"_r5"(%[y])\n\t" \
243
"punpckldq %%mm7,%%mm7\n\t" \
244
"paddw %%mm1,%%mm4\n\t" \
245
"movq "_r7"(%[y]),%%mm1\n\t" \
246
"psubw %%mm4,%%mm2\n\t" \
247
/*mm6={0x00007B1B}x2 \
248
mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
249
"movq %%mm2,%%mm4\n\t" \
250
"movq %%mm2,%%mm5\n\t" \
251
"punpcklwd %%mm6,%%mm4\n\t" \
252
"pcmpeqw %%mm2,%%mm0\n\t" \
253
"pmaddwd %%mm7,%%mm4\n\t" \
254
"mov $0x7B1B,%[a]\n\t" \
255
"punpckhwd %%mm6,%%mm5\n\t" \
256
"movd %[a],%%mm6\n\t" \
257
"pmaddwd %%mm7,%%mm5\n\t" \
258
"psubw %%mm3,%%mm0\n\t" \
259
"punpckldq %%mm6,%%mm6\n\t" \
260
/*mm7={64277-0x7FFF,0x7FFF}x2 \
261
mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
262
"psrad $17,%%mm4\n\t" \
263
"paddw %%mm0,%%mm2\n\t" \
264
"psrad $17,%%mm5\n\t" \
265
"mov $0x7FFF7B16,%[a]\n\t" \
266
"packssdw %%mm5,%%mm4\n\t" \
267
"movd %[a],%%mm7\n\t" \
268
"paddw %%mm4,%%mm2\n\t" \
269
"punpckldq %%mm7,%%mm7\n\t" \
270
/*mm0=0, mm7={12785}x4 \
271
mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
272
"movq %%mm1,%%mm4\n\t" \
273
"movq %%mm1,%%mm5\n\t" \
274
"movq %%mm2,"_r3"(%[y])\n\t" \
275
"punpcklwd %%mm1,%%mm4\n\t" \
276
"movq "_r1"(%[y]),%%mm2\n\t" \
277
"pmaddwd %%mm7,%%mm4\n\t" \
278
"mov $0x31F131F1,%[a]\n\t" \
279
"punpckhwd %%mm1,%%mm5\n\t" \
280
"pxor %%mm0,%%mm0\n\t" \
281
"pmaddwd %%mm7,%%mm5\n\t" \
282
"pcmpeqw %%mm0,%%mm1\n\t" \
283
"movd %[a],%%mm7\n\t" \
284
"psubw %%mm3,%%mm1\n\t" \
285
"punpckldq %%mm7,%%mm7\n\t" \
286
"paddd %%mm6,%%mm4\n\t" \
287
"paddd %%mm6,%%mm5\n\t" \
288
/*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
289
"movq %%mm2,%%mm6\n\t" \
290
"movq %%mm2,%%mm3\n\t" \
291
"pmulhw %%mm7,%%mm6\n\t" \
292
"pmullw %%mm7,%%mm3\n\t" \
293
"paddw %%mm1,%%mm6\n\t" \
294
"movq %%mm3,%%mm1\n\t" \
295
"punpckhwd %%mm6,%%mm3\n\t" \
296
"punpcklwd %%mm6,%%mm1\n\t" \
297
/*mm3={-1}x4, mm6={1}x4 \
298
mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
299
"paddd %%mm3,%%mm5\n\t" \
300
"paddd %%mm1,%%mm4\n\t" \
301
"psrad $16,%%mm5\n\t" \
302
"pxor %%mm6,%%mm6\n\t" \
303
"psrad $16,%%mm4\n\t" \
304
"pcmpeqb %%mm3,%%mm3\n\t" \
305
"packssdw %%mm5,%%mm4\n\t" \
306
"psubw %%mm3,%%mm6\n\t" \
307
/*mm1=t3'', mm7={20539,0x3000}x2 \
308
mm4=s=(12785*u>>16)-t4''*/ \
309
"movq %%mm4,"_r1"(%[y])\n\t" \
310
"pmulhw %%mm7,%%mm4\n\t" \
311
"mov $0x3000503B,%[a]\n\t" \
312
"movq "_r6"(%[y]),%%mm1\n\t" \
313
"movd %[a],%%mm7\n\t" \
314
"psubw %%mm2,%%mm4\n\t" \
315
"punpckldq %%mm7,%%mm7\n\t" \
316
/*mm6={0x00006CB7}x2 \
317
mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
318
"movq %%mm4,%%mm5\n\t" \
319
"movq %%mm4,%%mm2\n\t" \
320
"punpcklwd %%mm6,%%mm4\n\t" \
321
"pcmpeqw %%mm2,%%mm0\n\t" \
322
"pmaddwd %%mm7,%%mm4\n\t" \
323
"mov $0x6CB7,%[a]\n\t" \
324
"punpckhwd %%mm6,%%mm5\n\t" \
325
"movd %[a],%%mm6\n\t" \
326
"pmaddwd %%mm7,%%mm5\n\t" \
327
"psubw %%mm3,%%mm0\n\t" \
328
"punpckldq %%mm6,%%mm6\n\t" \
329
/*mm7={60547-0x7FFF,0x7FFF}x2 \
330
mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
331
"psrad $20,%%mm4\n\t" \
332
"paddw %%mm0,%%mm2\n\t" \
333
"psrad $20,%%mm5\n\t" \
334
"mov $0x7FFF6C84,%[a]\n\t" \
335
"packssdw %%mm5,%%mm4\n\t" \
336
"movd %[a],%%mm7\n\t" \
337
"paddw %%mm4,%%mm2\n\t" \
338
"punpckldq %%mm7,%%mm7\n\t" \
339
/*mm0=0, mm7={25080}x4 \
340
mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
341
"movq %%mm1,%%mm4\n\t" \
342
"movq %%mm1,%%mm5\n\t" \
343
"movq %%mm2,"_r7"(%[y])\n\t" \
344
"punpcklwd %%mm1,%%mm4\n\t" \
345
"movq "_r2"(%[y]),%%mm2\n\t" \
346
"pmaddwd %%mm7,%%mm4\n\t" \
347
"mov $0x61F861F8,%[a]\n\t" \
348
"punpckhwd %%mm1,%%mm5\n\t" \
349
"pxor %%mm0,%%mm0\n\t" \
350
"pmaddwd %%mm7,%%mm5\n\t" \
351
"movd %[a],%%mm7\n\t" \
352
"pcmpeqw %%mm0,%%mm1\n\t" \
353
"psubw %%mm3,%%mm1\n\t" \
354
"punpckldq %%mm7,%%mm7\n\t" \
355
"paddd %%mm6,%%mm4\n\t" \
356
"paddd %%mm6,%%mm5\n\t" \
357
/*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
358
"movq %%mm2,%%mm6\n\t" \
359
"movq %%mm2,%%mm3\n\t" \
360
"pmulhw %%mm7,%%mm6\n\t" \
361
"pmullw %%mm7,%%mm3\n\t" \
362
"paddw %%mm1,%%mm6\n\t" \
363
"movq %%mm3,%%mm1\n\t" \
364
"punpckhwd %%mm6,%%mm3\n\t" \
365
"punpcklwd %%mm6,%%mm1\n\t" \
366
/*mm1={-1}x4 \
367
mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
368
"paddd %%mm3,%%mm5\n\t" \
369
"paddd %%mm1,%%mm4\n\t" \
370
"psrad $16,%%mm5\n\t" \
371
"mov $0x28005460,%[a]\n\t" \
372
"psrad $16,%%mm4\n\t" \
373
"pcmpeqb %%mm1,%%mm1\n\t" \
374
"packssdw %%mm5,%%mm4\n\t" \
375
/*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
376
mm4=s=(25080*u>>16)-t2''*/ \
377
"movq %%mm4,%%mm6\n\t" \
378
"pmulhw %%mm7,%%mm4\n\t" \
379
"pxor %%mm5,%%mm5\n\t" \
380
"movd %[a],%%mm7\n\t" \
381
"psubw %%mm1,%%mm5\n\t" \
382
"punpckldq %%mm7,%%mm7\n\t" \
383
"psubw %%mm2,%%mm4\n\t" \
384
/*mm2=s+(s!=0) \
385
mm4:mm3=s*21600+0x2800*/ \
386
"movq %%mm4,%%mm3\n\t" \
387
"movq %%mm4,%%mm2\n\t" \
388
"punpckhwd %%mm5,%%mm4\n\t" \
389
"pcmpeqw %%mm2,%%mm0\n\t" \
390
"pmaddwd %%mm7,%%mm4\n\t" \
391
"psubw %%mm1,%%mm0\n\t" \
392
"punpcklwd %%mm5,%%mm3\n\t" \
393
"paddw %%mm0,%%mm2\n\t" \
394
"pmaddwd %%mm7,%%mm3\n\t" \
395
/*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
396
mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
397
"movq "_r4"(%[y]),%%mm0\n\t" \
398
"psrad $18,%%mm4\n\t" \
399
"movq "_r5"(%[y]),%%mm5\n\t" \
400
"psrad $18,%%mm3\n\t" \
401
"movq "_r7"(%[y]),%%mm1\n\t" \
402
"packssdw %%mm4,%%mm3\n\t" \
403
"movq "_r0"(%[y]),%%mm4\n\t" \
404
"paddw %%mm2,%%mm3\n\t" \
405
406
/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].
407
On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and
408
{mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/
409
# define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
410
"#OC_TRANSPOSE8x4\n\t" \
411
/*First 4x4 transpose:*/ \
412
/*mm0 = e3 e2 e1 e0 \
413
mm5 = f3 f2 f1 f0 \
414
mm3 = g3 g2 g1 g0 \
415
mm1 = h3 h2 h1 h0*/ \
416
"movq %%mm0,%%mm2\n\t" \
417
"punpcklwd %%mm5,%%mm0\n\t" \
418
"punpckhwd %%mm5,%%mm2\n\t" \
419
"movq %%mm3,%%mm5\n\t" \
420
"punpcklwd %%mm1,%%mm3\n\t" \
421
"punpckhwd %%mm1,%%mm5\n\t" \
422
/*mm0 = f1 e1 f0 e0 \
423
mm2 = f3 e3 f2 e2 \
424
mm3 = h1 g1 h0 g0 \
425
mm5 = h3 g3 h2 g2*/ \
426
"movq %%mm0,%%mm1\n\t" \
427
"punpckldq %%mm3,%%mm0\n\t" \
428
"movq %%mm0,"_r4"(%[y])\n\t" \
429
"punpckhdq %%mm3,%%mm1\n\t" \
430
"movq "_r1"(%[y]),%%mm0\n\t" \
431
"movq %%mm2,%%mm3\n\t" \
432
"punpckldq %%mm5,%%mm2\n\t" \
433
"punpckhdq %%mm5,%%mm3\n\t" \
434
"movq "_r3"(%[y]),%%mm5\n\t" \
435
/*_y[4] = h0 g0 f0 e0 \
436
mm1 = h1 g1 f1 e1 \
437
mm2 = h2 g2 f2 e2 \
438
mm3 = h3 g3 f3 e3*/ \
439
/*Second 4x4 transpose:*/ \
440
/*mm4 = a3 a2 a1 a0 \
441
mm0 = b3 b2 b1 b0 \
442
mm6 = c3 c2 c1 c0 \
443
mm5 = d3 d2 d1 d0*/ \
444
"movq %%mm4,%%mm7\n\t" \
445
"punpcklwd %%mm0,%%mm4\n\t" \
446
"punpckhwd %%mm0,%%mm7\n\t" \
447
"movq %%mm6,%%mm0\n\t" \
448
"punpcklwd %%mm5,%%mm6\n\t" \
449
"punpckhwd %%mm5,%%mm0\n\t" \
450
/*mm4 = b1 a1 b0 a0 \
451
mm7 = b3 a3 b2 a2 \
452
mm6 = d1 c1 d0 c0 \
453
mm0 = d3 c3 d2 c2*/ \
454
"movq %%mm4,%%mm5\n\t" \
455
"punpckldq %%mm6,%%mm4\n\t" \
456
"punpckhdq %%mm6,%%mm5\n\t" \
457
"movq %%mm7,%%mm6\n\t" \
458
"punpckhdq %%mm0,%%mm7\n\t" \
459
"punpckldq %%mm0,%%mm6\n\t" \
460
/*mm4 = d0 c0 b0 a0 \
461
mm5 = d1 c1 b1 a1 \
462
mm6 = d2 c2 b2 a2 \
463
mm7 = d3 c3 b3 a3*/ \
464
465
/*MMX implementation of the fDCT.*/
466
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
467
OC_ALIGN8(ogg_int16_t buf[64]);
468
ptrdiff_t a;
469
__asm__ __volatile__(
470
/*Add two extra bits of working precision to improve accuracy; any more and
471
we could overflow.*/
472
/*We also add biases to correct for some systematic error that remains in
473
the full fDCT->iDCT round trip.*/
474
"movq 0x00(%[x]),%%mm0\n\t"
475
"movq 0x10(%[x]),%%mm1\n\t"
476
"movq 0x20(%[x]),%%mm2\n\t"
477
"movq 0x30(%[x]),%%mm3\n\t"
478
"pcmpeqb %%mm4,%%mm4\n\t"
479
"pxor %%mm7,%%mm7\n\t"
480
"movq %%mm0,%%mm5\n\t"
481
"psllw $2,%%mm0\n\t"
482
"pcmpeqw %%mm7,%%mm5\n\t"
483
"movq 0x70(%[x]),%%mm7\n\t"
484
"psllw $2,%%mm1\n\t"
485
"psubw %%mm4,%%mm5\n\t"
486
"psllw $2,%%mm2\n\t"
487
"mov $1,%[a]\n\t"
488
"pslld $16,%%mm5\n\t"
489
"movd %[a],%%mm6\n\t"
490
"psllq $16,%%mm5\n\t"
491
"mov $0x10001,%[a]\n\t"
492
"psllw $2,%%mm3\n\t"
493
"movd %[a],%%mm4\n\t"
494
"punpckhwd %%mm6,%%mm5\n\t"
495
"psubw %%mm6,%%mm1\n\t"
496
"movq 0x60(%[x]),%%mm6\n\t"
497
"paddw %%mm5,%%mm0\n\t"
498
"movq 0x50(%[x]),%%mm5\n\t"
499
"paddw %%mm4,%%mm0\n\t"
500
"movq 0x40(%[x]),%%mm4\n\t"
501
/*We inline stage1 of the transform here so we can get better instruction
502
scheduling with the shifts.*/
503
/*mm0=t7'=t0-t7*/
504
"psllw $2,%%mm7\n\t"
505
"psubw %%mm7,%%mm0\n\t"
506
"psllw $2,%%mm6\n\t"
507
"paddw %%mm7,%%mm7\n\t"
508
/*mm1=t6'=t1-t6*/
509
"psllw $2,%%mm5\n\t"
510
"psubw %%mm6,%%mm1\n\t"
511
"psllw $2,%%mm4\n\t"
512
"paddw %%mm6,%%mm6\n\t"
513
/*mm2=t5'=t2-t5*/
514
"psubw %%mm5,%%mm2\n\t"
515
"paddw %%mm5,%%mm5\n\t"
516
/*mm3=t4'=t3-t4*/
517
"psubw %%mm4,%%mm3\n\t"
518
"paddw %%mm4,%%mm4\n\t"
519
/*mm7=t0'=t0+t7*/
520
"paddw %%mm0,%%mm7\n\t"
521
/*mm6=t1'=t1+t6*/
522
"paddw %%mm1,%%mm6\n\t"
523
/*mm5=t2'=t2+t5*/
524
"paddw %%mm2,%%mm5\n\t"
525
/*mm4=t3'=t3+t4*/
526
"paddw %%mm3,%%mm4\n\t"
527
OC_FDCT8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
528
OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
529
/*Swap out this 8x4 block for the next one.*/
530
"movq 0x08(%[x]),%%mm0\n\t"
531
"movq %%mm7,0x30(%[y])\n\t"
532
"movq 0x78(%[x]),%%mm7\n\t"
533
"movq %%mm1,0x50(%[y])\n\t"
534
"movq 0x18(%[x]),%%mm1\n\t"
535
"movq %%mm6,0x20(%[y])\n\t"
536
"movq 0x68(%[x]),%%mm6\n\t"
537
"movq %%mm2,0x60(%[y])\n\t"
538
"movq 0x28(%[x]),%%mm2\n\t"
539
"movq %%mm5,0x10(%[y])\n\t"
540
"movq 0x58(%[x]),%%mm5\n\t"
541
"movq %%mm3,0x70(%[y])\n\t"
542
"movq 0x38(%[x]),%%mm3\n\t"
543
/*And increase its working precision, too.*/
544
"psllw $2,%%mm0\n\t"
545
"movq %%mm4,0x00(%[y])\n\t"
546
"psllw $2,%%mm7\n\t"
547
"movq 0x48(%[x]),%%mm4\n\t"
548
/*We inline stage1 of the transform here so we can get better instruction
549
scheduling with the shifts.*/
550
/*mm0=t7'=t0-t7*/
551
"psubw %%mm7,%%mm0\n\t"
552
"psllw $2,%%mm1\n\t"
553
"paddw %%mm7,%%mm7\n\t"
554
"psllw $2,%%mm6\n\t"
555
/*mm1=t6'=t1-t6*/
556
"psubw %%mm6,%%mm1\n\t"
557
"psllw $2,%%mm2\n\t"
558
"paddw %%mm6,%%mm6\n\t"
559
"psllw $2,%%mm5\n\t"
560
/*mm2=t5'=t2-t5*/
561
"psubw %%mm5,%%mm2\n\t"
562
"psllw $2,%%mm3\n\t"
563
"paddw %%mm5,%%mm5\n\t"
564
"psllw $2,%%mm4\n\t"
565
/*mm3=t4'=t3-t4*/
566
"psubw %%mm4,%%mm3\n\t"
567
"paddw %%mm4,%%mm4\n\t"
568
/*mm7=t0'=t0+t7*/
569
"paddw %%mm0,%%mm7\n\t"
570
/*mm6=t1'=t1+t6*/
571
"paddw %%mm1,%%mm6\n\t"
572
/*mm5=t2'=t2+t5*/
573
"paddw %%mm2,%%mm5\n\t"
574
/*mm4=t3'=t3+t4*/
575
"paddw %%mm3,%%mm4\n\t"
576
OC_FDCT8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
577
OC_TRANSPOSE8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
578
/*Here the first 4x4 block of output from the last transpose is the second
579
4x4 block of input for the next transform.
580
We have cleverly arranged that it already be in the appropriate place,
581
so we only have to do half the stores and loads.*/
582
"movq 0x00(%[y]),%%mm0\n\t"
583
"movq %%mm1,0x58(%[y])\n\t"
584
"movq 0x10(%[y]),%%mm1\n\t"
585
"movq %%mm2,0x68(%[y])\n\t"
586
"movq 0x20(%[y]),%%mm2\n\t"
587
"movq %%mm3,0x78(%[y])\n\t"
588
"movq 0x30(%[y]),%%mm3\n\t"
589
OC_FDCT_STAGE1_8x4
590
OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
591
/*mm2={-2}x4*/
592
"pcmpeqw %%mm2,%%mm2\n\t"
593
"paddw %%mm2,%%mm2\n\t"
594
/*Round and store the results (no transpose).*/
595
"movq 0x10(%[y]),%%mm7\n\t"
596
"psubw %%mm2,%%mm4\n\t"
597
"psubw %%mm2,%%mm6\n\t"
598
"psraw $2,%%mm4\n\t"
599
"psubw %%mm2,%%mm0\n\t"
600
"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
601
"movq 0x30(%[y]),%%mm4\n\t"
602
"psraw $2,%%mm6\n\t"
603
"psubw %%mm2,%%mm5\n\t"
604
"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
605
"psraw $2,%%mm0\n\t"
606
"psubw %%mm2,%%mm3\n\t"
607
"movq %%mm0,"OC_MEM_OFFS(0x40,buf)"\n\t"
608
"psraw $2,%%mm5\n\t"
609
"psubw %%mm2,%%mm1\n\t"
610
"movq %%mm5,"OC_MEM_OFFS(0x50,buf)"\n\t"
611
"psraw $2,%%mm3\n\t"
612
"psubw %%mm2,%%mm7\n\t"
613
"movq %%mm3,"OC_MEM_OFFS(0x60,buf)"\n\t"
614
"psraw $2,%%mm1\n\t"
615
"psubw %%mm2,%%mm4\n\t"
616
"movq %%mm1,"OC_MEM_OFFS(0x70,buf)"\n\t"
617
"psraw $2,%%mm7\n\t"
618
"movq %%mm7,"OC_MEM_OFFS(0x10,buf)"\n\t"
619
"psraw $2,%%mm4\n\t"
620
"movq %%mm4,"OC_MEM_OFFS(0x30,buf)"\n\t"
621
/*Load the next block.*/
622
"movq 0x40(%[y]),%%mm0\n\t"
623
"movq 0x78(%[y]),%%mm7\n\t"
624
"movq 0x50(%[y]),%%mm1\n\t"
625
"movq 0x68(%[y]),%%mm6\n\t"
626
"movq 0x60(%[y]),%%mm2\n\t"
627
"movq 0x58(%[y]),%%mm5\n\t"
628
"movq 0x70(%[y]),%%mm3\n\t"
629
"movq 0x48(%[y]),%%mm4\n\t"
630
OC_FDCT_STAGE1_8x4
631
OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
632
/*mm2={-2}x4*/
633
"pcmpeqw %%mm2,%%mm2\n\t"
634
"paddw %%mm2,%%mm2\n\t"
635
/*Round and store the results (no transpose).*/
636
"movq 0x50(%[y]),%%mm7\n\t"
637
"psubw %%mm2,%%mm4\n\t"
638
"psubw %%mm2,%%mm6\n\t"
639
"psraw $2,%%mm4\n\t"
640
"psubw %%mm2,%%mm0\n\t"
641
"movq %%mm4,"OC_MEM_OFFS(0x08,buf)"\n\t"
642
"movq 0x70(%[y]),%%mm4\n\t"
643
"psraw $2,%%mm6\n\t"
644
"psubw %%mm2,%%mm5\n\t"
645
"movq %%mm6,"OC_MEM_OFFS(0x28,buf)"\n\t"
646
"psraw $2,%%mm0\n\t"
647
"psubw %%mm2,%%mm3\n\t"
648
"movq %%mm0,"OC_MEM_OFFS(0x48,buf)"\n\t"
649
"psraw $2,%%mm5\n\t"
650
"psubw %%mm2,%%mm1\n\t"
651
"movq %%mm5,"OC_MEM_OFFS(0x58,buf)"\n\t"
652
"psraw $2,%%mm3\n\t"
653
"psubw %%mm2,%%mm7\n\t"
654
"movq %%mm3,"OC_MEM_OFFS(0x68,buf)"\n\t"
655
"psraw $2,%%mm1\n\t"
656
"psubw %%mm2,%%mm4\n\t"
657
"movq %%mm1,"OC_MEM_OFFS(0x78,buf)"\n\t"
658
"psraw $2,%%mm7\n\t"
659
"movq %%mm7,"OC_MEM_OFFS(0x18,buf)"\n\t"
660
"psraw $2,%%mm4\n\t"
661
"movq %%mm4,"OC_MEM_OFFS(0x38,buf)"\n\t"
662
/*Final transpose and zig-zag.*/
663
#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
664
"movq "OC_MEM_OFFS(16*_row,buf)","_reg"\n\t" \
665
666
#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
667
"movq "OC_MEM_OFFS(16*_row+8,buf)","_reg"\n\t" \
668
669
OC_TRANSPOSE_ZIG_ZAG_MMXEXT
670
#undef OC_ZZ_LOAD_ROW_LO
671
#undef OC_ZZ_LOAD_ROW_HI
672
:[a]"=&r"(a),[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
673
:[y]"r"(_y),[x]"r"(_x)
674
:"memory"
675
);
676
}
677
678
#endif
679
680