Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libtheora/x86/mmxencfrag.c
9898 views
1
/********************************************************************
2
* *
3
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7
* *
8
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9
* by the Xiph.Org Foundation https://www.xiph.org/ *
10
* *
11
********************************************************************
12
13
function:
14
15
********************************************************************/
16
#include <stddef.h>
17
#include "x86enc.h"
18
19
#if defined(OC_X86_ASM)
20
21
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
22
const unsigned char *_ref,int _ystride){
23
ptrdiff_t ystride3;
24
ptrdiff_t ret;
25
__asm__ __volatile__(
26
/*Load the first 4 rows of each block.*/
27
"movq (%[src]),%%mm0\n\t"
28
"movq (%[ref]),%%mm1\n\t"
29
"movq (%[src],%[ystride]),%%mm2\n\t"
30
"movq (%[ref],%[ystride]),%%mm3\n\t"
31
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
32
"movq (%[src],%[ystride],2),%%mm4\n\t"
33
"movq (%[ref],%[ystride],2),%%mm5\n\t"
34
"movq (%[src],%[ystride3]),%%mm6\n\t"
35
"movq (%[ref],%[ystride3]),%%mm7\n\t"
36
/*Compute their SADs and add them in %%mm0*/
37
"psadbw %%mm1,%%mm0\n\t"
38
"psadbw %%mm3,%%mm2\n\t"
39
"lea (%[src],%[ystride],4),%[src]\n\t"
40
"paddw %%mm2,%%mm0\n\t"
41
"lea (%[ref],%[ystride],4),%[ref]\n\t"
42
/*Load the next 3 rows as registers become available.*/
43
"movq (%[src]),%%mm2\n\t"
44
"movq (%[ref]),%%mm3\n\t"
45
"psadbw %%mm5,%%mm4\n\t"
46
"psadbw %%mm7,%%mm6\n\t"
47
"paddw %%mm4,%%mm0\n\t"
48
"movq (%[ref],%[ystride]),%%mm5\n\t"
49
"movq (%[src],%[ystride]),%%mm4\n\t"
50
"paddw %%mm6,%%mm0\n\t"
51
"movq (%[ref],%[ystride],2),%%mm7\n\t"
52
"movq (%[src],%[ystride],2),%%mm6\n\t"
53
/*Start adding their SADs to %%mm0*/
54
"psadbw %%mm3,%%mm2\n\t"
55
"psadbw %%mm5,%%mm4\n\t"
56
"paddw %%mm2,%%mm0\n\t"
57
"psadbw %%mm7,%%mm6\n\t"
58
/*Load last row as registers become available.*/
59
"movq (%[src],%[ystride3]),%%mm2\n\t"
60
"movq (%[ref],%[ystride3]),%%mm3\n\t"
61
/*And finish adding up their SADs.*/
62
"paddw %%mm4,%%mm0\n\t"
63
"psadbw %%mm3,%%mm2\n\t"
64
"paddw %%mm6,%%mm0\n\t"
65
"paddw %%mm2,%%mm0\n\t"
66
"movd %%mm0,%[ret]\n\t"
67
:[ret]"=a"(ret),[src]"+r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)
68
:[ystride]"r"((ptrdiff_t)_ystride)
69
);
70
return (unsigned)ret;
71
}
72
73
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
74
const unsigned char *_ref,int _ystride,unsigned _thresh){
75
/*Early termination is for suckers.*/
76
return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
77
}
78
79
/*Assumes the first two rows of %[ref1] and %[ref2] are in %%mm0...%%mm3, the
80
first two rows of %[src] are in %%mm4,%%mm5, and {1}x8 is in %%mm7.
81
We pre-load the next two rows of data as registers become available.*/
82
#define OC_SAD2_LOOP \
83
"#OC_SAD2_LOOP\n\t" \
84
/*We want to compute (%%mm0+%%mm1>>1) on unsigned bytes without overflow, but \
85
pavgb computes (%%mm0+%%mm1+1>>1). \
86
The latter is exactly 1 too large when the low bit of two corresponding \
87
bytes is only set in one of them. \
88
Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
89
correct the output of pavgb. \
90
TODO: This should be rewritten to compute ~pavgb(~a,~b) instead, which \
91
schedules better; currently, however, this function is unused.*/ \
92
"movq %%mm0,%%mm6\n\t" \
93
"lea (%[ref1],%[ystride],2),%[ref1]\n\t" \
94
"pxor %%mm1,%%mm0\n\t" \
95
"pavgb %%mm1,%%mm6\n\t" \
96
"lea (%[ref2],%[ystride],2),%[ref2]\n\t" \
97
"movq %%mm2,%%mm1\n\t" \
98
"pand %%mm7,%%mm0\n\t" \
99
"pavgb %%mm3,%%mm2\n\t" \
100
"pxor %%mm3,%%mm1\n\t" \
101
"movq (%[ref2],%[ystride]),%%mm3\n\t" \
102
"psubb %%mm0,%%mm6\n\t" \
103
"movq (%[ref1]),%%mm0\n\t" \
104
"pand %%mm7,%%mm1\n\t" \
105
"psadbw %%mm6,%%mm4\n\t" \
106
"movd %[ret],%%mm6\n\t" \
107
"psubb %%mm1,%%mm2\n\t" \
108
"movq (%[ref2]),%%mm1\n\t" \
109
"lea (%[src],%[ystride],2),%[src]\n\t" \
110
"psadbw %%mm2,%%mm5\n\t" \
111
"movq (%[ref1],%[ystride]),%%mm2\n\t" \
112
"paddw %%mm4,%%mm5\n\t" \
113
"movq (%[src]),%%mm4\n\t" \
114
"paddw %%mm5,%%mm6\n\t" \
115
"movq (%[src],%[ystride]),%%mm5\n\t" \
116
"movd %%mm6,%[ret]\n\t" \
117
118
/*Same as above, but does not pre-load the next two rows.*/
119
#define OC_SAD2_TAIL \
120
"#OC_SAD2_TAIL\n\t" \
121
"movq %%mm0,%%mm6\n\t" \
122
"pavgb %%mm1,%%mm0\n\t" \
123
"pxor %%mm1,%%mm6\n\t" \
124
"movq %%mm2,%%mm1\n\t" \
125
"pand %%mm7,%%mm6\n\t" \
126
"pavgb %%mm3,%%mm2\n\t" \
127
"pxor %%mm3,%%mm1\n\t" \
128
"psubb %%mm6,%%mm0\n\t" \
129
"pand %%mm7,%%mm1\n\t" \
130
"psadbw %%mm0,%%mm4\n\t" \
131
"psubb %%mm1,%%mm2\n\t" \
132
"movd %[ret],%%mm6\n\t" \
133
"psadbw %%mm2,%%mm5\n\t" \
134
"paddw %%mm4,%%mm5\n\t" \
135
"paddw %%mm5,%%mm6\n\t" \
136
"movd %%mm6,%[ret]\n\t" \
137
138
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
139
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
140
unsigned _thresh){
141
ptrdiff_t ret;
142
__asm__ __volatile__(
143
"movq (%[ref1]),%%mm0\n\t"
144
"movq (%[ref2]),%%mm1\n\t"
145
"movq (%[ref1],%[ystride]),%%mm2\n\t"
146
"movq (%[ref2],%[ystride]),%%mm3\n\t"
147
"xor %[ret],%[ret]\n\t"
148
"movq (%[src]),%%mm4\n\t"
149
"pxor %%mm7,%%mm7\n\t"
150
"pcmpeqb %%mm6,%%mm6\n\t"
151
"movq (%[src],%[ystride]),%%mm5\n\t"
152
"psubb %%mm6,%%mm7\n\t"
153
OC_SAD2_LOOP
154
OC_SAD2_LOOP
155
OC_SAD2_LOOP
156
OC_SAD2_TAIL
157
:[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+r"(_ref1),[ref2]"+r"(_ref2)
158
:[ystride]"r"((ptrdiff_t)_ystride)
159
);
160
return (unsigned)ret;
161
}
162
163
/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
164
16-bit difference in %%mm0...%%mm7.*/
165
#define OC_LOAD_SUB_8x4(_off) \
166
"#OC_LOAD_SUB_8x4\n\t" \
167
"movd "#_off"(%[src]),%%mm0\n\t" \
168
"movd "#_off"(%[ref]),%%mm4\n\t" \
169
"movd "#_off"(%[src],%[src_ystride]),%%mm1\n\t" \
170
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
171
"movd "#_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
172
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
173
"movd "#_off"(%[src]),%%mm2\n\t" \
174
"movd "#_off"(%[ref]),%%mm7\n\t" \
175
"movd "#_off"(%[src],%[src_ystride]),%%mm3\n\t" \
176
"movd "#_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
177
"punpcklbw %%mm4,%%mm0\n\t" \
178
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
179
"punpcklbw %%mm4,%%mm4\n\t" \
180
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
181
"psubw %%mm4,%%mm0\n\t" \
182
"movd "#_off"(%[src]),%%mm4\n\t" \
183
"movq %%mm0,"OC_MEM_OFFS(_off*2,buf)"\n\t" \
184
"movd "#_off"(%[ref]),%%mm0\n\t" \
185
"punpcklbw %%mm5,%%mm1\n\t" \
186
"punpcklbw %%mm5,%%mm5\n\t" \
187
"psubw %%mm5,%%mm1\n\t" \
188
"movd "#_off"(%[src],%[src_ystride]),%%mm5\n\t" \
189
"punpcklbw %%mm7,%%mm2\n\t" \
190
"punpcklbw %%mm7,%%mm7\n\t" \
191
"psubw %%mm7,%%mm2\n\t" \
192
"movd "#_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
193
"punpcklbw %%mm6,%%mm3\n\t" \
194
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
195
"punpcklbw %%mm6,%%mm6\n\t" \
196
"psubw %%mm6,%%mm3\n\t" \
197
"movd "#_off"(%[src]),%%mm6\n\t" \
198
"punpcklbw %%mm0,%%mm4\n\t" \
199
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
200
"punpcklbw %%mm0,%%mm0\n\t" \
201
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
202
"psubw %%mm0,%%mm4\n\t" \
203
"movd "#_off"(%[ref]),%%mm0\n\t" \
204
"punpcklbw %%mm7,%%mm5\n\t" \
205
"neg %[src_ystride]\n\t" \
206
"punpcklbw %%mm7,%%mm7\n\t" \
207
"psubw %%mm7,%%mm5\n\t" \
208
"movd "#_off"(%[src],%[src_ystride]),%%mm7\n\t" \
209
"punpcklbw %%mm0,%%mm6\n\t" \
210
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
211
"punpcklbw %%mm0,%%mm0\n\t" \
212
"neg %[ref_ystride]\n\t" \
213
"psubw %%mm0,%%mm6\n\t" \
214
"movd "#_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
215
"lea (%[src],%[src_ystride],8),%[src]\n\t" \
216
"punpcklbw %%mm0,%%mm7\n\t" \
217
"neg %[src_ystride]\n\t" \
218
"punpcklbw %%mm0,%%mm0\n\t" \
219
"lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \
220
"psubw %%mm0,%%mm7\n\t" \
221
"neg %[ref_ystride]\n\t" \
222
"movq "OC_MEM_OFFS(_off*2,buf)",%%mm0\n\t" \
223
224
/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
225
#define OC_LOAD_8x4(_off) \
226
"#OC_LOAD_8x4\n\t" \
227
"movd "#_off"(%[src]),%%mm0\n\t" \
228
"movd "#_off"(%[src],%[ystride]),%%mm1\n\t" \
229
"movd "#_off"(%[src],%[ystride],2),%%mm2\n\t" \
230
"pxor %%mm7,%%mm7\n\t" \
231
"movd "#_off"(%[src],%[ystride3]),%%mm3\n\t" \
232
"punpcklbw %%mm7,%%mm0\n\t" \
233
"movd "#_off"(%[src4]),%%mm4\n\t" \
234
"punpcklbw %%mm7,%%mm1\n\t" \
235
"movd "#_off"(%[src4],%[ystride]),%%mm5\n\t" \
236
"punpcklbw %%mm7,%%mm2\n\t" \
237
"movd "#_off"(%[src4],%[ystride],2),%%mm6\n\t" \
238
"punpcklbw %%mm7,%%mm3\n\t" \
239
"movd "#_off"(%[src4],%[ystride3]),%%mm7\n\t" \
240
"punpcklbw %%mm4,%%mm4\n\t" \
241
"punpcklbw %%mm5,%%mm5\n\t" \
242
"psrlw $8,%%mm4\n\t" \
243
"psrlw $8,%%mm5\n\t" \
244
"punpcklbw %%mm6,%%mm6\n\t" \
245
"punpcklbw %%mm7,%%mm7\n\t" \
246
"psrlw $8,%%mm6\n\t" \
247
"psrlw $8,%%mm7\n\t" \
248
249
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
250
The transform is performed in place, except that outputs 0-3 are swapped with
251
outputs 4-7.
252
Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
253
perform this stage in place with no temporary registers).*/
254
#define OC_HADAMARD_AB_8x4 \
255
"#OC_HADAMARD_AB_8x4\n\t" \
256
/*Stage A: \
257
Outputs 0-3 are swapped with 4-7 here.*/ \
258
"paddw %%mm1,%%mm5\n\t" \
259
"paddw %%mm2,%%mm6\n\t" \
260
"paddw %%mm1,%%mm1\n\t" \
261
"paddw %%mm2,%%mm2\n\t" \
262
"psubw %%mm5,%%mm1\n\t" \
263
"psubw %%mm6,%%mm2\n\t" \
264
"paddw %%mm3,%%mm7\n\t" \
265
"paddw %%mm0,%%mm4\n\t" \
266
"paddw %%mm3,%%mm3\n\t" \
267
"paddw %%mm0,%%mm0\n\t" \
268
"psubw %%mm7,%%mm3\n\t" \
269
"psubw %%mm4,%%mm0\n\t" \
270
/*Stage B:*/ \
271
"paddw %%mm2,%%mm0\n\t" \
272
"paddw %%mm3,%%mm1\n\t" \
273
"paddw %%mm6,%%mm4\n\t" \
274
"paddw %%mm7,%%mm5\n\t" \
275
"paddw %%mm2,%%mm2\n\t" \
276
"paddw %%mm3,%%mm3\n\t" \
277
"paddw %%mm6,%%mm6\n\t" \
278
"paddw %%mm7,%%mm7\n\t" \
279
"psubw %%mm0,%%mm2\n\t" \
280
"psubw %%mm1,%%mm3\n\t" \
281
"psubw %%mm4,%%mm6\n\t" \
282
"psubw %%mm5,%%mm7\n\t" \
283
284
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
285
Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
286
place with no temporary registers).*/
287
#define OC_HADAMARD_C_8x4 \
288
"#OC_HADAMARD_C_8x4\n\t" \
289
/*Stage C:*/ \
290
"paddw %%mm1,%%mm0\n\t" \
291
"paddw %%mm3,%%mm2\n\t" \
292
"paddw %%mm5,%%mm4\n\t" \
293
"paddw %%mm7,%%mm6\n\t" \
294
"paddw %%mm1,%%mm1\n\t" \
295
"paddw %%mm3,%%mm3\n\t" \
296
"paddw %%mm5,%%mm5\n\t" \
297
"paddw %%mm7,%%mm7\n\t" \
298
"psubw %%mm0,%%mm1\n\t" \
299
"psubw %%mm2,%%mm3\n\t" \
300
"psubw %%mm4,%%mm5\n\t" \
301
"psubw %%mm6,%%mm7\n\t" \
302
303
/*Performs an 8-point 1-D Hadamard transform.
304
The transform is performed in place, except that outputs 0-3 are swapped with
305
outputs 4-7.
306
Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
307
in place with no temporary registers).*/
308
#define OC_HADAMARD_8x4 \
309
OC_HADAMARD_AB_8x4 \
310
OC_HADAMARD_C_8x4 \
311
312
/*Performs the first part of the final stage of the Hadamard transform and
313
summing of absolute values.
314
At the end of this part, %%mm1 will contain the DC coefficient of the
315
transform.*/
316
#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
317
/*We use the fact that \
318
(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
319
to merge the final butterfly with the abs and the first stage of \
320
accumulation. \
321
Thus we can avoid using pabsw, which is not available until SSSE3. \
322
Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
323
implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
324
registers). \
325
Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
326
This implementation is only 26 (+4 for spilling registers).*/ \
327
"#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \
328
"movq %%mm7,"OC_MEM_OFFS(_r7,buf)"\n\t" \
329
"movq %%mm6,"OC_MEM_OFFS(_r6,buf)"\n\t" \
330
/*mm7={0x7FFF}x4 \
331
mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
332
"pcmpeqb %%mm7,%%mm7\n\t" \
333
"movq %%mm0,%%mm6\n\t" \
334
"psrlw $1,%%mm7\n\t" \
335
"paddw %%mm1,%%mm6\n\t" \
336
"pmaxsw %%mm1,%%mm0\n\t" \
337
"paddsw %%mm7,%%mm6\n\t" \
338
"psubw %%mm6,%%mm0\n\t" \
339
/*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
340
mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
341
"movq %%mm2,%%mm6\n\t" \
342
"movq %%mm4,%%mm1\n\t" \
343
"pmaxsw %%mm3,%%mm2\n\t" \
344
"pmaxsw %%mm5,%%mm4\n\t" \
345
"paddw %%mm3,%%mm6\n\t" \
346
"paddw %%mm5,%%mm1\n\t" \
347
"movq "OC_MEM_OFFS(_r7,buf)",%%mm3\n\t" \
348
349
/*Performs the second part of the final stage of the Hadamard transform and
350
summing of absolute values.*/
351
#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
352
"#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \
353
"paddsw %%mm7,%%mm6\n\t" \
354
"movq "OC_MEM_OFFS(_r6,buf)",%%mm5\n\t" \
355
"paddsw %%mm7,%%mm1\n\t" \
356
"psubw %%mm6,%%mm2\n\t" \
357
"psubw %%mm1,%%mm4\n\t" \
358
/*mm7={1}x4 (needed for the horizontal add that follows) \
359
mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
360
"movq %%mm3,%%mm6\n\t" \
361
"pmaxsw %%mm5,%%mm3\n\t" \
362
"paddw %%mm2,%%mm0\n\t" \
363
"paddw %%mm5,%%mm6\n\t" \
364
"paddw %%mm4,%%mm0\n\t" \
365
"paddsw %%mm7,%%mm6\n\t" \
366
"paddw %%mm3,%%mm0\n\t" \
367
"psrlw $14,%%mm7\n\t" \
368
"psubw %%mm6,%%mm0\n\t" \
369
370
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
371
absolute value of each component, and accumulates everything into mm0.
372
This is the only portion of SATD which requires MMXEXT (we could use plain
373
MMX, but it takes 4 instructions and an extra register to work around the
374
lack of a pmaxsw, which is a pretty serious penalty).*/
375
#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
376
OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
377
OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
378
379
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
380
component, and accumulates everything into mm0.
381
Note that mm0 will have an extra 4 added to each column, and that after
382
removing this value, the remainder will be half the conventional value.*/
383
#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) \
384
OC_HADAMARD_AB_8x4 \
385
OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7)
386
387
/*Performs two 4x4 transposes (mostly) in place.
388
On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
389
contains rows {a,b,c,d}.
390
On output, {0x40,0x50,0x60,0x70}+_off(%[buf]) contains {e,f,g,h}^T, and
391
{mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
392
#define OC_TRANSPOSE_4x4x2(_off) \
393
"#OC_TRANSPOSE_4x4x2\n\t" \
394
/*First 4x4 transpose:*/ \
395
"movq %%mm5,"OC_MEM_OFFS(0x10+(_off),buf)"\n\t" \
396
/*mm0 = e3 e2 e1 e0 \
397
mm1 = f3 f2 f1 f0 \
398
mm2 = g3 g2 g1 g0 \
399
mm3 = h3 h2 h1 h0*/ \
400
"movq %%mm2,%%mm5\n\t" \
401
"punpcklwd %%mm3,%%mm2\n\t" \
402
"punpckhwd %%mm3,%%mm5\n\t" \
403
"movq %%mm0,%%mm3\n\t" \
404
"punpcklwd %%mm1,%%mm0\n\t" \
405
"punpckhwd %%mm1,%%mm3\n\t" \
406
/*mm0 = f1 e1 f0 e0 \
407
mm3 = f3 e3 f2 e2 \
408
mm2 = h1 g1 h0 g0 \
409
mm5 = h3 g3 h2 g2*/ \
410
"movq %%mm0,%%mm1\n\t" \
411
"punpckldq %%mm2,%%mm0\n\t" \
412
"punpckhdq %%mm2,%%mm1\n\t" \
413
"movq %%mm3,%%mm2\n\t" \
414
"punpckhdq %%mm5,%%mm3\n\t" \
415
"movq %%mm0,"OC_MEM_OFFS(0x40+(_off),buf)"\n\t" \
416
"punpckldq %%mm5,%%mm2\n\t" \
417
/*mm0 = h0 g0 f0 e0 \
418
mm1 = h1 g1 f1 e1 \
419
mm2 = h2 g2 f2 e2 \
420
mm3 = h3 g3 f3 e3*/ \
421
"movq "OC_MEM_OFFS(0x10+(_off),buf)",%%mm5\n\t" \
422
/*Second 4x4 transpose:*/ \
423
/*mm4 = a3 a2 a1 a0 \
424
mm5 = b3 b2 b1 b0 \
425
mm6 = c3 c2 c1 c0 \
426
mm7 = d3 d2 d1 d0*/ \
427
"movq %%mm6,%%mm0\n\t" \
428
"punpcklwd %%mm7,%%mm6\n\t" \
429
"movq %%mm1,"OC_MEM_OFFS(0x50+(_off),buf)"\n\t" \
430
"punpckhwd %%mm7,%%mm0\n\t" \
431
"movq %%mm4,%%mm7\n\t" \
432
"punpcklwd %%mm5,%%mm4\n\t" \
433
"movq %%mm2,"OC_MEM_OFFS(0x60+(_off),buf)"\n\t" \
434
"punpckhwd %%mm5,%%mm7\n\t" \
435
/*mm4 = b1 a1 b0 a0 \
436
mm7 = b3 a3 b2 a2 \
437
mm6 = d1 c1 d0 c0 \
438
mm0 = d3 c3 d2 c2*/ \
439
"movq %%mm4,%%mm5\n\t" \
440
"punpckldq %%mm6,%%mm4\n\t" \
441
"movq %%mm3,"OC_MEM_OFFS(0x70+(_off),buf)"\n\t" \
442
"punpckhdq %%mm6,%%mm5\n\t" \
443
"movq %%mm7,%%mm6\n\t" \
444
"punpckhdq %%mm0,%%mm7\n\t" \
445
"punpckldq %%mm0,%%mm6\n\t" \
446
/*mm4 = d0 c0 b0 a0 \
447
mm5 = d1 c1 b1 a1 \
448
mm6 = d2 c2 b2 a2 \
449
mm7 = d3 c3 b3 a3*/ \
450
451
static unsigned oc_int_frag_satd_mmxext(int *_dc,
452
const unsigned char *_src,int _src_ystride,
453
const unsigned char *_ref,int _ref_ystride){
454
OC_ALIGN8(ogg_int16_t buf[64]);
455
unsigned ret;
456
unsigned ret2;
457
int dc;
458
__asm__ __volatile__(
459
OC_LOAD_SUB_8x4(0x00)
460
OC_HADAMARD_8x4
461
OC_TRANSPOSE_4x4x2(0x00)
462
/*Finish swapping out this 8x4 block to make room for the next one.
463
mm0...mm3 have been swapped out already.*/
464
"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
465
"movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
466
"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
467
"movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
468
OC_LOAD_SUB_8x4(0x04)
469
OC_HADAMARD_8x4
470
OC_TRANSPOSE_4x4x2(0x08)
471
/*Here the first 4x4 block of output from the last transpose is the second
472
4x4 block of input for the next transform.
473
We have cleverly arranged that it already be in the appropriate place, so
474
we only have to do half the loads.*/
475
"movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
476
"movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
477
"movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
478
"movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
479
/*We split out the stages here so we can save the DC coefficient in the
480
middle.*/
481
OC_HADAMARD_AB_8x4
482
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
483
"movd %%mm1,%[dc]\n\t"
484
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
485
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
486
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
487
for the factor of two we dropped + 3 for the vertical accumulation).
488
Now we finally have to promote things to dwords.
489
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
490
latency of pmaddwd by starting the next series of loads now.*/
491
"pmaddwd %%mm7,%%mm0\n\t"
492
"movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
493
"movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
494
"movq %%mm0,%%mm4\n\t"
495
"movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
496
"punpckhdq %%mm0,%%mm0\n\t"
497
"movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
498
"paddd %%mm0,%%mm4\n\t"
499
"movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
500
"movd %%mm4,%[ret2]\n\t"
501
"movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
502
"movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
503
"movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
504
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
505
"pmaddwd %%mm7,%%mm0\n\t"
506
/*Subtract abs(dc) from 2*ret2.*/
507
"movsx %w[dc],%[dc]\n\t"
508
"cdq\n\t"
509
"lea (%[ret],%[ret2],2),%[ret2]\n\t"
510
"movq %%mm0,%%mm4\n\t"
511
"punpckhdq %%mm0,%%mm0\n\t"
512
"xor %[dc],%[ret]\n\t"
513
"paddd %%mm0,%%mm4\n\t"
514
/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
515
added to them, a factor of two removed, and the DC value included;
516
correct the final sum here.*/
517
"sub %[ret],%[ret2]\n\t"
518
"movd %%mm4,%[ret]\n\t"
519
"lea -64(%[ret2],%[ret],2),%[ret]\n\t"
520
/*Although it looks like we're using 8 registers here, gcc can alias %[ret]
521
and %[ret2] with some of the inputs, since for once we don't write to
522
them until after we're done using everything but %[buf].*/
523
/*Note that _src_ystride and _ref_ystride must be given non-overlapping
524
constraints, otherwise if gcc can prove they're equal it will allocate
525
them to the same register (which is bad); _src and _ref face a similar
526
problem, though those are never actually the same.*/
527
:[ret]"=d"(ret),[ret2]"=r"(ret2),[dc]"=a"(dc),
528
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
529
:[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
530
[ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
531
/*We have to use neg, so we actually clobber the condition codes for once
532
(not to mention cmp, sub, and add).*/
533
:"cc"
534
);
535
*_dc=dc;
536
return ret;
537
}
538
539
unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
540
const unsigned char *_ref,int _ystride){
541
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
542
}
543
544
/*Our internal implementation of frag_copy2 takes an extra stride parameter so
545
we can share code with oc_enc_frag_satd2_mmxext().*/
546
void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
547
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
548
__asm__ __volatile__(
549
/*Load the first 3 rows.*/
550
"movq (%[src1]),%%mm0\n\t"
551
"movq (%[src2]),%%mm1\n\t"
552
"movq (%[src1],%[src_ystride]),%%mm2\n\t"
553
"lea (%[src1],%[src_ystride],2),%[src1]\n\t"
554
"movq (%[src2],%[src_ystride]),%%mm3\n\t"
555
"lea (%[src2],%[src_ystride],2),%[src2]\n\t"
556
"pxor %%mm7,%%mm7\n\t"
557
"movq (%[src1]),%%mm4\n\t"
558
"pcmpeqb %%mm6,%%mm6\n\t"
559
"movq (%[src2]),%%mm5\n\t"
560
/*mm7={1}x8.*/
561
"psubb %%mm6,%%mm7\n\t"
562
/*Start averaging %%mm0 and %%mm1 into %%mm6.*/
563
"movq %%mm0,%%mm6\n\t"
564
"pxor %%mm1,%%mm0\n\t"
565
"pavgb %%mm1,%%mm6\n\t"
566
/*%%mm1 is free, start averaging %%mm3 into %%mm2 using %%mm1.*/
567
"movq %%mm2,%%mm1\n\t"
568
"pand %%mm7,%%mm0\n\t"
569
"pavgb %%mm3,%%mm2\n\t"
570
"pxor %%mm3,%%mm1\n\t"
571
/*%%mm3 is free.*/
572
"psubb %%mm0,%%mm6\n\t"
573
/*%%mm0 is free, start loading the next row.*/
574
"movq (%[src1],%[src_ystride]),%%mm0\n\t"
575
/*Start averaging %%mm5 and %%mm4 using %%mm3.*/
576
"movq %%mm4,%%mm3\n\t"
577
/*%%mm6 (row 0) is done; write it out.*/
578
"movq %%mm6,(%[dst])\n\t"
579
"pand %%mm7,%%mm1\n\t"
580
"pavgb %%mm5,%%mm4\n\t"
581
"psubb %%mm1,%%mm2\n\t"
582
/*%%mm1 is free, continue loading the next row.*/
583
"movq (%[src2],%[src_ystride]),%%mm1\n\t"
584
"pxor %%mm5,%%mm3\n\t"
585
"lea (%[src1],%[src_ystride],2),%[src1]\n\t"
586
/*%%mm2 (row 1) is done; write it out.*/
587
"movq %%mm2,(%[dst],%[dst_ystride])\n\t"
588
"pand %%mm7,%%mm3\n\t"
589
/*Start loading the next row.*/
590
"movq (%[src1]),%%mm2\n\t"
591
"lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
592
"psubb %%mm3,%%mm4\n\t"
593
"lea (%[src2],%[src_ystride],2),%[src2]\n\t"
594
/*%%mm4 (row 2) is done; write it out.*/
595
"movq %%mm4,(%[dst])\n\t"
596
/*Continue loading the next row.*/
597
"movq (%[src2]),%%mm3\n\t"
598
/*Start averaging %%mm0 and %%mm1 into %%mm6.*/
599
"movq %%mm0,%%mm6\n\t"
600
"pxor %%mm1,%%mm0\n\t"
601
/*Start loading the next row.*/
602
"movq (%[src1],%[src_ystride]),%%mm4\n\t"
603
"pavgb %%mm1,%%mm6\n\t"
604
/*%%mm1 is free; start averaging %%mm3 into %%mm2 using %%mm1.*/
605
"movq %%mm2,%%mm1\n\t"
606
"pand %%mm7,%%mm0\n\t"
607
/*Continue loading the next row.*/
608
"movq (%[src2],%[src_ystride]),%%mm5\n\t"
609
"pavgb %%mm3,%%mm2\n\t"
610
"lea (%[src1],%[src_ystride],2),%[src1]\n\t"
611
"pxor %%mm3,%%mm1\n\t"
612
/*%%mm3 is free.*/
613
"psubb %%mm0,%%mm6\n\t"
614
/*%%mm0 is free, start loading the next row.*/
615
"movq (%[src1]),%%mm0\n\t"
616
/*Start averaging %%mm5 into %%mm4 using %%mm3.*/
617
"movq %%mm4,%%mm3\n\t"
618
/*%%mm6 (row 3) is done; write it out.*/
619
"movq %%mm6,(%[dst],%[dst_ystride])\n\t"
620
"pand %%mm7,%%mm1\n\t"
621
"lea (%[src2],%[src_ystride],2),%[src2]\n\t"
622
"pavgb %%mm5,%%mm4\n\t"
623
"lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
624
"psubb %%mm1,%%mm2\n\t"
625
/*%%mm1 is free; continue loading the next row.*/
626
"movq (%[src2]),%%mm1\n\t"
627
"pxor %%mm5,%%mm3\n\t"
628
/*%%mm2 (row 4) is done; write it out.*/
629
"movq %%mm2,(%[dst])\n\t"
630
"pand %%mm7,%%mm3\n\t"
631
/*Start loading the next row.*/
632
"movq (%[src1],%[src_ystride]),%%mm2\n\t"
633
"psubb %%mm3,%%mm4\n\t"
634
/*Start averaging %%mm0 and %%mm1 into %%mm6.*/
635
"movq %%mm0,%%mm6\n\t"
636
/*Continue loading the next row.*/
637
"movq (%[src2],%[src_ystride]),%%mm3\n\t"
638
/*%%mm4 (row 5) is done; write it out.*/
639
"movq %%mm4,(%[dst],%[dst_ystride])\n\t"
640
"pxor %%mm1,%%mm0\n\t"
641
"pavgb %%mm1,%%mm6\n\t"
642
/*%%mm4 is free; start averaging %%mm3 into %%mm2 using %%mm4.*/
643
"movq %%mm2,%%mm4\n\t"
644
"pand %%mm7,%%mm0\n\t"
645
"pavgb %%mm3,%%mm2\n\t"
646
"pxor %%mm3,%%mm4\n\t"
647
"lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
648
"psubb %%mm0,%%mm6\n\t"
649
"pand %%mm7,%%mm4\n\t"
650
/*%%mm6 (row 6) is done, write it out.*/
651
"movq %%mm6,(%[dst])\n\t"
652
"psubb %%mm4,%%mm2\n\t"
653
/*%%mm2 (row 7) is done, write it out.*/
654
"movq %%mm2,(%[dst],%[dst_ystride])\n\t"
655
:[dst]"+r"(_dst),[src1]"+r"(_src1),[src2]"+r"(_src2)
656
:[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
657
[src_ystride]"r"((ptrdiff_t)_src_ystride)
658
:"memory"
659
);
660
}
661
662
unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
663
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
664
OC_ALIGN8(unsigned char ref[64]);
665
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
666
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
667
}
668
669
unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
670
const unsigned char *_src,int _ystride){
671
OC_ALIGN8(ogg_int16_t buf[64]);
672
unsigned ret;
673
unsigned ret2;
674
int dc;
675
__asm__ __volatile__(
676
OC_LOAD_8x4(0x00)
677
OC_HADAMARD_8x4
678
OC_TRANSPOSE_4x4x2(0x00)
679
/*Finish swapping out this 8x4 block to make room for the next one.
680
mm0...mm3 have been swapped out already.*/
681
"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
682
"movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
683
"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
684
"movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
685
OC_LOAD_8x4(0x04)
686
OC_HADAMARD_8x4
687
OC_TRANSPOSE_4x4x2(0x08)
688
/*Here the first 4x4 block of output from the last transpose is the second
689
4x4 block of input for the next transform.
690
We have cleverly arranged that it already be in the appropriate place, so
691
we only have to do half the loads.*/
692
"movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
693
"movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
694
"movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
695
"movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
696
/*We split out the stages here so we can save the DC coefficient in the
697
middle.*/
698
OC_HADAMARD_AB_8x4
699
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
700
"movd %%mm1,%[dc]\n\t"
701
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
702
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
703
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
704
for the factor of two we dropped + 3 for the vertical accumulation).
705
Now we finally have to promote things to dwords.
706
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
707
latency of pmaddwd by starting the next series of loads now.*/
708
"pmaddwd %%mm7,%%mm0\n\t"
709
"movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
710
"movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
711
"movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
712
"movq %%mm0,%%mm4\n\t"
713
"movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
714
"punpckhdq %%mm0,%%mm0\n\t"
715
"movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
716
"paddd %%mm0,%%mm4\n\t"
717
"movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
718
"movd %%mm4,%[ret]\n\t"
719
"movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
720
"movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
721
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
722
"pmaddwd %%mm7,%%mm0\n\t"
723
/*We assume that the DC coefficient is always positive (which is true,
724
because the input to the INTRA transform was not a difference).*/
725
"movzx %w[dc],%[dc]\n\t"
726
"add %[ret],%[ret]\n\t"
727
"sub %[dc],%[ret]\n\t"
728
"movq %%mm0,%%mm4\n\t"
729
"punpckhdq %%mm0,%%mm0\n\t"
730
"paddd %%mm0,%%mm4\n\t"
731
"movd %%mm4,%[ret2]\n\t"
732
"lea -64(%[ret],%[ret2],2),%[ret]\n\t"
733
/*Although it looks like we're using 8 registers here, gcc can alias %[ret]
734
and %[ret2] with some of the inputs, since for once we don't write to
735
them until after we're done using everything but %[buf] (which is also
736
listed as an output to ensure gcc _doesn't_ alias them against it).*/
737
:[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=r"(dc),
738
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
739
:[src]"r"(_src),[src4]"r"(_src+4*_ystride),
740
[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
741
/*We have to use sub, so we actually clobber the condition codes for once
742
(not to mention add).*/
743
:"cc"
744
);
745
*_dc=dc;
746
return ret;
747
}
748
749
void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
750
const unsigned char *_src,const unsigned char *_ref,int _ystride){
751
int i;
752
__asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
753
for(i=4;i-->0;){
754
__asm__ __volatile__(
755
/*mm0=[src]*/
756
"movq (%[src]),%%mm0\n\t"
757
/*mm1=[ref]*/
758
"movq (%[ref]),%%mm1\n\t"
759
/*mm4=[src+ystride]*/
760
"movq (%[src],%[ystride]),%%mm4\n\t"
761
/*mm5=[ref+ystride]*/
762
"movq (%[ref],%[ystride]),%%mm5\n\t"
763
/*Compute [src]-[ref].*/
764
"movq %%mm0,%%mm2\n\t"
765
"punpcklbw %%mm7,%%mm0\n\t"
766
"movq %%mm1,%%mm3\n\t"
767
"punpckhbw %%mm7,%%mm2\n\t"
768
"punpcklbw %%mm7,%%mm1\n\t"
769
"punpckhbw %%mm7,%%mm3\n\t"
770
"psubw %%mm1,%%mm0\n\t"
771
"psubw %%mm3,%%mm2\n\t"
772
/*Compute [src+ystride]-[ref+ystride].*/
773
"movq %%mm4,%%mm1\n\t"
774
"punpcklbw %%mm7,%%mm4\n\t"
775
"movq %%mm5,%%mm3\n\t"
776
"punpckhbw %%mm7,%%mm1\n\t"
777
"lea (%[src],%[ystride],2),%[src]\n\t"
778
"punpcklbw %%mm7,%%mm5\n\t"
779
"lea (%[ref],%[ystride],2),%[ref]\n\t"
780
"punpckhbw %%mm7,%%mm3\n\t"
781
"psubw %%mm5,%%mm4\n\t"
782
"psubw %%mm3,%%mm1\n\t"
783
/*Write the answer out.*/
784
"movq %%mm0,0x00(%[residue])\n\t"
785
"movq %%mm2,0x08(%[residue])\n\t"
786
"movq %%mm4,0x10(%[residue])\n\t"
787
"movq %%mm1,0x18(%[residue])\n\t"
788
"lea 0x20(%[residue]),%[residue]\n\t"
789
:[residue]"+r"(_residue),[src]"+r"(_src),[ref]"+r"(_ref)
790
:[ystride]"r"((ptrdiff_t)_ystride)
791
:"memory"
792
);
793
}
794
}
795
796
void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
797
const unsigned char *_src,int _ystride){
798
ptrdiff_t ystride3;
799
__asm__ __volatile__(
800
/*mm0=[src]*/
801
"movq (%[src]),%%mm0\n\t"
802
/*mm1=[src+ystride]*/
803
"movq (%[src],%[ystride]),%%mm1\n\t"
804
/*mm6={-1}x4*/
805
"pcmpeqw %%mm6,%%mm6\n\t"
806
/*mm2=[src+2*ystride]*/
807
"movq (%[src],%[ystride],2),%%mm2\n\t"
808
/*[ystride3]=3*[ystride]*/
809
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
810
/*mm6={1}x4*/
811
"psllw $15,%%mm6\n\t"
812
/*mm3=[src+3*ystride]*/
813
"movq (%[src],%[ystride3]),%%mm3\n\t"
814
/*mm6={128}x4*/
815
"psrlw $8,%%mm6\n\t"
816
/*mm7=0*/
817
"pxor %%mm7,%%mm7\n\t"
818
/*[src]=[src]+4*[ystride]*/
819
"lea (%[src],%[ystride],4),%[src]\n\t"
820
/*Compute [src]-128 and [src+ystride]-128*/
821
"movq %%mm0,%%mm4\n\t"
822
"punpcklbw %%mm7,%%mm0\n\t"
823
"movq %%mm1,%%mm5\n\t"
824
"punpckhbw %%mm7,%%mm4\n\t"
825
"psubw %%mm6,%%mm0\n\t"
826
"punpcklbw %%mm7,%%mm1\n\t"
827
"psubw %%mm6,%%mm4\n\t"
828
"punpckhbw %%mm7,%%mm5\n\t"
829
"psubw %%mm6,%%mm1\n\t"
830
"psubw %%mm6,%%mm5\n\t"
831
/*Write the answer out.*/
832
"movq %%mm0,0x00(%[residue])\n\t"
833
"movq %%mm4,0x08(%[residue])\n\t"
834
"movq %%mm1,0x10(%[residue])\n\t"
835
"movq %%mm5,0x18(%[residue])\n\t"
836
/*mm0=[src+4*ystride]*/
837
"movq (%[src]),%%mm0\n\t"
838
/*mm1=[src+5*ystride]*/
839
"movq (%[src],%[ystride]),%%mm1\n\t"
840
/*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
841
"movq %%mm2,%%mm4\n\t"
842
"punpcklbw %%mm7,%%mm2\n\t"
843
"movq %%mm3,%%mm5\n\t"
844
"punpckhbw %%mm7,%%mm4\n\t"
845
"psubw %%mm6,%%mm2\n\t"
846
"punpcklbw %%mm7,%%mm3\n\t"
847
"psubw %%mm6,%%mm4\n\t"
848
"punpckhbw %%mm7,%%mm5\n\t"
849
"psubw %%mm6,%%mm3\n\t"
850
"psubw %%mm6,%%mm5\n\t"
851
/*Write the answer out.*/
852
"movq %%mm2,0x20(%[residue])\n\t"
853
"movq %%mm4,0x28(%[residue])\n\t"
854
"movq %%mm3,0x30(%[residue])\n\t"
855
"movq %%mm5,0x38(%[residue])\n\t"
856
/*mm2=[src+6*ystride]*/
857
"movq (%[src],%[ystride],2),%%mm2\n\t"
858
/*mm3=[src+7*ystride]*/
859
"movq (%[src],%[ystride3]),%%mm3\n\t"
860
/*Compute [src+4*ystride]-128 and [src+5*ystride]-128*/
861
"movq %%mm0,%%mm4\n\t"
862
"punpcklbw %%mm7,%%mm0\n\t"
863
"movq %%mm1,%%mm5\n\t"
864
"punpckhbw %%mm7,%%mm4\n\t"
865
"psubw %%mm6,%%mm0\n\t"
866
"punpcklbw %%mm7,%%mm1\n\t"
867
"psubw %%mm6,%%mm4\n\t"
868
"punpckhbw %%mm7,%%mm5\n\t"
869
"psubw %%mm6,%%mm1\n\t"
870
"psubw %%mm6,%%mm5\n\t"
871
/*Write the answer out.*/
872
"movq %%mm0,0x40(%[residue])\n\t"
873
"movq %%mm4,0x48(%[residue])\n\t"
874
"movq %%mm1,0x50(%[residue])\n\t"
875
"movq %%mm5,0x58(%[residue])\n\t"
876
/*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
877
"movq %%mm2,%%mm4\n\t"
878
"punpcklbw %%mm7,%%mm2\n\t"
879
"movq %%mm3,%%mm5\n\t"
880
"punpckhbw %%mm7,%%mm4\n\t"
881
"psubw %%mm6,%%mm2\n\t"
882
"punpcklbw %%mm7,%%mm3\n\t"
883
"psubw %%mm6,%%mm4\n\t"
884
"punpckhbw %%mm7,%%mm5\n\t"
885
"psubw %%mm6,%%mm3\n\t"
886
"psubw %%mm6,%%mm5\n\t"
887
/*Write the answer out.*/
888
"movq %%mm2,0x60(%[residue])\n\t"
889
"movq %%mm4,0x68(%[residue])\n\t"
890
"movq %%mm3,0x70(%[residue])\n\t"
891
"movq %%mm5,0x78(%[residue])\n\t"
892
:[src]"+r"(_src),[ystride3]"=&r"(ystride3)
893
:[residue]"r"(_residue),[ystride]"r"((ptrdiff_t)_ystride)
894
:"memory"
895
);
896
}
897
898
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
899
const unsigned char *_src1,const unsigned char *_src2,int _ystride){
900
oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
901
}
902
903
#endif
904
905