CoCalc -- sse2encfrag.c

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libtheora/x86/sse2encfrag.c
⁹⁹⁰³ views
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9
 * by the Xiph.Org Foundation https://www.xiph.org/                 *
10
 *                                                                  *
11
 ********************************************************************
12

13
  function:
14

15
 ********************************************************************/
16
#include <stddef.h>
17
#include "x86enc.h"
18
#include "sse2trans.h"
19

20
#if defined(OC_X86_ASM)
21

22
/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
23
   16-bit differences.
24
  On output, these are stored in _m0, xmm1, xmm2, and xmm3.
25
  xmm4 and xmm5 are clobbered.*/
26
#define OC_LOAD_SUB_4x8(_m0) \
27
 "#OC_LOAD_SUB_4x8\n\t" \
28
 /*Load the first three rows.*/ \
29
 "movq (%[src]),"_m0"\n\t" \
30
 "movq (%[ref]),%%xmm4\n\t" \
31
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
32
 "movq (%[ref],%[ystride]),%%xmm3\n\t" \
33
 "movq (%[src],%[ystride],2),%%xmm2\n\t" \
34
 "movq (%[ref],%[ystride],2),%%xmm5\n\t" \
35
 /*Unpack and subtract.*/ \
36
 "punpcklbw %%xmm4,"_m0"\n\t" \
37
 "punpcklbw %%xmm4,%%xmm4\n\t" \
38
 "punpcklbw %%xmm3,%%xmm1\n\t" \
39
 "punpcklbw %%xmm3,%%xmm3\n\t" \
40
 "psubw %%xmm4,"_m0"\n\t" \
41
 "psubw %%xmm3,%%xmm1\n\t" \
42
 /*Load the last row.*/ \
43
 "movq (%[src],%[ystride3]),%%xmm3\n\t" \
44
 "movq (%[ref],%[ystride3]),%%xmm4\n\t" \
45
 /*Unpack, subtract, and advance the pointers.*/ \
46
 "punpcklbw %%xmm5,%%xmm2\n\t" \
47
 "punpcklbw %%xmm5,%%xmm5\n\t" \
48
 "lea (%[src],%[ystride],4),%[src]\n\t" \
49
 "psubw %%xmm5,%%xmm2\n\t" \
50
 "punpcklbw %%xmm4,%%xmm3\n\t" \
51
 "punpcklbw %%xmm4,%%xmm4\n\t" \
52
 "lea (%[ref],%[ystride],4),%[ref]\n\t" \
53
 "psubw %%xmm4,%%xmm3\n\t" \
54

55
/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
56
  On output, xmm0 contains the sum of two of the rows, and the other two are
57
   added to xmm7.*/
58
#define OC_SSD_4x8(_m0) \
59
 "pmaddwd "_m0","_m0"\n\t" \
60
 "pmaddwd %%xmm1,%%xmm1\n\t" \
61
 "pmaddwd %%xmm2,%%xmm2\n\t" \
62
 "pmaddwd %%xmm3,%%xmm3\n\t" \
63
 "paddd %%xmm1,"_m0"\n\t" \
64
 "paddd %%xmm3,%%xmm2\n\t" \
65
 "paddd %%xmm2,%%xmm7\n\t" \
66

67
unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
68
 const unsigned char *_ref,int _ystride){
69
  unsigned ret;
70
  __asm__ __volatile__(
71
    OC_LOAD_SUB_4x8("%%xmm7")
72
    OC_SSD_4x8("%%xmm7")
73
    OC_LOAD_SUB_4x8("%%xmm0")
74
    OC_SSD_4x8("%%xmm0")
75
    "paddd %%xmm0,%%xmm7\n\t"
76
    "movdqa %%xmm7,%%xmm6\n\t"
77
    "punpckhqdq %%xmm7,%%xmm7\n\t"
78
    "paddd %%xmm6,%%xmm7\n\t"
79
    "pshufd $1,%%xmm7,%%xmm6\n\t"
80
    "paddd %%xmm6,%%xmm7\n\t"
81
    "movd %%xmm7,%[ret]\n\t"
82
    :[ret]"=a"(ret)
83
    :[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
84
     [ystride3]"r"((ptrdiff_t)_ystride*3)
85
  );
86
  return ret;
87
}
88

89
static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
90
  0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
91
};
92

93
/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
94
   horizontal sums as well as their 16-bit differences subject to a mask.
95
  %%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
96
#define OC_LOAD_SUB_MASK_2x8 \
97
 "#OC_LOAD_SUB_MASK_2x8\n\t" \
98
 /*Start the loads and expand the next 8 bits of the mask.*/ \
99
 "shl $8,%[m]\n\t" \
100
 "movq (%[src]),%%xmm0\n\t" \
101
 "mov %h[m],%b[m]\n\t" \
102
 "movq (%[ref]),%%xmm2\n\t" \
103
 "movd %[m],%%xmm4\n\t" \
104
 "shr $8,%[m]\n\t" \
105
 "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
106
 "mov %h[m],%b[m]\n\t" \
107
 "pand %%xmm6,%%xmm4\n\t" \
108
 "pcmpeqb %%xmm6,%%xmm4\n\t" \
109
 /*Perform the masking.*/ \
110
 "pand %%xmm4,%%xmm0\n\t" \
111
 "pand %%xmm4,%%xmm2\n\t" \
112
 /*Finish the loads while unpacking the first set of rows, and expand the next
113
    8 bits of the mask.*/ \
114
 "movd %[m],%%xmm4\n\t" \
115
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
116
 "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
117
 "movq (%[ref],%[ystride]),%%xmm3\n\t" \
118
 "pand %%xmm6,%%xmm4\n\t" \
119
 "punpcklbw %%xmm2,%%xmm0\n\t" \
120
 "pcmpeqb %%xmm6,%%xmm4\n\t" \
121
 "punpcklbw %%xmm2,%%xmm2\n\t" \
122
 /*Mask and unpack the second set of rows.*/ \
123
 "pand %%xmm4,%%xmm1\n\t" \
124
 "pand %%xmm4,%%xmm3\n\t" \
125
 "punpcklbw %%xmm3,%%xmm1\n\t" \
126
 "punpcklbw %%xmm3,%%xmm3\n\t" \
127
 "psubw %%xmm2,%%xmm0\n\t" \
128
 "psubw %%xmm3,%%xmm1\n\t" \
129

130
unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
131
 const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
132
  ptrdiff_t ystride;
133
  unsigned  ret;
134
  int       i;
135
  ystride=_ystride;
136
  __asm__ __volatile__(
137
    "pxor %%xmm7,%%xmm7\n\t"
138
    "movq %[c],%%xmm6\n\t"
139
    :
140
    :[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
141
  );
142
  for(i=0;i<4;i++){
143
    unsigned m;
144
    m=_mask&0xFFFF;
145
    _mask>>=16;
146
    if(m){
147
      __asm__ __volatile__(
148
        OC_LOAD_SUB_MASK_2x8
149
        "pmaddwd %%xmm0,%%xmm0\n\t"
150
        "pmaddwd %%xmm1,%%xmm1\n\t"
151
        "paddd %%xmm0,%%xmm7\n\t"
152
        "paddd %%xmm1,%%xmm7\n\t"
153
        :[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
154
      );
155
    }
156
    _src+=2*ystride;
157
    _ref+=2*ystride;
158
  }
159
  __asm__ __volatile__(
160
    "movdqa %%xmm7,%%xmm6\n\t"
161
    "punpckhqdq %%xmm7,%%xmm7\n\t"
162
    "paddd %%xmm6,%%xmm7\n\t"
163
    "pshufd $1,%%xmm7,%%xmm6\n\t"
164
    "paddd %%xmm6,%%xmm7\n\t"
165
    "movd %%xmm7,%[ret]\n\t"
166
    :[ret]"=a"(ret)
167
  );
168
  return ret;
169
}
170

171

172
/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
173
   16-bit difference in %%xmm0...%%xmm7.*/
174
#define OC_LOAD_SUB_8x8 \
175
 "#OC_LOAD_SUB_8x8\n\t" \
176
 "movq (%[src]),%%xmm0\n\t" \
177
 "movq (%[ref]),%%xmm4\n\t" \
178
 "movq (%[src],%[src_ystride]),%%xmm1\n\t" \
179
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
180
 "movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
181
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
182
 "movq (%[src]),%%xmm2\n\t" \
183
 "movq (%[ref]),%%xmm7\n\t" \
184
 "movq (%[src],%[src_ystride]),%%xmm3\n\t" \
185
 "movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
186
 "punpcklbw %%xmm4,%%xmm0\n\t" \
187
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
188
 "punpcklbw %%xmm4,%%xmm4\n\t" \
189
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
190
 "psubw %%xmm4,%%xmm0\n\t" \
191
 "movq (%[src]),%%xmm4\n\t" \
192
 "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
193
 "movq (%[ref]),%%xmm0\n\t" \
194
 "punpcklbw %%xmm5,%%xmm1\n\t" \
195
 "punpcklbw %%xmm5,%%xmm5\n\t" \
196
 "psubw %%xmm5,%%xmm1\n\t" \
197
 "movq (%[src],%[src_ystride]),%%xmm5\n\t" \
198
 "punpcklbw %%xmm7,%%xmm2\n\t" \
199
 "punpcklbw %%xmm7,%%xmm7\n\t" \
200
 "psubw %%xmm7,%%xmm2\n\t" \
201
 "movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
202
 "punpcklbw %%xmm6,%%xmm3\n\t" \
203
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
204
 "punpcklbw %%xmm6,%%xmm6\n\t" \
205
 "psubw %%xmm6,%%xmm3\n\t" \
206
 "movq (%[src]),%%xmm6\n\t" \
207
 "punpcklbw %%xmm0,%%xmm4\n\t" \
208
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
209
 "punpcklbw %%xmm0,%%xmm0\n\t" \
210
 "lea (%[src],%[src_ystride],2),%[src]\n\t" \
211
 "psubw %%xmm0,%%xmm4\n\t" \
212
 "movq (%[ref]),%%xmm0\n\t" \
213
 "punpcklbw %%xmm7,%%xmm5\n\t" \
214
 "neg %[src_ystride]\n\t" \
215
 "punpcklbw %%xmm7,%%xmm7\n\t" \
216
 "psubw %%xmm7,%%xmm5\n\t" \
217
 "movq (%[src],%[src_ystride]),%%xmm7\n\t" \
218
 "punpcklbw %%xmm0,%%xmm6\n\t" \
219
 "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
220
 "punpcklbw %%xmm0,%%xmm0\n\t" \
221
 "neg %[ref_ystride]\n\t" \
222
 "psubw %%xmm0,%%xmm6\n\t" \
223
 "movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
224
 "punpcklbw %%xmm0,%%xmm7\n\t" \
225
 "punpcklbw %%xmm0,%%xmm0\n\t" \
226
 "psubw %%xmm0,%%xmm7\n\t" \
227
 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
228

229
/*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
230
#define OC_LOAD_8x8 \
231
 "#OC_LOAD_8x8\n\t" \
232
 "movq (%[src]),%%xmm0\n\t" \
233
 "movq (%[src],%[ystride]),%%xmm1\n\t" \
234
 "movq (%[src],%[ystride],2),%%xmm2\n\t" \
235
 "pxor %%xmm7,%%xmm7\n\t" \
236
 "movq (%[src],%[ystride3]),%%xmm3\n\t" \
237
 "punpcklbw %%xmm7,%%xmm0\n\t" \
238
 "movq (%[src4]),%%xmm4\n\t" \
239
 "punpcklbw %%xmm7,%%xmm1\n\t" \
240
 "movq (%[src4],%[ystride]),%%xmm5\n\t" \
241
 "punpcklbw %%xmm7,%%xmm2\n\t" \
242
 "movq (%[src4],%[ystride],2),%%xmm6\n\t" \
243
 "punpcklbw %%xmm7,%%xmm3\n\t" \
244
 "movq (%[src4],%[ystride3]),%%xmm7\n\t" \
245
 "punpcklbw %%xmm4,%%xmm4\n\t" \
246
 "punpcklbw %%xmm5,%%xmm5\n\t" \
247
 "psrlw $8,%%xmm4\n\t" \
248
 "psrlw $8,%%xmm5\n\t" \
249
 "punpcklbw %%xmm6,%%xmm6\n\t" \
250
 "punpcklbw %%xmm7,%%xmm7\n\t" \
251
 "psrlw $8,%%xmm6\n\t" \
252
 "psrlw $8,%%xmm7\n\t" \
253

254
/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
255
  Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
256
   perform this stage in place with no temporary registers).*/
257
#define OC_HADAMARD_AB_8x8 \
258
 "#OC_HADAMARD_AB_8x8\n\t" \
259
 /*Stage A:*/ \
260
 "paddw %%xmm5,%%xmm1\n\t" \
261
 "paddw %%xmm6,%%xmm2\n\t" \
262
 "paddw %%xmm5,%%xmm5\n\t" \
263
 "paddw %%xmm6,%%xmm6\n\t" \
264
 "psubw %%xmm1,%%xmm5\n\t" \
265
 "psubw %%xmm2,%%xmm6\n\t" \
266
 "paddw %%xmm7,%%xmm3\n\t" \
267
 "paddw %%xmm4,%%xmm0\n\t" \
268
 "paddw %%xmm7,%%xmm7\n\t" \
269
 "paddw %%xmm4,%%xmm4\n\t" \
270
 "psubw %%xmm3,%%xmm7\n\t" \
271
 "psubw %%xmm0,%%xmm4\n\t" \
272
 /*Stage B:*/ \
273
 "paddw %%xmm2,%%xmm0\n\t" \
274
 "paddw %%xmm3,%%xmm1\n\t" \
275
 "paddw %%xmm6,%%xmm4\n\t" \
276
 "paddw %%xmm7,%%xmm5\n\t" \
277
 "paddw %%xmm2,%%xmm2\n\t" \
278
 "paddw %%xmm3,%%xmm3\n\t" \
279
 "paddw %%xmm6,%%xmm6\n\t" \
280
 "paddw %%xmm7,%%xmm7\n\t" \
281
 "psubw %%xmm0,%%xmm2\n\t" \
282
 "psubw %%xmm1,%%xmm3\n\t" \
283
 "psubw %%xmm4,%%xmm6\n\t" \
284
 "psubw %%xmm5,%%xmm7\n\t" \
285

286
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
287
  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
288
   place with no temporary registers).*/
289
#define OC_HADAMARD_C_8x8 \
290
 "#OC_HADAMARD_C_8x8\n\t" \
291
 /*Stage C:*/ \
292
 "paddw %%xmm1,%%xmm0\n\t" \
293
 "paddw %%xmm3,%%xmm2\n\t" \
294
 "paddw %%xmm5,%%xmm4\n\t" \
295
 "paddw %%xmm7,%%xmm6\n\t" \
296
 "paddw %%xmm1,%%xmm1\n\t" \
297
 "paddw %%xmm3,%%xmm3\n\t" \
298
 "paddw %%xmm5,%%xmm5\n\t" \
299
 "paddw %%xmm7,%%xmm7\n\t" \
300
 "psubw %%xmm0,%%xmm1\n\t" \
301
 "psubw %%xmm2,%%xmm3\n\t" \
302
 "psubw %%xmm4,%%xmm5\n\t" \
303
 "psubw %%xmm6,%%xmm7\n\t" \
304

305
/*Performs an 8-point 1-D Hadamard transform in place.
306
  Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
307
   in place with no temporary registers).*/
308
#define OC_HADAMARD_8x8 \
309
 OC_HADAMARD_AB_8x8 \
310
 OC_HADAMARD_C_8x8 \
311

312
/*Performs the first part of the final stage of the Hadamard transform and
313
   summing of absolute values.
314
  At the end of this part, %%xmm1 will contain the DC coefficient of the
315
   transform.*/
316
#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
317
 /*We use the fact that \
318
     (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
319
    to merge the final butterfly with the abs and the first stage of \
320
    accumulation. \
321
   Thus we can avoid using pabsw, which is not available until SSSE3. \
322
   Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
323
    implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
324
    registers). \
325
   Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
326
   This implementation is only 26 (+4 for spilling registers).*/ \
327
 "#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
328
 "movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
329
 "movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
330
 /*xmm7={0x7FFF}x4 \
331
   xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
332
 "pcmpeqb %%xmm7,%%xmm7\n\t" \
333
 "movdqa %%xmm4,%%xmm6\n\t" \
334
 "psrlw $1,%%xmm7\n\t" \
335
 "paddw %%xmm5,%%xmm6\n\t" \
336
 "pmaxsw %%xmm5,%%xmm4\n\t" \
337
 "paddsw %%xmm7,%%xmm6\n\t" \
338
 "psubw %%xmm6,%%xmm4\n\t" \
339
 /*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
340
   xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
341
 "movdqa %%xmm2,%%xmm6\n\t" \
342
 "movdqa %%xmm0,%%xmm5\n\t" \
343
 "pmaxsw %%xmm3,%%xmm2\n\t" \
344
 "pmaxsw %%xmm1,%%xmm0\n\t" \
345
 "paddw %%xmm3,%%xmm6\n\t" \
346
 "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
347
 "paddw %%xmm5,%%xmm1\n\t" \
348
 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \
349

350
/*Performs the second part of the final stage of the Hadamard transform and
351
   summing of absolute values.*/
352
#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
353
 "#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
354
 "paddsw %%xmm7,%%xmm6\n\t" \
355
 "paddsw %%xmm7,%%xmm1\n\t" \
356
 "psubw %%xmm6,%%xmm2\n\t" \
357
 "psubw %%xmm1,%%xmm0\n\t" \
358
 /*xmm7={1}x4 (needed for the horizontal add that follows) \
359
   xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
360
 "movdqa %%xmm3,%%xmm6\n\t" \
361
 "pmaxsw %%xmm5,%%xmm3\n\t" \
362
 "paddw %%xmm2,%%xmm0\n\t" \
363
 "paddw %%xmm5,%%xmm6\n\t" \
364
 "paddw %%xmm4,%%xmm0\n\t" \
365
 "paddsw %%xmm7,%%xmm6\n\t" \
366
 "paddw %%xmm3,%%xmm0\n\t" \
367
 "psrlw $14,%%xmm7\n\t" \
368
 "psubw %%xmm6,%%xmm0\n\t" \
369

370
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
371
   absolute value of each component, and accumulates everything into xmm0.*/
372
#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
373
 OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
374
 OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
375

376
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
377
   component, and accumulates everything into xmm0.
378
  Note that xmm0 will have an extra 4 added to each column, and that after
379
   removing this value, the remainder will be half the conventional value.*/
380
#define OC_HADAMARD_ABS_ACCUM_8x8 \
381
 OC_HADAMARD_AB_8x8 \
382
 OC_HADAMARD_C_ABS_ACCUM_8x8
383

384
static unsigned oc_int_frag_satd_sse2(int *_dc,
385
 const unsigned char *_src,int _src_ystride,
386
 const unsigned char *_ref,int _ref_ystride){
387
  OC_ALIGN16(ogg_int16_t buf[16]);
388
  unsigned ret;
389
  unsigned ret2;
390
  int      dc;
391
  __asm__ __volatile__(
392
    OC_LOAD_SUB_8x8
393
    OC_HADAMARD_8x8
394
    OC_TRANSPOSE_8x8
395
    /*We split out the stages here so we can save the DC coefficient in the
396
       middle.*/
397
    OC_HADAMARD_AB_8x8
398
    OC_HADAMARD_C_ABS_ACCUM_A_8x8
399
    "movd %%xmm1,%[dc]\n\t"
400
    OC_HADAMARD_C_ABS_ACCUM_B_8x8
401
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
402
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
403
       for the factor of two we dropped + 3 for the vertical accumulation).
404
      Now we finally have to promote things to dwords.
405
      We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
406
       latency of pmaddwd by starting to compute abs(dc) here.*/
407
    "pmaddwd %%xmm7,%%xmm0\n\t"
408
    "movsx %w[dc],%[dc]\n\t"
409
    "cdq\n\t"
410
    "movdqa %%xmm0,%%xmm1\n\t"
411
    "punpckhqdq %%xmm0,%%xmm0\n\t"
412
    "paddd %%xmm1,%%xmm0\n\t"
413
    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
414
    "paddd %%xmm1,%%xmm0\n\t"
415
    "movd %%xmm0,%[ret]\n\t"
416
    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
417
       added to them, a factor of two removed, and the DC value included;
418
       correct the final sum here.*/
419
    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
420
    "xor %[dc],%[ret2]\n\t"
421
    "sub %[ret2],%[ret]\n\t"
422
    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
423
       and %[dc] with some of the inputs, since for once we don't write to
424
       them until after we're done using everything but %[buf].*/
425
    /*Note that _src_ystride and _ref_ystride must be given non-overlapping
426
       constraints, otherwise if gcc can prove they're equal it will allocate
427
       them to the same register (which is bad); _src and _ref face a similar
428
       problem.
429
      All four are destructively modified, but if we list them as output
430
       constraints, gcc can't alias them with other outputs.*/
431
    :[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
432
     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
433
    :[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
434
     [ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
435
    /*We have to use neg, so we actually clobber the condition codes for once
436
       (not to mention sub, and add).*/
437
    :"cc"
438
  );
439
  *_dc=dc;
440
  return ret;
441
}
442

443
unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
444
 const unsigned char *_ref,int _ystride){
445
  return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
446
}
447

448
unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
449
 const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
450
  OC_ALIGN8(unsigned char ref[64]);
451
  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
452
  return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
453
}
454

455
unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
456
 const unsigned char *_src,int _ystride){
457
  OC_ALIGN16(ogg_int16_t buf[16]);
458
  unsigned ret;
459
  int      dc;
460
  __asm__ __volatile__(
461
    OC_LOAD_8x8
462
    OC_HADAMARD_8x8
463
    OC_TRANSPOSE_8x8
464
    /*We split out the stages here so we can save the DC coefficient in the
465
       middle.*/
466
    OC_HADAMARD_AB_8x8
467
    OC_HADAMARD_C_ABS_ACCUM_A_8x8
468
    "movd %%xmm1,%[dc]\n\t"
469
    OC_HADAMARD_C_ABS_ACCUM_B_8x8
470
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
471
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
472
       for the factor of two we dropped + 3 for the vertical accumulation).
473
      Now we finally have to promote things to dwords.*/
474
    "pmaddwd %%xmm7,%%xmm0\n\t"
475
    /*We assume that the DC coefficient is always positive (which is true,
476
       because the input to the INTRA transform was not a difference).*/
477
    "movzx %w[dc],%[dc]\n\t"
478
    "movdqa %%xmm0,%%xmm1\n\t"
479
    "punpckhqdq %%xmm0,%%xmm0\n\t"
480
    "paddd %%xmm1,%%xmm0\n\t"
481
    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
482
    "paddd %%xmm1,%%xmm0\n\t"
483
    "movd %%xmm0,%[ret]\n\t"
484
    "lea -64(%[ret],%[ret]),%[ret]\n\t"
485
    "sub %[dc],%[ret]\n\t"
486
    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
487
       and %[dc] with some of the inputs, since for once we don't write to
488
       them until after we're done using everything but %[buf].*/
489
    :[ret]"=a"(ret),[dc]"=r"(dc),
490
     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
491
    :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
492
     [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
493
    /*We have to use sub, so we actually clobber the condition codes for once.*/
494
    :"cc"
495
  );
496
  *_dc=dc;
497
  return ret;
498
}
499

500
#endif
501

502
Product

Resources

Company