CoCalc -- mmxencfrag.c

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libtheora/x86_vc/mmxencfrag.c
⁹⁹⁰⁴ views
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9
 * by the Xiph.Org Foundation https://www.xiph.org/                 *
10
 *                                                                  *
11
 ********************************************************************
12

13
  function:
14

15
 ********************************************************************/
16
#include <stddef.h>
17
#include "x86enc.h"
18

19
#if defined(OC_X86_ASM)
20

21
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
22
 const unsigned char *_ref,int _ystride){
23
  ptrdiff_t ret;
24
  __asm{
25
#define SRC esi
26
#define REF edx
27
#define YSTRIDE ecx
28
#define YSTRIDE3 edi
29
    mov YSTRIDE,_ystride
30
    mov SRC,_src
31
    mov REF,_ref
32
    /*Load the first 4 rows of each block.*/
33
    movq mm0,[SRC]
34
    movq mm1,[REF]
35
    movq mm2,[SRC][YSTRIDE]
36
    movq mm3,[REF][YSTRIDE]
37
    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
38
    movq mm4,[SRC+YSTRIDE*2]
39
    movq mm5,[REF+YSTRIDE*2]
40
    movq mm6,[SRC+YSTRIDE3]
41
    movq mm7,[REF+YSTRIDE3]
42
    /*Compute their SADs and add them in mm0*/
43
    psadbw mm0,mm1
44
    psadbw mm2,mm3
45
    lea SRC,[SRC+YSTRIDE*4]
46
    paddw mm0,mm2
47
    lea REF,[REF+YSTRIDE*4]
48
    /*Load the next 3 rows as registers become available.*/
49
    movq mm2,[SRC]
50
    movq mm3,[REF]
51
    psadbw mm4,mm5
52
    psadbw mm6,mm7
53
    paddw mm0,mm4
54
    movq mm5,[REF+YSTRIDE]
55
    movq mm4,[SRC+YSTRIDE]
56
    paddw mm0,mm6
57
    movq mm7,[REF+YSTRIDE*2]
58
    movq mm6,[SRC+YSTRIDE*2]
59
    /*Start adding their SADs to mm0*/
60
    psadbw mm2,mm3
61
    psadbw mm4,mm5
62
    paddw mm0,mm2
63
    psadbw mm6,mm7
64
    /*Load last row as registers become available.*/
65
    movq mm2,[SRC+YSTRIDE3]
66
    movq mm3,[REF+YSTRIDE3]
67
    /*And finish adding up their SADs.*/
68
    paddw mm0,mm4
69
    psadbw mm2,mm3
70
    paddw mm0,mm6
71
    paddw mm0,mm2
72
    movd [ret],mm0
73
#undef SRC
74
#undef REF
75
#undef YSTRIDE
76
#undef YSTRIDE3
77
  }
78
  return (unsigned)ret;
79
}
80

81
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
82
 const unsigned char *_ref,int _ystride,unsigned _thresh){
83
  /*Early termination is for suckers.*/
84
  return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
85
}
86

87
#define OC_SAD2_LOOP __asm{ \
88
  /*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \
89
     pavgb computes (mm0+mm1+1>>1). \
90
   The latter is exactly 1 too large when the low bit of two corresponding \
91
    bytes is only set in one of them. \
92
   Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
93
    correct the output of pavgb.*/ \
94
  __asm  movq mm6,mm0 \
95
  __asm  lea REF1,[REF1+YSTRIDE*2] \
96
  __asm  pxor mm0,mm1 \
97
  __asm  pavgb mm6,mm1 \
98
  __asm  lea REF2,[REF2+YSTRIDE*2] \
99
  __asm  movq mm1,mm2 \
100
  __asm  pand mm0,mm7 \
101
  __asm  pavgb mm2,mm3 \
102
  __asm  pxor mm1,mm3 \
103
  __asm  movq mm3,[REF2+YSTRIDE] \
104
  __asm  psubb mm6,mm0 \
105
  __asm  movq mm0,[REF1] \
106
  __asm  pand mm1,mm7 \
107
  __asm  psadbw mm4,mm6 \
108
  __asm  movd mm6,RET \
109
  __asm  psubb mm2,mm1 \
110
  __asm  movq mm1,[REF2] \
111
  __asm  lea SRC,[SRC+YSTRIDE*2] \
112
  __asm  psadbw mm5,mm2 \
113
  __asm  movq mm2,[REF1+YSTRIDE] \
114
  __asm  paddw mm5,mm4 \
115
  __asm  movq mm4,[SRC] \
116
  __asm  paddw mm6,mm5 \
117
  __asm  movq mm5,[SRC+YSTRIDE] \
118
  __asm  movd RET,mm6 \
119
}
120

121
/*Same as above, but does not pre-load the next two rows.*/
122
#define OC_SAD2_TAIL __asm{ \
123
  __asm  movq mm6,mm0 \
124
  __asm  pavgb mm0,mm1 \
125
  __asm  pxor mm6,mm1 \
126
  __asm  movq mm1,mm2 \
127
  __asm  pand mm6,mm7 \
128
  __asm  pavgb mm2,mm3 \
129
  __asm  pxor mm1,mm3 \
130
  __asm  psubb mm0,mm6 \
131
  __asm  pand mm1,mm7 \
132
  __asm  psadbw mm4,mm0 \
133
  __asm  psubb mm2,mm1 \
134
  __asm  movd mm6,RET \
135
  __asm  psadbw mm5,mm2 \
136
  __asm  paddw mm5,mm4 \
137
  __asm  paddw mm6,mm5 \
138
  __asm  movd RET,mm6 \
139
}
140

141
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
142
 const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
143
 unsigned _thresh){
144
  ptrdiff_t ret;
145
  __asm{
146
#define REF1 ecx
147
#define REF2 edi
148
#define YSTRIDE esi
149
#define SRC edx
150
#define RET eax
151
    mov YSTRIDE,_ystride
152
    mov SRC,_src
153
    mov REF1,_ref1
154
    mov REF2,_ref2
155
    movq mm0,[REF1]
156
    movq mm1,[REF2]
157
    movq mm2,[REF1+YSTRIDE]
158
    movq mm3,[REF2+YSTRIDE]
159
    xor RET,RET
160
    movq mm4,[SRC]
161
    pxor mm7,mm7
162
    pcmpeqb mm6,mm6
163
    movq mm5,[SRC+YSTRIDE]
164
    psubb mm7,mm6
165
    OC_SAD2_LOOP
166
    OC_SAD2_LOOP
167
    OC_SAD2_LOOP
168
    OC_SAD2_TAIL
169
    mov [ret],RET
170
#undef REF1
171
#undef REF2
172
#undef YSTRIDE
173
#undef SRC
174
#undef RET
175
  }
176
  return (unsigned)ret;
177
}
178

179
/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
180
  16-bit difference in mm0...mm7.*/
181
#define OC_LOAD_SUB_8x4(_off) __asm{ \
182
  __asm  movd mm0,[_off+SRC] \
183
  __asm  movd mm4,[_off+REF] \
184
  __asm  movd mm1,[_off+SRC+SRC_YSTRIDE] \
185
  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
186
  __asm  movd mm5,[_off+REF+REF_YSTRIDE] \
187
  __asm  lea REF,[REF+REF_YSTRIDE*2] \
188
  __asm  movd mm2,[_off+SRC] \
189
  __asm  movd mm7,[_off+REF] \
190
  __asm  movd mm3,[_off+SRC+SRC_YSTRIDE] \
191
  __asm  movd mm6,[_off+REF+REF_YSTRIDE] \
192
  __asm  punpcklbw mm0,mm4 \
193
  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
194
  __asm  punpcklbw mm4,mm4 \
195
  __asm  lea REF,[REF+REF_YSTRIDE*2] \
196
  __asm  psubw mm0,mm4 \
197
  __asm  movd mm4,[_off+SRC] \
198
  __asm  movq [_off*2+BUF],mm0 \
199
  __asm  movd mm0,[_off+REF] \
200
  __asm  punpcklbw mm1,mm5 \
201
  __asm  punpcklbw mm5,mm5 \
202
  __asm  psubw mm1,mm5 \
203
  __asm  movd mm5,[_off+SRC+SRC_YSTRIDE] \
204
  __asm  punpcklbw mm2,mm7 \
205
  __asm  punpcklbw mm7,mm7 \
206
  __asm  psubw mm2,mm7 \
207
  __asm  movd mm7,[_off+REF+REF_YSTRIDE] \
208
  __asm  punpcklbw mm3,mm6 \
209
  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
210
  __asm  punpcklbw mm6,mm6 \
211
  __asm  psubw mm3,mm6 \
212
  __asm  movd mm6,[_off+SRC] \
213
  __asm  punpcklbw mm4,mm0 \
214
  __asm  lea REF,[REF+REF_YSTRIDE*2] \
215
  __asm  punpcklbw mm0,mm0 \
216
  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
217
  __asm  psubw mm4,mm0 \
218
  __asm  movd mm0,[_off+REF] \
219
  __asm  punpcklbw mm5,mm7 \
220
  __asm  neg SRC_YSTRIDE \
221
  __asm  punpcklbw mm7,mm7 \
222
  __asm  psubw mm5,mm7 \
223
  __asm  movd mm7,[_off+SRC+SRC_YSTRIDE] \
224
  __asm  punpcklbw mm6,mm0 \
225
  __asm  lea REF,[REF+REF_YSTRIDE*2] \
226
  __asm  punpcklbw mm0,mm0 \
227
  __asm  neg REF_YSTRIDE \
228
  __asm  psubw mm6,mm0 \
229
  __asm  movd mm0,[_off+REF+REF_YSTRIDE] \
230
  __asm  lea SRC,[SRC+SRC_YSTRIDE*8] \
231
  __asm  punpcklbw mm7,mm0 \
232
  __asm  neg SRC_YSTRIDE \
233
  __asm  punpcklbw mm0,mm0 \
234
  __asm  lea REF,[REF+REF_YSTRIDE*8] \
235
  __asm  psubw mm7,mm0 \
236
  __asm  neg REF_YSTRIDE \
237
  __asm  movq mm0,[_off*2+BUF] \
238
}
239

240
/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
241
#define OC_LOAD_8x4(_off) __asm{ \
242
  __asm  movd mm0,[_off+SRC] \
243
  __asm  movd mm1,[_off+SRC+YSTRIDE] \
244
  __asm  movd mm2,[_off+SRC+YSTRIDE*2] \
245
  __asm  pxor mm7,mm7 \
246
  __asm  movd mm3,[_off+SRC+YSTRIDE3] \
247
  __asm  punpcklbw mm0,mm7 \
248
  __asm  movd mm4,[_off+SRC4] \
249
  __asm  punpcklbw mm1,mm7 \
250
  __asm  movd mm5,[_off+SRC4+YSTRIDE] \
251
  __asm  punpcklbw mm2,mm7 \
252
  __asm  movd mm6,[_off+SRC4+YSTRIDE*2] \
253
  __asm  punpcklbw mm3,mm7 \
254
  __asm  movd mm7,[_off+SRC4+YSTRIDE3] \
255
  __asm  punpcklbw mm4,mm4 \
256
  __asm  punpcklbw mm5,mm5 \
257
  __asm  psrlw mm4,8 \
258
  __asm  psrlw mm5,8 \
259
  __asm  punpcklbw mm6,mm6 \
260
  __asm  punpcklbw mm7,mm7 \
261
  __asm  psrlw mm6,8 \
262
  __asm  psrlw mm7,8 \
263
}
264

265
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
266
  The transform is performed in place, except that outputs 0-3 are swapped with
267
   outputs 4-7.
268
  Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
269
   perform this stage in place with no temporary registers).*/
270
#define OC_HADAMARD_AB_8x4 __asm{ \
271
  /*Stage A: \
272
    Outputs 0-3 are swapped with 4-7 here.*/ \
273
  __asm  paddw mm5,mm1 \
274
  __asm  paddw mm6,mm2 \
275
  __asm  paddw mm1,mm1 \
276
  __asm  paddw mm2,mm2 \
277
  __asm  psubw mm1,mm5 \
278
  __asm  psubw mm2,mm6 \
279
  __asm  paddw mm7,mm3 \
280
  __asm  paddw mm4,mm0 \
281
  __asm  paddw mm3,mm3 \
282
  __asm  paddw mm0,mm0 \
283
  __asm  psubw mm3,mm7 \
284
  __asm  psubw mm0,mm4 \
285
   /*Stage B:*/ \
286
  __asm  paddw mm0,mm2 \
287
  __asm  paddw mm1,mm3 \
288
  __asm  paddw mm4,mm6 \
289
  __asm  paddw mm5,mm7 \
290
  __asm  paddw mm2,mm2 \
291
  __asm  paddw mm3,mm3 \
292
  __asm  paddw mm6,mm6 \
293
  __asm  paddw mm7,mm7 \
294
  __asm  psubw mm2,mm0 \
295
  __asm  psubw mm3,mm1 \
296
  __asm  psubw mm6,mm4 \
297
  __asm  psubw mm7,mm5 \
298
}
299

300
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
301
  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
302
   place with no temporary registers).*/
303
#define OC_HADAMARD_C_8x4 __asm{ \
304
  /*Stage C:*/ \
305
  __asm  paddw mm0,mm1 \
306
  __asm  paddw mm2,mm3 \
307
  __asm  paddw mm4,mm5 \
308
  __asm  paddw mm6,mm7 \
309
  __asm  paddw mm1,mm1 \
310
  __asm  paddw mm3,mm3 \
311
  __asm  paddw mm5,mm5 \
312
  __asm  paddw mm7,mm7 \
313
  __asm  psubw mm1,mm0 \
314
  __asm  psubw mm3,mm2 \
315
  __asm  psubw mm5,mm4 \
316
  __asm  psubw mm7,mm6 \
317
}
318

319
/*Performs an 8-point 1-D Hadamard transform.
320
  The transform is performed in place, except that outputs 0-3 are swapped with
321
   outputs 4-7.
322
  Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
323
   in place with no temporary registers).*/
324
#define OC_HADAMARD_8x4 __asm{ \
325
  OC_HADAMARD_AB_8x4 \
326
  OC_HADAMARD_C_8x4 \
327
}
328

329
/*Performs the first part of the final stage of the Hadamard transform and
330
   summing of absolute values.
331
  At the end of this part, mm1 will contain the DC coefficient of the
332
   transform.*/
333
#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \
334
  /*We use the fact that \
335
      (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
336
     to merge the final butterfly with the abs and the first stage of \
337
     accumulation. \
338
    Thus we can avoid using pabsw, which is not available until SSSE3. \
339
    Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
340
     implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
341
     registers). \
342
    Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
343
    This implementation is only 26 (+4 for spilling registers).*/ \
344
  __asm  movq [_r7+BUF],mm7 \
345
  __asm  movq [_r6+BUF],mm6 \
346
  /*mm7={0x7FFF}x4 \
347
    mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
348
  __asm  pcmpeqb mm7,mm7 \
349
  __asm  movq mm6,mm0 \
350
  __asm  psrlw mm7,1 \
351
  __asm  paddw mm6,mm1 \
352
  __asm  pmaxsw mm0,mm1 \
353
  __asm  paddsw mm6,mm7 \
354
  __asm  psubw mm0,mm6 \
355
  /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
356
    mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
357
  __asm  movq mm6,mm2 \
358
  __asm  movq mm1,mm4 \
359
  __asm  pmaxsw mm2,mm3 \
360
  __asm  pmaxsw mm4,mm5 \
361
  __asm  paddw mm6,mm3 \
362
  __asm  paddw mm1,mm5 \
363
  __asm  movq mm3,[_r7+BUF] \
364
}
365

366
/*Performs the second part of the final stage of the Hadamard transform and
367
   summing of absolute values.*/
368
#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \
369
  __asm  paddsw mm6,mm7 \
370
  __asm  movq mm5,[_r6+BUF] \
371
  __asm  paddsw mm1,mm7 \
372
  __asm  psubw mm2,mm6 \
373
  __asm  psubw mm4,mm1 \
374
  /*mm7={1}x4 (needed for the horizontal add that follows) \
375
    mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
376
  __asm  movq mm6,mm3 \
377
  __asm  pmaxsw mm3,mm5 \
378
  __asm  paddw mm0,mm2 \
379
  __asm  paddw mm6,mm5 \
380
  __asm  paddw mm0,mm4 \
381
  __asm  paddsw mm6,mm7 \
382
  __asm  paddw mm0,mm3 \
383
  __asm  psrlw mm7,14 \
384
  __asm  psubw mm0,mm6 \
385
}
386

387
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
388
   absolute value of each component, and accumulates everything into mm0.
389
  This is the only portion of SATD which requires MMXEXT (we could use plain
390
   MMX, but it takes 4 instructions and an extra register to work around the
391
   lack of a pmaxsw, which is a pretty serious penalty).*/
392
#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
393
  OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
394
  OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
395
}
396

397
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
398
   component, and accumulates everything into mm0.
399
  Note that mm0 will have an extra 4 added to each column, and that after
400
   removing this value, the remainder will be half the conventional value.*/
401
#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
402
  OC_HADAMARD_AB_8x4 \
403
  OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
404
}
405

406
/*Performs two 4x4 transposes (mostly) in place.
407
  On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
408
   contains rows {a,b,c,d}.
409
  On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and
410
   {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
411
#define OC_TRANSPOSE_4x4x2(_off) __asm{ \
412
  /*First 4x4 transpose:*/ \
413
  __asm  movq [0x10+_off+BUF],mm5 \
414
  /*mm0 = e3 e2 e1 e0 \
415
    mm1 = f3 f2 f1 f0 \
416
    mm2 = g3 g2 g1 g0 \
417
    mm3 = h3 h2 h1 h0*/ \
418
  __asm  movq mm5,mm2 \
419
  __asm  punpcklwd mm2,mm3 \
420
  __asm  punpckhwd mm5,mm3 \
421
  __asm  movq mm3,mm0 \
422
  __asm  punpcklwd mm0,mm1 \
423
  __asm  punpckhwd mm3,mm1 \
424
  /*mm0 = f1 e1 f0 e0 \
425
    mm3 = f3 e3 f2 e2 \
426
    mm2 = h1 g1 h0 g0 \
427
    mm5 = h3 g3 h2 g2*/ \
428
  __asm  movq mm1,mm0 \
429
  __asm  punpckldq mm0,mm2 \
430
  __asm  punpckhdq mm1,mm2 \
431
  __asm  movq mm2,mm3 \
432
  __asm  punpckhdq mm3,mm5 \
433
  __asm  movq [0x40+_off+BUF],mm0 \
434
  __asm  punpckldq mm2,mm5 \
435
  /*mm0 = h0 g0 f0 e0 \
436
    mm1 = h1 g1 f1 e1 \
437
    mm2 = h2 g2 f2 e2 \
438
    mm3 = h3 g3 f3 e3*/ \
439
  __asm  movq mm5,[0x10+_off+BUF] \
440
  /*Second 4x4 transpose:*/ \
441
  /*mm4 = a3 a2 a1 a0 \
442
    mm5 = b3 b2 b1 b0 \
443
    mm6 = c3 c2 c1 c0 \
444
    mm7 = d3 d2 d1 d0*/ \
445
  __asm  movq mm0,mm6 \
446
  __asm  punpcklwd mm6,mm7 \
447
  __asm  movq [0x50+_off+BUF],mm1 \
448
  __asm  punpckhwd mm0,mm7 \
449
  __asm  movq mm7,mm4 \
450
  __asm  punpcklwd mm4,mm5 \
451
  __asm  movq [0x60+_off+BUF],mm2 \
452
  __asm  punpckhwd mm7,mm5 \
453
  /*mm4 = b1 a1 b0 a0 \
454
    mm7 = b3 a3 b2 a2 \
455
    mm6 = d1 c1 d0 c0 \
456
    mm0 = d3 c3 d2 c2*/ \
457
  __asm  movq mm5,mm4 \
458
  __asm  punpckldq mm4,mm6 \
459
  __asm  movq [0x70+_off+BUF],mm3 \
460
  __asm  punpckhdq mm5,mm6 \
461
  __asm  movq mm6,mm7 \
462
  __asm  punpckhdq mm7,mm0 \
463
  __asm  punpckldq mm6,mm0 \
464
  /*mm4 = d0 c0 b0 a0 \
465
    mm5 = d1 c1 b1 a1 \
466
    mm6 = d2 c2 b2 a2 \
467
    mm7 = d3 c3 b3 a3*/ \
468
}
469

470
static unsigned oc_int_frag_satd_mmxext(int *_dc,
471
 const unsigned char *_src,int _src_ystride,
472
 const unsigned char *_ref,int _ref_ystride){
473
  OC_ALIGN8(ogg_int16_t buf[64]);
474
  ogg_int16_t *bufp;
475
  unsigned     ret;
476
  unsigned     ret2;
477
  int          dc;
478
  bufp=buf;
479
  __asm{
480
#define SRC esi
481
#define REF eax
482
#define SRC_YSTRIDE ecx
483
#define REF_YSTRIDE edx
484
#define BUF edi
485
#define RET edx
486
#define RET2 ecx
487
#define DC eax
488
#define DC_WORD ax
489
    mov SRC,_src
490
    mov SRC_YSTRIDE,_src_ystride
491
    mov REF,_ref
492
    mov REF_YSTRIDE,_ref_ystride
493
    mov BUF,bufp
494
    OC_LOAD_SUB_8x4(0x00)
495
    OC_HADAMARD_8x4
496
    OC_TRANSPOSE_4x4x2(0x00)
497
    /*Finish swapping out this 8x4 block to make room for the next one.
498
      mm0...mm3 have been swapped out already.*/
499
    movq [0x00+BUF],mm4
500
    movq [0x10+BUF],mm5
501
    movq [0x20+BUF],mm6
502
    movq [0x30+BUF],mm7
503
    OC_LOAD_SUB_8x4(0x04)
504
    OC_HADAMARD_8x4
505
    OC_TRANSPOSE_4x4x2(0x08)
506
    /*Here the first 4x4 block of output from the last transpose is the second
507
       4x4 block of input for the next transform.
508
      We have cleverly arranged that it already be in the appropriate place, so
509
       we only have to do half the loads.*/
510
    movq mm1,[0x10+BUF]
511
    movq mm2,[0x20+BUF]
512
    movq mm3,[0x30+BUF]
513
    movq mm0,[0x00+BUF]
514
    /*We split out the stages here so we can save the DC coefficient in the
515
       middle.*/
516
    OC_HADAMARD_AB_8x4
517
    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
518
    movd DC,mm1
519
    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
520
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
521
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
522
       for the factor of two we dropped + 3 for the vertical accumulation).
523
      Now we finally have to promote things to dwords.
524
      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
525
       latency of pmaddwd by starting the next series of loads now.*/
526
    pmaddwd mm0,mm7
527
    movq mm1,[0x50+BUF]
528
    movq mm5,[0x58+BUF]
529
    movq mm4,mm0
530
    movq mm2,[0x60+BUF]
531
    punpckhdq mm0,mm0
532
    movq mm6,[0x68+BUF]
533
    paddd mm4,mm0
534
    movq mm3,[0x70+BUF]
535
    movd RET2,mm4
536
    movq mm7,[0x78+BUF]
537
    movq mm0,[0x40+BUF]
538
    movq mm4,[0x48+BUF]
539
    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
540
    pmaddwd mm0,mm7
541
    /*Subtract abs(dc) from 2*ret2.*/
542
    movsx DC,DC_WORD
543
    cdq
544
    lea RET2,[RET+RET2*2]
545
    movq mm4,mm0
546
    punpckhdq mm0,mm0
547
    xor RET,DC
548
    paddd mm4,mm0
549
    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
550
       added to them, a factor of two removed, and the DC value included;
551
       correct the final sum here.*/
552
    sub RET2,RET
553
    movd RET,mm4
554
    lea RET,[RET2+RET*2-64]
555
    mov ret,RET
556
    mov dc,DC
557
#undef SRC
558
#undef REF
559
#undef SRC_YSTRIDE
560
#undef REF_YSTRIDE
561
#undef BUF
562
#undef RET
563
#undef RET2
564
#undef DC
565
#undef DC_WORD
566
  }
567
  *_dc=dc;
568
  return ret;
569
}
570

571
unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
572
 const unsigned char *_ref,int _ystride){
573
  return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
574
}
575

576

577
/*Our internal implementation of frag_copy2 takes an extra stride parameter so
578
   we can share code with oc_enc_frag_satd2_mmxext().*/
579
static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
580
 const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
581
  __asm{
582
    /*Load the first 3 rows.*/
583
#define DST_YSTRIDE edi
584
#define SRC_YSTRIDE esi
585
#define DST eax
586
#define SRC1 edx
587
#define SRC2 ecx
588
    mov DST_YSTRIDE,_dst_ystride
589
    mov SRC_YSTRIDE,_src_ystride
590
    mov DST,_dst
591
    mov SRC1,_src1
592
    mov SRC2,_src2
593
    movq mm0,[SRC1]
594
    movq mm1,[SRC2]
595
    movq mm2,[SRC1+SRC_YSTRIDE]
596
    lea SRC1,[SRC1+SRC_YSTRIDE*2]
597
    movq mm3,[SRC2+SRC_YSTRIDE]
598
    lea SRC2,[SRC2+SRC_YSTRIDE*2]
599
    pxor mm7,mm7
600
    movq mm4,[SRC1]
601
    pcmpeqb mm6,mm6
602
    movq mm5,[SRC2]
603
    /*mm7={1}x8.*/
604
    psubb mm7,mm6
605
    /*Start averaging mm0 and mm1 into mm6.*/
606
    movq mm6,mm0
607
    pxor mm0,mm1
608
    pavgb mm6,mm1
609
    /*mm1 is free, start averaging mm3 into mm2 using mm1.*/
610
    movq mm1,mm2
611
    pand mm0,mm7
612
    pavgb mm2,mm3
613
    pxor mm1,mm3
614
    /*mm3 is free.*/
615
    psubb mm6,mm0
616
    /*mm0 is free, start loading the next row.*/
617
    movq mm0,[SRC1+SRC_YSTRIDE]
618
    /*Start averaging mm5 and mm4 using mm3.*/
619
    movq mm3,mm4
620
    /*mm6 [row 0] is done; write it out.*/
621
    movq [DST],mm6
622
    pand mm1,mm7
623
    pavgb mm4,mm5
624
    psubb mm2,mm1
625
    /*mm1 is free, continue loading the next row.*/
626
    movq mm1,[SRC2+SRC_YSTRIDE]
627
    pxor mm3,mm5
628
    lea SRC1,[SRC1+SRC_YSTRIDE*2]
629
    /*mm2 [row 1] is done; write it out.*/
630
    movq [DST+DST_YSTRIDE],mm2
631
    pand mm3,mm7
632
    /*Start loading the next row.*/
633
    movq mm2,[SRC1]
634
    lea DST,[DST+DST_YSTRIDE*2]
635
    psubb mm4,mm3
636
    lea SRC2,[SRC2+SRC_YSTRIDE*2]
637
    /*mm4 [row 2] is done; write it out.*/
638
    movq [DST],mm4
639
    /*Continue loading the next row.*/
640
    movq mm3,[SRC2]
641
    /*Start averaging mm0 and mm1 into mm6.*/
642
    movq mm6,mm0
643
    pxor mm0,mm1
644
    /*Start loading the next row.*/
645
    movq mm4,[SRC1+SRC_YSTRIDE]
646
    pavgb mm6,mm1
647
    /*mm1 is free; start averaging mm3 into mm2 using mm1.*/
648
    movq mm1,mm2
649
    pand mm0,mm7
650
    /*Continue loading the next row.*/
651
    movq mm5,[SRC2+SRC_YSTRIDE]
652
    pavgb mm2,mm3
653
    lea SRC1,[SRC1+SRC_YSTRIDE*2]
654
    pxor mm1,mm3
655
    /*mm3 is free.*/
656
    psubb mm6,mm0
657
    /*mm0 is free, start loading the next row.*/
658
    movq mm0,[SRC1]
659
    /*Start averaging mm5 into mm4 using mm3.*/
660
    movq mm3,mm4
661
    /*mm6 [row 3] is done; write it out.*/
662
    movq [DST+DST_YSTRIDE],mm6
663
    pand mm1,mm7
664
    lea SRC2,[SRC2+SRC_YSTRIDE*2]
665
    pavgb mm4,mm5
666
    lea DST,[DST+DST_YSTRIDE*2]
667
    psubb mm2,mm1
668
    /*mm1 is free; continue loading the next row.*/
669
    movq mm1,[SRC2]
670
    pxor mm3,mm5
671
    /*mm2 [row 4] is done; write it out.*/
672
    movq [DST],mm2
673
    pand mm3,mm7
674
    /*Start loading the next row.*/
675
    movq mm2,[SRC1+SRC_YSTRIDE]
676
    psubb mm4,mm3
677
    /*Start averaging mm0 and mm1 into mm6.*/
678
    movq mm6,mm0
679
    /*Continue loading the next row.*/
680
    movq mm3,[SRC2+SRC_YSTRIDE]
681
    /*mm4 [row 5] is done; write it out.*/
682
    movq [DST+DST_YSTRIDE],mm4
683
    pxor mm0,mm1
684
    pavgb mm6,mm1
685
    /*mm4 is free; start averaging mm3 into mm2 using mm4.*/
686
    movq mm4,mm2
687
    pand mm0,mm7
688
    pavgb mm2,mm3
689
    pxor mm4,mm3
690
    lea DST,[DST+DST_YSTRIDE*2]
691
    psubb mm6,mm0
692
    pand mm4,mm7
693
    /*mm6 [row 6] is done, write it out.*/
694
    movq [DST],mm6
695
    psubb mm2,mm4
696
    /*mm2 [row 7] is done, write it out.*/
697
    movq [DST+DST_YSTRIDE],mm2
698
#undef SRC1
699
#undef SRC2
700
#undef SRC_YSTRIDE
701
#undef DST_YSTRIDE
702
#undef DST
703
  }
704
}
705

706
unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
707
 const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
708
  OC_ALIGN8(unsigned char ref[64]);
709
  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
710
  return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
711
}
712

713
unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,const unsigned char *_src,
714
 int _ystride){
715
  OC_ALIGN8(ogg_int16_t buf[64]);
716
  ogg_int16_t *bufp;
717
  unsigned     ret1;
718
  unsigned     ret2;
719
  int          dc;
720
  bufp=buf;
721
  __asm{
722
#define SRC eax
723
#define SRC4 esi
724
#define BUF edi
725
#define YSTRIDE edx
726
#define YSTRIDE3 ecx
727
#define RET eax
728
#define RET2 ecx
729
#define DC edx
730
#define DC_WORD dx
731
    mov SRC,_src
732
    mov BUF,bufp
733
    mov YSTRIDE,_ystride
734
    /* src4 = src+4*ystride */
735
    lea SRC4,[SRC+YSTRIDE*4]
736
    /* ystride3 = 3*ystride */
737
    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
738
    OC_LOAD_8x4(0x00)
739
    OC_HADAMARD_8x4
740
    OC_TRANSPOSE_4x4x2(0x00)
741
    /*Finish swapping out this 8x4 block to make room for the next one.
742
      mm0...mm3 have been swapped out already.*/
743
    movq [0x00+BUF],mm4
744
    movq [0x10+BUF],mm5
745
    movq [0x20+BUF],mm6
746
    movq [0x30+BUF],mm7
747
    OC_LOAD_8x4(0x04)
748
    OC_HADAMARD_8x4
749
    OC_TRANSPOSE_4x4x2(0x08)
750
    /*Here the first 4x4 block of output from the last transpose is the second
751
      4x4 block of input for the next transform.
752
      We have cleverly arranged that it already be in the appropriate place, so
753
      we only have to do half the loads.*/
754
    movq mm1,[0x10+BUF]
755
    movq mm2,[0x20+BUF]
756
    movq mm3,[0x30+BUF]
757
    movq mm0,[0x00+BUF]
758
    /*We split out the stages here so we can save the DC coefficient in the
759
      middle.*/
760
    OC_HADAMARD_AB_8x4
761
    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
762
    movd DC,mm1
763
    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
764
    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
765
      difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
766
      for the factor of two we dropped + 3 for the vertical accumulation).
767
      Now we finally have to promote things to dwords.
768
      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
769
      latency of pmaddwd by starting the next series of loads now.*/
770
    pmaddwd mm0,mm7
771
    movq mm1,[0x50+BUF]
772
    movq mm5,[0x58+BUF]
773
    movq mm2,[0x60+BUF]
774
    movq mm4,mm0
775
    movq mm6,[0x68+BUF]
776
    punpckhdq mm0,mm0
777
    movq mm3,[0x70+BUF]
778
    paddd mm4,mm0
779
    movq mm7,[0x78+BUF]
780
    movd RET,mm4
781
    movq mm0,[0x40+BUF]
782
    movq mm4,[0x48+BUF]
783
    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
784
    pmaddwd mm0,mm7
785
    /*We assume that the DC coefficient is always positive (which is true,
786
    because the input to the INTRA transform was not a difference).*/
787
    movzx DC,DC_WORD
788
    add RET,RET
789
    sub RET,DC
790
    movq mm4,mm0
791
    punpckhdq mm0,mm0
792
    paddd mm4,mm0
793
    movd RET2,mm4
794
    lea RET,[-64+RET+RET2*2]
795
    mov [dc],DC
796
    mov [ret1],RET
797
#undef SRC
798
#undef SRC4
799
#undef BUF
800
#undef YSTRIDE
801
#undef YSTRIDE3
802
#undef RET
803
#undef RET2
804
#undef DC
805
#undef DC_WORD
806
  }
807
  *_dc=dc;
808
  return ret1;
809
}
810

811
void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
812
 const unsigned char *_src, const unsigned char *_ref,int _ystride){
813
  int i;
814
  __asm  pxor mm7,mm7
815
  for(i=4;i-->0;){
816
    __asm{
817
#define SRC edx
818
#define YSTRIDE esi
819
#define RESIDUE eax
820
#define REF ecx
821
      mov YSTRIDE,_ystride
822
      mov RESIDUE,_residue
823
      mov SRC,_src
824
      mov REF,_ref
825
      /*mm0=[src]*/
826
      movq mm0,[SRC]
827
      /*mm1=[ref]*/
828
      movq mm1,[REF]
829
      /*mm4=[src+ystride]*/
830
      movq mm4,[SRC+YSTRIDE]
831
      /*mm5=[ref+ystride]*/
832
      movq mm5,[REF+YSTRIDE]
833
      /*Compute [src]-[ref].*/
834
      movq mm2,mm0
835
      punpcklbw mm0,mm7
836
      movq mm3,mm1
837
      punpckhbw mm2,mm7
838
      punpcklbw mm1,mm7
839
      punpckhbw mm3,mm7
840
      psubw mm0,mm1
841
      psubw mm2,mm3
842
      /*Compute [src+ystride]-[ref+ystride].*/
843
      movq mm1,mm4
844
      punpcklbw mm4,mm7
845
      movq mm3,mm5
846
      punpckhbw mm1,mm7
847
      lea SRC,[SRC+YSTRIDE*2]
848
      punpcklbw mm5,mm7
849
      lea REF,[REF+YSTRIDE*2]
850
      punpckhbw mm3,mm7
851
      psubw mm4,mm5
852
      psubw mm1,mm3
853
      /*Write the answer out.*/
854
      movq [RESIDUE+0x00],mm0
855
      movq [RESIDUE+0x08],mm2
856
      movq [RESIDUE+0x10],mm4
857
      movq [RESIDUE+0x18],mm1
858
      lea RESIDUE,[RESIDUE+0x20]
859
      mov _residue,RESIDUE
860
      mov _src,SRC
861
      mov _ref,REF
862
#undef SRC
863
#undef YSTRIDE
864
#undef RESIDUE
865
#undef REF
866
    }
867
  }
868
}
869

870
void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
871
 const unsigned char *_src,int _ystride){
872
   __asm{
873
#define YSTRIDE edx
874
#define YSTRIDE3 edi
875
#define RESIDUE ecx
876
#define SRC eax
877
    mov YSTRIDE,_ystride
878
    mov RESIDUE,_residue
879
    mov SRC,_src
880
    /*mm0=[src]*/
881
    movq mm0,[SRC]
882
    /*mm1=[src+ystride]*/
883
    movq mm1,[SRC+YSTRIDE]
884
    /*mm6={-1}x4*/
885
    pcmpeqw mm6,mm6
886
    /*mm2=[src+2*ystride]*/
887
    movq mm2,[SRC+YSTRIDE*2]
888
    /*[ystride3]=3*[ystride]*/
889
    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
890
    /*mm6={1}x4*/
891
    psllw mm6,15
892
    /*mm3=[src+3*ystride]*/
893
    movq mm3,[SRC+YSTRIDE3]
894
    /*mm6={128}x4*/
895
    psrlw mm6,8
896
    /*mm7=0*/ 
897
    pxor mm7,mm7
898
    /*[src]=[src]+4*[ystride]*/
899
    lea SRC,[SRC+YSTRIDE*4]
900
    /*Compute [src]-128 and [src+ystride]-128*/
901
    movq mm4,mm0
902
    punpcklbw mm0,mm7
903
    movq mm5,mm1
904
    punpckhbw mm4,mm7
905
    psubw mm0,mm6
906
    punpcklbw mm1,mm7
907
    psubw mm4,mm6
908
    punpckhbw mm5,mm7
909
    psubw mm1,mm6
910
    psubw mm5,mm6
911
    /*Write the answer out.*/
912
    movq [RESIDUE+0x00],mm0
913
    movq [RESIDUE+0x08],mm4
914
    movq [RESIDUE+0x10],mm1
915
    movq [RESIDUE+0x18],mm5
916
    /*mm0=[src+4*ystride]*/
917
    movq mm0,[SRC]
918
    /*mm1=[src+5*ystride]*/
919
    movq mm1,[SRC+YSTRIDE]
920
    /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
921
    movq mm4,mm2
922
    punpcklbw mm2,mm7
923
    movq mm5,mm3
924
    punpckhbw mm4,mm7
925
    psubw mm2,mm6
926
    punpcklbw mm3,mm7
927
    psubw mm4,mm6
928
    punpckhbw mm5,mm7
929
    psubw mm3,mm6
930
    psubw mm5,mm6
931
    /*Write the answer out.*/
932
    movq [RESIDUE+0x20],mm2
933
    movq [RESIDUE+0x28],mm4
934
    movq [RESIDUE+0x30],mm3
935
    movq [RESIDUE+0x38],mm5
936
    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
937
    movq mm2,[SRC+YSTRIDE*2]
938
    movq mm3,[SRC+YSTRIDE3]
939
    movq mm4,mm0
940
    punpcklbw mm0,mm7
941
    movq mm5,mm1
942
    punpckhbw mm4,mm7
943
    psubw mm0,mm6
944
    punpcklbw mm1,mm7
945
    psubw mm4,mm6
946
    punpckhbw mm5,mm7
947
    psubw mm1,mm6
948
    psubw mm5,mm6
949
    /*Write the answer out.*/
950
    movq [RESIDUE+0x40],mm0
951
    movq [RESIDUE+0x48],mm4
952
    movq [RESIDUE+0x50],mm1
953
    movq [RESIDUE+0x58],mm5
954
    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
955
    movq mm4,mm2
956
    punpcklbw mm2,mm7
957
    movq mm5,mm3
958
    punpckhbw mm4,mm7
959
    psubw mm2,mm6
960
    punpcklbw mm3,mm7
961
    psubw mm4,mm6
962
    punpckhbw mm5,mm7
963
    psubw mm3,mm6
964
    psubw mm5,mm6
965
    /*Write the answer out.*/
966
    movq [RESIDUE+0x60],mm2
967
    movq [RESIDUE+0x68],mm4
968
    movq [RESIDUE+0x70],mm3
969
    movq [RESIDUE+0x78],mm5
970
#undef YSTRIDE
971
#undef YSTRIDE3
972
#undef RESIDUE
973
#undef SRC
974
  }
975
}
976

977
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
978
 const unsigned char *_src1,const unsigned char *_src2,int _ystride){
979
  oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
980
}
981

982
#endif
983

984
Product

Resources

Company