Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libtheora/x86_vc/mmxencfrag.c
9904 views
1
/********************************************************************
2
* *
3
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7
* *
8
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9
* by the Xiph.Org Foundation https://www.xiph.org/ *
10
* *
11
********************************************************************
12
13
function:
14
15
********************************************************************/
16
#include <stddef.h>
17
#include "x86enc.h"
18
19
#if defined(OC_X86_ASM)
20
21
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
22
const unsigned char *_ref,int _ystride){
23
ptrdiff_t ret;
24
__asm{
25
#define SRC esi
26
#define REF edx
27
#define YSTRIDE ecx
28
#define YSTRIDE3 edi
29
mov YSTRIDE,_ystride
30
mov SRC,_src
31
mov REF,_ref
32
/*Load the first 4 rows of each block.*/
33
movq mm0,[SRC]
34
movq mm1,[REF]
35
movq mm2,[SRC][YSTRIDE]
36
movq mm3,[REF][YSTRIDE]
37
lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
38
movq mm4,[SRC+YSTRIDE*2]
39
movq mm5,[REF+YSTRIDE*2]
40
movq mm6,[SRC+YSTRIDE3]
41
movq mm7,[REF+YSTRIDE3]
42
/*Compute their SADs and add them in mm0*/
43
psadbw mm0,mm1
44
psadbw mm2,mm3
45
lea SRC,[SRC+YSTRIDE*4]
46
paddw mm0,mm2
47
lea REF,[REF+YSTRIDE*4]
48
/*Load the next 3 rows as registers become available.*/
49
movq mm2,[SRC]
50
movq mm3,[REF]
51
psadbw mm4,mm5
52
psadbw mm6,mm7
53
paddw mm0,mm4
54
movq mm5,[REF+YSTRIDE]
55
movq mm4,[SRC+YSTRIDE]
56
paddw mm0,mm6
57
movq mm7,[REF+YSTRIDE*2]
58
movq mm6,[SRC+YSTRIDE*2]
59
/*Start adding their SADs to mm0*/
60
psadbw mm2,mm3
61
psadbw mm4,mm5
62
paddw mm0,mm2
63
psadbw mm6,mm7
64
/*Load last row as registers become available.*/
65
movq mm2,[SRC+YSTRIDE3]
66
movq mm3,[REF+YSTRIDE3]
67
/*And finish adding up their SADs.*/
68
paddw mm0,mm4
69
psadbw mm2,mm3
70
paddw mm0,mm6
71
paddw mm0,mm2
72
movd [ret],mm0
73
#undef SRC
74
#undef REF
75
#undef YSTRIDE
76
#undef YSTRIDE3
77
}
78
return (unsigned)ret;
79
}
80
81
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
82
const unsigned char *_ref,int _ystride,unsigned _thresh){
83
/*Early termination is for suckers.*/
84
return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
85
}
86
87
#define OC_SAD2_LOOP __asm{ \
88
/*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \
89
pavgb computes (mm0+mm1+1>>1). \
90
The latter is exactly 1 too large when the low bit of two corresponding \
91
bytes is only set in one of them. \
92
Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
93
correct the output of pavgb.*/ \
94
__asm movq mm6,mm0 \
95
__asm lea REF1,[REF1+YSTRIDE*2] \
96
__asm pxor mm0,mm1 \
97
__asm pavgb mm6,mm1 \
98
__asm lea REF2,[REF2+YSTRIDE*2] \
99
__asm movq mm1,mm2 \
100
__asm pand mm0,mm7 \
101
__asm pavgb mm2,mm3 \
102
__asm pxor mm1,mm3 \
103
__asm movq mm3,[REF2+YSTRIDE] \
104
__asm psubb mm6,mm0 \
105
__asm movq mm0,[REF1] \
106
__asm pand mm1,mm7 \
107
__asm psadbw mm4,mm6 \
108
__asm movd mm6,RET \
109
__asm psubb mm2,mm1 \
110
__asm movq mm1,[REF2] \
111
__asm lea SRC,[SRC+YSTRIDE*2] \
112
__asm psadbw mm5,mm2 \
113
__asm movq mm2,[REF1+YSTRIDE] \
114
__asm paddw mm5,mm4 \
115
__asm movq mm4,[SRC] \
116
__asm paddw mm6,mm5 \
117
__asm movq mm5,[SRC+YSTRIDE] \
118
__asm movd RET,mm6 \
119
}
120
121
/*Same as above, but does not pre-load the next two rows.*/
122
#define OC_SAD2_TAIL __asm{ \
123
__asm movq mm6,mm0 \
124
__asm pavgb mm0,mm1 \
125
__asm pxor mm6,mm1 \
126
__asm movq mm1,mm2 \
127
__asm pand mm6,mm7 \
128
__asm pavgb mm2,mm3 \
129
__asm pxor mm1,mm3 \
130
__asm psubb mm0,mm6 \
131
__asm pand mm1,mm7 \
132
__asm psadbw mm4,mm0 \
133
__asm psubb mm2,mm1 \
134
__asm movd mm6,RET \
135
__asm psadbw mm5,mm2 \
136
__asm paddw mm5,mm4 \
137
__asm paddw mm6,mm5 \
138
__asm movd RET,mm6 \
139
}
140
141
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
142
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
143
unsigned _thresh){
144
ptrdiff_t ret;
145
__asm{
146
#define REF1 ecx
147
#define REF2 edi
148
#define YSTRIDE esi
149
#define SRC edx
150
#define RET eax
151
mov YSTRIDE,_ystride
152
mov SRC,_src
153
mov REF1,_ref1
154
mov REF2,_ref2
155
movq mm0,[REF1]
156
movq mm1,[REF2]
157
movq mm2,[REF1+YSTRIDE]
158
movq mm3,[REF2+YSTRIDE]
159
xor RET,RET
160
movq mm4,[SRC]
161
pxor mm7,mm7
162
pcmpeqb mm6,mm6
163
movq mm5,[SRC+YSTRIDE]
164
psubb mm7,mm6
165
OC_SAD2_LOOP
166
OC_SAD2_LOOP
167
OC_SAD2_LOOP
168
OC_SAD2_TAIL
169
mov [ret],RET
170
#undef REF1
171
#undef REF2
172
#undef YSTRIDE
173
#undef SRC
174
#undef RET
175
}
176
return (unsigned)ret;
177
}
178
179
/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
180
16-bit difference in mm0...mm7.*/
181
#define OC_LOAD_SUB_8x4(_off) __asm{ \
182
__asm movd mm0,[_off+SRC] \
183
__asm movd mm4,[_off+REF] \
184
__asm movd mm1,[_off+SRC+SRC_YSTRIDE] \
185
__asm lea SRC,[SRC+SRC_YSTRIDE*2] \
186
__asm movd mm5,[_off+REF+REF_YSTRIDE] \
187
__asm lea REF,[REF+REF_YSTRIDE*2] \
188
__asm movd mm2,[_off+SRC] \
189
__asm movd mm7,[_off+REF] \
190
__asm movd mm3,[_off+SRC+SRC_YSTRIDE] \
191
__asm movd mm6,[_off+REF+REF_YSTRIDE] \
192
__asm punpcklbw mm0,mm4 \
193
__asm lea SRC,[SRC+SRC_YSTRIDE*2] \
194
__asm punpcklbw mm4,mm4 \
195
__asm lea REF,[REF+REF_YSTRIDE*2] \
196
__asm psubw mm0,mm4 \
197
__asm movd mm4,[_off+SRC] \
198
__asm movq [_off*2+BUF],mm0 \
199
__asm movd mm0,[_off+REF] \
200
__asm punpcklbw mm1,mm5 \
201
__asm punpcklbw mm5,mm5 \
202
__asm psubw mm1,mm5 \
203
__asm movd mm5,[_off+SRC+SRC_YSTRIDE] \
204
__asm punpcklbw mm2,mm7 \
205
__asm punpcklbw mm7,mm7 \
206
__asm psubw mm2,mm7 \
207
__asm movd mm7,[_off+REF+REF_YSTRIDE] \
208
__asm punpcklbw mm3,mm6 \
209
__asm lea SRC,[SRC+SRC_YSTRIDE*2] \
210
__asm punpcklbw mm6,mm6 \
211
__asm psubw mm3,mm6 \
212
__asm movd mm6,[_off+SRC] \
213
__asm punpcklbw mm4,mm0 \
214
__asm lea REF,[REF+REF_YSTRIDE*2] \
215
__asm punpcklbw mm0,mm0 \
216
__asm lea SRC,[SRC+SRC_YSTRIDE*2] \
217
__asm psubw mm4,mm0 \
218
__asm movd mm0,[_off+REF] \
219
__asm punpcklbw mm5,mm7 \
220
__asm neg SRC_YSTRIDE \
221
__asm punpcklbw mm7,mm7 \
222
__asm psubw mm5,mm7 \
223
__asm movd mm7,[_off+SRC+SRC_YSTRIDE] \
224
__asm punpcklbw mm6,mm0 \
225
__asm lea REF,[REF+REF_YSTRIDE*2] \
226
__asm punpcklbw mm0,mm0 \
227
__asm neg REF_YSTRIDE \
228
__asm psubw mm6,mm0 \
229
__asm movd mm0,[_off+REF+REF_YSTRIDE] \
230
__asm lea SRC,[SRC+SRC_YSTRIDE*8] \
231
__asm punpcklbw mm7,mm0 \
232
__asm neg SRC_YSTRIDE \
233
__asm punpcklbw mm0,mm0 \
234
__asm lea REF,[REF+REF_YSTRIDE*8] \
235
__asm psubw mm7,mm0 \
236
__asm neg REF_YSTRIDE \
237
__asm movq mm0,[_off*2+BUF] \
238
}
239
240
/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
241
#define OC_LOAD_8x4(_off) __asm{ \
242
__asm movd mm0,[_off+SRC] \
243
__asm movd mm1,[_off+SRC+YSTRIDE] \
244
__asm movd mm2,[_off+SRC+YSTRIDE*2] \
245
__asm pxor mm7,mm7 \
246
__asm movd mm3,[_off+SRC+YSTRIDE3] \
247
__asm punpcklbw mm0,mm7 \
248
__asm movd mm4,[_off+SRC4] \
249
__asm punpcklbw mm1,mm7 \
250
__asm movd mm5,[_off+SRC4+YSTRIDE] \
251
__asm punpcklbw mm2,mm7 \
252
__asm movd mm6,[_off+SRC4+YSTRIDE*2] \
253
__asm punpcklbw mm3,mm7 \
254
__asm movd mm7,[_off+SRC4+YSTRIDE3] \
255
__asm punpcklbw mm4,mm4 \
256
__asm punpcklbw mm5,mm5 \
257
__asm psrlw mm4,8 \
258
__asm psrlw mm5,8 \
259
__asm punpcklbw mm6,mm6 \
260
__asm punpcklbw mm7,mm7 \
261
__asm psrlw mm6,8 \
262
__asm psrlw mm7,8 \
263
}
264
265
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
266
The transform is performed in place, except that outputs 0-3 are swapped with
267
outputs 4-7.
268
Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
269
perform this stage in place with no temporary registers).*/
270
#define OC_HADAMARD_AB_8x4 __asm{ \
271
/*Stage A: \
272
Outputs 0-3 are swapped with 4-7 here.*/ \
273
__asm paddw mm5,mm1 \
274
__asm paddw mm6,mm2 \
275
__asm paddw mm1,mm1 \
276
__asm paddw mm2,mm2 \
277
__asm psubw mm1,mm5 \
278
__asm psubw mm2,mm6 \
279
__asm paddw mm7,mm3 \
280
__asm paddw mm4,mm0 \
281
__asm paddw mm3,mm3 \
282
__asm paddw mm0,mm0 \
283
__asm psubw mm3,mm7 \
284
__asm psubw mm0,mm4 \
285
/*Stage B:*/ \
286
__asm paddw mm0,mm2 \
287
__asm paddw mm1,mm3 \
288
__asm paddw mm4,mm6 \
289
__asm paddw mm5,mm7 \
290
__asm paddw mm2,mm2 \
291
__asm paddw mm3,mm3 \
292
__asm paddw mm6,mm6 \
293
__asm paddw mm7,mm7 \
294
__asm psubw mm2,mm0 \
295
__asm psubw mm3,mm1 \
296
__asm psubw mm6,mm4 \
297
__asm psubw mm7,mm5 \
298
}
299
300
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
301
Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
302
place with no temporary registers).*/
303
#define OC_HADAMARD_C_8x4 __asm{ \
304
/*Stage C:*/ \
305
__asm paddw mm0,mm1 \
306
__asm paddw mm2,mm3 \
307
__asm paddw mm4,mm5 \
308
__asm paddw mm6,mm7 \
309
__asm paddw mm1,mm1 \
310
__asm paddw mm3,mm3 \
311
__asm paddw mm5,mm5 \
312
__asm paddw mm7,mm7 \
313
__asm psubw mm1,mm0 \
314
__asm psubw mm3,mm2 \
315
__asm psubw mm5,mm4 \
316
__asm psubw mm7,mm6 \
317
}
318
319
/*Performs an 8-point 1-D Hadamard transform.
320
The transform is performed in place, except that outputs 0-3 are swapped with
321
outputs 4-7.
322
Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
323
in place with no temporary registers).*/
324
#define OC_HADAMARD_8x4 __asm{ \
325
OC_HADAMARD_AB_8x4 \
326
OC_HADAMARD_C_8x4 \
327
}
328
329
/*Performs the first part of the final stage of the Hadamard transform and
330
summing of absolute values.
331
At the end of this part, mm1 will contain the DC coefficient of the
332
transform.*/
333
#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \
334
/*We use the fact that \
335
(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
336
to merge the final butterfly with the abs and the first stage of \
337
accumulation. \
338
Thus we can avoid using pabsw, which is not available until SSSE3. \
339
Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
340
implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
341
registers). \
342
Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
343
This implementation is only 26 (+4 for spilling registers).*/ \
344
__asm movq [_r7+BUF],mm7 \
345
__asm movq [_r6+BUF],mm6 \
346
/*mm7={0x7FFF}x4 \
347
mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
348
__asm pcmpeqb mm7,mm7 \
349
__asm movq mm6,mm0 \
350
__asm psrlw mm7,1 \
351
__asm paddw mm6,mm1 \
352
__asm pmaxsw mm0,mm1 \
353
__asm paddsw mm6,mm7 \
354
__asm psubw mm0,mm6 \
355
/*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
356
mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
357
__asm movq mm6,mm2 \
358
__asm movq mm1,mm4 \
359
__asm pmaxsw mm2,mm3 \
360
__asm pmaxsw mm4,mm5 \
361
__asm paddw mm6,mm3 \
362
__asm paddw mm1,mm5 \
363
__asm movq mm3,[_r7+BUF] \
364
}
365
366
/*Performs the second part of the final stage of the Hadamard transform and
367
summing of absolute values.*/
368
#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \
369
__asm paddsw mm6,mm7 \
370
__asm movq mm5,[_r6+BUF] \
371
__asm paddsw mm1,mm7 \
372
__asm psubw mm2,mm6 \
373
__asm psubw mm4,mm1 \
374
/*mm7={1}x4 (needed for the horizontal add that follows) \
375
mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
376
__asm movq mm6,mm3 \
377
__asm pmaxsw mm3,mm5 \
378
__asm paddw mm0,mm2 \
379
__asm paddw mm6,mm5 \
380
__asm paddw mm0,mm4 \
381
__asm paddsw mm6,mm7 \
382
__asm paddw mm0,mm3 \
383
__asm psrlw mm7,14 \
384
__asm psubw mm0,mm6 \
385
}
386
387
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
388
absolute value of each component, and accumulates everything into mm0.
389
This is the only portion of SATD which requires MMXEXT (we could use plain
390
MMX, but it takes 4 instructions and an extra register to work around the
391
lack of a pmaxsw, which is a pretty serious penalty).*/
392
#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
393
OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
394
OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
395
}
396
397
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
398
component, and accumulates everything into mm0.
399
Note that mm0 will have an extra 4 added to each column, and that after
400
removing this value, the remainder will be half the conventional value.*/
401
#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
402
OC_HADAMARD_AB_8x4 \
403
OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
404
}
405
406
/*Performs two 4x4 transposes (mostly) in place.
407
On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
408
contains rows {a,b,c,d}.
409
On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and
410
{mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
411
#define OC_TRANSPOSE_4x4x2(_off) __asm{ \
412
/*First 4x4 transpose:*/ \
413
__asm movq [0x10+_off+BUF],mm5 \
414
/*mm0 = e3 e2 e1 e0 \
415
mm1 = f3 f2 f1 f0 \
416
mm2 = g3 g2 g1 g0 \
417
mm3 = h3 h2 h1 h0*/ \
418
__asm movq mm5,mm2 \
419
__asm punpcklwd mm2,mm3 \
420
__asm punpckhwd mm5,mm3 \
421
__asm movq mm3,mm0 \
422
__asm punpcklwd mm0,mm1 \
423
__asm punpckhwd mm3,mm1 \
424
/*mm0 = f1 e1 f0 e0 \
425
mm3 = f3 e3 f2 e2 \
426
mm2 = h1 g1 h0 g0 \
427
mm5 = h3 g3 h2 g2*/ \
428
__asm movq mm1,mm0 \
429
__asm punpckldq mm0,mm2 \
430
__asm punpckhdq mm1,mm2 \
431
__asm movq mm2,mm3 \
432
__asm punpckhdq mm3,mm5 \
433
__asm movq [0x40+_off+BUF],mm0 \
434
__asm punpckldq mm2,mm5 \
435
/*mm0 = h0 g0 f0 e0 \
436
mm1 = h1 g1 f1 e1 \
437
mm2 = h2 g2 f2 e2 \
438
mm3 = h3 g3 f3 e3*/ \
439
__asm movq mm5,[0x10+_off+BUF] \
440
/*Second 4x4 transpose:*/ \
441
/*mm4 = a3 a2 a1 a0 \
442
mm5 = b3 b2 b1 b0 \
443
mm6 = c3 c2 c1 c0 \
444
mm7 = d3 d2 d1 d0*/ \
445
__asm movq mm0,mm6 \
446
__asm punpcklwd mm6,mm7 \
447
__asm movq [0x50+_off+BUF],mm1 \
448
__asm punpckhwd mm0,mm7 \
449
__asm movq mm7,mm4 \
450
__asm punpcklwd mm4,mm5 \
451
__asm movq [0x60+_off+BUF],mm2 \
452
__asm punpckhwd mm7,mm5 \
453
/*mm4 = b1 a1 b0 a0 \
454
mm7 = b3 a3 b2 a2 \
455
mm6 = d1 c1 d0 c0 \
456
mm0 = d3 c3 d2 c2*/ \
457
__asm movq mm5,mm4 \
458
__asm punpckldq mm4,mm6 \
459
__asm movq [0x70+_off+BUF],mm3 \
460
__asm punpckhdq mm5,mm6 \
461
__asm movq mm6,mm7 \
462
__asm punpckhdq mm7,mm0 \
463
__asm punpckldq mm6,mm0 \
464
/*mm4 = d0 c0 b0 a0 \
465
mm5 = d1 c1 b1 a1 \
466
mm6 = d2 c2 b2 a2 \
467
mm7 = d3 c3 b3 a3*/ \
468
}
469
470
static unsigned oc_int_frag_satd_mmxext(int *_dc,
471
const unsigned char *_src,int _src_ystride,
472
const unsigned char *_ref,int _ref_ystride){
473
OC_ALIGN8(ogg_int16_t buf[64]);
474
ogg_int16_t *bufp;
475
unsigned ret;
476
unsigned ret2;
477
int dc;
478
bufp=buf;
479
__asm{
480
#define SRC esi
481
#define REF eax
482
#define SRC_YSTRIDE ecx
483
#define REF_YSTRIDE edx
484
#define BUF edi
485
#define RET edx
486
#define RET2 ecx
487
#define DC eax
488
#define DC_WORD ax
489
mov SRC,_src
490
mov SRC_YSTRIDE,_src_ystride
491
mov REF,_ref
492
mov REF_YSTRIDE,_ref_ystride
493
mov BUF,bufp
494
OC_LOAD_SUB_8x4(0x00)
495
OC_HADAMARD_8x4
496
OC_TRANSPOSE_4x4x2(0x00)
497
/*Finish swapping out this 8x4 block to make room for the next one.
498
mm0...mm3 have been swapped out already.*/
499
movq [0x00+BUF],mm4
500
movq [0x10+BUF],mm5
501
movq [0x20+BUF],mm6
502
movq [0x30+BUF],mm7
503
OC_LOAD_SUB_8x4(0x04)
504
OC_HADAMARD_8x4
505
OC_TRANSPOSE_4x4x2(0x08)
506
/*Here the first 4x4 block of output from the last transpose is the second
507
4x4 block of input for the next transform.
508
We have cleverly arranged that it already be in the appropriate place, so
509
we only have to do half the loads.*/
510
movq mm1,[0x10+BUF]
511
movq mm2,[0x20+BUF]
512
movq mm3,[0x30+BUF]
513
movq mm0,[0x00+BUF]
514
/*We split out the stages here so we can save the DC coefficient in the
515
middle.*/
516
OC_HADAMARD_AB_8x4
517
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
518
movd DC,mm1
519
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
520
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
521
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
522
for the factor of two we dropped + 3 for the vertical accumulation).
523
Now we finally have to promote things to dwords.
524
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
525
latency of pmaddwd by starting the next series of loads now.*/
526
pmaddwd mm0,mm7
527
movq mm1,[0x50+BUF]
528
movq mm5,[0x58+BUF]
529
movq mm4,mm0
530
movq mm2,[0x60+BUF]
531
punpckhdq mm0,mm0
532
movq mm6,[0x68+BUF]
533
paddd mm4,mm0
534
movq mm3,[0x70+BUF]
535
movd RET2,mm4
536
movq mm7,[0x78+BUF]
537
movq mm0,[0x40+BUF]
538
movq mm4,[0x48+BUF]
539
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
540
pmaddwd mm0,mm7
541
/*Subtract abs(dc) from 2*ret2.*/
542
movsx DC,DC_WORD
543
cdq
544
lea RET2,[RET+RET2*2]
545
movq mm4,mm0
546
punpckhdq mm0,mm0
547
xor RET,DC
548
paddd mm4,mm0
549
/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
550
added to them, a factor of two removed, and the DC value included;
551
correct the final sum here.*/
552
sub RET2,RET
553
movd RET,mm4
554
lea RET,[RET2+RET*2-64]
555
mov ret,RET
556
mov dc,DC
557
#undef SRC
558
#undef REF
559
#undef SRC_YSTRIDE
560
#undef REF_YSTRIDE
561
#undef BUF
562
#undef RET
563
#undef RET2
564
#undef DC
565
#undef DC_WORD
566
}
567
*_dc=dc;
568
return ret;
569
}
570
571
unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
572
const unsigned char *_ref,int _ystride){
573
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
574
}
575
576
577
/*Our internal implementation of frag_copy2 takes an extra stride parameter so
578
we can share code with oc_enc_frag_satd2_mmxext().*/
579
static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
580
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
581
__asm{
582
/*Load the first 3 rows.*/
583
#define DST_YSTRIDE edi
584
#define SRC_YSTRIDE esi
585
#define DST eax
586
#define SRC1 edx
587
#define SRC2 ecx
588
mov DST_YSTRIDE,_dst_ystride
589
mov SRC_YSTRIDE,_src_ystride
590
mov DST,_dst
591
mov SRC1,_src1
592
mov SRC2,_src2
593
movq mm0,[SRC1]
594
movq mm1,[SRC2]
595
movq mm2,[SRC1+SRC_YSTRIDE]
596
lea SRC1,[SRC1+SRC_YSTRIDE*2]
597
movq mm3,[SRC2+SRC_YSTRIDE]
598
lea SRC2,[SRC2+SRC_YSTRIDE*2]
599
pxor mm7,mm7
600
movq mm4,[SRC1]
601
pcmpeqb mm6,mm6
602
movq mm5,[SRC2]
603
/*mm7={1}x8.*/
604
psubb mm7,mm6
605
/*Start averaging mm0 and mm1 into mm6.*/
606
movq mm6,mm0
607
pxor mm0,mm1
608
pavgb mm6,mm1
609
/*mm1 is free, start averaging mm3 into mm2 using mm1.*/
610
movq mm1,mm2
611
pand mm0,mm7
612
pavgb mm2,mm3
613
pxor mm1,mm3
614
/*mm3 is free.*/
615
psubb mm6,mm0
616
/*mm0 is free, start loading the next row.*/
617
movq mm0,[SRC1+SRC_YSTRIDE]
618
/*Start averaging mm5 and mm4 using mm3.*/
619
movq mm3,mm4
620
/*mm6 [row 0] is done; write it out.*/
621
movq [DST],mm6
622
pand mm1,mm7
623
pavgb mm4,mm5
624
psubb mm2,mm1
625
/*mm1 is free, continue loading the next row.*/
626
movq mm1,[SRC2+SRC_YSTRIDE]
627
pxor mm3,mm5
628
lea SRC1,[SRC1+SRC_YSTRIDE*2]
629
/*mm2 [row 1] is done; write it out.*/
630
movq [DST+DST_YSTRIDE],mm2
631
pand mm3,mm7
632
/*Start loading the next row.*/
633
movq mm2,[SRC1]
634
lea DST,[DST+DST_YSTRIDE*2]
635
psubb mm4,mm3
636
lea SRC2,[SRC2+SRC_YSTRIDE*2]
637
/*mm4 [row 2] is done; write it out.*/
638
movq [DST],mm4
639
/*Continue loading the next row.*/
640
movq mm3,[SRC2]
641
/*Start averaging mm0 and mm1 into mm6.*/
642
movq mm6,mm0
643
pxor mm0,mm1
644
/*Start loading the next row.*/
645
movq mm4,[SRC1+SRC_YSTRIDE]
646
pavgb mm6,mm1
647
/*mm1 is free; start averaging mm3 into mm2 using mm1.*/
648
movq mm1,mm2
649
pand mm0,mm7
650
/*Continue loading the next row.*/
651
movq mm5,[SRC2+SRC_YSTRIDE]
652
pavgb mm2,mm3
653
lea SRC1,[SRC1+SRC_YSTRIDE*2]
654
pxor mm1,mm3
655
/*mm3 is free.*/
656
psubb mm6,mm0
657
/*mm0 is free, start loading the next row.*/
658
movq mm0,[SRC1]
659
/*Start averaging mm5 into mm4 using mm3.*/
660
movq mm3,mm4
661
/*mm6 [row 3] is done; write it out.*/
662
movq [DST+DST_YSTRIDE],mm6
663
pand mm1,mm7
664
lea SRC2,[SRC2+SRC_YSTRIDE*2]
665
pavgb mm4,mm5
666
lea DST,[DST+DST_YSTRIDE*2]
667
psubb mm2,mm1
668
/*mm1 is free; continue loading the next row.*/
669
movq mm1,[SRC2]
670
pxor mm3,mm5
671
/*mm2 [row 4] is done; write it out.*/
672
movq [DST],mm2
673
pand mm3,mm7
674
/*Start loading the next row.*/
675
movq mm2,[SRC1+SRC_YSTRIDE]
676
psubb mm4,mm3
677
/*Start averaging mm0 and mm1 into mm6.*/
678
movq mm6,mm0
679
/*Continue loading the next row.*/
680
movq mm3,[SRC2+SRC_YSTRIDE]
681
/*mm4 [row 5] is done; write it out.*/
682
movq [DST+DST_YSTRIDE],mm4
683
pxor mm0,mm1
684
pavgb mm6,mm1
685
/*mm4 is free; start averaging mm3 into mm2 using mm4.*/
686
movq mm4,mm2
687
pand mm0,mm7
688
pavgb mm2,mm3
689
pxor mm4,mm3
690
lea DST,[DST+DST_YSTRIDE*2]
691
psubb mm6,mm0
692
pand mm4,mm7
693
/*mm6 [row 6] is done, write it out.*/
694
movq [DST],mm6
695
psubb mm2,mm4
696
/*mm2 [row 7] is done, write it out.*/
697
movq [DST+DST_YSTRIDE],mm2
698
#undef SRC1
699
#undef SRC2
700
#undef SRC_YSTRIDE
701
#undef DST_YSTRIDE
702
#undef DST
703
}
704
}
705
706
unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
707
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
708
OC_ALIGN8(unsigned char ref[64]);
709
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
710
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
711
}
712
713
unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,const unsigned char *_src,
714
int _ystride){
715
OC_ALIGN8(ogg_int16_t buf[64]);
716
ogg_int16_t *bufp;
717
unsigned ret1;
718
unsigned ret2;
719
int dc;
720
bufp=buf;
721
__asm{
722
#define SRC eax
723
#define SRC4 esi
724
#define BUF edi
725
#define YSTRIDE edx
726
#define YSTRIDE3 ecx
727
#define RET eax
728
#define RET2 ecx
729
#define DC edx
730
#define DC_WORD dx
731
mov SRC,_src
732
mov BUF,bufp
733
mov YSTRIDE,_ystride
734
/* src4 = src+4*ystride */
735
lea SRC4,[SRC+YSTRIDE*4]
736
/* ystride3 = 3*ystride */
737
lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
738
OC_LOAD_8x4(0x00)
739
OC_HADAMARD_8x4
740
OC_TRANSPOSE_4x4x2(0x00)
741
/*Finish swapping out this 8x4 block to make room for the next one.
742
mm0...mm3 have been swapped out already.*/
743
movq [0x00+BUF],mm4
744
movq [0x10+BUF],mm5
745
movq [0x20+BUF],mm6
746
movq [0x30+BUF],mm7
747
OC_LOAD_8x4(0x04)
748
OC_HADAMARD_8x4
749
OC_TRANSPOSE_4x4x2(0x08)
750
/*Here the first 4x4 block of output from the last transpose is the second
751
4x4 block of input for the next transform.
752
We have cleverly arranged that it already be in the appropriate place, so
753
we only have to do half the loads.*/
754
movq mm1,[0x10+BUF]
755
movq mm2,[0x20+BUF]
756
movq mm3,[0x30+BUF]
757
movq mm0,[0x00+BUF]
758
/*We split out the stages here so we can save the DC coefficient in the
759
middle.*/
760
OC_HADAMARD_AB_8x4
761
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
762
movd DC,mm1
763
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
764
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
765
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
766
for the factor of two we dropped + 3 for the vertical accumulation).
767
Now we finally have to promote things to dwords.
768
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
769
latency of pmaddwd by starting the next series of loads now.*/
770
pmaddwd mm0,mm7
771
movq mm1,[0x50+BUF]
772
movq mm5,[0x58+BUF]
773
movq mm2,[0x60+BUF]
774
movq mm4,mm0
775
movq mm6,[0x68+BUF]
776
punpckhdq mm0,mm0
777
movq mm3,[0x70+BUF]
778
paddd mm4,mm0
779
movq mm7,[0x78+BUF]
780
movd RET,mm4
781
movq mm0,[0x40+BUF]
782
movq mm4,[0x48+BUF]
783
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
784
pmaddwd mm0,mm7
785
/*We assume that the DC coefficient is always positive (which is true,
786
because the input to the INTRA transform was not a difference).*/
787
movzx DC,DC_WORD
788
add RET,RET
789
sub RET,DC
790
movq mm4,mm0
791
punpckhdq mm0,mm0
792
paddd mm4,mm0
793
movd RET2,mm4
794
lea RET,[-64+RET+RET2*2]
795
mov [dc],DC
796
mov [ret1],RET
797
#undef SRC
798
#undef SRC4
799
#undef BUF
800
#undef YSTRIDE
801
#undef YSTRIDE3
802
#undef RET
803
#undef RET2
804
#undef DC
805
#undef DC_WORD
806
}
807
*_dc=dc;
808
return ret1;
809
}
810
811
void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
812
const unsigned char *_src, const unsigned char *_ref,int _ystride){
813
int i;
814
__asm pxor mm7,mm7
815
for(i=4;i-->0;){
816
__asm{
817
#define SRC edx
818
#define YSTRIDE esi
819
#define RESIDUE eax
820
#define REF ecx
821
mov YSTRIDE,_ystride
822
mov RESIDUE,_residue
823
mov SRC,_src
824
mov REF,_ref
825
/*mm0=[src]*/
826
movq mm0,[SRC]
827
/*mm1=[ref]*/
828
movq mm1,[REF]
829
/*mm4=[src+ystride]*/
830
movq mm4,[SRC+YSTRIDE]
831
/*mm5=[ref+ystride]*/
832
movq mm5,[REF+YSTRIDE]
833
/*Compute [src]-[ref].*/
834
movq mm2,mm0
835
punpcklbw mm0,mm7
836
movq mm3,mm1
837
punpckhbw mm2,mm7
838
punpcklbw mm1,mm7
839
punpckhbw mm3,mm7
840
psubw mm0,mm1
841
psubw mm2,mm3
842
/*Compute [src+ystride]-[ref+ystride].*/
843
movq mm1,mm4
844
punpcklbw mm4,mm7
845
movq mm3,mm5
846
punpckhbw mm1,mm7
847
lea SRC,[SRC+YSTRIDE*2]
848
punpcklbw mm5,mm7
849
lea REF,[REF+YSTRIDE*2]
850
punpckhbw mm3,mm7
851
psubw mm4,mm5
852
psubw mm1,mm3
853
/*Write the answer out.*/
854
movq [RESIDUE+0x00],mm0
855
movq [RESIDUE+0x08],mm2
856
movq [RESIDUE+0x10],mm4
857
movq [RESIDUE+0x18],mm1
858
lea RESIDUE,[RESIDUE+0x20]
859
mov _residue,RESIDUE
860
mov _src,SRC
861
mov _ref,REF
862
#undef SRC
863
#undef YSTRIDE
864
#undef RESIDUE
865
#undef REF
866
}
867
}
868
}
869
870
void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
871
const unsigned char *_src,int _ystride){
872
__asm{
873
#define YSTRIDE edx
874
#define YSTRIDE3 edi
875
#define RESIDUE ecx
876
#define SRC eax
877
mov YSTRIDE,_ystride
878
mov RESIDUE,_residue
879
mov SRC,_src
880
/*mm0=[src]*/
881
movq mm0,[SRC]
882
/*mm1=[src+ystride]*/
883
movq mm1,[SRC+YSTRIDE]
884
/*mm6={-1}x4*/
885
pcmpeqw mm6,mm6
886
/*mm2=[src+2*ystride]*/
887
movq mm2,[SRC+YSTRIDE*2]
888
/*[ystride3]=3*[ystride]*/
889
lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
890
/*mm6={1}x4*/
891
psllw mm6,15
892
/*mm3=[src+3*ystride]*/
893
movq mm3,[SRC+YSTRIDE3]
894
/*mm6={128}x4*/
895
psrlw mm6,8
896
/*mm7=0*/
897
pxor mm7,mm7
898
/*[src]=[src]+4*[ystride]*/
899
lea SRC,[SRC+YSTRIDE*4]
900
/*Compute [src]-128 and [src+ystride]-128*/
901
movq mm4,mm0
902
punpcklbw mm0,mm7
903
movq mm5,mm1
904
punpckhbw mm4,mm7
905
psubw mm0,mm6
906
punpcklbw mm1,mm7
907
psubw mm4,mm6
908
punpckhbw mm5,mm7
909
psubw mm1,mm6
910
psubw mm5,mm6
911
/*Write the answer out.*/
912
movq [RESIDUE+0x00],mm0
913
movq [RESIDUE+0x08],mm4
914
movq [RESIDUE+0x10],mm1
915
movq [RESIDUE+0x18],mm5
916
/*mm0=[src+4*ystride]*/
917
movq mm0,[SRC]
918
/*mm1=[src+5*ystride]*/
919
movq mm1,[SRC+YSTRIDE]
920
/*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
921
movq mm4,mm2
922
punpcklbw mm2,mm7
923
movq mm5,mm3
924
punpckhbw mm4,mm7
925
psubw mm2,mm6
926
punpcklbw mm3,mm7
927
psubw mm4,mm6
928
punpckhbw mm5,mm7
929
psubw mm3,mm6
930
psubw mm5,mm6
931
/*Write the answer out.*/
932
movq [RESIDUE+0x20],mm2
933
movq [RESIDUE+0x28],mm4
934
movq [RESIDUE+0x30],mm3
935
movq [RESIDUE+0x38],mm5
936
/*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
937
movq mm2,[SRC+YSTRIDE*2]
938
movq mm3,[SRC+YSTRIDE3]
939
movq mm4,mm0
940
punpcklbw mm0,mm7
941
movq mm5,mm1
942
punpckhbw mm4,mm7
943
psubw mm0,mm6
944
punpcklbw mm1,mm7
945
psubw mm4,mm6
946
punpckhbw mm5,mm7
947
psubw mm1,mm6
948
psubw mm5,mm6
949
/*Write the answer out.*/
950
movq [RESIDUE+0x40],mm0
951
movq [RESIDUE+0x48],mm4
952
movq [RESIDUE+0x50],mm1
953
movq [RESIDUE+0x58],mm5
954
/*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
955
movq mm4,mm2
956
punpcklbw mm2,mm7
957
movq mm5,mm3
958
punpckhbw mm4,mm7
959
psubw mm2,mm6
960
punpcklbw mm3,mm7
961
psubw mm4,mm6
962
punpckhbw mm5,mm7
963
psubw mm3,mm6
964
psubw mm5,mm6
965
/*Write the answer out.*/
966
movq [RESIDUE+0x60],mm2
967
movq [RESIDUE+0x68],mm4
968
movq [RESIDUE+0x70],mm3
969
movq [RESIDUE+0x78],mm5
970
#undef YSTRIDE
971
#undef YSTRIDE3
972
#undef RESIDUE
973
#undef SRC
974
}
975
}
976
977
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
978
const unsigned char *_src1,const unsigned char *_src2,int _ystride){
979
oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
980
}
981
982
#endif
983
984