Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libtheora/x86_vc/mmxfrag.c
9904 views
1
/********************************************************************
2
* *
3
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7
* *
8
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9
* by the Xiph.Org Foundation and contributors *
10
* https://www.xiph.org/ *
11
* *
12
********************************************************************
13
14
function:
15
16
********************************************************************/
17
18
/*MMX acceleration of fragment reconstruction for motion compensation.
19
Originally written by Rudolf Marek.
20
Additional optimization by Nils Pipenbrinck.
21
Note: Loops are unrolled for best performance.
22
The iteration each instruction belongs to is marked in the comments as #i.*/
23
#include <stddef.h>
24
#include "x86int.h"
25
26
#if defined(OC_X86_ASM)
27
28
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
29
between rows.*/
30
# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
31
do{ \
32
const unsigned char *src; \
33
unsigned char *dst; \
34
src=(_src); \
35
dst=(_dst); \
36
__asm mov SRC,src \
37
__asm mov DST,dst \
38
__asm mov YSTRIDE,_ystride \
39
/*src+0*ystride*/ \
40
__asm movq mm0,[SRC] \
41
/*src+1*ystride*/ \
42
__asm movq mm1,[SRC+YSTRIDE] \
43
/*ystride3=ystride*3*/ \
44
__asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
45
/*src+2*ystride*/ \
46
__asm movq mm2,[SRC+YSTRIDE*2] \
47
/*src+3*ystride*/ \
48
__asm movq mm3,[SRC+YSTRIDE3] \
49
/*dst+0*ystride*/ \
50
__asm movq [DST],mm0 \
51
/*dst+1*ystride*/ \
52
__asm movq [DST+YSTRIDE],mm1 \
53
/*Pointer to next 4.*/ \
54
__asm lea SRC,[SRC+YSTRIDE*4] \
55
/*dst+2*ystride*/ \
56
__asm movq [DST+YSTRIDE*2],mm2 \
57
/*dst+3*ystride*/ \
58
__asm movq [DST+YSTRIDE3],mm3 \
59
/*Pointer to next 4.*/ \
60
__asm lea DST,[DST+YSTRIDE*4] \
61
/*src+0*ystride*/ \
62
__asm movq mm0,[SRC] \
63
/*src+1*ystride*/ \
64
__asm movq mm1,[SRC+YSTRIDE] \
65
/*src+2*ystride*/ \
66
__asm movq mm2,[SRC+YSTRIDE*2] \
67
/*src+3*ystride*/ \
68
__asm movq mm3,[SRC+YSTRIDE3] \
69
/*dst+0*ystride*/ \
70
__asm movq [DST],mm0 \
71
/*dst+1*ystride*/ \
72
__asm movq [DST+YSTRIDE],mm1 \
73
/*dst+2*ystride*/ \
74
__asm movq [DST+YSTRIDE*2],mm2 \
75
/*dst+3*ystride*/ \
76
__asm movq [DST+YSTRIDE3],mm3 \
77
} \
78
while(0)
79
80
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
81
between rows.*/
82
void oc_frag_copy_mmx(unsigned char *_dst,
83
const unsigned char *_src,int _ystride){
84
#define SRC edx
85
#define DST eax
86
#define YSTRIDE ecx
87
#define YSTRIDE3 esi
88
OC_FRAG_COPY_MMX(_dst,_src,_ystride);
89
#undef SRC
90
#undef DST
91
#undef YSTRIDE
92
#undef YSTRIDE3
93
}
94
95
/*Copies the fragments specified by the lists of fragment indices from one
96
frame to another.
97
_dst_frame: The reference frame to copy to.
98
_src_frame: The reference frame to copy from.
99
_ystride: The row stride of the reference frames.
100
_fragis: A pointer to a list of fragment indices.
101
_nfragis: The number of fragment indices to copy.
102
_frag_buf_offs: The offsets of fragments in the reference frames.*/
103
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
104
const unsigned char *_src_frame,int _ystride,
105
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
106
ptrdiff_t fragii;
107
for(fragii=0;fragii<_nfragis;fragii++){
108
ptrdiff_t frag_buf_off;
109
frag_buf_off=_frag_buf_offs[_fragis[fragii]];
110
#define SRC edx
111
#define DST eax
112
#define YSTRIDE ecx
113
#define YSTRIDE3 edi
114
OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
115
_src_frame+frag_buf_off,_ystride);
116
#undef SRC
117
#undef DST
118
#undef YSTRIDE
119
#undef YSTRIDE3
120
}
121
}
122
123
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
124
const ogg_int16_t *_residue){
125
__asm{
126
#define DST edx
127
#define DST4 esi
128
#define YSTRIDE eax
129
#define YSTRIDE3 edi
130
#define RESIDUE ecx
131
mov DST,_dst
132
mov YSTRIDE,_ystride
133
mov RESIDUE,_residue
134
lea DST4,[DST+YSTRIDE*4]
135
lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
136
/*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
137
pcmpeqw mm0,mm0
138
/*#0 Load low residue.*/
139
movq mm1,[0*8+RESIDUE]
140
/*#0 Load high residue.*/
141
movq mm2,[1*8+RESIDUE]
142
/*Set mm0 to 0x8000800080008000.*/
143
psllw mm0,15
144
/*#1 Load low residue.*/
145
movq mm3,[2*8+RESIDUE]
146
/*#1 Load high residue.*/
147
movq mm4,[3*8+RESIDUE]
148
/*Set mm0 to 0x0080008000800080.*/
149
psrlw mm0,8
150
/*#2 Load low residue.*/
151
movq mm5,[4*8+RESIDUE]
152
/*#2 Load high residue.*/
153
movq mm6,[5*8+RESIDUE]
154
/*#0 Bias low residue.*/
155
paddsw mm1,mm0
156
/*#0 Bias high residue.*/
157
paddsw mm2,mm0
158
/*#0 Pack to byte.*/
159
packuswb mm1,mm2
160
/*#1 Bias low residue.*/
161
paddsw mm3,mm0
162
/*#1 Bias high residue.*/
163
paddsw mm4,mm0
164
/*#1 Pack to byte.*/
165
packuswb mm3,mm4
166
/*#2 Bias low residue.*/
167
paddsw mm5,mm0
168
/*#2 Bias high residue.*/
169
paddsw mm6,mm0
170
/*#2 Pack to byte.*/
171
packuswb mm5,mm6
172
/*#0 Write row.*/
173
movq [DST],mm1
174
/*#1 Write row.*/
175
movq [DST+YSTRIDE],mm3
176
/*#2 Write row.*/
177
movq [DST+YSTRIDE*2],mm5
178
/*#3 Load low residue.*/
179
movq mm1,[6*8+RESIDUE]
180
/*#3 Load high residue.*/
181
movq mm2,[7*8+RESIDUE]
182
/*#4 Load high residue.*/
183
movq mm3,[8*8+RESIDUE]
184
/*#4 Load high residue.*/
185
movq mm4,[9*8+RESIDUE]
186
/*#5 Load high residue.*/
187
movq mm5,[10*8+RESIDUE]
188
/*#5 Load high residue.*/
189
movq mm6,[11*8+RESIDUE]
190
/*#3 Bias low residue.*/
191
paddsw mm1,mm0
192
/*#3 Bias high residue.*/
193
paddsw mm2,mm0
194
/*#3 Pack to byte.*/
195
packuswb mm1,mm2
196
/*#4 Bias low residue.*/
197
paddsw mm3,mm0
198
/*#4 Bias high residue.*/
199
paddsw mm4,mm0
200
/*#4 Pack to byte.*/
201
packuswb mm3,mm4
202
/*#5 Bias low residue.*/
203
paddsw mm5,mm0
204
/*#5 Bias high residue.*/
205
paddsw mm6,mm0
206
/*#5 Pack to byte.*/
207
packuswb mm5,mm6
208
/*#3 Write row.*/
209
movq [DST+YSTRIDE3],mm1
210
/*#4 Write row.*/
211
movq [DST4],mm3
212
/*#5 Write row.*/
213
movq [DST4+YSTRIDE],mm5
214
/*#6 Load low residue.*/
215
movq mm1,[12*8+RESIDUE]
216
/*#6 Load high residue.*/
217
movq mm2,[13*8+RESIDUE]
218
/*#7 Load low residue.*/
219
movq mm3,[14*8+RESIDUE]
220
/*#7 Load high residue.*/
221
movq mm4,[15*8+RESIDUE]
222
/*#6 Bias low residue.*/
223
paddsw mm1,mm0
224
/*#6 Bias high residue.*/
225
paddsw mm2,mm0
226
/*#6 Pack to byte.*/
227
packuswb mm1,mm2
228
/*#7 Bias low residue.*/
229
paddsw mm3,mm0
230
/*#7 Bias high residue.*/
231
paddsw mm4,mm0
232
/*#7 Pack to byte.*/
233
packuswb mm3,mm4
234
/*#6 Write row.*/
235
movq [DST4+YSTRIDE*2],mm1
236
/*#7 Write row.*/
237
movq [DST4+YSTRIDE3],mm3
238
#undef DST
239
#undef DST4
240
#undef YSTRIDE
241
#undef YSTRIDE3
242
#undef RESIDUE
243
}
244
}
245
246
void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
247
int _ystride,const ogg_int16_t *_residue){
248
int i;
249
/*Zero mm0.*/
250
__asm pxor mm0,mm0;
251
for(i=4;i-->0;){
252
__asm{
253
#define DST edx
254
#define SRC ecx
255
#define YSTRIDE edi
256
#define RESIDUE eax
257
mov DST,_dst
258
mov SRC,_src
259
mov YSTRIDE,_ystride
260
mov RESIDUE,_residue
261
/*#0 Load source.*/
262
movq mm3,[SRC]
263
/*#1 Load source.*/
264
movq mm7,[SRC+YSTRIDE]
265
/*#0 Get copy of src.*/
266
movq mm4,mm3
267
/*#0 Expand high source.*/
268
punpckhbw mm4,mm0
269
/*#0 Expand low source.*/
270
punpcklbw mm3,mm0
271
/*#0 Add residue high.*/
272
paddsw mm4,[8+RESIDUE]
273
/*#1 Get copy of src.*/
274
movq mm2,mm7
275
/*#0 Add residue low.*/
276
paddsw mm3,[RESIDUE]
277
/*#1 Expand high source.*/
278
punpckhbw mm2,mm0
279
/*#0 Pack final row pixels.*/
280
packuswb mm3,mm4
281
/*#1 Expand low source.*/
282
punpcklbw mm7,mm0
283
/*#1 Add residue low.*/
284
paddsw mm7,[16+RESIDUE]
285
/*#1 Add residue high.*/
286
paddsw mm2,[24+RESIDUE]
287
/*Advance residue.*/
288
lea RESIDUE,[32+RESIDUE]
289
/*#1 Pack final row pixels.*/
290
packuswb mm7,mm2
291
/*Advance src.*/
292
lea SRC,[SRC+YSTRIDE*2]
293
/*#0 Write row.*/
294
movq [DST],mm3
295
/*#1 Write row.*/
296
movq [DST+YSTRIDE],mm7
297
/*Advance dst.*/
298
lea DST,[DST+YSTRIDE*2]
299
mov _residue,RESIDUE
300
mov _dst,DST
301
mov _src,SRC
302
#undef DST
303
#undef SRC
304
#undef YSTRIDE
305
#undef RESIDUE
306
}
307
}
308
}
309
310
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
311
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
312
int i;
313
/*Zero mm7.*/
314
__asm pxor mm7,mm7;
315
for(i=4;i-->0;){
316
__asm{
317
#define SRC1 ecx
318
#define SRC2 edi
319
#define YSTRIDE esi
320
#define RESIDUE edx
321
#define DST eax
322
mov YSTRIDE,_ystride
323
mov DST,_dst
324
mov RESIDUE,_residue
325
mov SRC1,_src1
326
mov SRC2,_src2
327
/*#0 Load src1.*/
328
movq mm0,[SRC1]
329
/*#0 Load src2.*/
330
movq mm2,[SRC2]
331
/*#0 Copy src1.*/
332
movq mm1,mm0
333
/*#0 Copy src2.*/
334
movq mm3,mm2
335
/*#1 Load src1.*/
336
movq mm4,[SRC1+YSTRIDE]
337
/*#0 Unpack lower src1.*/
338
punpcklbw mm0,mm7
339
/*#1 Load src2.*/
340
movq mm5,[SRC2+YSTRIDE]
341
/*#0 Unpack higher src1.*/
342
punpckhbw mm1,mm7
343
/*#0 Unpack lower src2.*/
344
punpcklbw mm2,mm7
345
/*#0 Unpack higher src2.*/
346
punpckhbw mm3,mm7
347
/*Advance src1 ptr.*/
348
lea SRC1,[SRC1+YSTRIDE*2]
349
/*Advance src2 ptr.*/
350
lea SRC2,[SRC2+YSTRIDE*2]
351
/*#0 Lower src1+src2.*/
352
paddsw mm0,mm2
353
/*#0 Higher src1+src2.*/
354
paddsw mm1,mm3
355
/*#1 Copy src1.*/
356
movq mm2,mm4
357
/*#0 Build lo average.*/
358
psraw mm0,1
359
/*#1 Copy src2.*/
360
movq mm3,mm5
361
/*#1 Unpack lower src1.*/
362
punpcklbw mm4,mm7
363
/*#0 Build hi average.*/
364
psraw mm1,1
365
/*#1 Unpack higher src1.*/
366
punpckhbw mm2,mm7
367
/*#0 low+=residue.*/
368
paddsw mm0,[RESIDUE]
369
/*#1 Unpack lower src2.*/
370
punpcklbw mm5,mm7
371
/*#0 high+=residue.*/
372
paddsw mm1,[8+RESIDUE]
373
/*#1 Unpack higher src2.*/
374
punpckhbw mm3,mm7
375
/*#1 Lower src1+src2.*/
376
paddsw mm5,mm4
377
/*#0 Pack and saturate.*/
378
packuswb mm0,mm1
379
/*#1 Higher src1+src2.*/
380
paddsw mm3,mm2
381
/*#0 Write row.*/
382
movq [DST],mm0
383
/*#1 Build lo average.*/
384
psraw mm5,1
385
/*#1 Build hi average.*/
386
psraw mm3,1
387
/*#1 low+=residue.*/
388
paddsw mm5,[16+RESIDUE]
389
/*#1 high+=residue.*/
390
paddsw mm3,[24+RESIDUE]
391
/*#1 Pack and saturate.*/
392
packuswb mm5,mm3
393
/*#1 Write row ptr.*/
394
movq [DST+YSTRIDE],mm5
395
/*Advance residue ptr.*/
396
add RESIDUE,32
397
/*Advance dest ptr.*/
398
lea DST,[DST+YSTRIDE*2]
399
mov _dst,DST
400
mov _residue,RESIDUE
401
mov _src1,SRC1
402
mov _src2,SRC2
403
#undef SRC1
404
#undef SRC2
405
#undef YSTRIDE
406
#undef RESIDUE
407
#undef DST
408
}
409
}
410
}
411
412
void oc_restore_fpu_mmx(void){
413
__asm emms;
414
}
415
416
#endif
417
418