Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libtheora/x86/mmxstate.c
9898 views
1
/********************************************************************
2
* *
3
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7
* *
8
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9
* by the Xiph.Org Foundation and contributors *
10
* https://www.xiph.org/ *
11
* *
12
********************************************************************
13
14
function:
15
16
********************************************************************/
17
18
/*MMX acceleration of complete fragment reconstruction algorithm.
19
Originally written by Rudolf Marek.*/
20
#include <string.h>
21
#include "x86int.h"
22
#include "mmxloop.h"
23
24
#if defined(OC_X86_ASM)
25
26
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
27
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
28
unsigned char *dst;
29
ptrdiff_t frag_buf_off;
30
int ystride;
31
int refi;
32
/*Apply the inverse transform.*/
33
/*Special case only having a DC component.*/
34
if(_last_zzi<2){
35
/*Note that this value must be unsigned, to keep the __asm__ block from
36
sign-extending it when it puts it in a register.*/
37
ogg_uint16_t p;
38
int i;
39
/*We round this dequant product (and not any of the others) because there's
40
no iDCT rounding.*/
41
p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
42
/*Fill _dct_coeffs with p.*/
43
__asm__ __volatile__(
44
/*mm0=0000 0000 0000 AAAA*/
45
"movd %[p],%%mm0\n\t"
46
/*mm0=0000 0000 AAAA AAAA*/
47
"punpcklwd %%mm0,%%mm0\n\t"
48
/*mm0=AAAA AAAA AAAA AAAA*/
49
"punpckldq %%mm0,%%mm0\n\t"
50
:
51
:[p]"r"((unsigned)p)
52
);
53
for(i=0;i<4;i++){
54
__asm__ __volatile__(
55
"movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
56
"movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
57
"movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
58
"movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
59
:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
60
);
61
}
62
}
63
else{
64
/*Dequantize the DC coefficient.*/
65
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
66
oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
67
}
68
/*Fill in the target buffer.*/
69
frag_buf_off=_state->frag_buf_offs[_fragi];
70
refi=_state->frags[_fragi].refi;
71
ystride=_state->ref_ystride[_pli];
72
dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
73
if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
74
else{
75
const unsigned char *ref;
76
int mvoffsets[2];
77
ref=_state->ref_frame_data[refi]+frag_buf_off;
78
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
79
_state->frag_mvs[_fragi])>1){
80
oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
81
_dct_coeffs+64);
82
}
83
else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
84
}
85
}
86
87
/*We copy these entire function to inline the actual MMX routines so that we
88
use only a single indirect call.*/
89
90
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
91
memset(_bv,_flimit,8);
92
}
93
94
/*Apply the loop filter to a given set of fragment rows in the given plane.
95
The filter may be run on the bottom edge, affecting pixels in the next row of
96
fragments, so this row also needs to be available.
97
_bv: The bounding values array.
98
_refi: The index of the frame buffer to filter.
99
_pli: The color plane to filter.
100
_fragy0: The Y coordinate of the first fragment row to filter.
101
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
102
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
103
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
104
OC_ALIGN8(unsigned char ll[8]);
105
const oc_fragment_plane *fplane;
106
const oc_fragment *frags;
107
const ptrdiff_t *frag_buf_offs;
108
unsigned char *ref_frame_data;
109
ptrdiff_t fragi_top;
110
ptrdiff_t fragi_bot;
111
ptrdiff_t fragi0;
112
ptrdiff_t fragi0_end;
113
int ystride;
114
int nhfrags;
115
memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
116
fplane=_state->fplanes+_pli;
117
nhfrags=fplane->nhfrags;
118
fragi_top=fplane->froffset;
119
fragi_bot=fragi_top+fplane->nfrags;
120
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
121
fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
122
ystride=_state->ref_ystride[_pli];
123
frags=_state->frags;
124
frag_buf_offs=_state->frag_buf_offs;
125
ref_frame_data=_state->ref_frame_data[_refi];
126
/*The following loops are constructed somewhat non-intuitively on purpose.
127
The main idea is: if a block boundary has at least one coded fragment on
128
it, the filter is applied to it.
129
However, the order that the filters are applied in matters, and VP3 chose
130
the somewhat strange ordering used below.*/
131
while(fragi0<fragi0_end){
132
ptrdiff_t fragi;
133
ptrdiff_t fragi_end;
134
fragi=fragi0;
135
fragi_end=fragi+nhfrags;
136
while(fragi<fragi_end){
137
if(frags[fragi].coded){
138
unsigned char *ref;
139
ref=ref_frame_data+frag_buf_offs[fragi];
140
if(fragi>fragi0){
141
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
142
}
143
if(fragi0>fragi_top){
144
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
145
}
146
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
147
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
148
}
149
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
150
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride*8),ystride,ll);
151
}
152
}
153
fragi++;
154
}
155
fragi0+=nhfrags;
156
}
157
}
158
159
void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
160
memset(_bv,~(_flimit<<1),8);
161
}
162
163
/*Apply the loop filter to a given set of fragment rows in the given plane.
164
The filter may be run on the bottom edge, affecting pixels in the next row of
165
fragments, so this row also needs to be available.
166
_bv: The bounding values array.
167
_refi: The index of the frame buffer to filter.
168
_pli: The color plane to filter.
169
_fragy0: The Y coordinate of the first fragment row to filter.
170
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
171
void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
172
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
173
const oc_fragment_plane *fplane;
174
const oc_fragment *frags;
175
const ptrdiff_t *frag_buf_offs;
176
unsigned char *ref_frame_data;
177
ptrdiff_t fragi_top;
178
ptrdiff_t fragi_bot;
179
ptrdiff_t fragi0;
180
ptrdiff_t fragi0_end;
181
int ystride;
182
int nhfrags;
183
fplane=_state->fplanes+_pli;
184
nhfrags=fplane->nhfrags;
185
fragi_top=fplane->froffset;
186
fragi_bot=fragi_top+fplane->nfrags;
187
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
188
fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
189
ystride=_state->ref_ystride[_pli];
190
frags=_state->frags;
191
frag_buf_offs=_state->frag_buf_offs;
192
ref_frame_data=_state->ref_frame_data[_refi];
193
/*The following loops are constructed somewhat non-intuitively on purpose.
194
The main idea is: if a block boundary has at least one coded fragment on
195
it, the filter is applied to it.
196
However, the order that the filters are applied in matters, and VP3 chose
197
the somewhat strange ordering used below.*/
198
while(fragi0<fragi0_end){
199
ptrdiff_t fragi;
200
ptrdiff_t fragi_end;
201
fragi=fragi0;
202
fragi_end=fragi+nhfrags;
203
while(fragi<fragi_end){
204
if(frags[fragi].coded){
205
unsigned char *ref;
206
ref=ref_frame_data+frag_buf_offs[fragi];
207
if(fragi>fragi0){
208
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
209
}
210
if(fragi0>fragi_top){
211
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
212
}
213
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
214
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
215
}
216
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
217
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride*8),ystride,_bv);
218
}
219
}
220
fragi++;
221
}
222
fragi0+=nhfrags;
223
}
224
}
225
226
#endif
227
228