CoCalc -- mmxstate.c

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libtheora/x86_vc/mmxstate.c
²²¹⁸⁸ views
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9
 * by the Xiph.Org Foundation and contributors                      *
10
 * https://www.xiph.org/                                            *
11
 *                                                                  *
12
 ********************************************************************
13

14
  function:
15

16
 ********************************************************************/
17

18
/*MMX acceleration of complete fragment reconstruction algorithm.
19
  Originally written by Rudolf Marek.*/
20
#include <string.h>
21
#include "x86int.h"
22
#include "mmxloop.h"
23

24
#if defined(OC_X86_ASM)
25

26
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
27
 int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
28
  unsigned char *dst;
29
  ptrdiff_t      frag_buf_off;
30
  int            ystride;
31
  int            refi;
32
  /*Apply the inverse transform.*/
33
  /*Special case only having a DC component.*/
34
  if(_last_zzi<2){
35
    /*Note that this value must be unsigned, to keep the __asm__ block from
36
       sign-extending it when it puts it in a register.*/
37
    ogg_uint16_t p;
38
    /*We round this dequant product (and not any of the others) because there's
39
       no iDCT rounding.*/
40
    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
41
    /*Fill _dct_coeffs with p.*/
42
    __asm{
43
#define Y eax
44
#define P ecx
45
      mov Y,_dct_coeffs
46
      movzx P,p
47
      lea Y,[Y+128]
48
      /*mm0=0000 0000 0000 AAAA*/
49
      movd mm0,P
50
      /*mm0=0000 0000 AAAA AAAA*/
51
      punpcklwd mm0,mm0
52
      /*mm0=AAAA AAAA AAAA AAAA*/
53
      punpckldq mm0,mm0
54
      movq [Y],mm0
55
      movq [8+Y],mm0
56
      movq [16+Y],mm0
57
      movq [24+Y],mm0
58
      movq [32+Y],mm0
59
      movq [40+Y],mm0
60
      movq [48+Y],mm0
61
      movq [56+Y],mm0
62
      movq [64+Y],mm0
63
      movq [72+Y],mm0
64
      movq [80+Y],mm0
65
      movq [88+Y],mm0
66
      movq [96+Y],mm0
67
      movq [104+Y],mm0
68
      movq [112+Y],mm0
69
      movq [120+Y],mm0
70
#undef Y
71
#undef P
72
    }
73
  }
74
  else{
75
    /*Dequantize the DC coefficient.*/
76
    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
77
    oc_idct8x8_mmx(_dct_coeffs+64,_dct_coeffs,_last_zzi);
78
  }
79
  /*Fill in the target buffer.*/
80
  frag_buf_off=_state->frag_buf_offs[_fragi];
81
  refi=_state->frags[_fragi].refi;
82
  ystride=_state->ref_ystride[_pli];
83
  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
84
  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
85
  else{
86
    const unsigned char *ref;
87
    int                  mvoffsets[2];
88
    ref=_state->ref_frame_data[refi]+frag_buf_off;
89
    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
90
     _state->frag_mvs[_fragi])>1){
91
      oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
92
       _dct_coeffs+64);
93
    }
94
    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
95
  }
96
}
97

98
/*We copy these entire function to inline the actual MMX routines so that we
99
   use only a single indirect call.*/
100

101
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
102
  memset(_bv,~(_flimit<<1),8);
103
}
104

105
/*Apply the loop filter to a given set of fragment rows in the given plane.
106
  The filter may be run on the bottom edge, affecting pixels in the next row of
107
   fragments, so this row also needs to be available.
108
  _bv:        The bounding values array.
109
  _refi:      The index of the frame buffer to filter.
110
  _pli:       The color plane to filter.
111
  _fragy0:    The Y coordinate of the first fragment row to filter.
112
  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
113
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
114
 signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
115
  const oc_fragment_plane *fplane;
116
  const oc_fragment       *frags;
117
  const ptrdiff_t         *frag_buf_offs;
118
  unsigned char           *ref_frame_data;
119
  ptrdiff_t                fragi_top;
120
  ptrdiff_t                fragi_bot;
121
  ptrdiff_t                fragi0;
122
  ptrdiff_t                fragi0_end;
123
  int                      ystride;
124
  int                      nhfrags;
125
  fplane=_state->fplanes+_pli;
126
  nhfrags=fplane->nhfrags;
127
  fragi_top=fplane->froffset;
128
  fragi_bot=fragi_top+fplane->nfrags;
129
  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
130
  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
131
  ystride=_state->ref_ystride[_pli];
132
  frags=_state->frags;
133
  frag_buf_offs=_state->frag_buf_offs;
134
  ref_frame_data=_state->ref_frame_data[_refi];
135
  /*The following loops are constructed somewhat non-intuitively on purpose.
136
    The main idea is: if a block boundary has at least one coded fragment on
137
     it, the filter is applied to it.
138
    However, the order that the filters are applied in matters, and VP3 chose
139
     the somewhat strange ordering used below.*/
140
  while(fragi0<fragi0_end){
141
    ptrdiff_t fragi;
142
    ptrdiff_t fragi_end;
143
    fragi=fragi0;
144
    fragi_end=fragi+nhfrags;
145
    while(fragi<fragi_end){
146
      if(frags[fragi].coded){
147
        unsigned char *ref;
148
        ref=ref_frame_data+frag_buf_offs[fragi];
149
#define PIX eax
150
#define YSTRIDE3 edi
151
#define YSTRIDE ecx
152
#define LL edx
153
#define D esi
154
#define D_WORD si
155
        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,_bv);
156
        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,_bv);
157
        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
158
          OC_LOOP_FILTER_H_MMX(ref+8,ystride,_bv);
159
        }
160
        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
161
          OC_LOOP_FILTER_V_MMX(ref+(ystride*8),ystride,_bv);
162
        }
163
#undef PIX
164
#undef YSTRIDE3
165
#undef YSTRIDE
166
#undef LL
167
#undef D
168
#undef D_WORD
169
      }
170
      fragi++;
171
    }
172
    fragi0+=nhfrags;
173
  }
174
}
175

176
#endif
177

178
Product

Resources

Company