Path: blob/master/thirdparty/libtheora/x86/mmxstate.c
9898 views
/********************************************************************1* *2* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *3* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *4* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *5* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *6* *7* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *8* by the Xiph.Org Foundation and contributors *9* https://www.xiph.org/ *10* *11********************************************************************1213function:1415********************************************************************/1617/*MMX acceleration of complete fragment reconstruction algorithm.18Originally written by Rudolf Marek.*/19#include <string.h>20#include "x86int.h"21#include "mmxloop.h"2223#if defined(OC_X86_ASM)2425void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,26int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){27unsigned char *dst;28ptrdiff_t frag_buf_off;29int ystride;30int refi;31/*Apply the inverse transform.*/32/*Special case only having a DC component.*/33if(_last_zzi<2){34/*Note that this value must be unsigned, to keep the __asm__ block from35sign-extending it when it puts it in a register.*/36ogg_uint16_t p;37int i;38/*We round this dequant product (and not any of the others) because there's39no iDCT rounding.*/40p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);41/*Fill _dct_coeffs with p.*/42__asm__ __volatile__(43/*mm0=0000 0000 0000 AAAA*/44"movd %[p],%%mm0\n\t"45/*mm0=0000 0000 AAAA AAAA*/46"punpcklwd %%mm0,%%mm0\n\t"47/*mm0=AAAA AAAA AAAA AAAA*/48"punpckldq %%mm0,%%mm0\n\t"49:50:[p]"r"((unsigned)p)51);52for(i=0;i<4;i++){53__asm__ __volatile__(54"movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"55"movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"56"movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"57"movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"58:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)59);60}61}62else{63/*Dequantize the DC coefficient.*/64_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);65oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);66}67/*Fill in the target buffer.*/68frag_buf_off=_state->frag_buf_offs[_fragi];69refi=_state->frags[_fragi].refi;70ystride=_state->ref_ystride[_pli];71dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;72if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);73else{74const unsigned char *ref;75int mvoffsets[2];76ref=_state->ref_frame_data[refi]+frag_buf_off;77if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,78_state->frag_mvs[_fragi])>1){79oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,80_dct_coeffs+64);81}82else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);83}84}8586/*We copy these entire function to inline the actual MMX routines so that we87use only a single indirect call.*/8889void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){90memset(_bv,_flimit,8);91}9293/*Apply the loop filter to a given set of fragment rows in the given plane.94The filter may be run on the bottom edge, affecting pixels in the next row of95fragments, so this row also needs to be available.96_bv: The bounding values array.97_refi: The index of the frame buffer to filter.98_pli: The color plane to filter.99_fragy0: The Y coordinate of the first fragment row to filter.100_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/101void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,102signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){103OC_ALIGN8(unsigned char ll[8]);104const oc_fragment_plane *fplane;105const oc_fragment *frags;106const ptrdiff_t *frag_buf_offs;107unsigned char *ref_frame_data;108ptrdiff_t fragi_top;109ptrdiff_t fragi_bot;110ptrdiff_t fragi0;111ptrdiff_t fragi0_end;112int ystride;113int nhfrags;114memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));115fplane=_state->fplanes+_pli;116nhfrags=fplane->nhfrags;117fragi_top=fplane->froffset;118fragi_bot=fragi_top+fplane->nfrags;119fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;120fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;121ystride=_state->ref_ystride[_pli];122frags=_state->frags;123frag_buf_offs=_state->frag_buf_offs;124ref_frame_data=_state->ref_frame_data[_refi];125/*The following loops are constructed somewhat non-intuitively on purpose.126The main idea is: if a block boundary has at least one coded fragment on127it, the filter is applied to it.128However, the order that the filters are applied in matters, and VP3 chose129the somewhat strange ordering used below.*/130while(fragi0<fragi0_end){131ptrdiff_t fragi;132ptrdiff_t fragi_end;133fragi=fragi0;134fragi_end=fragi+nhfrags;135while(fragi<fragi_end){136if(frags[fragi].coded){137unsigned char *ref;138ref=ref_frame_data+frag_buf_offs[fragi];139if(fragi>fragi0){140OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);141}142if(fragi0>fragi_top){143OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);144}145if(fragi+1<fragi_end&&!frags[fragi+1].coded){146OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);147}148if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){149OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride*8),ystride,ll);150}151}152fragi++;153}154fragi0+=nhfrags;155}156}157158void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){159memset(_bv,~(_flimit<<1),8);160}161162/*Apply the loop filter to a given set of fragment rows in the given plane.163The filter may be run on the bottom edge, affecting pixels in the next row of164fragments, so this row also needs to be available.165_bv: The bounding values array.166_refi: The index of the frame buffer to filter.167_pli: The color plane to filter.168_fragy0: The Y coordinate of the first fragment row to filter.169_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/170void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,171signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){172const oc_fragment_plane *fplane;173const oc_fragment *frags;174const ptrdiff_t *frag_buf_offs;175unsigned char *ref_frame_data;176ptrdiff_t fragi_top;177ptrdiff_t fragi_bot;178ptrdiff_t fragi0;179ptrdiff_t fragi0_end;180int ystride;181int nhfrags;182fplane=_state->fplanes+_pli;183nhfrags=fplane->nhfrags;184fragi_top=fplane->froffset;185fragi_bot=fragi_top+fplane->nfrags;186fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;187fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;188ystride=_state->ref_ystride[_pli];189frags=_state->frags;190frag_buf_offs=_state->frag_buf_offs;191ref_frame_data=_state->ref_frame_data[_refi];192/*The following loops are constructed somewhat non-intuitively on purpose.193The main idea is: if a block boundary has at least one coded fragment on194it, the filter is applied to it.195However, the order that the filters are applied in matters, and VP3 chose196the somewhat strange ordering used below.*/197while(fragi0<fragi0_end){198ptrdiff_t fragi;199ptrdiff_t fragi_end;200fragi=fragi0;201fragi_end=fragi+nhfrags;202while(fragi<fragi_end){203if(frags[fragi].coded){204unsigned char *ref;205ref=ref_frame_data+frag_buf_offs[fragi];206if(fragi>fragi0){207OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);208}209if(fragi0>fragi_top){210OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);211}212if(fragi+1<fragi_end&&!frags[fragi+1].coded){213OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);214}215if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){216OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride*8),ystride,_bv);217}218}219fragi++;220}221fragi0+=nhfrags;222}223}224225#endif226227228