Path: blob/master/thirdparty/libtheora/x86_vc/mmxfrag.c
9904 views
/********************************************************************1* *2* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *3* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *4* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *5* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *6* *7* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *8* by the Xiph.Org Foundation and contributors *9* https://www.xiph.org/ *10* *11********************************************************************1213function:1415********************************************************************/1617/*MMX acceleration of fragment reconstruction for motion compensation.18Originally written by Rudolf Marek.19Additional optimization by Nils Pipenbrinck.20Note: Loops are unrolled for best performance.21The iteration each instruction belongs to is marked in the comments as #i.*/22#include <stddef.h>23#include "x86int.h"2425#if defined(OC_X86_ASM)2627/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes28between rows.*/29# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \30do{ \31const unsigned char *src; \32unsigned char *dst; \33src=(_src); \34dst=(_dst); \35__asm mov SRC,src \36__asm mov DST,dst \37__asm mov YSTRIDE,_ystride \38/*src+0*ystride*/ \39__asm movq mm0,[SRC] \40/*src+1*ystride*/ \41__asm movq mm1,[SRC+YSTRIDE] \42/*ystride3=ystride*3*/ \43__asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \44/*src+2*ystride*/ \45__asm movq mm2,[SRC+YSTRIDE*2] \46/*src+3*ystride*/ \47__asm movq mm3,[SRC+YSTRIDE3] \48/*dst+0*ystride*/ \49__asm movq [DST],mm0 \50/*dst+1*ystride*/ \51__asm movq [DST+YSTRIDE],mm1 \52/*Pointer to next 4.*/ \53__asm lea SRC,[SRC+YSTRIDE*4] \54/*dst+2*ystride*/ \55__asm movq [DST+YSTRIDE*2],mm2 \56/*dst+3*ystride*/ \57__asm movq [DST+YSTRIDE3],mm3 \58/*Pointer to next 4.*/ \59__asm lea DST,[DST+YSTRIDE*4] \60/*src+0*ystride*/ \61__asm movq mm0,[SRC] \62/*src+1*ystride*/ \63__asm movq mm1,[SRC+YSTRIDE] \64/*src+2*ystride*/ \65__asm movq mm2,[SRC+YSTRIDE*2] \66/*src+3*ystride*/ \67__asm movq mm3,[SRC+YSTRIDE3] \68/*dst+0*ystride*/ \69__asm movq [DST],mm0 \70/*dst+1*ystride*/ \71__asm movq [DST+YSTRIDE],mm1 \72/*dst+2*ystride*/ \73__asm movq [DST+YSTRIDE*2],mm2 \74/*dst+3*ystride*/ \75__asm movq [DST+YSTRIDE3],mm3 \76} \77while(0)7879/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes80between rows.*/81void oc_frag_copy_mmx(unsigned char *_dst,82const unsigned char *_src,int _ystride){83#define SRC edx84#define DST eax85#define YSTRIDE ecx86#define YSTRIDE3 esi87OC_FRAG_COPY_MMX(_dst,_src,_ystride);88#undef SRC89#undef DST90#undef YSTRIDE91#undef YSTRIDE392}9394/*Copies the fragments specified by the lists of fragment indices from one95frame to another.96_dst_frame: The reference frame to copy to.97_src_frame: The reference frame to copy from.98_ystride: The row stride of the reference frames.99_fragis: A pointer to a list of fragment indices.100_nfragis: The number of fragment indices to copy.101_frag_buf_offs: The offsets of fragments in the reference frames.*/102void oc_frag_copy_list_mmx(unsigned char *_dst_frame,103const unsigned char *_src_frame,int _ystride,104const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){105ptrdiff_t fragii;106for(fragii=0;fragii<_nfragis;fragii++){107ptrdiff_t frag_buf_off;108frag_buf_off=_frag_buf_offs[_fragis[fragii]];109#define SRC edx110#define DST eax111#define YSTRIDE ecx112#define YSTRIDE3 edi113OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,114_src_frame+frag_buf_off,_ystride);115#undef SRC116#undef DST117#undef YSTRIDE118#undef YSTRIDE3119}120}121122void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,123const ogg_int16_t *_residue){124__asm{125#define DST edx126#define DST4 esi127#define YSTRIDE eax128#define YSTRIDE3 edi129#define RESIDUE ecx130mov DST,_dst131mov YSTRIDE,_ystride132mov RESIDUE,_residue133lea DST4,[DST+YSTRIDE*4]134lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]135/*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/136pcmpeqw mm0,mm0137/*#0 Load low residue.*/138movq mm1,[0*8+RESIDUE]139/*#0 Load high residue.*/140movq mm2,[1*8+RESIDUE]141/*Set mm0 to 0x8000800080008000.*/142psllw mm0,15143/*#1 Load low residue.*/144movq mm3,[2*8+RESIDUE]145/*#1 Load high residue.*/146movq mm4,[3*8+RESIDUE]147/*Set mm0 to 0x0080008000800080.*/148psrlw mm0,8149/*#2 Load low residue.*/150movq mm5,[4*8+RESIDUE]151/*#2 Load high residue.*/152movq mm6,[5*8+RESIDUE]153/*#0 Bias low residue.*/154paddsw mm1,mm0155/*#0 Bias high residue.*/156paddsw mm2,mm0157/*#0 Pack to byte.*/158packuswb mm1,mm2159/*#1 Bias low residue.*/160paddsw mm3,mm0161/*#1 Bias high residue.*/162paddsw mm4,mm0163/*#1 Pack to byte.*/164packuswb mm3,mm4165/*#2 Bias low residue.*/166paddsw mm5,mm0167/*#2 Bias high residue.*/168paddsw mm6,mm0169/*#2 Pack to byte.*/170packuswb mm5,mm6171/*#0 Write row.*/172movq [DST],mm1173/*#1 Write row.*/174movq [DST+YSTRIDE],mm3175/*#2 Write row.*/176movq [DST+YSTRIDE*2],mm5177/*#3 Load low residue.*/178movq mm1,[6*8+RESIDUE]179/*#3 Load high residue.*/180movq mm2,[7*8+RESIDUE]181/*#4 Load high residue.*/182movq mm3,[8*8+RESIDUE]183/*#4 Load high residue.*/184movq mm4,[9*8+RESIDUE]185/*#5 Load high residue.*/186movq mm5,[10*8+RESIDUE]187/*#5 Load high residue.*/188movq mm6,[11*8+RESIDUE]189/*#3 Bias low residue.*/190paddsw mm1,mm0191/*#3 Bias high residue.*/192paddsw mm2,mm0193/*#3 Pack to byte.*/194packuswb mm1,mm2195/*#4 Bias low residue.*/196paddsw mm3,mm0197/*#4 Bias high residue.*/198paddsw mm4,mm0199/*#4 Pack to byte.*/200packuswb mm3,mm4201/*#5 Bias low residue.*/202paddsw mm5,mm0203/*#5 Bias high residue.*/204paddsw mm6,mm0205/*#5 Pack to byte.*/206packuswb mm5,mm6207/*#3 Write row.*/208movq [DST+YSTRIDE3],mm1209/*#4 Write row.*/210movq [DST4],mm3211/*#5 Write row.*/212movq [DST4+YSTRIDE],mm5213/*#6 Load low residue.*/214movq mm1,[12*8+RESIDUE]215/*#6 Load high residue.*/216movq mm2,[13*8+RESIDUE]217/*#7 Load low residue.*/218movq mm3,[14*8+RESIDUE]219/*#7 Load high residue.*/220movq mm4,[15*8+RESIDUE]221/*#6 Bias low residue.*/222paddsw mm1,mm0223/*#6 Bias high residue.*/224paddsw mm2,mm0225/*#6 Pack to byte.*/226packuswb mm1,mm2227/*#7 Bias low residue.*/228paddsw mm3,mm0229/*#7 Bias high residue.*/230paddsw mm4,mm0231/*#7 Pack to byte.*/232packuswb mm3,mm4233/*#6 Write row.*/234movq [DST4+YSTRIDE*2],mm1235/*#7 Write row.*/236movq [DST4+YSTRIDE3],mm3237#undef DST238#undef DST4239#undef YSTRIDE240#undef YSTRIDE3241#undef RESIDUE242}243}244245void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,246int _ystride,const ogg_int16_t *_residue){247int i;248/*Zero mm0.*/249__asm pxor mm0,mm0;250for(i=4;i-->0;){251__asm{252#define DST edx253#define SRC ecx254#define YSTRIDE edi255#define RESIDUE eax256mov DST,_dst257mov SRC,_src258mov YSTRIDE,_ystride259mov RESIDUE,_residue260/*#0 Load source.*/261movq mm3,[SRC]262/*#1 Load source.*/263movq mm7,[SRC+YSTRIDE]264/*#0 Get copy of src.*/265movq mm4,mm3266/*#0 Expand high source.*/267punpckhbw mm4,mm0268/*#0 Expand low source.*/269punpcklbw mm3,mm0270/*#0 Add residue high.*/271paddsw mm4,[8+RESIDUE]272/*#1 Get copy of src.*/273movq mm2,mm7274/*#0 Add residue low.*/275paddsw mm3,[RESIDUE]276/*#1 Expand high source.*/277punpckhbw mm2,mm0278/*#0 Pack final row pixels.*/279packuswb mm3,mm4280/*#1 Expand low source.*/281punpcklbw mm7,mm0282/*#1 Add residue low.*/283paddsw mm7,[16+RESIDUE]284/*#1 Add residue high.*/285paddsw mm2,[24+RESIDUE]286/*Advance residue.*/287lea RESIDUE,[32+RESIDUE]288/*#1 Pack final row pixels.*/289packuswb mm7,mm2290/*Advance src.*/291lea SRC,[SRC+YSTRIDE*2]292/*#0 Write row.*/293movq [DST],mm3294/*#1 Write row.*/295movq [DST+YSTRIDE],mm7296/*Advance dst.*/297lea DST,[DST+YSTRIDE*2]298mov _residue,RESIDUE299mov _dst,DST300mov _src,SRC301#undef DST302#undef SRC303#undef YSTRIDE304#undef RESIDUE305}306}307}308309void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,310const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){311int i;312/*Zero mm7.*/313__asm pxor mm7,mm7;314for(i=4;i-->0;){315__asm{316#define SRC1 ecx317#define SRC2 edi318#define YSTRIDE esi319#define RESIDUE edx320#define DST eax321mov YSTRIDE,_ystride322mov DST,_dst323mov RESIDUE,_residue324mov SRC1,_src1325mov SRC2,_src2326/*#0 Load src1.*/327movq mm0,[SRC1]328/*#0 Load src2.*/329movq mm2,[SRC2]330/*#0 Copy src1.*/331movq mm1,mm0332/*#0 Copy src2.*/333movq mm3,mm2334/*#1 Load src1.*/335movq mm4,[SRC1+YSTRIDE]336/*#0 Unpack lower src1.*/337punpcklbw mm0,mm7338/*#1 Load src2.*/339movq mm5,[SRC2+YSTRIDE]340/*#0 Unpack higher src1.*/341punpckhbw mm1,mm7342/*#0 Unpack lower src2.*/343punpcklbw mm2,mm7344/*#0 Unpack higher src2.*/345punpckhbw mm3,mm7346/*Advance src1 ptr.*/347lea SRC1,[SRC1+YSTRIDE*2]348/*Advance src2 ptr.*/349lea SRC2,[SRC2+YSTRIDE*2]350/*#0 Lower src1+src2.*/351paddsw mm0,mm2352/*#0 Higher src1+src2.*/353paddsw mm1,mm3354/*#1 Copy src1.*/355movq mm2,mm4356/*#0 Build lo average.*/357psraw mm0,1358/*#1 Copy src2.*/359movq mm3,mm5360/*#1 Unpack lower src1.*/361punpcklbw mm4,mm7362/*#0 Build hi average.*/363psraw mm1,1364/*#1 Unpack higher src1.*/365punpckhbw mm2,mm7366/*#0 low+=residue.*/367paddsw mm0,[RESIDUE]368/*#1 Unpack lower src2.*/369punpcklbw mm5,mm7370/*#0 high+=residue.*/371paddsw mm1,[8+RESIDUE]372/*#1 Unpack higher src2.*/373punpckhbw mm3,mm7374/*#1 Lower src1+src2.*/375paddsw mm5,mm4376/*#0 Pack and saturate.*/377packuswb mm0,mm1378/*#1 Higher src1+src2.*/379paddsw mm3,mm2380/*#0 Write row.*/381movq [DST],mm0382/*#1 Build lo average.*/383psraw mm5,1384/*#1 Build hi average.*/385psraw mm3,1386/*#1 low+=residue.*/387paddsw mm5,[16+RESIDUE]388/*#1 high+=residue.*/389paddsw mm3,[24+RESIDUE]390/*#1 Pack and saturate.*/391packuswb mm5,mm3392/*#1 Write row ptr.*/393movq [DST+YSTRIDE],mm5394/*Advance residue ptr.*/395add RESIDUE,32396/*Advance dest ptr.*/397lea DST,[DST+YSTRIDE*2]398mov _dst,DST399mov _residue,RESIDUE400mov _src1,SRC1401mov _src2,SRC2402#undef SRC1403#undef SRC2404#undef YSTRIDE405#undef RESIDUE406#undef DST407}408}409}410411void oc_restore_fpu_mmx(void){412__asm emms;413}414415#endif416417418