Path: blob/master/thirdparty/libtheora/x86_vc/mmxencfrag.c
9904 views
/********************************************************************1* *2* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *3* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *4* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *5* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *6* *7* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *8* by the Xiph.Org Foundation https://www.xiph.org/ *9* *10********************************************************************1112function:1314********************************************************************/15#include <stddef.h>16#include "x86enc.h"1718#if defined(OC_X86_ASM)1920unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,21const unsigned char *_ref,int _ystride){22ptrdiff_t ret;23__asm{24#define SRC esi25#define REF edx26#define YSTRIDE ecx27#define YSTRIDE3 edi28mov YSTRIDE,_ystride29mov SRC,_src30mov REF,_ref31/*Load the first 4 rows of each block.*/32movq mm0,[SRC]33movq mm1,[REF]34movq mm2,[SRC][YSTRIDE]35movq mm3,[REF][YSTRIDE]36lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]37movq mm4,[SRC+YSTRIDE*2]38movq mm5,[REF+YSTRIDE*2]39movq mm6,[SRC+YSTRIDE3]40movq mm7,[REF+YSTRIDE3]41/*Compute their SADs and add them in mm0*/42psadbw mm0,mm143psadbw mm2,mm344lea SRC,[SRC+YSTRIDE*4]45paddw mm0,mm246lea REF,[REF+YSTRIDE*4]47/*Load the next 3 rows as registers become available.*/48movq mm2,[SRC]49movq mm3,[REF]50psadbw mm4,mm551psadbw mm6,mm752paddw mm0,mm453movq mm5,[REF+YSTRIDE]54movq mm4,[SRC+YSTRIDE]55paddw mm0,mm656movq mm7,[REF+YSTRIDE*2]57movq mm6,[SRC+YSTRIDE*2]58/*Start adding their SADs to mm0*/59psadbw mm2,mm360psadbw mm4,mm561paddw mm0,mm262psadbw mm6,mm763/*Load last row as registers become available.*/64movq mm2,[SRC+YSTRIDE3]65movq mm3,[REF+YSTRIDE3]66/*And finish adding up their SADs.*/67paddw mm0,mm468psadbw mm2,mm369paddw mm0,mm670paddw mm0,mm271movd [ret],mm072#undef SRC73#undef REF74#undef YSTRIDE75#undef YSTRIDE376}77return (unsigned)ret;78}7980unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,81const unsigned char *_ref,int _ystride,unsigned _thresh){82/*Early termination is for suckers.*/83return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);84}8586#define OC_SAD2_LOOP __asm{ \87/*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \88pavgb computes (mm0+mm1+1>>1). \89The latter is exactly 1 too large when the low bit of two corresponding \90bytes is only set in one of them. \91Therefore we pxor the operands, pand to mask out the low bits, and psubb to \92correct the output of pavgb.*/ \93__asm movq mm6,mm0 \94__asm lea REF1,[REF1+YSTRIDE*2] \95__asm pxor mm0,mm1 \96__asm pavgb mm6,mm1 \97__asm lea REF2,[REF2+YSTRIDE*2] \98__asm movq mm1,mm2 \99__asm pand mm0,mm7 \100__asm pavgb mm2,mm3 \101__asm pxor mm1,mm3 \102__asm movq mm3,[REF2+YSTRIDE] \103__asm psubb mm6,mm0 \104__asm movq mm0,[REF1] \105__asm pand mm1,mm7 \106__asm psadbw mm4,mm6 \107__asm movd mm6,RET \108__asm psubb mm2,mm1 \109__asm movq mm1,[REF2] \110__asm lea SRC,[SRC+YSTRIDE*2] \111__asm psadbw mm5,mm2 \112__asm movq mm2,[REF1+YSTRIDE] \113__asm paddw mm5,mm4 \114__asm movq mm4,[SRC] \115__asm paddw mm6,mm5 \116__asm movq mm5,[SRC+YSTRIDE] \117__asm movd RET,mm6 \118}119120/*Same as above, but does not pre-load the next two rows.*/121#define OC_SAD2_TAIL __asm{ \122__asm movq mm6,mm0 \123__asm pavgb mm0,mm1 \124__asm pxor mm6,mm1 \125__asm movq mm1,mm2 \126__asm pand mm6,mm7 \127__asm pavgb mm2,mm3 \128__asm pxor mm1,mm3 \129__asm psubb mm0,mm6 \130__asm pand mm1,mm7 \131__asm psadbw mm4,mm0 \132__asm psubb mm2,mm1 \133__asm movd mm6,RET \134__asm psadbw mm5,mm2 \135__asm paddw mm5,mm4 \136__asm paddw mm6,mm5 \137__asm movd RET,mm6 \138}139140unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,141const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,142unsigned _thresh){143ptrdiff_t ret;144__asm{145#define REF1 ecx146#define REF2 edi147#define YSTRIDE esi148#define SRC edx149#define RET eax150mov YSTRIDE,_ystride151mov SRC,_src152mov REF1,_ref1153mov REF2,_ref2154movq mm0,[REF1]155movq mm1,[REF2]156movq mm2,[REF1+YSTRIDE]157movq mm3,[REF2+YSTRIDE]158xor RET,RET159movq mm4,[SRC]160pxor mm7,mm7161pcmpeqb mm6,mm6162movq mm5,[SRC+YSTRIDE]163psubb mm7,mm6164OC_SAD2_LOOP165OC_SAD2_LOOP166OC_SAD2_LOOP167OC_SAD2_TAIL168mov [ret],RET169#undef REF1170#undef REF2171#undef YSTRIDE172#undef SRC173#undef RET174}175return (unsigned)ret;176}177178/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their17916-bit difference in mm0...mm7.*/180#define OC_LOAD_SUB_8x4(_off) __asm{ \181__asm movd mm0,[_off+SRC] \182__asm movd mm4,[_off+REF] \183__asm movd mm1,[_off+SRC+SRC_YSTRIDE] \184__asm lea SRC,[SRC+SRC_YSTRIDE*2] \185__asm movd mm5,[_off+REF+REF_YSTRIDE] \186__asm lea REF,[REF+REF_YSTRIDE*2] \187__asm movd mm2,[_off+SRC] \188__asm movd mm7,[_off+REF] \189__asm movd mm3,[_off+SRC+SRC_YSTRIDE] \190__asm movd mm6,[_off+REF+REF_YSTRIDE] \191__asm punpcklbw mm0,mm4 \192__asm lea SRC,[SRC+SRC_YSTRIDE*2] \193__asm punpcklbw mm4,mm4 \194__asm lea REF,[REF+REF_YSTRIDE*2] \195__asm psubw mm0,mm4 \196__asm movd mm4,[_off+SRC] \197__asm movq [_off*2+BUF],mm0 \198__asm movd mm0,[_off+REF] \199__asm punpcklbw mm1,mm5 \200__asm punpcklbw mm5,mm5 \201__asm psubw mm1,mm5 \202__asm movd mm5,[_off+SRC+SRC_YSTRIDE] \203__asm punpcklbw mm2,mm7 \204__asm punpcklbw mm7,mm7 \205__asm psubw mm2,mm7 \206__asm movd mm7,[_off+REF+REF_YSTRIDE] \207__asm punpcklbw mm3,mm6 \208__asm lea SRC,[SRC+SRC_YSTRIDE*2] \209__asm punpcklbw mm6,mm6 \210__asm psubw mm3,mm6 \211__asm movd mm6,[_off+SRC] \212__asm punpcklbw mm4,mm0 \213__asm lea REF,[REF+REF_YSTRIDE*2] \214__asm punpcklbw mm0,mm0 \215__asm lea SRC,[SRC+SRC_YSTRIDE*2] \216__asm psubw mm4,mm0 \217__asm movd mm0,[_off+REF] \218__asm punpcklbw mm5,mm7 \219__asm neg SRC_YSTRIDE \220__asm punpcklbw mm7,mm7 \221__asm psubw mm5,mm7 \222__asm movd mm7,[_off+SRC+SRC_YSTRIDE] \223__asm punpcklbw mm6,mm0 \224__asm lea REF,[REF+REF_YSTRIDE*2] \225__asm punpcklbw mm0,mm0 \226__asm neg REF_YSTRIDE \227__asm psubw mm6,mm0 \228__asm movd mm0,[_off+REF+REF_YSTRIDE] \229__asm lea SRC,[SRC+SRC_YSTRIDE*8] \230__asm punpcklbw mm7,mm0 \231__asm neg SRC_YSTRIDE \232__asm punpcklbw mm0,mm0 \233__asm lea REF,[REF+REF_YSTRIDE*8] \234__asm psubw mm7,mm0 \235__asm neg REF_YSTRIDE \236__asm movq mm0,[_off*2+BUF] \237}238239/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/240#define OC_LOAD_8x4(_off) __asm{ \241__asm movd mm0,[_off+SRC] \242__asm movd mm1,[_off+SRC+YSTRIDE] \243__asm movd mm2,[_off+SRC+YSTRIDE*2] \244__asm pxor mm7,mm7 \245__asm movd mm3,[_off+SRC+YSTRIDE3] \246__asm punpcklbw mm0,mm7 \247__asm movd mm4,[_off+SRC4] \248__asm punpcklbw mm1,mm7 \249__asm movd mm5,[_off+SRC4+YSTRIDE] \250__asm punpcklbw mm2,mm7 \251__asm movd mm6,[_off+SRC4+YSTRIDE*2] \252__asm punpcklbw mm3,mm7 \253__asm movd mm7,[_off+SRC4+YSTRIDE3] \254__asm punpcklbw mm4,mm4 \255__asm punpcklbw mm5,mm5 \256__asm psrlw mm4,8 \257__asm psrlw mm5,8 \258__asm punpcklbw mm6,mm6 \259__asm punpcklbw mm7,mm7 \260__asm psrlw mm6,8 \261__asm psrlw mm7,8 \262}263264/*Performs the first two stages of an 8-point 1-D Hadamard transform.265The transform is performed in place, except that outputs 0-3 are swapped with266outputs 4-7.267Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to268perform this stage in place with no temporary registers).*/269#define OC_HADAMARD_AB_8x4 __asm{ \270/*Stage A: \271Outputs 0-3 are swapped with 4-7 here.*/ \272__asm paddw mm5,mm1 \273__asm paddw mm6,mm2 \274__asm paddw mm1,mm1 \275__asm paddw mm2,mm2 \276__asm psubw mm1,mm5 \277__asm psubw mm2,mm6 \278__asm paddw mm7,mm3 \279__asm paddw mm4,mm0 \280__asm paddw mm3,mm3 \281__asm paddw mm0,mm0 \282__asm psubw mm3,mm7 \283__asm psubw mm0,mm4 \284/*Stage B:*/ \285__asm paddw mm0,mm2 \286__asm paddw mm1,mm3 \287__asm paddw mm4,mm6 \288__asm paddw mm5,mm7 \289__asm paddw mm2,mm2 \290__asm paddw mm3,mm3 \291__asm paddw mm6,mm6 \292__asm paddw mm7,mm7 \293__asm psubw mm2,mm0 \294__asm psubw mm3,mm1 \295__asm psubw mm6,mm4 \296__asm psubw mm7,mm5 \297}298299/*Performs the last stage of an 8-point 1-D Hadamard transform in place.300Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in301place with no temporary registers).*/302#define OC_HADAMARD_C_8x4 __asm{ \303/*Stage C:*/ \304__asm paddw mm0,mm1 \305__asm paddw mm2,mm3 \306__asm paddw mm4,mm5 \307__asm paddw mm6,mm7 \308__asm paddw mm1,mm1 \309__asm paddw mm3,mm3 \310__asm paddw mm5,mm5 \311__asm paddw mm7,mm7 \312__asm psubw mm1,mm0 \313__asm psubw mm3,mm2 \314__asm psubw mm5,mm4 \315__asm psubw mm7,mm6 \316}317318/*Performs an 8-point 1-D Hadamard transform.319The transform is performed in place, except that outputs 0-3 are swapped with320outputs 4-7.321Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform322in place with no temporary registers).*/323#define OC_HADAMARD_8x4 __asm{ \324OC_HADAMARD_AB_8x4 \325OC_HADAMARD_C_8x4 \326}327328/*Performs the first part of the final stage of the Hadamard transform and329summing of absolute values.330At the end of this part, mm1 will contain the DC coefficient of the331transform.*/332#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \333/*We use the fact that \334(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \335to merge the final butterfly with the abs and the first stage of \336accumulation. \337Thus we can avoid using pabsw, which is not available until SSSE3. \338Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \339implementation would be (3+3)*8+7=55 instructions (+4 for spilling \340registers). \341Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \342This implementation is only 26 (+4 for spilling registers).*/ \343__asm movq [_r7+BUF],mm7 \344__asm movq [_r6+BUF],mm6 \345/*mm7={0x7FFF}x4 \346mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \347__asm pcmpeqb mm7,mm7 \348__asm movq mm6,mm0 \349__asm psrlw mm7,1 \350__asm paddw mm6,mm1 \351__asm pmaxsw mm0,mm1 \352__asm paddsw mm6,mm7 \353__asm psubw mm0,mm6 \354/*mm2=max(abs(mm2),abs(mm3))-0x7FFF \355mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \356__asm movq mm6,mm2 \357__asm movq mm1,mm4 \358__asm pmaxsw mm2,mm3 \359__asm pmaxsw mm4,mm5 \360__asm paddw mm6,mm3 \361__asm paddw mm1,mm5 \362__asm movq mm3,[_r7+BUF] \363}364365/*Performs the second part of the final stage of the Hadamard transform and366summing of absolute values.*/367#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \368__asm paddsw mm6,mm7 \369__asm movq mm5,[_r6+BUF] \370__asm paddsw mm1,mm7 \371__asm psubw mm2,mm6 \372__asm psubw mm4,mm1 \373/*mm7={1}x4 (needed for the horizontal add that follows) \374mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \375__asm movq mm6,mm3 \376__asm pmaxsw mm3,mm5 \377__asm paddw mm0,mm2 \378__asm paddw mm6,mm5 \379__asm paddw mm0,mm4 \380__asm paddsw mm6,mm7 \381__asm paddw mm0,mm3 \382__asm psrlw mm7,14 \383__asm psubw mm0,mm6 \384}385386/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the387absolute value of each component, and accumulates everything into mm0.388This is the only portion of SATD which requires MMXEXT (we could use plain389MMX, but it takes 4 instructions and an extra register to work around the390lack of a pmaxsw, which is a pretty serious penalty).*/391#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \392OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \393OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \394}395396/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each397component, and accumulates everything into mm0.398Note that mm0 will have an extra 4 added to each column, and that after399removing this value, the remainder will be half the conventional value.*/400#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \401OC_HADAMARD_AB_8x4 \402OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \403}404405/*Performs two 4x4 transposes (mostly) in place.406On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}407contains rows {a,b,c,d}.408On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and409{mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/410#define OC_TRANSPOSE_4x4x2(_off) __asm{ \411/*First 4x4 transpose:*/ \412__asm movq [0x10+_off+BUF],mm5 \413/*mm0 = e3 e2 e1 e0 \414mm1 = f3 f2 f1 f0 \415mm2 = g3 g2 g1 g0 \416mm3 = h3 h2 h1 h0*/ \417__asm movq mm5,mm2 \418__asm punpcklwd mm2,mm3 \419__asm punpckhwd mm5,mm3 \420__asm movq mm3,mm0 \421__asm punpcklwd mm0,mm1 \422__asm punpckhwd mm3,mm1 \423/*mm0 = f1 e1 f0 e0 \424mm3 = f3 e3 f2 e2 \425mm2 = h1 g1 h0 g0 \426mm5 = h3 g3 h2 g2*/ \427__asm movq mm1,mm0 \428__asm punpckldq mm0,mm2 \429__asm punpckhdq mm1,mm2 \430__asm movq mm2,mm3 \431__asm punpckhdq mm3,mm5 \432__asm movq [0x40+_off+BUF],mm0 \433__asm punpckldq mm2,mm5 \434/*mm0 = h0 g0 f0 e0 \435mm1 = h1 g1 f1 e1 \436mm2 = h2 g2 f2 e2 \437mm3 = h3 g3 f3 e3*/ \438__asm movq mm5,[0x10+_off+BUF] \439/*Second 4x4 transpose:*/ \440/*mm4 = a3 a2 a1 a0 \441mm5 = b3 b2 b1 b0 \442mm6 = c3 c2 c1 c0 \443mm7 = d3 d2 d1 d0*/ \444__asm movq mm0,mm6 \445__asm punpcklwd mm6,mm7 \446__asm movq [0x50+_off+BUF],mm1 \447__asm punpckhwd mm0,mm7 \448__asm movq mm7,mm4 \449__asm punpcklwd mm4,mm5 \450__asm movq [0x60+_off+BUF],mm2 \451__asm punpckhwd mm7,mm5 \452/*mm4 = b1 a1 b0 a0 \453mm7 = b3 a3 b2 a2 \454mm6 = d1 c1 d0 c0 \455mm0 = d3 c3 d2 c2*/ \456__asm movq mm5,mm4 \457__asm punpckldq mm4,mm6 \458__asm movq [0x70+_off+BUF],mm3 \459__asm punpckhdq mm5,mm6 \460__asm movq mm6,mm7 \461__asm punpckhdq mm7,mm0 \462__asm punpckldq mm6,mm0 \463/*mm4 = d0 c0 b0 a0 \464mm5 = d1 c1 b1 a1 \465mm6 = d2 c2 b2 a2 \466mm7 = d3 c3 b3 a3*/ \467}468469static unsigned oc_int_frag_satd_mmxext(int *_dc,470const unsigned char *_src,int _src_ystride,471const unsigned char *_ref,int _ref_ystride){472OC_ALIGN8(ogg_int16_t buf[64]);473ogg_int16_t *bufp;474unsigned ret;475unsigned ret2;476int dc;477bufp=buf;478__asm{479#define SRC esi480#define REF eax481#define SRC_YSTRIDE ecx482#define REF_YSTRIDE edx483#define BUF edi484#define RET edx485#define RET2 ecx486#define DC eax487#define DC_WORD ax488mov SRC,_src489mov SRC_YSTRIDE,_src_ystride490mov REF,_ref491mov REF_YSTRIDE,_ref_ystride492mov BUF,bufp493OC_LOAD_SUB_8x4(0x00)494OC_HADAMARD_8x4495OC_TRANSPOSE_4x4x2(0x00)496/*Finish swapping out this 8x4 block to make room for the next one.497mm0...mm3 have been swapped out already.*/498movq [0x00+BUF],mm4499movq [0x10+BUF],mm5500movq [0x20+BUF],mm6501movq [0x30+BUF],mm7502OC_LOAD_SUB_8x4(0x04)503OC_HADAMARD_8x4504OC_TRANSPOSE_4x4x2(0x08)505/*Here the first 4x4 block of output from the last transpose is the second5064x4 block of input for the next transform.507We have cleverly arranged that it already be in the appropriate place, so508we only have to do half the loads.*/509movq mm1,[0x10+BUF]510movq mm2,[0x20+BUF]511movq mm3,[0x30+BUF]512movq mm0,[0x00+BUF]513/*We split out the stages here so we can save the DC coefficient in the514middle.*/515OC_HADAMARD_AB_8x4516OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)517movd DC,mm1518OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)519/*Up to this point, everything fit in 16 bits (8 input + 1 for the520difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1521for the factor of two we dropped + 3 for the vertical accumulation).522Now we finally have to promote things to dwords.523We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long524latency of pmaddwd by starting the next series of loads now.*/525pmaddwd mm0,mm7526movq mm1,[0x50+BUF]527movq mm5,[0x58+BUF]528movq mm4,mm0529movq mm2,[0x60+BUF]530punpckhdq mm0,mm0531movq mm6,[0x68+BUF]532paddd mm4,mm0533movq mm3,[0x70+BUF]534movd RET2,mm4535movq mm7,[0x78+BUF]536movq mm0,[0x40+BUF]537movq mm4,[0x48+BUF]538OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)539pmaddwd mm0,mm7540/*Subtract abs(dc) from 2*ret2.*/541movsx DC,DC_WORD542cdq543lea RET2,[RET+RET2*2]544movq mm4,mm0545punpckhdq mm0,mm0546xor RET,DC547paddd mm4,mm0548/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4549added to them, a factor of two removed, and the DC value included;550correct the final sum here.*/551sub RET2,RET552movd RET,mm4553lea RET,[RET2+RET*2-64]554mov ret,RET555mov dc,DC556#undef SRC557#undef REF558#undef SRC_YSTRIDE559#undef REF_YSTRIDE560#undef BUF561#undef RET562#undef RET2563#undef DC564#undef DC_WORD565}566*_dc=dc;567return ret;568}569570unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,571const unsigned char *_ref,int _ystride){572return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);573}574575576/*Our internal implementation of frag_copy2 takes an extra stride parameter so577we can share code with oc_enc_frag_satd2_mmxext().*/578static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,579const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){580__asm{581/*Load the first 3 rows.*/582#define DST_YSTRIDE edi583#define SRC_YSTRIDE esi584#define DST eax585#define SRC1 edx586#define SRC2 ecx587mov DST_YSTRIDE,_dst_ystride588mov SRC_YSTRIDE,_src_ystride589mov DST,_dst590mov SRC1,_src1591mov SRC2,_src2592movq mm0,[SRC1]593movq mm1,[SRC2]594movq mm2,[SRC1+SRC_YSTRIDE]595lea SRC1,[SRC1+SRC_YSTRIDE*2]596movq mm3,[SRC2+SRC_YSTRIDE]597lea SRC2,[SRC2+SRC_YSTRIDE*2]598pxor mm7,mm7599movq mm4,[SRC1]600pcmpeqb mm6,mm6601movq mm5,[SRC2]602/*mm7={1}x8.*/603psubb mm7,mm6604/*Start averaging mm0 and mm1 into mm6.*/605movq mm6,mm0606pxor mm0,mm1607pavgb mm6,mm1608/*mm1 is free, start averaging mm3 into mm2 using mm1.*/609movq mm1,mm2610pand mm0,mm7611pavgb mm2,mm3612pxor mm1,mm3613/*mm3 is free.*/614psubb mm6,mm0615/*mm0 is free, start loading the next row.*/616movq mm0,[SRC1+SRC_YSTRIDE]617/*Start averaging mm5 and mm4 using mm3.*/618movq mm3,mm4619/*mm6 [row 0] is done; write it out.*/620movq [DST],mm6621pand mm1,mm7622pavgb mm4,mm5623psubb mm2,mm1624/*mm1 is free, continue loading the next row.*/625movq mm1,[SRC2+SRC_YSTRIDE]626pxor mm3,mm5627lea SRC1,[SRC1+SRC_YSTRIDE*2]628/*mm2 [row 1] is done; write it out.*/629movq [DST+DST_YSTRIDE],mm2630pand mm3,mm7631/*Start loading the next row.*/632movq mm2,[SRC1]633lea DST,[DST+DST_YSTRIDE*2]634psubb mm4,mm3635lea SRC2,[SRC2+SRC_YSTRIDE*2]636/*mm4 [row 2] is done; write it out.*/637movq [DST],mm4638/*Continue loading the next row.*/639movq mm3,[SRC2]640/*Start averaging mm0 and mm1 into mm6.*/641movq mm6,mm0642pxor mm0,mm1643/*Start loading the next row.*/644movq mm4,[SRC1+SRC_YSTRIDE]645pavgb mm6,mm1646/*mm1 is free; start averaging mm3 into mm2 using mm1.*/647movq mm1,mm2648pand mm0,mm7649/*Continue loading the next row.*/650movq mm5,[SRC2+SRC_YSTRIDE]651pavgb mm2,mm3652lea SRC1,[SRC1+SRC_YSTRIDE*2]653pxor mm1,mm3654/*mm3 is free.*/655psubb mm6,mm0656/*mm0 is free, start loading the next row.*/657movq mm0,[SRC1]658/*Start averaging mm5 into mm4 using mm3.*/659movq mm3,mm4660/*mm6 [row 3] is done; write it out.*/661movq [DST+DST_YSTRIDE],mm6662pand mm1,mm7663lea SRC2,[SRC2+SRC_YSTRIDE*2]664pavgb mm4,mm5665lea DST,[DST+DST_YSTRIDE*2]666psubb mm2,mm1667/*mm1 is free; continue loading the next row.*/668movq mm1,[SRC2]669pxor mm3,mm5670/*mm2 [row 4] is done; write it out.*/671movq [DST],mm2672pand mm3,mm7673/*Start loading the next row.*/674movq mm2,[SRC1+SRC_YSTRIDE]675psubb mm4,mm3676/*Start averaging mm0 and mm1 into mm6.*/677movq mm6,mm0678/*Continue loading the next row.*/679movq mm3,[SRC2+SRC_YSTRIDE]680/*mm4 [row 5] is done; write it out.*/681movq [DST+DST_YSTRIDE],mm4682pxor mm0,mm1683pavgb mm6,mm1684/*mm4 is free; start averaging mm3 into mm2 using mm4.*/685movq mm4,mm2686pand mm0,mm7687pavgb mm2,mm3688pxor mm4,mm3689lea DST,[DST+DST_YSTRIDE*2]690psubb mm6,mm0691pand mm4,mm7692/*mm6 [row 6] is done, write it out.*/693movq [DST],mm6694psubb mm2,mm4695/*mm2 [row 7] is done, write it out.*/696movq [DST+DST_YSTRIDE],mm2697#undef SRC1698#undef SRC2699#undef SRC_YSTRIDE700#undef DST_YSTRIDE701#undef DST702}703}704705unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,706const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){707OC_ALIGN8(unsigned char ref[64]);708oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);709return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);710}711712unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,const unsigned char *_src,713int _ystride){714OC_ALIGN8(ogg_int16_t buf[64]);715ogg_int16_t *bufp;716unsigned ret1;717unsigned ret2;718int dc;719bufp=buf;720__asm{721#define SRC eax722#define SRC4 esi723#define BUF edi724#define YSTRIDE edx725#define YSTRIDE3 ecx726#define RET eax727#define RET2 ecx728#define DC edx729#define DC_WORD dx730mov SRC,_src731mov BUF,bufp732mov YSTRIDE,_ystride733/* src4 = src+4*ystride */734lea SRC4,[SRC+YSTRIDE*4]735/* ystride3 = 3*ystride */736lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]737OC_LOAD_8x4(0x00)738OC_HADAMARD_8x4739OC_TRANSPOSE_4x4x2(0x00)740/*Finish swapping out this 8x4 block to make room for the next one.741mm0...mm3 have been swapped out already.*/742movq [0x00+BUF],mm4743movq [0x10+BUF],mm5744movq [0x20+BUF],mm6745movq [0x30+BUF],mm7746OC_LOAD_8x4(0x04)747OC_HADAMARD_8x4748OC_TRANSPOSE_4x4x2(0x08)749/*Here the first 4x4 block of output from the last transpose is the second7504x4 block of input for the next transform.751We have cleverly arranged that it already be in the appropriate place, so752we only have to do half the loads.*/753movq mm1,[0x10+BUF]754movq mm2,[0x20+BUF]755movq mm3,[0x30+BUF]756movq mm0,[0x00+BUF]757/*We split out the stages here so we can save the DC coefficient in the758middle.*/759OC_HADAMARD_AB_8x4760OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)761movd DC,mm1762OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)763/*Up to this point, everything fit in 16 bits (8 input + 1 for the764difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1765for the factor of two we dropped + 3 for the vertical accumulation).766Now we finally have to promote things to dwords.767We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long768latency of pmaddwd by starting the next series of loads now.*/769pmaddwd mm0,mm7770movq mm1,[0x50+BUF]771movq mm5,[0x58+BUF]772movq mm2,[0x60+BUF]773movq mm4,mm0774movq mm6,[0x68+BUF]775punpckhdq mm0,mm0776movq mm3,[0x70+BUF]777paddd mm4,mm0778movq mm7,[0x78+BUF]779movd RET,mm4780movq mm0,[0x40+BUF]781movq mm4,[0x48+BUF]782OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)783pmaddwd mm0,mm7784/*We assume that the DC coefficient is always positive (which is true,785because the input to the INTRA transform was not a difference).*/786movzx DC,DC_WORD787add RET,RET788sub RET,DC789movq mm4,mm0790punpckhdq mm0,mm0791paddd mm4,mm0792movd RET2,mm4793lea RET,[-64+RET+RET2*2]794mov [dc],DC795mov [ret1],RET796#undef SRC797#undef SRC4798#undef BUF799#undef YSTRIDE800#undef YSTRIDE3801#undef RET802#undef RET2803#undef DC804#undef DC_WORD805}806*_dc=dc;807return ret1;808}809810void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],811const unsigned char *_src, const unsigned char *_ref,int _ystride){812int i;813__asm pxor mm7,mm7814for(i=4;i-->0;){815__asm{816#define SRC edx817#define YSTRIDE esi818#define RESIDUE eax819#define REF ecx820mov YSTRIDE,_ystride821mov RESIDUE,_residue822mov SRC,_src823mov REF,_ref824/*mm0=[src]*/825movq mm0,[SRC]826/*mm1=[ref]*/827movq mm1,[REF]828/*mm4=[src+ystride]*/829movq mm4,[SRC+YSTRIDE]830/*mm5=[ref+ystride]*/831movq mm5,[REF+YSTRIDE]832/*Compute [src]-[ref].*/833movq mm2,mm0834punpcklbw mm0,mm7835movq mm3,mm1836punpckhbw mm2,mm7837punpcklbw mm1,mm7838punpckhbw mm3,mm7839psubw mm0,mm1840psubw mm2,mm3841/*Compute [src+ystride]-[ref+ystride].*/842movq mm1,mm4843punpcklbw mm4,mm7844movq mm3,mm5845punpckhbw mm1,mm7846lea SRC,[SRC+YSTRIDE*2]847punpcklbw mm5,mm7848lea REF,[REF+YSTRIDE*2]849punpckhbw mm3,mm7850psubw mm4,mm5851psubw mm1,mm3852/*Write the answer out.*/853movq [RESIDUE+0x00],mm0854movq [RESIDUE+0x08],mm2855movq [RESIDUE+0x10],mm4856movq [RESIDUE+0x18],mm1857lea RESIDUE,[RESIDUE+0x20]858mov _residue,RESIDUE859mov _src,SRC860mov _ref,REF861#undef SRC862#undef YSTRIDE863#undef RESIDUE864#undef REF865}866}867}868869void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],870const unsigned char *_src,int _ystride){871__asm{872#define YSTRIDE edx873#define YSTRIDE3 edi874#define RESIDUE ecx875#define SRC eax876mov YSTRIDE,_ystride877mov RESIDUE,_residue878mov SRC,_src879/*mm0=[src]*/880movq mm0,[SRC]881/*mm1=[src+ystride]*/882movq mm1,[SRC+YSTRIDE]883/*mm6={-1}x4*/884pcmpeqw mm6,mm6885/*mm2=[src+2*ystride]*/886movq mm2,[SRC+YSTRIDE*2]887/*[ystride3]=3*[ystride]*/888lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]889/*mm6={1}x4*/890psllw mm6,15891/*mm3=[src+3*ystride]*/892movq mm3,[SRC+YSTRIDE3]893/*mm6={128}x4*/894psrlw mm6,8895/*mm7=0*/896pxor mm7,mm7897/*[src]=[src]+4*[ystride]*/898lea SRC,[SRC+YSTRIDE*4]899/*Compute [src]-128 and [src+ystride]-128*/900movq mm4,mm0901punpcklbw mm0,mm7902movq mm5,mm1903punpckhbw mm4,mm7904psubw mm0,mm6905punpcklbw mm1,mm7906psubw mm4,mm6907punpckhbw mm5,mm7908psubw mm1,mm6909psubw mm5,mm6910/*Write the answer out.*/911movq [RESIDUE+0x00],mm0912movq [RESIDUE+0x08],mm4913movq [RESIDUE+0x10],mm1914movq [RESIDUE+0x18],mm5915/*mm0=[src+4*ystride]*/916movq mm0,[SRC]917/*mm1=[src+5*ystride]*/918movq mm1,[SRC+YSTRIDE]919/*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/920movq mm4,mm2921punpcklbw mm2,mm7922movq mm5,mm3923punpckhbw mm4,mm7924psubw mm2,mm6925punpcklbw mm3,mm7926psubw mm4,mm6927punpckhbw mm5,mm7928psubw mm3,mm6929psubw mm5,mm6930/*Write the answer out.*/931movq [RESIDUE+0x20],mm2932movq [RESIDUE+0x28],mm4933movq [RESIDUE+0x30],mm3934movq [RESIDUE+0x38],mm5935/*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/936movq mm2,[SRC+YSTRIDE*2]937movq mm3,[SRC+YSTRIDE3]938movq mm4,mm0939punpcklbw mm0,mm7940movq mm5,mm1941punpckhbw mm4,mm7942psubw mm0,mm6943punpcklbw mm1,mm7944psubw mm4,mm6945punpckhbw mm5,mm7946psubw mm1,mm6947psubw mm5,mm6948/*Write the answer out.*/949movq [RESIDUE+0x40],mm0950movq [RESIDUE+0x48],mm4951movq [RESIDUE+0x50],mm1952movq [RESIDUE+0x58],mm5953/*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/954movq mm4,mm2955punpcklbw mm2,mm7956movq mm5,mm3957punpckhbw mm4,mm7958psubw mm2,mm6959punpcklbw mm3,mm7960psubw mm4,mm6961punpckhbw mm5,mm7962psubw mm3,mm6963psubw mm5,mm6964/*Write the answer out.*/965movq [RESIDUE+0x60],mm2966movq [RESIDUE+0x68],mm4967movq [RESIDUE+0x70],mm3968movq [RESIDUE+0x78],mm5969#undef YSTRIDE970#undef YSTRIDE3971#undef RESIDUE972#undef SRC973}974}975976void oc_enc_frag_copy2_mmxext(unsigned char *_dst,977const unsigned char *_src1,const unsigned char *_src2,int _ystride){978oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);979}980981#endif982983984