Path: blob/master/thirdparty/libtheora/x86/mmxencfrag.c
9898 views
/********************************************************************1* *2* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *3* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *4* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *5* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *6* *7* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *8* by the Xiph.Org Foundation https://www.xiph.org/ *9* *10********************************************************************1112function:1314********************************************************************/15#include <stddef.h>16#include "x86enc.h"1718#if defined(OC_X86_ASM)1920unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,21const unsigned char *_ref,int _ystride){22ptrdiff_t ystride3;23ptrdiff_t ret;24__asm__ __volatile__(25/*Load the first 4 rows of each block.*/26"movq (%[src]),%%mm0\n\t"27"movq (%[ref]),%%mm1\n\t"28"movq (%[src],%[ystride]),%%mm2\n\t"29"movq (%[ref],%[ystride]),%%mm3\n\t"30"lea (%[ystride],%[ystride],2),%[ystride3]\n\t"31"movq (%[src],%[ystride],2),%%mm4\n\t"32"movq (%[ref],%[ystride],2),%%mm5\n\t"33"movq (%[src],%[ystride3]),%%mm6\n\t"34"movq (%[ref],%[ystride3]),%%mm7\n\t"35/*Compute their SADs and add them in %%mm0*/36"psadbw %%mm1,%%mm0\n\t"37"psadbw %%mm3,%%mm2\n\t"38"lea (%[src],%[ystride],4),%[src]\n\t"39"paddw %%mm2,%%mm0\n\t"40"lea (%[ref],%[ystride],4),%[ref]\n\t"41/*Load the next 3 rows as registers become available.*/42"movq (%[src]),%%mm2\n\t"43"movq (%[ref]),%%mm3\n\t"44"psadbw %%mm5,%%mm4\n\t"45"psadbw %%mm7,%%mm6\n\t"46"paddw %%mm4,%%mm0\n\t"47"movq (%[ref],%[ystride]),%%mm5\n\t"48"movq (%[src],%[ystride]),%%mm4\n\t"49"paddw %%mm6,%%mm0\n\t"50"movq (%[ref],%[ystride],2),%%mm7\n\t"51"movq (%[src],%[ystride],2),%%mm6\n\t"52/*Start adding their SADs to %%mm0*/53"psadbw %%mm3,%%mm2\n\t"54"psadbw %%mm5,%%mm4\n\t"55"paddw %%mm2,%%mm0\n\t"56"psadbw %%mm7,%%mm6\n\t"57/*Load last row as registers become available.*/58"movq (%[src],%[ystride3]),%%mm2\n\t"59"movq (%[ref],%[ystride3]),%%mm3\n\t"60/*And finish adding up their SADs.*/61"paddw %%mm4,%%mm0\n\t"62"psadbw %%mm3,%%mm2\n\t"63"paddw %%mm6,%%mm0\n\t"64"paddw %%mm2,%%mm0\n\t"65"movd %%mm0,%[ret]\n\t"66:[ret]"=a"(ret),[src]"+r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)67:[ystride]"r"((ptrdiff_t)_ystride)68);69return (unsigned)ret;70}7172unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,73const unsigned char *_ref,int _ystride,unsigned _thresh){74/*Early termination is for suckers.*/75return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);76}7778/*Assumes the first two rows of %[ref1] and %[ref2] are in %%mm0...%%mm3, the79first two rows of %[src] are in %%mm4,%%mm5, and {1}x8 is in %%mm7.80We pre-load the next two rows of data as registers become available.*/81#define OC_SAD2_LOOP \82"#OC_SAD2_LOOP\n\t" \83/*We want to compute (%%mm0+%%mm1>>1) on unsigned bytes without overflow, but \84pavgb computes (%%mm0+%%mm1+1>>1). \85The latter is exactly 1 too large when the low bit of two corresponding \86bytes is only set in one of them. \87Therefore we pxor the operands, pand to mask out the low bits, and psubb to \88correct the output of pavgb. \89TODO: This should be rewritten to compute ~pavgb(~a,~b) instead, which \90schedules better; currently, however, this function is unused.*/ \91"movq %%mm0,%%mm6\n\t" \92"lea (%[ref1],%[ystride],2),%[ref1]\n\t" \93"pxor %%mm1,%%mm0\n\t" \94"pavgb %%mm1,%%mm6\n\t" \95"lea (%[ref2],%[ystride],2),%[ref2]\n\t" \96"movq %%mm2,%%mm1\n\t" \97"pand %%mm7,%%mm0\n\t" \98"pavgb %%mm3,%%mm2\n\t" \99"pxor %%mm3,%%mm1\n\t" \100"movq (%[ref2],%[ystride]),%%mm3\n\t" \101"psubb %%mm0,%%mm6\n\t" \102"movq (%[ref1]),%%mm0\n\t" \103"pand %%mm7,%%mm1\n\t" \104"psadbw %%mm6,%%mm4\n\t" \105"movd %[ret],%%mm6\n\t" \106"psubb %%mm1,%%mm2\n\t" \107"movq (%[ref2]),%%mm1\n\t" \108"lea (%[src],%[ystride],2),%[src]\n\t" \109"psadbw %%mm2,%%mm5\n\t" \110"movq (%[ref1],%[ystride]),%%mm2\n\t" \111"paddw %%mm4,%%mm5\n\t" \112"movq (%[src]),%%mm4\n\t" \113"paddw %%mm5,%%mm6\n\t" \114"movq (%[src],%[ystride]),%%mm5\n\t" \115"movd %%mm6,%[ret]\n\t" \116117/*Same as above, but does not pre-load the next two rows.*/118#define OC_SAD2_TAIL \119"#OC_SAD2_TAIL\n\t" \120"movq %%mm0,%%mm6\n\t" \121"pavgb %%mm1,%%mm0\n\t" \122"pxor %%mm1,%%mm6\n\t" \123"movq %%mm2,%%mm1\n\t" \124"pand %%mm7,%%mm6\n\t" \125"pavgb %%mm3,%%mm2\n\t" \126"pxor %%mm3,%%mm1\n\t" \127"psubb %%mm6,%%mm0\n\t" \128"pand %%mm7,%%mm1\n\t" \129"psadbw %%mm0,%%mm4\n\t" \130"psubb %%mm1,%%mm2\n\t" \131"movd %[ret],%%mm6\n\t" \132"psadbw %%mm2,%%mm5\n\t" \133"paddw %%mm4,%%mm5\n\t" \134"paddw %%mm5,%%mm6\n\t" \135"movd %%mm6,%[ret]\n\t" \136137unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,138const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,139unsigned _thresh){140ptrdiff_t ret;141__asm__ __volatile__(142"movq (%[ref1]),%%mm0\n\t"143"movq (%[ref2]),%%mm1\n\t"144"movq (%[ref1],%[ystride]),%%mm2\n\t"145"movq (%[ref2],%[ystride]),%%mm3\n\t"146"xor %[ret],%[ret]\n\t"147"movq (%[src]),%%mm4\n\t"148"pxor %%mm7,%%mm7\n\t"149"pcmpeqb %%mm6,%%mm6\n\t"150"movq (%[src],%[ystride]),%%mm5\n\t"151"psubb %%mm6,%%mm7\n\t"152OC_SAD2_LOOP153OC_SAD2_LOOP154OC_SAD2_LOOP155OC_SAD2_TAIL156:[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+r"(_ref1),[ref2]"+r"(_ref2)157:[ystride]"r"((ptrdiff_t)_ystride)158);159return (unsigned)ret;160}161162/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their16316-bit difference in %%mm0...%%mm7.*/164#define OC_LOAD_SUB_8x4(_off) \165"#OC_LOAD_SUB_8x4\n\t" \166"movd "#_off"(%[src]),%%mm0\n\t" \167"movd "#_off"(%[ref]),%%mm4\n\t" \168"movd "#_off"(%[src],%[src_ystride]),%%mm1\n\t" \169"lea (%[src],%[src_ystride],2),%[src]\n\t" \170"movd "#_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \171"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \172"movd "#_off"(%[src]),%%mm2\n\t" \173"movd "#_off"(%[ref]),%%mm7\n\t" \174"movd "#_off"(%[src],%[src_ystride]),%%mm3\n\t" \175"movd "#_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \176"punpcklbw %%mm4,%%mm0\n\t" \177"lea (%[src],%[src_ystride],2),%[src]\n\t" \178"punpcklbw %%mm4,%%mm4\n\t" \179"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \180"psubw %%mm4,%%mm0\n\t" \181"movd "#_off"(%[src]),%%mm4\n\t" \182"movq %%mm0,"OC_MEM_OFFS(_off*2,buf)"\n\t" \183"movd "#_off"(%[ref]),%%mm0\n\t" \184"punpcklbw %%mm5,%%mm1\n\t" \185"punpcklbw %%mm5,%%mm5\n\t" \186"psubw %%mm5,%%mm1\n\t" \187"movd "#_off"(%[src],%[src_ystride]),%%mm5\n\t" \188"punpcklbw %%mm7,%%mm2\n\t" \189"punpcklbw %%mm7,%%mm7\n\t" \190"psubw %%mm7,%%mm2\n\t" \191"movd "#_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \192"punpcklbw %%mm6,%%mm3\n\t" \193"lea (%[src],%[src_ystride],2),%[src]\n\t" \194"punpcklbw %%mm6,%%mm6\n\t" \195"psubw %%mm6,%%mm3\n\t" \196"movd "#_off"(%[src]),%%mm6\n\t" \197"punpcklbw %%mm0,%%mm4\n\t" \198"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \199"punpcklbw %%mm0,%%mm0\n\t" \200"lea (%[src],%[src_ystride],2),%[src]\n\t" \201"psubw %%mm0,%%mm4\n\t" \202"movd "#_off"(%[ref]),%%mm0\n\t" \203"punpcklbw %%mm7,%%mm5\n\t" \204"neg %[src_ystride]\n\t" \205"punpcklbw %%mm7,%%mm7\n\t" \206"psubw %%mm7,%%mm5\n\t" \207"movd "#_off"(%[src],%[src_ystride]),%%mm7\n\t" \208"punpcklbw %%mm0,%%mm6\n\t" \209"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \210"punpcklbw %%mm0,%%mm0\n\t" \211"neg %[ref_ystride]\n\t" \212"psubw %%mm0,%%mm6\n\t" \213"movd "#_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \214"lea (%[src],%[src_ystride],8),%[src]\n\t" \215"punpcklbw %%mm0,%%mm7\n\t" \216"neg %[src_ystride]\n\t" \217"punpcklbw %%mm0,%%mm0\n\t" \218"lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \219"psubw %%mm0,%%mm7\n\t" \220"neg %[ref_ystride]\n\t" \221"movq "OC_MEM_OFFS(_off*2,buf)",%%mm0\n\t" \222223/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/224#define OC_LOAD_8x4(_off) \225"#OC_LOAD_8x4\n\t" \226"movd "#_off"(%[src]),%%mm0\n\t" \227"movd "#_off"(%[src],%[ystride]),%%mm1\n\t" \228"movd "#_off"(%[src],%[ystride],2),%%mm2\n\t" \229"pxor %%mm7,%%mm7\n\t" \230"movd "#_off"(%[src],%[ystride3]),%%mm3\n\t" \231"punpcklbw %%mm7,%%mm0\n\t" \232"movd "#_off"(%[src4]),%%mm4\n\t" \233"punpcklbw %%mm7,%%mm1\n\t" \234"movd "#_off"(%[src4],%[ystride]),%%mm5\n\t" \235"punpcklbw %%mm7,%%mm2\n\t" \236"movd "#_off"(%[src4],%[ystride],2),%%mm6\n\t" \237"punpcklbw %%mm7,%%mm3\n\t" \238"movd "#_off"(%[src4],%[ystride3]),%%mm7\n\t" \239"punpcklbw %%mm4,%%mm4\n\t" \240"punpcklbw %%mm5,%%mm5\n\t" \241"psrlw $8,%%mm4\n\t" \242"psrlw $8,%%mm5\n\t" \243"punpcklbw %%mm6,%%mm6\n\t" \244"punpcklbw %%mm7,%%mm7\n\t" \245"psrlw $8,%%mm6\n\t" \246"psrlw $8,%%mm7\n\t" \247248/*Performs the first two stages of an 8-point 1-D Hadamard transform.249The transform is performed in place, except that outputs 0-3 are swapped with250outputs 4-7.251Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to252perform this stage in place with no temporary registers).*/253#define OC_HADAMARD_AB_8x4 \254"#OC_HADAMARD_AB_8x4\n\t" \255/*Stage A: \256Outputs 0-3 are swapped with 4-7 here.*/ \257"paddw %%mm1,%%mm5\n\t" \258"paddw %%mm2,%%mm6\n\t" \259"paddw %%mm1,%%mm1\n\t" \260"paddw %%mm2,%%mm2\n\t" \261"psubw %%mm5,%%mm1\n\t" \262"psubw %%mm6,%%mm2\n\t" \263"paddw %%mm3,%%mm7\n\t" \264"paddw %%mm0,%%mm4\n\t" \265"paddw %%mm3,%%mm3\n\t" \266"paddw %%mm0,%%mm0\n\t" \267"psubw %%mm7,%%mm3\n\t" \268"psubw %%mm4,%%mm0\n\t" \269/*Stage B:*/ \270"paddw %%mm2,%%mm0\n\t" \271"paddw %%mm3,%%mm1\n\t" \272"paddw %%mm6,%%mm4\n\t" \273"paddw %%mm7,%%mm5\n\t" \274"paddw %%mm2,%%mm2\n\t" \275"paddw %%mm3,%%mm3\n\t" \276"paddw %%mm6,%%mm6\n\t" \277"paddw %%mm7,%%mm7\n\t" \278"psubw %%mm0,%%mm2\n\t" \279"psubw %%mm1,%%mm3\n\t" \280"psubw %%mm4,%%mm6\n\t" \281"psubw %%mm5,%%mm7\n\t" \282283/*Performs the last stage of an 8-point 1-D Hadamard transform in place.284Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in285place with no temporary registers).*/286#define OC_HADAMARD_C_8x4 \287"#OC_HADAMARD_C_8x4\n\t" \288/*Stage C:*/ \289"paddw %%mm1,%%mm0\n\t" \290"paddw %%mm3,%%mm2\n\t" \291"paddw %%mm5,%%mm4\n\t" \292"paddw %%mm7,%%mm6\n\t" \293"paddw %%mm1,%%mm1\n\t" \294"paddw %%mm3,%%mm3\n\t" \295"paddw %%mm5,%%mm5\n\t" \296"paddw %%mm7,%%mm7\n\t" \297"psubw %%mm0,%%mm1\n\t" \298"psubw %%mm2,%%mm3\n\t" \299"psubw %%mm4,%%mm5\n\t" \300"psubw %%mm6,%%mm7\n\t" \301302/*Performs an 8-point 1-D Hadamard transform.303The transform is performed in place, except that outputs 0-3 are swapped with304outputs 4-7.305Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform306in place with no temporary registers).*/307#define OC_HADAMARD_8x4 \308OC_HADAMARD_AB_8x4 \309OC_HADAMARD_C_8x4 \310311/*Performs the first part of the final stage of the Hadamard transform and312summing of absolute values.313At the end of this part, %%mm1 will contain the DC coefficient of the314transform.*/315#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \316/*We use the fact that \317(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \318to merge the final butterfly with the abs and the first stage of \319accumulation. \320Thus we can avoid using pabsw, which is not available until SSSE3. \321Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \322implementation would be (3+3)*8+7=55 instructions (+4 for spilling \323registers). \324Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \325This implementation is only 26 (+4 for spilling registers).*/ \326"#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \327"movq %%mm7,"OC_MEM_OFFS(_r7,buf)"\n\t" \328"movq %%mm6,"OC_MEM_OFFS(_r6,buf)"\n\t" \329/*mm7={0x7FFF}x4 \330mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \331"pcmpeqb %%mm7,%%mm7\n\t" \332"movq %%mm0,%%mm6\n\t" \333"psrlw $1,%%mm7\n\t" \334"paddw %%mm1,%%mm6\n\t" \335"pmaxsw %%mm1,%%mm0\n\t" \336"paddsw %%mm7,%%mm6\n\t" \337"psubw %%mm6,%%mm0\n\t" \338/*mm2=max(abs(mm2),abs(mm3))-0x7FFF \339mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \340"movq %%mm2,%%mm6\n\t" \341"movq %%mm4,%%mm1\n\t" \342"pmaxsw %%mm3,%%mm2\n\t" \343"pmaxsw %%mm5,%%mm4\n\t" \344"paddw %%mm3,%%mm6\n\t" \345"paddw %%mm5,%%mm1\n\t" \346"movq "OC_MEM_OFFS(_r7,buf)",%%mm3\n\t" \347348/*Performs the second part of the final stage of the Hadamard transform and349summing of absolute values.*/350#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \351"#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \352"paddsw %%mm7,%%mm6\n\t" \353"movq "OC_MEM_OFFS(_r6,buf)",%%mm5\n\t" \354"paddsw %%mm7,%%mm1\n\t" \355"psubw %%mm6,%%mm2\n\t" \356"psubw %%mm1,%%mm4\n\t" \357/*mm7={1}x4 (needed for the horizontal add that follows) \358mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \359"movq %%mm3,%%mm6\n\t" \360"pmaxsw %%mm5,%%mm3\n\t" \361"paddw %%mm2,%%mm0\n\t" \362"paddw %%mm5,%%mm6\n\t" \363"paddw %%mm4,%%mm0\n\t" \364"paddsw %%mm7,%%mm6\n\t" \365"paddw %%mm3,%%mm0\n\t" \366"psrlw $14,%%mm7\n\t" \367"psubw %%mm6,%%mm0\n\t" \368369/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the370absolute value of each component, and accumulates everything into mm0.371This is the only portion of SATD which requires MMXEXT (we could use plain372MMX, but it takes 4 instructions and an extra register to work around the373lack of a pmaxsw, which is a pretty serious penalty).*/374#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \375OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \376OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \377378/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each379component, and accumulates everything into mm0.380Note that mm0 will have an extra 4 added to each column, and that after381removing this value, the remainder will be half the conventional value.*/382#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) \383OC_HADAMARD_AB_8x4 \384OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7)385386/*Performs two 4x4 transposes (mostly) in place.387On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}388contains rows {a,b,c,d}.389On output, {0x40,0x50,0x60,0x70}+_off(%[buf]) contains {e,f,g,h}^T, and390{mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/391#define OC_TRANSPOSE_4x4x2(_off) \392"#OC_TRANSPOSE_4x4x2\n\t" \393/*First 4x4 transpose:*/ \394"movq %%mm5,"OC_MEM_OFFS(0x10+(_off),buf)"\n\t" \395/*mm0 = e3 e2 e1 e0 \396mm1 = f3 f2 f1 f0 \397mm2 = g3 g2 g1 g0 \398mm3 = h3 h2 h1 h0*/ \399"movq %%mm2,%%mm5\n\t" \400"punpcklwd %%mm3,%%mm2\n\t" \401"punpckhwd %%mm3,%%mm5\n\t" \402"movq %%mm0,%%mm3\n\t" \403"punpcklwd %%mm1,%%mm0\n\t" \404"punpckhwd %%mm1,%%mm3\n\t" \405/*mm0 = f1 e1 f0 e0 \406mm3 = f3 e3 f2 e2 \407mm2 = h1 g1 h0 g0 \408mm5 = h3 g3 h2 g2*/ \409"movq %%mm0,%%mm1\n\t" \410"punpckldq %%mm2,%%mm0\n\t" \411"punpckhdq %%mm2,%%mm1\n\t" \412"movq %%mm3,%%mm2\n\t" \413"punpckhdq %%mm5,%%mm3\n\t" \414"movq %%mm0,"OC_MEM_OFFS(0x40+(_off),buf)"\n\t" \415"punpckldq %%mm5,%%mm2\n\t" \416/*mm0 = h0 g0 f0 e0 \417mm1 = h1 g1 f1 e1 \418mm2 = h2 g2 f2 e2 \419mm3 = h3 g3 f3 e3*/ \420"movq "OC_MEM_OFFS(0x10+(_off),buf)",%%mm5\n\t" \421/*Second 4x4 transpose:*/ \422/*mm4 = a3 a2 a1 a0 \423mm5 = b3 b2 b1 b0 \424mm6 = c3 c2 c1 c0 \425mm7 = d3 d2 d1 d0*/ \426"movq %%mm6,%%mm0\n\t" \427"punpcklwd %%mm7,%%mm6\n\t" \428"movq %%mm1,"OC_MEM_OFFS(0x50+(_off),buf)"\n\t" \429"punpckhwd %%mm7,%%mm0\n\t" \430"movq %%mm4,%%mm7\n\t" \431"punpcklwd %%mm5,%%mm4\n\t" \432"movq %%mm2,"OC_MEM_OFFS(0x60+(_off),buf)"\n\t" \433"punpckhwd %%mm5,%%mm7\n\t" \434/*mm4 = b1 a1 b0 a0 \435mm7 = b3 a3 b2 a2 \436mm6 = d1 c1 d0 c0 \437mm0 = d3 c3 d2 c2*/ \438"movq %%mm4,%%mm5\n\t" \439"punpckldq %%mm6,%%mm4\n\t" \440"movq %%mm3,"OC_MEM_OFFS(0x70+(_off),buf)"\n\t" \441"punpckhdq %%mm6,%%mm5\n\t" \442"movq %%mm7,%%mm6\n\t" \443"punpckhdq %%mm0,%%mm7\n\t" \444"punpckldq %%mm0,%%mm6\n\t" \445/*mm4 = d0 c0 b0 a0 \446mm5 = d1 c1 b1 a1 \447mm6 = d2 c2 b2 a2 \448mm7 = d3 c3 b3 a3*/ \449450static unsigned oc_int_frag_satd_mmxext(int *_dc,451const unsigned char *_src,int _src_ystride,452const unsigned char *_ref,int _ref_ystride){453OC_ALIGN8(ogg_int16_t buf[64]);454unsigned ret;455unsigned ret2;456int dc;457__asm__ __volatile__(458OC_LOAD_SUB_8x4(0x00)459OC_HADAMARD_8x4460OC_TRANSPOSE_4x4x2(0x00)461/*Finish swapping out this 8x4 block to make room for the next one.462mm0...mm3 have been swapped out already.*/463"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"464"movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"465"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"466"movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"467OC_LOAD_SUB_8x4(0x04)468OC_HADAMARD_8x4469OC_TRANSPOSE_4x4x2(0x08)470/*Here the first 4x4 block of output from the last transpose is the second4714x4 block of input for the next transform.472We have cleverly arranged that it already be in the appropriate place, so473we only have to do half the loads.*/474"movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"475"movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"476"movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"477"movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"478/*We split out the stages here so we can save the DC coefficient in the479middle.*/480OC_HADAMARD_AB_8x4481OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)482"movd %%mm1,%[dc]\n\t"483OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)484/*Up to this point, everything fit in 16 bits (8 input + 1 for the485difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1486for the factor of two we dropped + 3 for the vertical accumulation).487Now we finally have to promote things to dwords.488We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long489latency of pmaddwd by starting the next series of loads now.*/490"pmaddwd %%mm7,%%mm0\n\t"491"movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"492"movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"493"movq %%mm0,%%mm4\n\t"494"movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"495"punpckhdq %%mm0,%%mm0\n\t"496"movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"497"paddd %%mm0,%%mm4\n\t"498"movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"499"movd %%mm4,%[ret2]\n\t"500"movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"501"movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"502"movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"503OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)504"pmaddwd %%mm7,%%mm0\n\t"505/*Subtract abs(dc) from 2*ret2.*/506"movsx %w[dc],%[dc]\n\t"507"cdq\n\t"508"lea (%[ret],%[ret2],2),%[ret2]\n\t"509"movq %%mm0,%%mm4\n\t"510"punpckhdq %%mm0,%%mm0\n\t"511"xor %[dc],%[ret]\n\t"512"paddd %%mm0,%%mm4\n\t"513/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4514added to them, a factor of two removed, and the DC value included;515correct the final sum here.*/516"sub %[ret],%[ret2]\n\t"517"movd %%mm4,%[ret]\n\t"518"lea -64(%[ret2],%[ret],2),%[ret]\n\t"519/*Although it looks like we're using 8 registers here, gcc can alias %[ret]520and %[ret2] with some of the inputs, since for once we don't write to521them until after we're done using everything but %[buf].*/522/*Note that _src_ystride and _ref_ystride must be given non-overlapping523constraints, otherwise if gcc can prove they're equal it will allocate524them to the same register (which is bad); _src and _ref face a similar525problem, though those are never actually the same.*/526:[ret]"=d"(ret),[ret2]"=r"(ret2),[dc]"=a"(dc),527[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))528:[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),529[ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)530/*We have to use neg, so we actually clobber the condition codes for once531(not to mention cmp, sub, and add).*/532:"cc"533);534*_dc=dc;535return ret;536}537538unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,539const unsigned char *_ref,int _ystride){540return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);541}542543/*Our internal implementation of frag_copy2 takes an extra stride parameter so544we can share code with oc_enc_frag_satd2_mmxext().*/545void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,546const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){547__asm__ __volatile__(548/*Load the first 3 rows.*/549"movq (%[src1]),%%mm0\n\t"550"movq (%[src2]),%%mm1\n\t"551"movq (%[src1],%[src_ystride]),%%mm2\n\t"552"lea (%[src1],%[src_ystride],2),%[src1]\n\t"553"movq (%[src2],%[src_ystride]),%%mm3\n\t"554"lea (%[src2],%[src_ystride],2),%[src2]\n\t"555"pxor %%mm7,%%mm7\n\t"556"movq (%[src1]),%%mm4\n\t"557"pcmpeqb %%mm6,%%mm6\n\t"558"movq (%[src2]),%%mm5\n\t"559/*mm7={1}x8.*/560"psubb %%mm6,%%mm7\n\t"561/*Start averaging %%mm0 and %%mm1 into %%mm6.*/562"movq %%mm0,%%mm6\n\t"563"pxor %%mm1,%%mm0\n\t"564"pavgb %%mm1,%%mm6\n\t"565/*%%mm1 is free, start averaging %%mm3 into %%mm2 using %%mm1.*/566"movq %%mm2,%%mm1\n\t"567"pand %%mm7,%%mm0\n\t"568"pavgb %%mm3,%%mm2\n\t"569"pxor %%mm3,%%mm1\n\t"570/*%%mm3 is free.*/571"psubb %%mm0,%%mm6\n\t"572/*%%mm0 is free, start loading the next row.*/573"movq (%[src1],%[src_ystride]),%%mm0\n\t"574/*Start averaging %%mm5 and %%mm4 using %%mm3.*/575"movq %%mm4,%%mm3\n\t"576/*%%mm6 (row 0) is done; write it out.*/577"movq %%mm6,(%[dst])\n\t"578"pand %%mm7,%%mm1\n\t"579"pavgb %%mm5,%%mm4\n\t"580"psubb %%mm1,%%mm2\n\t"581/*%%mm1 is free, continue loading the next row.*/582"movq (%[src2],%[src_ystride]),%%mm1\n\t"583"pxor %%mm5,%%mm3\n\t"584"lea (%[src1],%[src_ystride],2),%[src1]\n\t"585/*%%mm2 (row 1) is done; write it out.*/586"movq %%mm2,(%[dst],%[dst_ystride])\n\t"587"pand %%mm7,%%mm3\n\t"588/*Start loading the next row.*/589"movq (%[src1]),%%mm2\n\t"590"lea (%[dst],%[dst_ystride],2),%[dst]\n\t"591"psubb %%mm3,%%mm4\n\t"592"lea (%[src2],%[src_ystride],2),%[src2]\n\t"593/*%%mm4 (row 2) is done; write it out.*/594"movq %%mm4,(%[dst])\n\t"595/*Continue loading the next row.*/596"movq (%[src2]),%%mm3\n\t"597/*Start averaging %%mm0 and %%mm1 into %%mm6.*/598"movq %%mm0,%%mm6\n\t"599"pxor %%mm1,%%mm0\n\t"600/*Start loading the next row.*/601"movq (%[src1],%[src_ystride]),%%mm4\n\t"602"pavgb %%mm1,%%mm6\n\t"603/*%%mm1 is free; start averaging %%mm3 into %%mm2 using %%mm1.*/604"movq %%mm2,%%mm1\n\t"605"pand %%mm7,%%mm0\n\t"606/*Continue loading the next row.*/607"movq (%[src2],%[src_ystride]),%%mm5\n\t"608"pavgb %%mm3,%%mm2\n\t"609"lea (%[src1],%[src_ystride],2),%[src1]\n\t"610"pxor %%mm3,%%mm1\n\t"611/*%%mm3 is free.*/612"psubb %%mm0,%%mm6\n\t"613/*%%mm0 is free, start loading the next row.*/614"movq (%[src1]),%%mm0\n\t"615/*Start averaging %%mm5 into %%mm4 using %%mm3.*/616"movq %%mm4,%%mm3\n\t"617/*%%mm6 (row 3) is done; write it out.*/618"movq %%mm6,(%[dst],%[dst_ystride])\n\t"619"pand %%mm7,%%mm1\n\t"620"lea (%[src2],%[src_ystride],2),%[src2]\n\t"621"pavgb %%mm5,%%mm4\n\t"622"lea (%[dst],%[dst_ystride],2),%[dst]\n\t"623"psubb %%mm1,%%mm2\n\t"624/*%%mm1 is free; continue loading the next row.*/625"movq (%[src2]),%%mm1\n\t"626"pxor %%mm5,%%mm3\n\t"627/*%%mm2 (row 4) is done; write it out.*/628"movq %%mm2,(%[dst])\n\t"629"pand %%mm7,%%mm3\n\t"630/*Start loading the next row.*/631"movq (%[src1],%[src_ystride]),%%mm2\n\t"632"psubb %%mm3,%%mm4\n\t"633/*Start averaging %%mm0 and %%mm1 into %%mm6.*/634"movq %%mm0,%%mm6\n\t"635/*Continue loading the next row.*/636"movq (%[src2],%[src_ystride]),%%mm3\n\t"637/*%%mm4 (row 5) is done; write it out.*/638"movq %%mm4,(%[dst],%[dst_ystride])\n\t"639"pxor %%mm1,%%mm0\n\t"640"pavgb %%mm1,%%mm6\n\t"641/*%%mm4 is free; start averaging %%mm3 into %%mm2 using %%mm4.*/642"movq %%mm2,%%mm4\n\t"643"pand %%mm7,%%mm0\n\t"644"pavgb %%mm3,%%mm2\n\t"645"pxor %%mm3,%%mm4\n\t"646"lea (%[dst],%[dst_ystride],2),%[dst]\n\t"647"psubb %%mm0,%%mm6\n\t"648"pand %%mm7,%%mm4\n\t"649/*%%mm6 (row 6) is done, write it out.*/650"movq %%mm6,(%[dst])\n\t"651"psubb %%mm4,%%mm2\n\t"652/*%%mm2 (row 7) is done, write it out.*/653"movq %%mm2,(%[dst],%[dst_ystride])\n\t"654:[dst]"+r"(_dst),[src1]"+r"(_src1),[src2]"+r"(_src2)655:[dst_ystride]"r"((ptrdiff_t)_dst_ystride),656[src_ystride]"r"((ptrdiff_t)_src_ystride)657:"memory"658);659}660661unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,662const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){663OC_ALIGN8(unsigned char ref[64]);664oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);665return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);666}667668unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,669const unsigned char *_src,int _ystride){670OC_ALIGN8(ogg_int16_t buf[64]);671unsigned ret;672unsigned ret2;673int dc;674__asm__ __volatile__(675OC_LOAD_8x4(0x00)676OC_HADAMARD_8x4677OC_TRANSPOSE_4x4x2(0x00)678/*Finish swapping out this 8x4 block to make room for the next one.679mm0...mm3 have been swapped out already.*/680"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"681"movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"682"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"683"movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"684OC_LOAD_8x4(0x04)685OC_HADAMARD_8x4686OC_TRANSPOSE_4x4x2(0x08)687/*Here the first 4x4 block of output from the last transpose is the second6884x4 block of input for the next transform.689We have cleverly arranged that it already be in the appropriate place, so690we only have to do half the loads.*/691"movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"692"movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"693"movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"694"movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"695/*We split out the stages here so we can save the DC coefficient in the696middle.*/697OC_HADAMARD_AB_8x4698OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)699"movd %%mm1,%[dc]\n\t"700OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)701/*Up to this point, everything fit in 16 bits (8 input + 1 for the702difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1703for the factor of two we dropped + 3 for the vertical accumulation).704Now we finally have to promote things to dwords.705We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long706latency of pmaddwd by starting the next series of loads now.*/707"pmaddwd %%mm7,%%mm0\n\t"708"movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"709"movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"710"movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"711"movq %%mm0,%%mm4\n\t"712"movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"713"punpckhdq %%mm0,%%mm0\n\t"714"movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"715"paddd %%mm0,%%mm4\n\t"716"movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"717"movd %%mm4,%[ret]\n\t"718"movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"719"movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"720OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)721"pmaddwd %%mm7,%%mm0\n\t"722/*We assume that the DC coefficient is always positive (which is true,723because the input to the INTRA transform was not a difference).*/724"movzx %w[dc],%[dc]\n\t"725"add %[ret],%[ret]\n\t"726"sub %[dc],%[ret]\n\t"727"movq %%mm0,%%mm4\n\t"728"punpckhdq %%mm0,%%mm0\n\t"729"paddd %%mm0,%%mm4\n\t"730"movd %%mm4,%[ret2]\n\t"731"lea -64(%[ret],%[ret2],2),%[ret]\n\t"732/*Although it looks like we're using 8 registers here, gcc can alias %[ret]733and %[ret2] with some of the inputs, since for once we don't write to734them until after we're done using everything but %[buf] (which is also735listed as an output to ensure gcc _doesn't_ alias them against it).*/736:[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=r"(dc),737[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))738:[src]"r"(_src),[src4]"r"(_src+4*_ystride),739[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)740/*We have to use sub, so we actually clobber the condition codes for once741(not to mention add).*/742:"cc"743);744*_dc=dc;745return ret;746}747748void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],749const unsigned char *_src,const unsigned char *_ref,int _ystride){750int i;751__asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);752for(i=4;i-->0;){753__asm__ __volatile__(754/*mm0=[src]*/755"movq (%[src]),%%mm0\n\t"756/*mm1=[ref]*/757"movq (%[ref]),%%mm1\n\t"758/*mm4=[src+ystride]*/759"movq (%[src],%[ystride]),%%mm4\n\t"760/*mm5=[ref+ystride]*/761"movq (%[ref],%[ystride]),%%mm5\n\t"762/*Compute [src]-[ref].*/763"movq %%mm0,%%mm2\n\t"764"punpcklbw %%mm7,%%mm0\n\t"765"movq %%mm1,%%mm3\n\t"766"punpckhbw %%mm7,%%mm2\n\t"767"punpcklbw %%mm7,%%mm1\n\t"768"punpckhbw %%mm7,%%mm3\n\t"769"psubw %%mm1,%%mm0\n\t"770"psubw %%mm3,%%mm2\n\t"771/*Compute [src+ystride]-[ref+ystride].*/772"movq %%mm4,%%mm1\n\t"773"punpcklbw %%mm7,%%mm4\n\t"774"movq %%mm5,%%mm3\n\t"775"punpckhbw %%mm7,%%mm1\n\t"776"lea (%[src],%[ystride],2),%[src]\n\t"777"punpcklbw %%mm7,%%mm5\n\t"778"lea (%[ref],%[ystride],2),%[ref]\n\t"779"punpckhbw %%mm7,%%mm3\n\t"780"psubw %%mm5,%%mm4\n\t"781"psubw %%mm3,%%mm1\n\t"782/*Write the answer out.*/783"movq %%mm0,0x00(%[residue])\n\t"784"movq %%mm2,0x08(%[residue])\n\t"785"movq %%mm4,0x10(%[residue])\n\t"786"movq %%mm1,0x18(%[residue])\n\t"787"lea 0x20(%[residue]),%[residue]\n\t"788:[residue]"+r"(_residue),[src]"+r"(_src),[ref]"+r"(_ref)789:[ystride]"r"((ptrdiff_t)_ystride)790:"memory"791);792}793}794795void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],796const unsigned char *_src,int _ystride){797ptrdiff_t ystride3;798__asm__ __volatile__(799/*mm0=[src]*/800"movq (%[src]),%%mm0\n\t"801/*mm1=[src+ystride]*/802"movq (%[src],%[ystride]),%%mm1\n\t"803/*mm6={-1}x4*/804"pcmpeqw %%mm6,%%mm6\n\t"805/*mm2=[src+2*ystride]*/806"movq (%[src],%[ystride],2),%%mm2\n\t"807/*[ystride3]=3*[ystride]*/808"lea (%[ystride],%[ystride],2),%[ystride3]\n\t"809/*mm6={1}x4*/810"psllw $15,%%mm6\n\t"811/*mm3=[src+3*ystride]*/812"movq (%[src],%[ystride3]),%%mm3\n\t"813/*mm6={128}x4*/814"psrlw $8,%%mm6\n\t"815/*mm7=0*/816"pxor %%mm7,%%mm7\n\t"817/*[src]=[src]+4*[ystride]*/818"lea (%[src],%[ystride],4),%[src]\n\t"819/*Compute [src]-128 and [src+ystride]-128*/820"movq %%mm0,%%mm4\n\t"821"punpcklbw %%mm7,%%mm0\n\t"822"movq %%mm1,%%mm5\n\t"823"punpckhbw %%mm7,%%mm4\n\t"824"psubw %%mm6,%%mm0\n\t"825"punpcklbw %%mm7,%%mm1\n\t"826"psubw %%mm6,%%mm4\n\t"827"punpckhbw %%mm7,%%mm5\n\t"828"psubw %%mm6,%%mm1\n\t"829"psubw %%mm6,%%mm5\n\t"830/*Write the answer out.*/831"movq %%mm0,0x00(%[residue])\n\t"832"movq %%mm4,0x08(%[residue])\n\t"833"movq %%mm1,0x10(%[residue])\n\t"834"movq %%mm5,0x18(%[residue])\n\t"835/*mm0=[src+4*ystride]*/836"movq (%[src]),%%mm0\n\t"837/*mm1=[src+5*ystride]*/838"movq (%[src],%[ystride]),%%mm1\n\t"839/*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/840"movq %%mm2,%%mm4\n\t"841"punpcklbw %%mm7,%%mm2\n\t"842"movq %%mm3,%%mm5\n\t"843"punpckhbw %%mm7,%%mm4\n\t"844"psubw %%mm6,%%mm2\n\t"845"punpcklbw %%mm7,%%mm3\n\t"846"psubw %%mm6,%%mm4\n\t"847"punpckhbw %%mm7,%%mm5\n\t"848"psubw %%mm6,%%mm3\n\t"849"psubw %%mm6,%%mm5\n\t"850/*Write the answer out.*/851"movq %%mm2,0x20(%[residue])\n\t"852"movq %%mm4,0x28(%[residue])\n\t"853"movq %%mm3,0x30(%[residue])\n\t"854"movq %%mm5,0x38(%[residue])\n\t"855/*mm2=[src+6*ystride]*/856"movq (%[src],%[ystride],2),%%mm2\n\t"857/*mm3=[src+7*ystride]*/858"movq (%[src],%[ystride3]),%%mm3\n\t"859/*Compute [src+4*ystride]-128 and [src+5*ystride]-128*/860"movq %%mm0,%%mm4\n\t"861"punpcklbw %%mm7,%%mm0\n\t"862"movq %%mm1,%%mm5\n\t"863"punpckhbw %%mm7,%%mm4\n\t"864"psubw %%mm6,%%mm0\n\t"865"punpcklbw %%mm7,%%mm1\n\t"866"psubw %%mm6,%%mm4\n\t"867"punpckhbw %%mm7,%%mm5\n\t"868"psubw %%mm6,%%mm1\n\t"869"psubw %%mm6,%%mm5\n\t"870/*Write the answer out.*/871"movq %%mm0,0x40(%[residue])\n\t"872"movq %%mm4,0x48(%[residue])\n\t"873"movq %%mm1,0x50(%[residue])\n\t"874"movq %%mm5,0x58(%[residue])\n\t"875/*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/876"movq %%mm2,%%mm4\n\t"877"punpcklbw %%mm7,%%mm2\n\t"878"movq %%mm3,%%mm5\n\t"879"punpckhbw %%mm7,%%mm4\n\t"880"psubw %%mm6,%%mm2\n\t"881"punpcklbw %%mm7,%%mm3\n\t"882"psubw %%mm6,%%mm4\n\t"883"punpckhbw %%mm7,%%mm5\n\t"884"psubw %%mm6,%%mm3\n\t"885"psubw %%mm6,%%mm5\n\t"886/*Write the answer out.*/887"movq %%mm2,0x60(%[residue])\n\t"888"movq %%mm4,0x68(%[residue])\n\t"889"movq %%mm3,0x70(%[residue])\n\t"890"movq %%mm5,0x78(%[residue])\n\t"891:[src]"+r"(_src),[ystride3]"=&r"(ystride3)892:[residue]"r"(_residue),[ystride]"r"((ptrdiff_t)_ystride)893:"memory"894);895}896897void oc_enc_frag_copy2_mmxext(unsigned char *_dst,898const unsigned char *_src1,const unsigned char *_src2,int _ystride){899oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);900}901902#endif903904905