Path: blob/master/thirdparty/libtheora/x86/sse2encfrag.c
9903 views
/********************************************************************1* *2* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *3* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *4* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *5* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *6* *7* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *8* by the Xiph.Org Foundation https://www.xiph.org/ *9* *10********************************************************************1112function:1314********************************************************************/15#include <stddef.h>16#include "x86enc.h"17#include "sse2trans.h"1819#if defined(OC_X86_ASM)2021/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their2216-bit differences.23On output, these are stored in _m0, xmm1, xmm2, and xmm3.24xmm4 and xmm5 are clobbered.*/25#define OC_LOAD_SUB_4x8(_m0) \26"#OC_LOAD_SUB_4x8\n\t" \27/*Load the first three rows.*/ \28"movq (%[src]),"_m0"\n\t" \29"movq (%[ref]),%%xmm4\n\t" \30"movq (%[src],%[ystride]),%%xmm1\n\t" \31"movq (%[ref],%[ystride]),%%xmm3\n\t" \32"movq (%[src],%[ystride],2),%%xmm2\n\t" \33"movq (%[ref],%[ystride],2),%%xmm5\n\t" \34/*Unpack and subtract.*/ \35"punpcklbw %%xmm4,"_m0"\n\t" \36"punpcklbw %%xmm4,%%xmm4\n\t" \37"punpcklbw %%xmm3,%%xmm1\n\t" \38"punpcklbw %%xmm3,%%xmm3\n\t" \39"psubw %%xmm4,"_m0"\n\t" \40"psubw %%xmm3,%%xmm1\n\t" \41/*Load the last row.*/ \42"movq (%[src],%[ystride3]),%%xmm3\n\t" \43"movq (%[ref],%[ystride3]),%%xmm4\n\t" \44/*Unpack, subtract, and advance the pointers.*/ \45"punpcklbw %%xmm5,%%xmm2\n\t" \46"punpcklbw %%xmm5,%%xmm5\n\t" \47"lea (%[src],%[ystride],4),%[src]\n\t" \48"psubw %%xmm5,%%xmm2\n\t" \49"punpcklbw %%xmm4,%%xmm3\n\t" \50"punpcklbw %%xmm4,%%xmm4\n\t" \51"lea (%[ref],%[ystride],4),%[ref]\n\t" \52"psubw %%xmm4,%%xmm3\n\t" \5354/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.55On output, xmm0 contains the sum of two of the rows, and the other two are56added to xmm7.*/57#define OC_SSD_4x8(_m0) \58"pmaddwd "_m0","_m0"\n\t" \59"pmaddwd %%xmm1,%%xmm1\n\t" \60"pmaddwd %%xmm2,%%xmm2\n\t" \61"pmaddwd %%xmm3,%%xmm3\n\t" \62"paddd %%xmm1,"_m0"\n\t" \63"paddd %%xmm3,%%xmm2\n\t" \64"paddd %%xmm2,%%xmm7\n\t" \6566unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,67const unsigned char *_ref,int _ystride){68unsigned ret;69__asm__ __volatile__(70OC_LOAD_SUB_4x8("%%xmm7")71OC_SSD_4x8("%%xmm7")72OC_LOAD_SUB_4x8("%%xmm0")73OC_SSD_4x8("%%xmm0")74"paddd %%xmm0,%%xmm7\n\t"75"movdqa %%xmm7,%%xmm6\n\t"76"punpckhqdq %%xmm7,%%xmm7\n\t"77"paddd %%xmm6,%%xmm7\n\t"78"pshufd $1,%%xmm7,%%xmm6\n\t"79"paddd %%xmm6,%%xmm7\n\t"80"movd %%xmm7,%[ret]\n\t"81:[ret]"=a"(ret)82:[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),83[ystride3]"r"((ptrdiff_t)_ystride*3)84);85return ret;86}8788static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={890x01,0x02,0x04,0x08,0x10,0x20,0x40,0x8090};9192/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their93horizontal sums as well as their 16-bit differences subject to a mask.94%%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/95#define OC_LOAD_SUB_MASK_2x8 \96"#OC_LOAD_SUB_MASK_2x8\n\t" \97/*Start the loads and expand the next 8 bits of the mask.*/ \98"shl $8,%[m]\n\t" \99"movq (%[src]),%%xmm0\n\t" \100"mov %h[m],%b[m]\n\t" \101"movq (%[ref]),%%xmm2\n\t" \102"movd %[m],%%xmm4\n\t" \103"shr $8,%[m]\n\t" \104"pshuflw $0x00,%%xmm4,%%xmm4\n\t" \105"mov %h[m],%b[m]\n\t" \106"pand %%xmm6,%%xmm4\n\t" \107"pcmpeqb %%xmm6,%%xmm4\n\t" \108/*Perform the masking.*/ \109"pand %%xmm4,%%xmm0\n\t" \110"pand %%xmm4,%%xmm2\n\t" \111/*Finish the loads while unpacking the first set of rows, and expand the next1128 bits of the mask.*/ \113"movd %[m],%%xmm4\n\t" \114"movq (%[src],%[ystride]),%%xmm1\n\t" \115"pshuflw $0x00,%%xmm4,%%xmm4\n\t" \116"movq (%[ref],%[ystride]),%%xmm3\n\t" \117"pand %%xmm6,%%xmm4\n\t" \118"punpcklbw %%xmm2,%%xmm0\n\t" \119"pcmpeqb %%xmm6,%%xmm4\n\t" \120"punpcklbw %%xmm2,%%xmm2\n\t" \121/*Mask and unpack the second set of rows.*/ \122"pand %%xmm4,%%xmm1\n\t" \123"pand %%xmm4,%%xmm3\n\t" \124"punpcklbw %%xmm3,%%xmm1\n\t" \125"punpcklbw %%xmm3,%%xmm3\n\t" \126"psubw %%xmm2,%%xmm0\n\t" \127"psubw %%xmm3,%%xmm1\n\t" \128129unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,130const unsigned char *_ref,int _ystride,ogg_int64_t _mask){131ptrdiff_t ystride;132unsigned ret;133int i;134ystride=_ystride;135__asm__ __volatile__(136"pxor %%xmm7,%%xmm7\n\t"137"movq %[c],%%xmm6\n\t"138:139:[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))140);141for(i=0;i<4;i++){142unsigned m;143m=_mask&0xFFFF;144_mask>>=16;145if(m){146__asm__ __volatile__(147OC_LOAD_SUB_MASK_2x8148"pmaddwd %%xmm0,%%xmm0\n\t"149"pmaddwd %%xmm1,%%xmm1\n\t"150"paddd %%xmm0,%%xmm7\n\t"151"paddd %%xmm1,%%xmm7\n\t"152:[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)153);154}155_src+=2*ystride;156_ref+=2*ystride;157}158__asm__ __volatile__(159"movdqa %%xmm7,%%xmm6\n\t"160"punpckhqdq %%xmm7,%%xmm7\n\t"161"paddd %%xmm6,%%xmm7\n\t"162"pshufd $1,%%xmm7,%%xmm6\n\t"163"paddd %%xmm6,%%xmm7\n\t"164"movd %%xmm7,%[ret]\n\t"165:[ret]"=a"(ret)166);167return ret;168}169170171/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their17216-bit difference in %%xmm0...%%xmm7.*/173#define OC_LOAD_SUB_8x8 \174"#OC_LOAD_SUB_8x8\n\t" \175"movq (%[src]),%%xmm0\n\t" \176"movq (%[ref]),%%xmm4\n\t" \177"movq (%[src],%[src_ystride]),%%xmm1\n\t" \178"lea (%[src],%[src_ystride],2),%[src]\n\t" \179"movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \180"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \181"movq (%[src]),%%xmm2\n\t" \182"movq (%[ref]),%%xmm7\n\t" \183"movq (%[src],%[src_ystride]),%%xmm3\n\t" \184"movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \185"punpcklbw %%xmm4,%%xmm0\n\t" \186"lea (%[src],%[src_ystride],2),%[src]\n\t" \187"punpcklbw %%xmm4,%%xmm4\n\t" \188"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \189"psubw %%xmm4,%%xmm0\n\t" \190"movq (%[src]),%%xmm4\n\t" \191"movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \192"movq (%[ref]),%%xmm0\n\t" \193"punpcklbw %%xmm5,%%xmm1\n\t" \194"punpcklbw %%xmm5,%%xmm5\n\t" \195"psubw %%xmm5,%%xmm1\n\t" \196"movq (%[src],%[src_ystride]),%%xmm5\n\t" \197"punpcklbw %%xmm7,%%xmm2\n\t" \198"punpcklbw %%xmm7,%%xmm7\n\t" \199"psubw %%xmm7,%%xmm2\n\t" \200"movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \201"punpcklbw %%xmm6,%%xmm3\n\t" \202"lea (%[src],%[src_ystride],2),%[src]\n\t" \203"punpcklbw %%xmm6,%%xmm6\n\t" \204"psubw %%xmm6,%%xmm3\n\t" \205"movq (%[src]),%%xmm6\n\t" \206"punpcklbw %%xmm0,%%xmm4\n\t" \207"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \208"punpcklbw %%xmm0,%%xmm0\n\t" \209"lea (%[src],%[src_ystride],2),%[src]\n\t" \210"psubw %%xmm0,%%xmm4\n\t" \211"movq (%[ref]),%%xmm0\n\t" \212"punpcklbw %%xmm7,%%xmm5\n\t" \213"neg %[src_ystride]\n\t" \214"punpcklbw %%xmm7,%%xmm7\n\t" \215"psubw %%xmm7,%%xmm5\n\t" \216"movq (%[src],%[src_ystride]),%%xmm7\n\t" \217"punpcklbw %%xmm0,%%xmm6\n\t" \218"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \219"punpcklbw %%xmm0,%%xmm0\n\t" \220"neg %[ref_ystride]\n\t" \221"psubw %%xmm0,%%xmm6\n\t" \222"movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \223"punpcklbw %%xmm0,%%xmm7\n\t" \224"punpcklbw %%xmm0,%%xmm0\n\t" \225"psubw %%xmm0,%%xmm7\n\t" \226"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \227228/*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/229#define OC_LOAD_8x8 \230"#OC_LOAD_8x8\n\t" \231"movq (%[src]),%%xmm0\n\t" \232"movq (%[src],%[ystride]),%%xmm1\n\t" \233"movq (%[src],%[ystride],2),%%xmm2\n\t" \234"pxor %%xmm7,%%xmm7\n\t" \235"movq (%[src],%[ystride3]),%%xmm3\n\t" \236"punpcklbw %%xmm7,%%xmm0\n\t" \237"movq (%[src4]),%%xmm4\n\t" \238"punpcklbw %%xmm7,%%xmm1\n\t" \239"movq (%[src4],%[ystride]),%%xmm5\n\t" \240"punpcklbw %%xmm7,%%xmm2\n\t" \241"movq (%[src4],%[ystride],2),%%xmm6\n\t" \242"punpcklbw %%xmm7,%%xmm3\n\t" \243"movq (%[src4],%[ystride3]),%%xmm7\n\t" \244"punpcklbw %%xmm4,%%xmm4\n\t" \245"punpcklbw %%xmm5,%%xmm5\n\t" \246"psrlw $8,%%xmm4\n\t" \247"psrlw $8,%%xmm5\n\t" \248"punpcklbw %%xmm6,%%xmm6\n\t" \249"punpcklbw %%xmm7,%%xmm7\n\t" \250"psrlw $8,%%xmm6\n\t" \251"psrlw $8,%%xmm7\n\t" \252253/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.254Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to255perform this stage in place with no temporary registers).*/256#define OC_HADAMARD_AB_8x8 \257"#OC_HADAMARD_AB_8x8\n\t" \258/*Stage A:*/ \259"paddw %%xmm5,%%xmm1\n\t" \260"paddw %%xmm6,%%xmm2\n\t" \261"paddw %%xmm5,%%xmm5\n\t" \262"paddw %%xmm6,%%xmm6\n\t" \263"psubw %%xmm1,%%xmm5\n\t" \264"psubw %%xmm2,%%xmm6\n\t" \265"paddw %%xmm7,%%xmm3\n\t" \266"paddw %%xmm4,%%xmm0\n\t" \267"paddw %%xmm7,%%xmm7\n\t" \268"paddw %%xmm4,%%xmm4\n\t" \269"psubw %%xmm3,%%xmm7\n\t" \270"psubw %%xmm0,%%xmm4\n\t" \271/*Stage B:*/ \272"paddw %%xmm2,%%xmm0\n\t" \273"paddw %%xmm3,%%xmm1\n\t" \274"paddw %%xmm6,%%xmm4\n\t" \275"paddw %%xmm7,%%xmm5\n\t" \276"paddw %%xmm2,%%xmm2\n\t" \277"paddw %%xmm3,%%xmm3\n\t" \278"paddw %%xmm6,%%xmm6\n\t" \279"paddw %%xmm7,%%xmm7\n\t" \280"psubw %%xmm0,%%xmm2\n\t" \281"psubw %%xmm1,%%xmm3\n\t" \282"psubw %%xmm4,%%xmm6\n\t" \283"psubw %%xmm5,%%xmm7\n\t" \284285/*Performs the last stage of an 8-point 1-D Hadamard transform in place.286Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in287place with no temporary registers).*/288#define OC_HADAMARD_C_8x8 \289"#OC_HADAMARD_C_8x8\n\t" \290/*Stage C:*/ \291"paddw %%xmm1,%%xmm0\n\t" \292"paddw %%xmm3,%%xmm2\n\t" \293"paddw %%xmm5,%%xmm4\n\t" \294"paddw %%xmm7,%%xmm6\n\t" \295"paddw %%xmm1,%%xmm1\n\t" \296"paddw %%xmm3,%%xmm3\n\t" \297"paddw %%xmm5,%%xmm5\n\t" \298"paddw %%xmm7,%%xmm7\n\t" \299"psubw %%xmm0,%%xmm1\n\t" \300"psubw %%xmm2,%%xmm3\n\t" \301"psubw %%xmm4,%%xmm5\n\t" \302"psubw %%xmm6,%%xmm7\n\t" \303304/*Performs an 8-point 1-D Hadamard transform in place.305Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform306in place with no temporary registers).*/307#define OC_HADAMARD_8x8 \308OC_HADAMARD_AB_8x8 \309OC_HADAMARD_C_8x8 \310311/*Performs the first part of the final stage of the Hadamard transform and312summing of absolute values.313At the end of this part, %%xmm1 will contain the DC coefficient of the314transform.*/315#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \316/*We use the fact that \317(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \318to merge the final butterfly with the abs and the first stage of \319accumulation. \320Thus we can avoid using pabsw, which is not available until SSSE3. \321Emulating pabsw takes 3 instructions, so the straightforward SSE2 \322implementation would be (3+3)*8+7=55 instructions (+4 for spilling \323registers). \324Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \325This implementation is only 26 (+4 for spilling registers).*/ \326"#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \327"movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \328"movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \329/*xmm7={0x7FFF}x4 \330xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \331"pcmpeqb %%xmm7,%%xmm7\n\t" \332"movdqa %%xmm4,%%xmm6\n\t" \333"psrlw $1,%%xmm7\n\t" \334"paddw %%xmm5,%%xmm6\n\t" \335"pmaxsw %%xmm5,%%xmm4\n\t" \336"paddsw %%xmm7,%%xmm6\n\t" \337"psubw %%xmm6,%%xmm4\n\t" \338/*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \339xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \340"movdqa %%xmm2,%%xmm6\n\t" \341"movdqa %%xmm0,%%xmm5\n\t" \342"pmaxsw %%xmm3,%%xmm2\n\t" \343"pmaxsw %%xmm1,%%xmm0\n\t" \344"paddw %%xmm3,%%xmm6\n\t" \345"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \346"paddw %%xmm5,%%xmm1\n\t" \347"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \348349/*Performs the second part of the final stage of the Hadamard transform and350summing of absolute values.*/351#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \352"#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \353"paddsw %%xmm7,%%xmm6\n\t" \354"paddsw %%xmm7,%%xmm1\n\t" \355"psubw %%xmm6,%%xmm2\n\t" \356"psubw %%xmm1,%%xmm0\n\t" \357/*xmm7={1}x4 (needed for the horizontal add that follows) \358xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \359"movdqa %%xmm3,%%xmm6\n\t" \360"pmaxsw %%xmm5,%%xmm3\n\t" \361"paddw %%xmm2,%%xmm0\n\t" \362"paddw %%xmm5,%%xmm6\n\t" \363"paddw %%xmm4,%%xmm0\n\t" \364"paddsw %%xmm7,%%xmm6\n\t" \365"paddw %%xmm3,%%xmm0\n\t" \366"psrlw $14,%%xmm7\n\t" \367"psubw %%xmm6,%%xmm0\n\t" \368369/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the370absolute value of each component, and accumulates everything into xmm0.*/371#define OC_HADAMARD_C_ABS_ACCUM_8x8 \372OC_HADAMARD_C_ABS_ACCUM_A_8x8 \373OC_HADAMARD_C_ABS_ACCUM_B_8x8 \374375/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each376component, and accumulates everything into xmm0.377Note that xmm0 will have an extra 4 added to each column, and that after378removing this value, the remainder will be half the conventional value.*/379#define OC_HADAMARD_ABS_ACCUM_8x8 \380OC_HADAMARD_AB_8x8 \381OC_HADAMARD_C_ABS_ACCUM_8x8382383static unsigned oc_int_frag_satd_sse2(int *_dc,384const unsigned char *_src,int _src_ystride,385const unsigned char *_ref,int _ref_ystride){386OC_ALIGN16(ogg_int16_t buf[16]);387unsigned ret;388unsigned ret2;389int dc;390__asm__ __volatile__(391OC_LOAD_SUB_8x8392OC_HADAMARD_8x8393OC_TRANSPOSE_8x8394/*We split out the stages here so we can save the DC coefficient in the395middle.*/396OC_HADAMARD_AB_8x8397OC_HADAMARD_C_ABS_ACCUM_A_8x8398"movd %%xmm1,%[dc]\n\t"399OC_HADAMARD_C_ABS_ACCUM_B_8x8400/*Up to this point, everything fit in 16 bits (8 input + 1 for the401difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1402for the factor of two we dropped + 3 for the vertical accumulation).403Now we finally have to promote things to dwords.404We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long405latency of pmaddwd by starting to compute abs(dc) here.*/406"pmaddwd %%xmm7,%%xmm0\n\t"407"movsx %w[dc],%[dc]\n\t"408"cdq\n\t"409"movdqa %%xmm0,%%xmm1\n\t"410"punpckhqdq %%xmm0,%%xmm0\n\t"411"paddd %%xmm1,%%xmm0\n\t"412"pshuflw $0xE,%%xmm0,%%xmm1\n\t"413"paddd %%xmm1,%%xmm0\n\t"414"movd %%xmm0,%[ret]\n\t"415/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4416added to them, a factor of two removed, and the DC value included;417correct the final sum here.*/418"lea -64(%[ret2],%[ret],2),%[ret]\n\t"419"xor %[dc],%[ret2]\n\t"420"sub %[ret2],%[ret]\n\t"421/*Although it looks like we're using 7 registers here, gcc can alias %[ret]422and %[dc] with some of the inputs, since for once we don't write to423them until after we're done using everything but %[buf].*/424/*Note that _src_ystride and _ref_ystride must be given non-overlapping425constraints, otherwise if gcc can prove they're equal it will allocate426them to the same register (which is bad); _src and _ref face a similar427problem.428All four are destructively modified, but if we list them as output429constraints, gcc can't alias them with other outputs.*/430:[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),431[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))432:[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),433[ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)434/*We have to use neg, so we actually clobber the condition codes for once435(not to mention sub, and add).*/436:"cc"437);438*_dc=dc;439return ret;440}441442unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,443const unsigned char *_ref,int _ystride){444return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);445}446447unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,448const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){449OC_ALIGN8(unsigned char ref[64]);450oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);451return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);452}453454unsigned oc_enc_frag_intra_satd_sse2(int *_dc,455const unsigned char *_src,int _ystride){456OC_ALIGN16(ogg_int16_t buf[16]);457unsigned ret;458int dc;459__asm__ __volatile__(460OC_LOAD_8x8461OC_HADAMARD_8x8462OC_TRANSPOSE_8x8463/*We split out the stages here so we can save the DC coefficient in the464middle.*/465OC_HADAMARD_AB_8x8466OC_HADAMARD_C_ABS_ACCUM_A_8x8467"movd %%xmm1,%[dc]\n\t"468OC_HADAMARD_C_ABS_ACCUM_B_8x8469/*Up to this point, everything fit in 16 bits (8 input + 1 for the470difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1471for the factor of two we dropped + 3 for the vertical accumulation).472Now we finally have to promote things to dwords.*/473"pmaddwd %%xmm7,%%xmm0\n\t"474/*We assume that the DC coefficient is always positive (which is true,475because the input to the INTRA transform was not a difference).*/476"movzx %w[dc],%[dc]\n\t"477"movdqa %%xmm0,%%xmm1\n\t"478"punpckhqdq %%xmm0,%%xmm0\n\t"479"paddd %%xmm1,%%xmm0\n\t"480"pshuflw $0xE,%%xmm0,%%xmm1\n\t"481"paddd %%xmm1,%%xmm0\n\t"482"movd %%xmm0,%[ret]\n\t"483"lea -64(%[ret],%[ret]),%[ret]\n\t"484"sub %[dc],%[ret]\n\t"485/*Although it looks like we're using 7 registers here, gcc can alias %[ret]486and %[dc] with some of the inputs, since for once we don't write to487them until after we're done using everything but %[buf].*/488:[ret]"=a"(ret),[dc]"=r"(dc),489[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))490:[src]"r"(_src),[src4]"r"(_src+4*_ystride),491[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)492/*We have to use sub, so we actually clobber the condition codes for once.*/493:"cc"494);495*_dc=dc;496return ret;497}498499#endif500501502