Path: blob/master/thirdparty/libtheora/x86/sse2fdct.c
9898 views
/********************************************************************1* *2* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *3* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *4* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *5* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *6* *7* THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 *8* by the Xiph.Org Foundation https://www.xiph.org/ *9* *10********************************************************************/11/*SSE2 fDCT implementation for x86_64.*/12/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/13#include <stddef.h>14#include "x86enc.h"15#include "x86zigzag.h"16#include "sse2trans.h"1718#if defined(OC_X86_64_ASM)1920# define OC_FDCT_8x8 \21/*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \22"#OC_FDCT_8x8\n\t" \23/*Stage 1:*/ \24"movdqa %%xmm0,%%xmm11\n\t" \25"movdqa %%xmm1,%%xmm10\n\t" \26"movdqa %%xmm2,%%xmm9\n\t" \27"movdqa %%xmm3,%%xmm8\n\t" \28/*xmm11=t7'=t0-t7*/ \29"psubw %%xmm7,%%xmm11\n\t" \30/*xmm10=t6'=t1-t6*/ \31"psubw %%xmm6,%%xmm10\n\t" \32/*xmm9=t5'=t2-t5*/ \33"psubw %%xmm5,%%xmm9\n\t" \34/*xmm8=t4'=t3-t4*/ \35"psubw %%xmm4,%%xmm8\n\t" \36/*xmm0=t0'=t0+t7*/ \37"paddw %%xmm7,%%xmm0\n\t" \38/*xmm1=t1'=t1+t6*/ \39"paddw %%xmm6,%%xmm1\n\t" \40/*xmm5=t2'=t2+t5*/ \41"paddw %%xmm2,%%xmm5\n\t" \42/*xmm4=t3'=t3+t4*/ \43"paddw %%xmm3,%%xmm4\n\t" \44/*xmm2,3,6,7 are now free.*/ \45/*Stage 2:*/ \46"movdqa %%xmm0,%%xmm3\n\t" \47"mov $0x5A806A0A,%[a]\n\t" \48"movdqa %%xmm1,%%xmm2\n\t" \49"movd %[a],%%xmm13\n\t" \50"movdqa %%xmm10,%%xmm6\n\t" \51"pshufd $00,%%xmm13,%%xmm13\n\t" \52/*xmm2=t2''=t1'-t2'*/ \53"psubw %%xmm5,%%xmm2\n\t" \54"pxor %%xmm12,%%xmm12\n\t" \55/*xmm3=t3''=t0'-t3'*/ \56"psubw %%xmm4,%%xmm3\n\t" \57"psubw %%xmm14,%%xmm12\n\t" \58/*xmm10=t5''=t6'-t5'*/ \59"psubw %%xmm9,%%xmm10\n\t" \60"paddw %%xmm12,%%xmm12\n\t" \61/*xmm4=t0''=t0'+t3'*/ \62"paddw %%xmm0,%%xmm4\n\t" \63/*xmm1=t1''=t1'+t2'*/ \64"paddw %%xmm5,%%xmm1\n\t" \65/*xmm6=t6''=t6'+t5'*/ \66"paddw %%xmm9,%%xmm6\n\t" \67/*xmm0,xmm5,xmm9 are now free.*/ \68/*Stage 3:*/ \69/*xmm10:xmm5=t5''*27146+0xB500 \70xmm0=t5''*/ \71"movdqa %%xmm10,%%xmm5\n\t" \72"movdqa %%xmm10,%%xmm0\n\t" \73"punpckhwd %%xmm12,%%xmm10\n\t" \74"pmaddwd %%xmm13,%%xmm10\n\t" \75"punpcklwd %%xmm12,%%xmm5\n\t" \76"pmaddwd %%xmm13,%%xmm5\n\t" \77/*xmm5=(t5''*27146+0xB500>>16)+t5''*/ \78"psrad $16,%%xmm10\n\t" \79"psrad $16,%%xmm5\n\t" \80"packssdw %%xmm10,%%xmm5\n\t" \81"paddw %%xmm0,%%xmm5\n\t" \82/*xmm0=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \83"pcmpeqw %%xmm15,%%xmm0\n\t" \84"psubw %%xmm14,%%xmm0\n\t" \85"paddw %%xmm5,%%xmm0\n\t" \86"movdqa %%xmm8,%%xmm5\n\t" \87"psraw $1,%%xmm0\n\t" \88/*xmm5=t5'''=t4'-s*/ \89"psubw %%xmm0,%%xmm5\n\t" \90/*xmm8=t4''=t4'+s*/ \91"paddw %%xmm0,%%xmm8\n\t" \92/*xmm0,xmm7,xmm9,xmm10 are free.*/ \93/*xmm7:xmm9=t6''*27146+0xB500*/ \94"movdqa %%xmm6,%%xmm7\n\t" \95"movdqa %%xmm6,%%xmm9\n\t" \96"punpckhwd %%xmm12,%%xmm7\n\t" \97"pmaddwd %%xmm13,%%xmm7\n\t" \98"punpcklwd %%xmm12,%%xmm9\n\t" \99"pmaddwd %%xmm13,%%xmm9\n\t" \100/*xmm9=(t6''*27146+0xB500>>16)+t6''*/ \101"psrad $16,%%xmm7\n\t" \102"psrad $16,%%xmm9\n\t" \103"packssdw %%xmm7,%%xmm9\n\t" \104"paddw %%xmm6,%%xmm9\n\t" \105/*xmm9=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \106"pcmpeqw %%xmm15,%%xmm6\n\t" \107"psubw %%xmm14,%%xmm6\n\t" \108"paddw %%xmm6,%%xmm9\n\t" \109"movdqa %%xmm11,%%xmm7\n\t" \110"psraw $1,%%xmm9\n\t" \111/*xmm7=t6'''=t7'-s*/ \112"psubw %%xmm9,%%xmm7\n\t" \113/*xmm9=t7''=t7'+s*/ \114"paddw %%xmm11,%%xmm9\n\t" \115/*xmm0,xmm6,xmm10,xmm11 are free.*/ \116/*Stage 4:*/ \117/*xmm10:xmm0=t1''*27146+0xB500*/ \118"movdqa %%xmm1,%%xmm0\n\t" \119"movdqa %%xmm1,%%xmm10\n\t" \120"punpcklwd %%xmm12,%%xmm0\n\t" \121"pmaddwd %%xmm13,%%xmm0\n\t" \122"punpckhwd %%xmm12,%%xmm10\n\t" \123"pmaddwd %%xmm13,%%xmm10\n\t" \124/*xmm0=(t1''*27146+0xB500>>16)+t1''*/ \125"psrad $16,%%xmm0\n\t" \126"psrad $16,%%xmm10\n\t" \127"mov $0x20006A0A,%[a]\n\t" \128"packssdw %%xmm10,%%xmm0\n\t" \129"movd %[a],%%xmm13\n\t" \130"paddw %%xmm1,%%xmm0\n\t" \131/*xmm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \132"pcmpeqw %%xmm15,%%xmm1\n\t" \133"pshufd $00,%%xmm13,%%xmm13\n\t" \134"psubw %%xmm14,%%xmm1\n\t" \135"paddw %%xmm1,%%xmm0\n\t" \136/*xmm10:xmm4=t0''*27146+0x4000*/ \137"movdqa %%xmm4,%%xmm1\n\t" \138"movdqa %%xmm4,%%xmm10\n\t" \139"punpcklwd %%xmm12,%%xmm4\n\t" \140"pmaddwd %%xmm13,%%xmm4\n\t" \141"punpckhwd %%xmm12,%%xmm10\n\t" \142"pmaddwd %%xmm13,%%xmm10\n\t" \143/*xmm4=(t0''*27146+0x4000>>16)+t0''*/ \144"psrad $16,%%xmm4\n\t" \145"psrad $16,%%xmm10\n\t" \146"mov $0x6CB7,%[a]\n\t" \147"packssdw %%xmm10,%%xmm4\n\t" \148"movd %[a],%%xmm12\n\t" \149"paddw %%xmm1,%%xmm4\n\t" \150/*xmm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \151"pcmpeqw %%xmm15,%%xmm1\n\t" \152"pshufd $00,%%xmm12,%%xmm12\n\t" \153"psubw %%xmm14,%%xmm1\n\t" \154"mov $0x7FFF6C84,%[a]\n\t" \155"paddw %%xmm1,%%xmm4\n\t" \156/*xmm0=_y[0]=u=r+s>>1 \157The naive implementation could cause overflow, so we use \158u=(r&s)+((r^s)>>1).*/ \159"movdqa %%xmm0,%%xmm6\n\t" \160"pxor %%xmm4,%%xmm0\n\t" \161"pand %%xmm4,%%xmm6\n\t" \162"psraw $1,%%xmm0\n\t" \163"movd %[a],%%xmm13\n\t" \164"paddw %%xmm6,%%xmm0\n\t" \165/*xmm4=_y[4]=v=r-u*/ \166"pshufd $00,%%xmm13,%%xmm13\n\t" \167"psubw %%xmm0,%%xmm4\n\t" \168/*xmm1,xmm6,xmm10,xmm11 are free.*/ \169/*xmm6:xmm10=60547*t3''+0x6CB7*/ \170"movdqa %%xmm3,%%xmm10\n\t" \171"movdqa %%xmm3,%%xmm6\n\t" \172"punpcklwd %%xmm3,%%xmm10\n\t" \173"pmaddwd %%xmm13,%%xmm10\n\t" \174"mov $0x61F861F8,%[a]\n\t" \175"punpckhwd %%xmm3,%%xmm6\n\t" \176"pmaddwd %%xmm13,%%xmm6\n\t" \177"movd %[a],%%xmm13\n\t" \178"paddd %%xmm12,%%xmm10\n\t" \179"pshufd $00,%%xmm13,%%xmm13\n\t" \180"paddd %%xmm12,%%xmm6\n\t" \181/*xmm1:xmm2=25080*t2'' \182xmm12=t2''*/ \183"movdqa %%xmm2,%%xmm11\n\t" \184"movdqa %%xmm2,%%xmm12\n\t" \185"pmullw %%xmm13,%%xmm2\n\t" \186"pmulhw %%xmm13,%%xmm11\n\t" \187"movdqa %%xmm2,%%xmm1\n\t" \188"punpcklwd %%xmm11,%%xmm2\n\t" \189"punpckhwd %%xmm11,%%xmm1\n\t" \190/*xmm10=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \191"paddd %%xmm2,%%xmm10\n\t" \192"paddd %%xmm1,%%xmm6\n\t" \193"psrad $16,%%xmm10\n\t" \194"pcmpeqw %%xmm15,%%xmm3\n\t" \195"psrad $16,%%xmm6\n\t" \196"psubw %%xmm14,%%xmm3\n\t" \197"packssdw %%xmm6,%%xmm10\n\t" \198"paddw %%xmm3,%%xmm10\n\t" \199/*xmm2=_y[2]=u \200xmm10=s=(25080*u>>16)-t2''*/ \201"movdqa %%xmm10,%%xmm2\n\t" \202"pmulhw %%xmm13,%%xmm10\n\t" \203"psubw %%xmm12,%%xmm10\n\t" \204/*xmm1:xmm6=s*21600+0x2800*/ \205"pxor %%xmm12,%%xmm12\n\t" \206"psubw %%xmm14,%%xmm12\n\t" \207"mov $0x28005460,%[a]\n\t" \208"movd %[a],%%xmm13\n\t" \209"pshufd $00,%%xmm13,%%xmm13\n\t" \210"movdqa %%xmm10,%%xmm6\n\t" \211"movdqa %%xmm10,%%xmm1\n\t" \212"punpcklwd %%xmm12,%%xmm6\n\t" \213"pmaddwd %%xmm13,%%xmm6\n\t" \214"mov $0x0E3D,%[a]\n\t" \215"punpckhwd %%xmm12,%%xmm1\n\t" \216"pmaddwd %%xmm13,%%xmm1\n\t" \217/*xmm6=(s*21600+0x2800>>18)+s*/ \218"psrad $18,%%xmm6\n\t" \219"psrad $18,%%xmm1\n\t" \220"movd %[a],%%xmm12\n\t" \221"packssdw %%xmm1,%%xmm6\n\t" \222"pshufd $00,%%xmm12,%%xmm12\n\t" \223"paddw %%xmm10,%%xmm6\n\t" \224/*xmm6=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \225"mov $0x7FFF54DC,%[a]\n\t" \226"pcmpeqw %%xmm15,%%xmm10\n\t" \227"movd %[a],%%xmm13\n\t" \228"psubw %%xmm14,%%xmm10\n\t" \229"pshufd $00,%%xmm13,%%xmm13\n\t" \230"paddw %%xmm10,%%xmm6\n\t " \231/*xmm1,xmm3,xmm10,xmm11 are free.*/ \232/*xmm11:xmm10=54491*t5'''+0x0E3D*/ \233"movdqa %%xmm5,%%xmm10\n\t" \234"movdqa %%xmm5,%%xmm11\n\t" \235"punpcklwd %%xmm5,%%xmm10\n\t" \236"pmaddwd %%xmm13,%%xmm10\n\t" \237"mov $0x8E3A8E3A,%[a]\n\t" \238"punpckhwd %%xmm5,%%xmm11\n\t" \239"pmaddwd %%xmm13,%%xmm11\n\t" \240"movd %[a],%%xmm13\n\t" \241"paddd %%xmm12,%%xmm10\n\t" \242"pshufd $00,%%xmm13,%%xmm13\n\t" \243"paddd %%xmm12,%%xmm11\n\t" \244/*xmm7:xmm12=36410*t6''' \245xmm1=t6'''*/ \246"movdqa %%xmm7,%%xmm3\n\t" \247"movdqa %%xmm7,%%xmm1\n\t" \248"pmulhw %%xmm13,%%xmm3\n\t" \249"pmullw %%xmm13,%%xmm7\n\t" \250"paddw %%xmm1,%%xmm3\n\t" \251"movdqa %%xmm7,%%xmm12\n\t" \252"punpckhwd %%xmm3,%%xmm7\n\t" \253"punpcklwd %%xmm3,%%xmm12\n\t" \254/*xmm10=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \255"paddd %%xmm12,%%xmm10\n\t" \256"paddd %%xmm7,%%xmm11\n\t" \257"psrad $16,%%xmm10\n\t" \258"pcmpeqw %%xmm15,%%xmm5\n\t" \259"psrad $16,%%xmm11\n\t" \260"psubw %%xmm14,%%xmm5\n\t" \261"packssdw %%xmm11,%%xmm10\n\t" \262"pxor %%xmm12,%%xmm12\n\t" \263"paddw %%xmm5,%%xmm10\n\t" \264/*xmm5=_y[5]=u \265xmm1=s=t6'''-(36410*u>>16)*/ \266"psubw %%xmm14,%%xmm12\n\t" \267"movdqa %%xmm10,%%xmm5\n\t" \268"mov $0x340067C8,%[a]\n\t" \269"pmulhw %%xmm13,%%xmm10\n\t" \270"movd %[a],%%xmm13\n\t" \271"paddw %%xmm5,%%xmm10\n\t" \272"pshufd $00,%%xmm13,%%xmm13\n\t" \273"psubw %%xmm10,%%xmm1\n\t" \274/*xmm11:xmm3=s*26568+0x3400*/ \275"movdqa %%xmm1,%%xmm3\n\t" \276"movdqa %%xmm1,%%xmm11\n\t" \277"punpcklwd %%xmm12,%%xmm3\n\t" \278"pmaddwd %%xmm13,%%xmm3\n\t" \279"mov $0x7B1B,%[a]\n\t" \280"punpckhwd %%xmm12,%%xmm11\n\t" \281"pmaddwd %%xmm13,%%xmm11\n\t" \282/*xmm3=(s*26568+0x3400>>17)+s*/ \283"psrad $17,%%xmm3\n\t" \284"psrad $17,%%xmm11\n\t" \285"movd %[a],%%xmm12\n\t" \286"packssdw %%xmm11,%%xmm3\n\t" \287"pshufd $00,%%xmm12,%%xmm12\n\t" \288"paddw %%xmm1,%%xmm3\n\t" \289/*xmm3=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \290"mov $0x7FFF7B16,%[a]\n\t" \291"pcmpeqw %%xmm15,%%xmm1\n\t" \292"movd %[a],%%xmm13\n\t" \293"psubw %%xmm14,%%xmm1\n\t" \294"pshufd $00,%%xmm13,%%xmm13\n\t" \295"paddw %%xmm1,%%xmm3\n\t " \296/*xmm1,xmm7,xmm10,xmm11 are free.*/ \297/*xmm11:xmm10=64277*t7''+0x7B1B*/ \298"movdqa %%xmm9,%%xmm10\n\t" \299"movdqa %%xmm9,%%xmm11\n\t" \300"punpcklwd %%xmm9,%%xmm10\n\t" \301"pmaddwd %%xmm13,%%xmm10\n\t" \302"mov $0x31F131F1,%[a]\n\t" \303"punpckhwd %%xmm9,%%xmm11\n\t" \304"pmaddwd %%xmm13,%%xmm11\n\t" \305"movd %[a],%%xmm13\n\t" \306"paddd %%xmm12,%%xmm10\n\t" \307"pshufd $00,%%xmm13,%%xmm13\n\t" \308"paddd %%xmm12,%%xmm11\n\t" \309/*xmm12:xmm7=12785*t4''*/ \310"movdqa %%xmm8,%%xmm7\n\t" \311"movdqa %%xmm8,%%xmm1\n\t" \312"pmullw %%xmm13,%%xmm7\n\t" \313"pmulhw %%xmm13,%%xmm1\n\t" \314"movdqa %%xmm7,%%xmm12\n\t" \315"punpcklwd %%xmm1,%%xmm7\n\t" \316"punpckhwd %%xmm1,%%xmm12\n\t" \317/*xmm10=u=(12785*t4''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \318"paddd %%xmm7,%%xmm10\n\t" \319"paddd %%xmm12,%%xmm11\n\t" \320"psrad $16,%%xmm10\n\t" \321"pcmpeqw %%xmm15,%%xmm9\n\t" \322"psrad $16,%%xmm11\n\t" \323"psubw %%xmm14,%%xmm9\n\t" \324"packssdw %%xmm11,%%xmm10\n\t" \325"pxor %%xmm12,%%xmm12\n\t" \326"paddw %%xmm9,%%xmm10\n\t" \327/*xmm1=_y[1]=u \328xmm10=s=(12785*u>>16)-t4''*/ \329"psubw %%xmm14,%%xmm12\n\t" \330"movdqa %%xmm10,%%xmm1\n\t" \331"mov $0x3000503B,%[a]\n\t" \332"pmulhw %%xmm13,%%xmm10\n\t" \333"movd %[a],%%xmm13\n\t" \334"psubw %%xmm8,%%xmm10\n\t" \335"pshufd $00,%%xmm13,%%xmm13\n\t" \336/*xmm8:xmm7=s*20539+0x3000*/ \337"movdqa %%xmm10,%%xmm7\n\t" \338"movdqa %%xmm10,%%xmm8\n\t" \339"punpcklwd %%xmm12,%%xmm7\n\t" \340"pmaddwd %%xmm13,%%xmm7\n\t" \341"punpckhwd %%xmm12,%%xmm8\n\t" \342"pmaddwd %%xmm13,%%xmm8\n\t" \343/*xmm7=(s*20539+0x3000>>20)+s*/ \344"psrad $20,%%xmm7\n\t" \345"psrad $20,%%xmm8\n\t" \346"packssdw %%xmm8,%%xmm7\n\t" \347"paddw %%xmm10,%%xmm7\n\t" \348/*xmm7=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \349"pcmpeqw %%xmm15,%%xmm10\n\t" \350"psubw %%xmm14,%%xmm10\n\t" \351"paddw %%xmm10,%%xmm7\n\t " \352353/*SSE2 implementation of the fDCT for x86-64 only.354Because of the 8 extra XMM registers on x86-64, this version can operate355without any temporary stack access at all.*/356void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){357ptrdiff_t a;358__asm__ __volatile__(359/*Load the input.*/360"movdqa 0x00(%[x]),%%xmm0\n\t"361"movdqa 0x10(%[x]),%%xmm1\n\t"362"movdqa 0x20(%[x]),%%xmm2\n\t"363"movdqa 0x30(%[x]),%%xmm3\n\t"364"movdqa 0x40(%[x]),%%xmm4\n\t"365"movdqa 0x50(%[x]),%%xmm5\n\t"366"movdqa 0x60(%[x]),%%xmm6\n\t"367"movdqa 0x70(%[x]),%%xmm7\n\t"368/*Add two extra bits of working precision to improve accuracy; any more and369we could overflow.*/370/*We also add a few biases to correct for some systematic error that371remains in the full fDCT->iDCT round trip.*/372/*xmm15={0}x8*/373"pxor %%xmm15,%%xmm15\n\t"374/*xmm14={-1}x8*/375"pcmpeqb %%xmm14,%%xmm14\n\t"376"psllw $2,%%xmm0\n\t"377/*xmm8=xmm0*/378"movdqa %%xmm0,%%xmm8\n\t"379"psllw $2,%%xmm1\n\t"380/*xmm8={_x[7...0]==0}*/381"pcmpeqw %%xmm15,%%xmm8\n\t"382"psllw $2,%%xmm2\n\t"383/*xmm8={_x[7...0]!=0}*/384"psubw %%xmm14,%%xmm8\n\t"385"psllw $2,%%xmm3\n\t"386/*%[a]=1*/387"mov $1,%[a]\n\t"388/*xmm8={_x[6]!=0,0,_x[4]!=0,0,_x[2]!=0,0,_x[0]!=0,0}*/389"pslld $16,%%xmm8\n\t"390"psllw $2,%%xmm4\n\t"391/*xmm9={0,0,0,0,0,0,0,1}*/392"movd %[a],%%xmm9\n\t"393/*xmm8={0,0,_x[2]!=0,0,_x[0]!=0,0}*/394"pshufhw $0x00,%%xmm8,%%xmm8\n\t"395"psllw $2,%%xmm5\n\t"396/*%[a]={1}x2*/397"mov $0x10001,%[a]\n\t"398/*xmm8={0,0,0,0,0,0,0,_x[0]!=0}*/399"pshuflw $0x01,%%xmm8,%%xmm8\n\t"400"psllw $2,%%xmm6\n\t"401/*xmm10={0,0,0,0,0,0,1,1}*/402"movd %[a],%%xmm10\n\t"403/*xmm0=_x[7...0]+{0,0,0,0,0,0,0,_x[0]!=0}*/404"paddw %%xmm8,%%xmm0\n\t"405"psllw $2,%%xmm7\n\t"406/*xmm0=_x[7...0]+{0,0,0,0,0,0,1,(_x[0]!=0)+1}*/407"paddw %%xmm10,%%xmm0\n\t"408/*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/409"psubw %%xmm9,%%xmm1\n\t"410/*Transform columns.*/411OC_FDCT_8x8412/*Transform rows.*/413OC_TRANSPOSE_8x8414OC_FDCT_8x8415/*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/416"paddw %%xmm14,%%xmm14\n\t"417"psubw %%xmm14,%%xmm0\n\t"418"psubw %%xmm14,%%xmm1\n\t"419"psraw $2,%%xmm0\n\t"420"psubw %%xmm14,%%xmm2\n\t"421"psraw $2,%%xmm1\n\t"422"psubw %%xmm14,%%xmm3\n\t"423"psraw $2,%%xmm2\n\t"424"psubw %%xmm14,%%xmm4\n\t"425"psraw $2,%%xmm3\n\t"426"psubw %%xmm14,%%xmm5\n\t"427"psraw $2,%%xmm4\n\t"428"psubw %%xmm14,%%xmm6\n\t"429"psraw $2,%%xmm5\n\t"430"psubw %%xmm14,%%xmm7\n\t"431"psraw $2,%%xmm6\n\t"432"psraw $2,%%xmm7\n\t"433/*Transpose, zig-zag, and store the result.*/434/*We could probably do better using SSSE3's palignr, but re-using MMXEXT435version will do for now.*/436#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \437"movdq2q %%xmm"#_row","_reg"\n\t" \438439#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \440"punpckhqdq %%xmm"#_row",%%xmm"#_row"\n\t" \441"movdq2q %%xmm"#_row","_reg"\n\t" \442443OC_TRANSPOSE_ZIG_ZAG_MMXEXT444#undef OC_ZZ_LOAD_ROW_LO445#undef OC_ZZ_LOAD_ROW_HI446:[a]"=&r"(a)447:[y]"r"(_y),[x]"r"(_x)448:"memory"449);450}451#endif452453454