#include "x86enc.h"
#include "x86zigzag.h"
#if defined(OC_X86_ASM)
# define OC_FDCT_STAGE1_8x4 \
"#OC_FDCT_STAGE1_8x4\n\t" \
\
\
"psubw %%mm7,%%mm0\n\t" \
"paddw %%mm7,%%mm7\n\t" \
\
"psubw %%mm6,%%mm1\n\t" \
"paddw %%mm6,%%mm6\n\t" \
\
"psubw %%mm5,%%mm2\n\t" \
"paddw %%mm5,%%mm5\n\t" \
\
"psubw %%mm4,%%mm3\n\t" \
"paddw %%mm4,%%mm4\n\t" \
\
"paddw %%mm0,%%mm7\n\t" \
\
"paddw %%mm1,%%mm6\n\t" \
\
"paddw %%mm2,%%mm5\n\t" \
\
"paddw %%mm3,%%mm4\n\t" \
# define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
"#OC_FDCT8x4\n\t" \
\
\
"psubw %%mm4,%%mm7\n\t" \
"paddw %%mm4,%%mm4\n\t" \
\
"psubw %%mm5,%%mm6\n\t" \
"movq %%mm7,"_r6"(%[y])\n\t" \
"paddw %%mm5,%%mm5\n\t" \
\
"psubw %%mm2,%%mm1\n\t" \
"movq %%mm6,"_r2"(%[y])\n\t" \
\
"paddw %%mm7,%%mm4\n\t" \
"paddw %%mm2,%%mm2\n\t" \
\
"movq %%mm4,"_r0"(%[y])\n\t" \
"paddw %%mm6,%%mm5\n\t" \
\
"paddw %%mm1,%%mm2\n\t" \
"movq %%mm5,"_r4"(%[y])\n\t" \
\
\
\
\
"mov $0x5A806A0A,%[a]\n\t" \
"pcmpeqb %%mm6,%%mm6\n\t" \
"movd %[a],%%mm7\n\t" \
"psrlw $15,%%mm6\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
"paddw %%mm6,%%mm6\n\t" \
\
"movq %%mm1,%%mm4\n\t" \
"movq %%mm1,%%mm5\n\t" \
"punpcklwd %%mm6,%%mm4\n\t" \
"movq %%mm2,"_r3"(%[y])\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"movq %%mm0,"_r7"(%[y])\n\t" \
"punpckhwd %%mm6,%%mm5\n\t" \
"pxor %%mm0,%%mm0\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"pcmpeqb %%mm2,%%mm2\n\t" \
\
"pcmpeqw %%mm1,%%mm0\n\t" \
"psrad $16,%%mm4\n\t" \
"psubw %%mm2,%%mm0\n\t" \
"movq "_r3"(%[y]),%%mm2\n\t" \
"psrad $16,%%mm5\n\t" \
"paddw %%mm0,%%mm1\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
\
"paddw %%mm1,%%mm4\n\t" \
"movq "_r7"(%[y]),%%mm0\n\t" \
"psraw $1,%%mm4\n\t" \
"movq %%mm3,%%mm1\n\t" \
\
"paddw %%mm4,%%mm3\n\t" \
\
"psubw %%mm4,%%mm1\n\t" \
\
"movq %%mm2,%%mm4\n\t" \
"movq %%mm2,%%mm5\n\t" \
"punpcklwd %%mm6,%%mm4\n\t" \
"movq %%mm1,"_r5"(%[y])\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"movq %%mm3,"_r1"(%[y])\n\t" \
"punpckhwd %%mm6,%%mm5\n\t" \
"pxor %%mm1,%%mm1\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"pcmpeqb %%mm3,%%mm3\n\t" \
\
"psrad $16,%%mm4\n\t" \
"pcmpeqw %%mm2,%%mm1\n\t" \
"psrad $16,%%mm5\n\t" \
"psubw %%mm3,%%mm1\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
"paddw %%mm1,%%mm2\n\t" \
\
"paddw %%mm2,%%mm4\n\t" \
"movq "_r4"(%[y]),%%mm1\n\t" \
"psraw $1,%%mm4\n\t" \
"movq %%mm0,%%mm2\n\t" \
\
"paddw %%mm4,%%mm0\n\t" \
\
"psubw %%mm4,%%mm2\n\t" \
\
\
"movq %%mm1,%%mm4\n\t" \
"movq %%mm1,%%mm5\n\t" \
"punpcklwd %%mm6,%%mm4\n\t" \
"movq %%mm2,"_r3"(%[y])\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"movq "_r0"(%[y]),%%mm2\n\t" \
"punpckhwd %%mm6,%%mm5\n\t" \
"movq %%mm0,"_r7"(%[y])\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"pxor %%mm0,%%mm0\n\t" \
\
"psrad $16,%%mm4\n\t" \
"mov $0x20006A0A,%[a]\n\t" \
"pcmpeqw %%mm1,%%mm0\n\t" \
"movd %[a],%%mm7\n\t" \
"psrad $16,%%mm5\n\t" \
"psubw %%mm3,%%mm0\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
"paddw %%mm1,%%mm0\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
"paddw %%mm4,%%mm0\n\t" \
\
"movq %%mm2,%%mm4\n\t" \
"movq %%mm2,%%mm5\n\t" \
"punpcklwd %%mm6,%%mm4\n\t" \
"mov $0x0E3D,%[a]\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"punpckhwd %%mm6,%%mm5\n\t" \
"movd %[a],%%mm6\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"pxor %%mm1,%%mm1\n\t" \
"punpckldq %%mm6,%%mm6\n\t" \
"pcmpeqw %%mm2,%%mm1\n\t" \
\
"psrad $16,%%mm4\n\t" \
"psubw %%mm3,%%mm1\n\t" \
"psrad $16,%%mm5\n\t" \
"paddw %%mm1,%%mm2\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
"movq "_r5"(%[y]),%%mm1\n\t" \
"paddw %%mm2,%%mm4\n\t" \
\
"movq "_r3"(%[y]),%%mm2\n\t" \
"movq %%mm0,%%mm7\n\t" \
"pxor %%mm4,%%mm0\n\t" \
"pand %%mm4,%%mm7\n\t" \
"psraw $1,%%mm0\n\t" \
"mov $0x7FFF54DC,%[a]\n\t" \
"paddw %%mm7,%%mm0\n\t" \
"movd %[a],%%mm7\n\t" \
\
"psubw %%mm0,%%mm4\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
"movq %%mm4,"_r4"(%[y])\n\t" \
\
"movq %%mm1,%%mm4\n\t" \
"movq %%mm1,%%mm5\n\t" \
"punpcklwd %%mm1,%%mm4\n\t" \
"mov $0x8E3A8E3A,%[a]\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"movq %%mm0,"_r0"(%[y])\n\t" \
"punpckhwd %%mm1,%%mm5\n\t" \
"pxor %%mm0,%%mm0\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"pcmpeqw %%mm0,%%mm1\n\t" \
"movd %[a],%%mm7\n\t" \
"psubw %%mm3,%%mm1\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
"paddd %%mm6,%%mm4\n\t" \
"paddd %%mm6,%%mm5\n\t" \
\
"movq %%mm2,%%mm6\n\t" \
"movq %%mm2,%%mm3\n\t" \
"pmulhw %%mm7,%%mm6\n\t" \
"paddw %%mm2,%%mm1\n\t" \
"pmullw %%mm7,%%mm3\n\t" \
"pxor %%mm0,%%mm0\n\t" \
"paddw %%mm1,%%mm6\n\t" \
"movq %%mm3,%%mm1\n\t" \
"punpckhwd %%mm6,%%mm3\n\t" \
"punpcklwd %%mm6,%%mm1\n\t" \
\
"paddd %%mm3,%%mm5\n\t" \
"paddd %%mm1,%%mm4\n\t" \
"psrad $16,%%mm5\n\t" \
"pxor %%mm6,%%mm6\n\t" \
"psrad $16,%%mm4\n\t" \
"pcmpeqb %%mm3,%%mm3\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
"psubw %%mm3,%%mm6\n\t" \
\
"movq %%mm4,%%mm1\n\t" \
"mov $0x340067C8,%[a]\n\t" \
"pmulhw %%mm7,%%mm4\n\t" \
"movd %[a],%%mm7\n\t" \
"movq %%mm1,"_r5"(%[y])\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
"paddw %%mm1,%%mm4\n\t" \
"movq "_r7"(%[y]),%%mm1\n\t" \
"psubw %%mm4,%%mm2\n\t" \
\
"movq %%mm2,%%mm4\n\t" \
"movq %%mm2,%%mm5\n\t" \
"punpcklwd %%mm6,%%mm4\n\t" \
"pcmpeqw %%mm2,%%mm0\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"mov $0x7B1B,%[a]\n\t" \
"punpckhwd %%mm6,%%mm5\n\t" \
"movd %[a],%%mm6\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"psubw %%mm3,%%mm0\n\t" \
"punpckldq %%mm6,%%mm6\n\t" \
\
"psrad $17,%%mm4\n\t" \
"paddw %%mm0,%%mm2\n\t" \
"psrad $17,%%mm5\n\t" \
"mov $0x7FFF7B16,%[a]\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
"movd %[a],%%mm7\n\t" \
"paddw %%mm4,%%mm2\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
\
"movq %%mm1,%%mm4\n\t" \
"movq %%mm1,%%mm5\n\t" \
"movq %%mm2,"_r3"(%[y])\n\t" \
"punpcklwd %%mm1,%%mm4\n\t" \
"movq "_r1"(%[y]),%%mm2\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"mov $0x31F131F1,%[a]\n\t" \
"punpckhwd %%mm1,%%mm5\n\t" \
"pxor %%mm0,%%mm0\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"pcmpeqw %%mm0,%%mm1\n\t" \
"movd %[a],%%mm7\n\t" \
"psubw %%mm3,%%mm1\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
"paddd %%mm6,%%mm4\n\t" \
"paddd %%mm6,%%mm5\n\t" \
\
"movq %%mm2,%%mm6\n\t" \
"movq %%mm2,%%mm3\n\t" \
"pmulhw %%mm7,%%mm6\n\t" \
"pmullw %%mm7,%%mm3\n\t" \
"paddw %%mm1,%%mm6\n\t" \
"movq %%mm3,%%mm1\n\t" \
"punpckhwd %%mm6,%%mm3\n\t" \
"punpcklwd %%mm6,%%mm1\n\t" \
\
"paddd %%mm3,%%mm5\n\t" \
"paddd %%mm1,%%mm4\n\t" \
"psrad $16,%%mm5\n\t" \
"pxor %%mm6,%%mm6\n\t" \
"psrad $16,%%mm4\n\t" \
"pcmpeqb %%mm3,%%mm3\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
"psubw %%mm3,%%mm6\n\t" \
\
"movq %%mm4,"_r1"(%[y])\n\t" \
"pmulhw %%mm7,%%mm4\n\t" \
"mov $0x3000503B,%[a]\n\t" \
"movq "_r6"(%[y]),%%mm1\n\t" \
"movd %[a],%%mm7\n\t" \
"psubw %%mm2,%%mm4\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
\
"movq %%mm4,%%mm5\n\t" \
"movq %%mm4,%%mm2\n\t" \
"punpcklwd %%mm6,%%mm4\n\t" \
"pcmpeqw %%mm2,%%mm0\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"mov $0x6CB7,%[a]\n\t" \
"punpckhwd %%mm6,%%mm5\n\t" \
"movd %[a],%%mm6\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"psubw %%mm3,%%mm0\n\t" \
"punpckldq %%mm6,%%mm6\n\t" \
\
"psrad $20,%%mm4\n\t" \
"paddw %%mm0,%%mm2\n\t" \
"psrad $20,%%mm5\n\t" \
"mov $0x7FFF6C84,%[a]\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
"movd %[a],%%mm7\n\t" \
"paddw %%mm4,%%mm2\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
\
"movq %%mm1,%%mm4\n\t" \
"movq %%mm1,%%mm5\n\t" \
"movq %%mm2,"_r7"(%[y])\n\t" \
"punpcklwd %%mm1,%%mm4\n\t" \
"movq "_r2"(%[y]),%%mm2\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"mov $0x61F861F8,%[a]\n\t" \
"punpckhwd %%mm1,%%mm5\n\t" \
"pxor %%mm0,%%mm0\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"movd %[a],%%mm7\n\t" \
"pcmpeqw %%mm0,%%mm1\n\t" \
"psubw %%mm3,%%mm1\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
"paddd %%mm6,%%mm4\n\t" \
"paddd %%mm6,%%mm5\n\t" \
\
"movq %%mm2,%%mm6\n\t" \
"movq %%mm2,%%mm3\n\t" \
"pmulhw %%mm7,%%mm6\n\t" \
"pmullw %%mm7,%%mm3\n\t" \
"paddw %%mm1,%%mm6\n\t" \
"movq %%mm3,%%mm1\n\t" \
"punpckhwd %%mm6,%%mm3\n\t" \
"punpcklwd %%mm6,%%mm1\n\t" \
\
"paddd %%mm3,%%mm5\n\t" \
"paddd %%mm1,%%mm4\n\t" \
"psrad $16,%%mm5\n\t" \
"mov $0x28005460,%[a]\n\t" \
"psrad $16,%%mm4\n\t" \
"pcmpeqb %%mm1,%%mm1\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
\
"movq %%mm4,%%mm6\n\t" \
"pmulhw %%mm7,%%mm4\n\t" \
"pxor %%mm5,%%mm5\n\t" \
"movd %[a],%%mm7\n\t" \
"psubw %%mm1,%%mm5\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
"psubw %%mm2,%%mm4\n\t" \
\
"movq %%mm4,%%mm3\n\t" \
"movq %%mm4,%%mm2\n\t" \
"punpckhwd %%mm5,%%mm4\n\t" \
"pcmpeqw %%mm2,%%mm0\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"psubw %%mm1,%%mm0\n\t" \
"punpcklwd %%mm5,%%mm3\n\t" \
"paddw %%mm0,%%mm2\n\t" \
"pmaddwd %%mm7,%%mm3\n\t" \
\
"movq "_r4"(%[y]),%%mm0\n\t" \
"psrad $18,%%mm4\n\t" \
"movq "_r5"(%[y]),%%mm5\n\t" \
"psrad $18,%%mm3\n\t" \
"movq "_r7"(%[y]),%%mm1\n\t" \
"packssdw %%mm4,%%mm3\n\t" \
"movq "_r0"(%[y]),%%mm4\n\t" \
"paddw %%mm2,%%mm3\n\t" \
# define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
"#OC_TRANSPOSE8x4\n\t" \
\
\
"movq %%mm0,%%mm2\n\t" \
"punpcklwd %%mm5,%%mm0\n\t" \
"punpckhwd %%mm5,%%mm2\n\t" \
"movq %%mm3,%%mm5\n\t" \
"punpcklwd %%mm1,%%mm3\n\t" \
"punpckhwd %%mm1,%%mm5\n\t" \
\
"movq %%mm0,%%mm1\n\t" \
"punpckldq %%mm3,%%mm0\n\t" \
"movq %%mm0,"_r4"(%[y])\n\t" \
"punpckhdq %%mm3,%%mm1\n\t" \
"movq "_r1"(%[y]),%%mm0\n\t" \
"movq %%mm2,%%mm3\n\t" \
"punpckldq %%mm5,%%mm2\n\t" \
"punpckhdq %%mm5,%%mm3\n\t" \
"movq "_r3"(%[y]),%%mm5\n\t" \
\
\
\
"movq %%mm4,%%mm7\n\t" \
"punpcklwd %%mm0,%%mm4\n\t" \
"punpckhwd %%mm0,%%mm7\n\t" \
"movq %%mm6,%%mm0\n\t" \
"punpcklwd %%mm5,%%mm6\n\t" \
"punpckhwd %%mm5,%%mm0\n\t" \
\
"movq %%mm4,%%mm5\n\t" \
"punpckldq %%mm6,%%mm4\n\t" \
"punpckhdq %%mm6,%%mm5\n\t" \
"movq %%mm7,%%mm6\n\t" \
"punpckhdq %%mm0,%%mm7\n\t" \
"punpckldq %%mm0,%%mm6\n\t" \
\
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
OC_ALIGN8(ogg_int16_t buf[64]);
ptrdiff_t a;
__asm__ __volatile__(
"movq 0x00(%[x]),%%mm0\n\t"
"movq 0x10(%[x]),%%mm1\n\t"
"movq 0x20(%[x]),%%mm2\n\t"
"movq 0x30(%[x]),%%mm3\n\t"
"pcmpeqb %%mm4,%%mm4\n\t"
"pxor %%mm7,%%mm7\n\t"
"movq %%mm0,%%mm5\n\t"
"psllw $2,%%mm0\n\t"
"pcmpeqw %%mm7,%%mm5\n\t"
"movq 0x70(%[x]),%%mm7\n\t"
"psllw $2,%%mm1\n\t"
"psubw %%mm4,%%mm5\n\t"
"psllw $2,%%mm2\n\t"
"mov $1,%[a]\n\t"
"pslld $16,%%mm5\n\t"
"movd %[a],%%mm6\n\t"
"psllq $16,%%mm5\n\t"
"mov $0x10001,%[a]\n\t"
"psllw $2,%%mm3\n\t"
"movd %[a],%%mm4\n\t"
"punpckhwd %%mm6,%%mm5\n\t"
"psubw %%mm6,%%mm1\n\t"
"movq 0x60(%[x]),%%mm6\n\t"
"paddw %%mm5,%%mm0\n\t"
"movq 0x50(%[x]),%%mm5\n\t"
"paddw %%mm4,%%mm0\n\t"
"movq 0x40(%[x]),%%mm4\n\t"
"psllw $2,%%mm7\n\t"
"psubw %%mm7,%%mm0\n\t"
"psllw $2,%%mm6\n\t"
"paddw %%mm7,%%mm7\n\t"
"psllw $2,%%mm5\n\t"
"psubw %%mm6,%%mm1\n\t"
"psllw $2,%%mm4\n\t"
"paddw %%mm6,%%mm6\n\t"
"psubw %%mm5,%%mm2\n\t"
"paddw %%mm5,%%mm5\n\t"
"psubw %%mm4,%%mm3\n\t"
"paddw %%mm4,%%mm4\n\t"
"paddw %%mm0,%%mm7\n\t"
"paddw %%mm1,%%mm6\n\t"
"paddw %%mm2,%%mm5\n\t"
"paddw %%mm3,%%mm4\n\t"
OC_FDCT8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
"movq 0x08(%[x]),%%mm0\n\t"
"movq %%mm7,0x30(%[y])\n\t"
"movq 0x78(%[x]),%%mm7\n\t"
"movq %%mm1,0x50(%[y])\n\t"
"movq 0x18(%[x]),%%mm1\n\t"
"movq %%mm6,0x20(%[y])\n\t"
"movq 0x68(%[x]),%%mm6\n\t"
"movq %%mm2,0x60(%[y])\n\t"
"movq 0x28(%[x]),%%mm2\n\t"
"movq %%mm5,0x10(%[y])\n\t"
"movq 0x58(%[x]),%%mm5\n\t"
"movq %%mm3,0x70(%[y])\n\t"
"movq 0x38(%[x]),%%mm3\n\t"
"psllw $2,%%mm0\n\t"
"movq %%mm4,0x00(%[y])\n\t"
"psllw $2,%%mm7\n\t"
"movq 0x48(%[x]),%%mm4\n\t"
"psubw %%mm7,%%mm0\n\t"
"psllw $2,%%mm1\n\t"
"paddw %%mm7,%%mm7\n\t"
"psllw $2,%%mm6\n\t"
"psubw %%mm6,%%mm1\n\t"
"psllw $2,%%mm2\n\t"
"paddw %%mm6,%%mm6\n\t"
"psllw $2,%%mm5\n\t"
"psubw %%mm5,%%mm2\n\t"
"psllw $2,%%mm3\n\t"
"paddw %%mm5,%%mm5\n\t"
"psllw $2,%%mm4\n\t"
"psubw %%mm4,%%mm3\n\t"
"paddw %%mm4,%%mm4\n\t"
"paddw %%mm0,%%mm7\n\t"
"paddw %%mm1,%%mm6\n\t"
"paddw %%mm2,%%mm5\n\t"
"paddw %%mm3,%%mm4\n\t"
OC_FDCT8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
OC_TRANSPOSE8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
"movq 0x00(%[y]),%%mm0\n\t"
"movq %%mm1,0x58(%[y])\n\t"
"movq 0x10(%[y]),%%mm1\n\t"
"movq %%mm2,0x68(%[y])\n\t"
"movq 0x20(%[y]),%%mm2\n\t"
"movq %%mm3,0x78(%[y])\n\t"
"movq 0x30(%[y]),%%mm3\n\t"
OC_FDCT_STAGE1_8x4
OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
"pcmpeqw %%mm2,%%mm2\n\t"
"paddw %%mm2,%%mm2\n\t"
"movq 0x10(%[y]),%%mm7\n\t"
"psubw %%mm2,%%mm4\n\t"
"psubw %%mm2,%%mm6\n\t"
"psraw $2,%%mm4\n\t"
"psubw %%mm2,%%mm0\n\t"
"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
"movq 0x30(%[y]),%%mm4\n\t"
"psraw $2,%%mm6\n\t"
"psubw %%mm2,%%mm5\n\t"
"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
"psraw $2,%%mm0\n\t"
"psubw %%mm2,%%mm3\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x40,buf)"\n\t"
"psraw $2,%%mm5\n\t"
"psubw %%mm2,%%mm1\n\t"
"movq %%mm5,"OC_MEM_OFFS(0x50,buf)"\n\t"
"psraw $2,%%mm3\n\t"
"psubw %%mm2,%%mm7\n\t"
"movq %%mm3,"OC_MEM_OFFS(0x60,buf)"\n\t"
"psraw $2,%%mm1\n\t"
"psubw %%mm2,%%mm4\n\t"
"movq %%mm1,"OC_MEM_OFFS(0x70,buf)"\n\t"
"psraw $2,%%mm7\n\t"
"movq %%mm7,"OC_MEM_OFFS(0x10,buf)"\n\t"
"psraw $2,%%mm4\n\t"
"movq %%mm4,"OC_MEM_OFFS(0x30,buf)"\n\t"
"movq 0x40(%[y]),%%mm0\n\t"
"movq 0x78(%[y]),%%mm7\n\t"
"movq 0x50(%[y]),%%mm1\n\t"
"movq 0x68(%[y]),%%mm6\n\t"
"movq 0x60(%[y]),%%mm2\n\t"
"movq 0x58(%[y]),%%mm5\n\t"
"movq 0x70(%[y]),%%mm3\n\t"
"movq 0x48(%[y]),%%mm4\n\t"
OC_FDCT_STAGE1_8x4
OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
"pcmpeqw %%mm2,%%mm2\n\t"
"paddw %%mm2,%%mm2\n\t"
"movq 0x50(%[y]),%%mm7\n\t"
"psubw %%mm2,%%mm4\n\t"
"psubw %%mm2,%%mm6\n\t"
"psraw $2,%%mm4\n\t"
"psubw %%mm2,%%mm0\n\t"
"movq %%mm4,"OC_MEM_OFFS(0x08,buf)"\n\t"
"movq 0x70(%[y]),%%mm4\n\t"
"psraw $2,%%mm6\n\t"
"psubw %%mm2,%%mm5\n\t"
"movq %%mm6,"OC_MEM_OFFS(0x28,buf)"\n\t"
"psraw $2,%%mm0\n\t"
"psubw %%mm2,%%mm3\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x48,buf)"\n\t"
"psraw $2,%%mm5\n\t"
"psubw %%mm2,%%mm1\n\t"
"movq %%mm5,"OC_MEM_OFFS(0x58,buf)"\n\t"
"psraw $2,%%mm3\n\t"
"psubw %%mm2,%%mm7\n\t"
"movq %%mm3,"OC_MEM_OFFS(0x68,buf)"\n\t"
"psraw $2,%%mm1\n\t"
"psubw %%mm2,%%mm4\n\t"
"movq %%mm1,"OC_MEM_OFFS(0x78,buf)"\n\t"
"psraw $2,%%mm7\n\t"
"movq %%mm7,"OC_MEM_OFFS(0x18,buf)"\n\t"
"psraw $2,%%mm4\n\t"
"movq %%mm4,"OC_MEM_OFFS(0x38,buf)"\n\t"
#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
"movq "OC_MEM_OFFS(16*_row,buf)","_reg"\n\t" \
#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
"movq "OC_MEM_OFFS(16*_row+8,buf)","_reg"\n\t" \
OC_TRANSPOSE_ZIG_ZAG_MMXEXT
#undef OC_ZZ_LOAD_ROW_LO
#undef OC_ZZ_LOAD_ROW_HI
:[a]"=&r"(a),[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
:[y]"r"(_y),[x]"r"(_x)
:"memory"
);
}
#endif