Path: blob/master/thirdparty/libtheora/x86_vc/mmxfdct.c
9912 views
/********************************************************************1* *2* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *3* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *4* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *5* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *6* *7* THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 *8* by the Xiph.Org Foundation https://www.xiph.org/ *9* *10********************************************************************/11/*MMX fDCT implementation for x86_32*/12/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/13#include "x86enc.h"14#include "x86zigzag.h"1516#if defined(OC_X86_ASM)1718#define OC_FDCT_STAGE1_8x4 __asm{ \19/*Stage 1:*/ \20/*mm0=t7'=t0-t7*/ \21__asm psubw mm0,mm7 \22__asm paddw mm7,mm7 \23/*mm1=t6'=t1-t6*/ \24__asm psubw mm1, mm6 \25__asm paddw mm6,mm6 \26/*mm2=t5'=t2-t5*/ \27__asm psubw mm2,mm5 \28__asm paddw mm5,mm5 \29/*mm3=t4'=t3-t4*/ \30__asm psubw mm3,mm4 \31__asm paddw mm4,mm4 \32/*mm7=t0'=t0+t7*/ \33__asm paddw mm7,mm0 \34/*mm6=t1'=t1+t6*/ \35__asm paddw mm6,mm1 \36/*mm5=t2'=t2+t5*/ \37__asm paddw mm5,mm2 \38/*mm4=t3'=t3+t4*/ \39__asm paddw mm4,mm3\40}4142#define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \43/*Stage 2:*/ \44/*mm7=t3''=t0'-t3'*/ \45__asm psubw mm7,mm4 \46__asm paddw mm4,mm4 \47/*mm6=t2''=t1'-t2'*/ \48__asm psubw mm6,mm5 \49__asm movq [Y+_r6],mm7 \50__asm paddw mm5,mm5 \51/*mm1=t5''=t6'-t5'*/ \52__asm psubw mm1,mm2 \53__asm movq [Y+_r2],mm6 \54/*mm4=t0''=t0'+t3'*/ \55__asm paddw mm4,mm7 \56__asm paddw mm2,mm2 \57/*mm5=t1''=t1'+t2'*/ \58__asm movq [Y+_r0],mm4 \59__asm paddw mm5,mm6 \60/*mm2=t6''=t6'+t5'*/ \61__asm paddw mm2,mm1 \62__asm movq [Y+_r4],mm5 \63/*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \64/*mm4, mm5, mm6, mm7 are free.*/ \65/*Stage 3:*/ \66/*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \67__asm mov A,0x5A806A0A \68__asm pcmpeqb mm6,mm6 \69__asm movd mm7,A \70__asm psrlw mm6,15 \71__asm punpckldq mm7,mm7 \72__asm paddw mm6,mm6 \73/*mm0=0, m2={-1}x4 \74mm5:mm4=t5''*27146+0xB500*/ \75__asm movq mm4,mm1 \76__asm movq mm5,mm1 \77__asm punpcklwd mm4,mm6 \78__asm movq [Y+_r3],mm2 \79__asm pmaddwd mm4,mm7 \80__asm movq [Y+_r7],mm0 \81__asm punpckhwd mm5,mm6 \82__asm pxor mm0,mm0 \83__asm pmaddwd mm5,mm7 \84__asm pcmpeqb mm2,mm2 \85/*mm2=t6'', mm1=t5''+(t5''!=0) \86mm4=(t5''*27146+0xB500>>16)*/ \87__asm pcmpeqw mm0,mm1 \88__asm psrad mm4,16 \89__asm psubw mm0,mm2 \90__asm movq mm2, [Y+_r3] \91__asm psrad mm5,16 \92__asm paddw mm1,mm0 \93__asm packssdw mm4,mm5 \94/*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \95__asm paddw mm4,mm1 \96__asm movq mm0, [Y+_r7] \97__asm psraw mm4,1 \98__asm movq mm1,mm3 \99/*mm3=t4''=t4'+s*/ \100__asm paddw mm3,mm4 \101/*mm1=t5'''=t4'-s*/ \102__asm psubw mm1,mm4 \103/*mm1=0, mm3={-1}x4 \104mm5:mm4=t6''*27146+0xB500*/ \105__asm movq mm4,mm2 \106__asm movq mm5,mm2 \107__asm punpcklwd mm4,mm6 \108__asm movq [Y+_r5],mm1 \109__asm pmaddwd mm4,mm7 \110__asm movq [Y+_r1],mm3 \111__asm punpckhwd mm5,mm6 \112__asm pxor mm1,mm1 \113__asm pmaddwd mm5,mm7 \114__asm pcmpeqb mm3,mm3 \115/*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \116__asm psrad mm4,16 \117__asm pcmpeqw mm1,mm2 \118__asm psrad mm5,16 \119__asm psubw mm1,mm3 \120__asm packssdw mm4,mm5 \121__asm paddw mm2,mm1 \122/*mm1=t1'' \123mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \124__asm paddw mm4,mm2 \125__asm movq mm1,[Y+_r4] \126__asm psraw mm4,1 \127__asm movq mm2,mm0 \128/*mm7={54491-0x7FFF,0x7FFF}x2 \129mm0=t7''=t7'+s*/ \130__asm paddw mm0,mm4 \131/*mm2=t6'''=t7'-s*/ \132__asm psubw mm2,mm4 \133/*Stage 4:*/ \134/*mm0=0, mm2=t0'' \135mm5:mm4=t1''*27146+0xB500*/ \136__asm movq mm4,mm1 \137__asm movq mm5,mm1 \138__asm punpcklwd mm4,mm6 \139__asm movq [Y+_r3],mm2 \140__asm pmaddwd mm4,mm7 \141__asm movq mm2,[Y+_r0] \142__asm punpckhwd mm5,mm6 \143__asm movq [Y+_r7],mm0 \144__asm pmaddwd mm5,mm7 \145__asm pxor mm0,mm0 \146/*mm7={27146,0x4000>>1}x2 \147mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \148__asm psrad mm4,16 \149__asm mov A,0x20006A0A \150__asm pcmpeqw mm0,mm1 \151__asm movd mm7,A \152__asm psrad mm5,16 \153__asm psubw mm0,mm3 \154__asm packssdw mm4,mm5 \155__asm paddw mm0,mm1 \156__asm punpckldq mm7,mm7 \157__asm paddw mm0,mm4 \158/*mm6={0x00000E3D}x2 \159mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \160__asm movq mm4,mm2 \161__asm movq mm5,mm2 \162__asm punpcklwd mm4,mm6 \163__asm mov A,0x0E3D \164__asm pmaddwd mm4,mm7 \165__asm punpckhwd mm5,mm6 \166__asm movd mm6,A \167__asm pmaddwd mm5,mm7 \168__asm pxor mm1,mm1 \169__asm punpckldq mm6,mm6 \170__asm pcmpeqw mm1,mm2 \171/*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \172__asm psrad mm4,16 \173__asm psubw mm1,mm3 \174__asm psrad mm5,16 \175__asm paddw mm2,mm1 \176__asm packssdw mm4,mm5 \177__asm movq mm1,[Y+_r5] \178__asm paddw mm4,mm2 \179/*mm2=t6'', mm0=_y[0]=u=r+s>>1 \180The naive implementation could cause overflow, so we use \181u=(r&s)+((r^s)>>1).*/ \182__asm movq mm2,[Y+_r3] \183__asm movq mm7,mm0 \184__asm pxor mm0,mm4 \185__asm pand mm7,mm4 \186__asm psraw mm0,1 \187__asm mov A,0x7FFF54DC \188__asm paddw mm0,mm7 \189__asm movd mm7,A \190/*mm7={54491-0x7FFF,0x7FFF}x2 \191mm4=_y[4]=v=r-u*/ \192__asm psubw mm4,mm0 \193__asm punpckldq mm7,mm7 \194__asm movq [Y+_r4],mm4 \195/*mm0=0, mm7={36410}x4 \196mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \197__asm movq mm4,mm1 \198__asm movq mm5,mm1 \199__asm punpcklwd mm4,mm1 \200__asm mov A,0x8E3A8E3A \201__asm pmaddwd mm4,mm7 \202__asm movq [Y+_r0],mm0 \203__asm punpckhwd mm5,mm1 \204__asm pxor mm0,mm0 \205__asm pmaddwd mm5,mm7 \206__asm pcmpeqw mm1,mm0 \207__asm movd mm7,A \208__asm psubw mm1,mm3 \209__asm punpckldq mm7,mm7 \210__asm paddd mm4,mm6 \211__asm paddd mm5,mm6 \212/*mm0=0 \213mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \214__asm movq mm6,mm2 \215__asm movq mm3,mm2 \216__asm pmulhw mm6,mm7 \217__asm paddw mm1,mm2 \218__asm pmullw mm3,mm7 \219__asm pxor mm0,mm0 \220__asm paddw mm6,mm1 \221__asm movq mm1,mm3 \222__asm punpckhwd mm3,mm6 \223__asm punpcklwd mm1,mm6 \224/*mm3={-1}x4, mm6={1}x4 \225mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \226__asm paddd mm5,mm3 \227__asm paddd mm4,mm1 \228__asm psrad mm5,16 \229__asm pxor mm6,mm6 \230__asm psrad mm4,16 \231__asm pcmpeqb mm3,mm3 \232__asm packssdw mm4,mm5 \233__asm psubw mm6,mm3 \234/*mm1=t7'', mm7={26568,0x3400}x2 \235mm2=s=t6'''-(36410*u>>16)*/ \236__asm movq mm1,mm4 \237__asm mov A,0x340067C8 \238__asm pmulhw mm4,mm7 \239__asm movd mm7,A \240__asm movq [Y+_r5],mm1 \241__asm punpckldq mm7,mm7 \242__asm paddw mm4,mm1 \243__asm movq mm1,[Y+_r7] \244__asm psubw mm2,mm4 \245/*mm6={0x00007B1B}x2 \246mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \247__asm movq mm4,mm2 \248__asm movq mm5,mm2 \249__asm punpcklwd mm4,mm6 \250__asm pcmpeqw mm0,mm2 \251__asm pmaddwd mm4,mm7 \252__asm mov A,0x7B1B \253__asm punpckhwd mm5,mm6 \254__asm movd mm6,A \255__asm pmaddwd mm5,mm7 \256__asm psubw mm0,mm3 \257__asm punpckldq mm6,mm6 \258/*mm7={64277-0x7FFF,0x7FFF}x2 \259mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \260__asm psrad mm4,17 \261__asm paddw mm2,mm0 \262__asm psrad mm5,17 \263__asm mov A,0x7FFF7B16 \264__asm packssdw mm4,mm5 \265__asm movd mm7,A \266__asm paddw mm2,mm4 \267__asm punpckldq mm7,mm7 \268/*mm0=0, mm7={12785}x4 \269mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \270__asm movq mm4,mm1 \271__asm movq mm5,mm1 \272__asm movq [Y+_r3],mm2 \273__asm punpcklwd mm4,mm1 \274__asm movq mm2,[Y+_r1] \275__asm pmaddwd mm4,mm7 \276__asm mov A,0x31F131F1 \277__asm punpckhwd mm5,mm1 \278__asm pxor mm0,mm0 \279__asm pmaddwd mm5,mm7 \280__asm pcmpeqw mm1,mm0 \281__asm movd mm7,A \282__asm psubw mm1,mm3 \283__asm punpckldq mm7,mm7 \284__asm paddd mm4,mm6 \285__asm paddd mm5,mm6 \286/*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \287__asm movq mm6,mm2 \288__asm movq mm3,mm2 \289__asm pmulhw mm6,mm7 \290__asm pmullw mm3,mm7 \291__asm paddw mm6,mm1 \292__asm movq mm1,mm3 \293__asm punpckhwd mm3,mm6 \294__asm punpcklwd mm1,mm6 \295/*mm3={-1}x4, mm6={1}x4 \296mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \297__asm paddd mm5,mm3 \298__asm paddd mm4,mm1 \299__asm psrad mm5,16 \300__asm pxor mm6,mm6 \301__asm psrad mm4,16 \302__asm pcmpeqb mm3,mm3 \303__asm packssdw mm4,mm5 \304__asm psubw mm6,mm3 \305/*mm1=t3'', mm7={20539,0x3000}x2 \306mm4=s=(12785*u>>16)-t4''*/ \307__asm movq [Y+_r1],mm4 \308__asm pmulhw mm4,mm7 \309__asm mov A,0x3000503B \310__asm movq mm1,[Y+_r6] \311__asm movd mm7,A \312__asm psubw mm4,mm2 \313__asm punpckldq mm7,mm7 \314/*mm6={0x00006CB7}x2 \315mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \316__asm movq mm5,mm4 \317__asm movq mm2,mm4 \318__asm punpcklwd mm4,mm6 \319__asm pcmpeqw mm0,mm2 \320__asm pmaddwd mm4,mm7 \321__asm mov A,0x6CB7 \322__asm punpckhwd mm5,mm6 \323__asm movd mm6,A \324__asm pmaddwd mm5,mm7 \325__asm psubw mm0,mm3 \326__asm punpckldq mm6,mm6 \327/*mm7={60547-0x7FFF,0x7FFF}x2 \328mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \329__asm psrad mm4,20 \330__asm paddw mm2,mm0 \331__asm psrad mm5,20 \332__asm mov A,0x7FFF6C84 \333__asm packssdw mm4,mm5 \334__asm movd mm7,A \335__asm paddw mm2,mm4 \336__asm punpckldq mm7,mm7 \337/*mm0=0, mm7={25080}x4 \338mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \339__asm movq mm4,mm1 \340__asm movq mm5,mm1 \341__asm movq [Y+_r7],mm2 \342__asm punpcklwd mm4,mm1 \343__asm movq mm2,[Y+_r2] \344__asm pmaddwd mm4,mm7 \345__asm mov A,0x61F861F8 \346__asm punpckhwd mm5,mm1 \347__asm pxor mm0,mm0 \348__asm pmaddwd mm5,mm7 \349__asm movd mm7,A \350__asm pcmpeqw mm1,mm0 \351__asm psubw mm1,mm3 \352__asm punpckldq mm7,mm7 \353__asm paddd mm4,mm6 \354__asm paddd mm5,mm6 \355/*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \356__asm movq mm6,mm2 \357__asm movq mm3,mm2 \358__asm pmulhw mm6,mm7 \359__asm pmullw mm3,mm7 \360__asm paddw mm6,mm1 \361__asm movq mm1,mm3 \362__asm punpckhwd mm3,mm6 \363__asm punpcklwd mm1,mm6 \364/*mm1={-1}x4 \365mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \366__asm paddd mm5,mm3 \367__asm paddd mm4,mm1 \368__asm psrad mm5,16 \369__asm mov A,0x28005460 \370__asm psrad mm4,16 \371__asm pcmpeqb mm1,mm1 \372__asm packssdw mm4,mm5 \373/*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \374mm4=s=(25080*u>>16)-t2''*/ \375__asm movq mm6,mm4 \376__asm pmulhw mm4,mm7 \377__asm pxor mm5,mm5 \378__asm movd mm7,A \379__asm psubw mm5,mm1 \380__asm punpckldq mm7,mm7 \381__asm psubw mm4,mm2 \382/*mm2=s+(s!=0) \383mm4:mm3=s*21600+0x2800*/ \384__asm movq mm3,mm4 \385__asm movq mm2,mm4 \386__asm punpckhwd mm4,mm5 \387__asm pcmpeqw mm0,mm2 \388__asm pmaddwd mm4,mm7 \389__asm psubw mm0,mm1 \390__asm punpcklwd mm3,mm5 \391__asm paddw mm2,mm0 \392__asm pmaddwd mm3,mm7 \393/*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \394mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \395__asm movq mm0,[Y+_r4] \396__asm psrad mm4,18 \397__asm movq mm5,[Y+_r5] \398__asm psrad mm3,18 \399__asm movq mm1,[Y+_r7] \400__asm packssdw mm3,mm4 \401__asm movq mm4,[Y+_r0] \402__asm paddw mm3,mm2 \403}404405/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].406On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and407{mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/408#define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \409/*First 4x4 transpose:*/ \410/*mm0 = e3 e2 e1 e0 \411mm5 = f3 f2 f1 f0 \412mm3 = g3 g2 g1 g0 \413mm1 = h3 h2 h1 h0*/ \414__asm movq mm2,mm0 \415__asm punpcklwd mm0,mm5 \416__asm punpckhwd mm2,mm5 \417__asm movq mm5,mm3 \418__asm punpcklwd mm3,mm1 \419__asm punpckhwd mm5,mm1 \420/*mm0 = f1 e1 f0 e0 \421mm2 = f3 e3 f2 e2 \422mm3 = h1 g1 h0 g0 \423mm5 = h3 g3 h2 g2*/ \424__asm movq mm1,mm0 \425__asm punpckldq mm0,mm3 \426__asm movq [Y+_r4],mm0 \427__asm punpckhdq mm1,mm3 \428__asm movq mm0,[Y+_r1] \429__asm movq mm3,mm2 \430__asm punpckldq mm2,mm5 \431__asm punpckhdq mm3,mm5 \432__asm movq mm5,[Y+_r3] \433/*_y[4] = h0 g0 f0 e0 \434mm1 = h1 g1 f1 e1 \435mm2 = h2 g2 f2 e2 \436mm3 = h3 g3 f3 e3*/ \437/*Second 4x4 transpose:*/ \438/*mm4 = a3 a2 a1 a0 \439mm0 = b3 b2 b1 b0 \440mm6 = c3 c2 c1 c0 \441mm5 = d3 d2 d1 d0*/ \442__asm movq mm7,mm4 \443__asm punpcklwd mm4,mm0 \444__asm punpckhwd mm7,mm0 \445__asm movq mm0,mm6 \446__asm punpcklwd mm6,mm5 \447__asm punpckhwd mm0,mm5 \448/*mm4 = b1 a1 b0 a0 \449mm7 = b3 a3 b2 a2 \450mm6 = d1 c1 d0 c0 \451mm0 = d3 c3 d2 c2*/ \452__asm movq mm5,mm4 \453__asm punpckldq mm4,mm6 \454__asm punpckhdq mm5,mm6 \455__asm movq mm6,mm7 \456__asm punpckhdq mm7,mm0 \457__asm punpckldq mm6,mm0 \458/*mm4 = d0 c0 b0 a0 \459mm5 = d1 c1 b1 a1 \460mm6 = d2 c2 b2 a2 \461mm7 = d3 c3 b3 a3*/ \462}463464/*MMX implementation of the fDCT.*/465void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){466OC_ALIGN8(ogg_int16_t buf[64]);467ogg_int16_t *bufp;468bufp=buf;469__asm{470#define X edx471#define Y eax472#define A ecx473#define BUF esi474/*Add two extra bits of working precision to improve accuracy; any more and475we could overflow.*/476/*We also add biases to correct for some systematic error that remains in477the full fDCT->iDCT round trip.*/478mov X, _x479mov Y, _y480mov BUF, bufp481movq mm0,[0x00+X]482movq mm1,[0x10+X]483movq mm2,[0x20+X]484movq mm3,[0x30+X]485pcmpeqb mm4,mm4486pxor mm7,mm7487movq mm5,mm0488psllw mm0,2489pcmpeqw mm5,mm7490movq mm7,[0x70+X]491psllw mm1,2492psubw mm5,mm4493psllw mm2,2494mov A,1495pslld mm5,16496movd mm6,A497psllq mm5,16498mov A,0x10001499psllw mm3,2500movd mm4,A501punpckhwd mm5,mm6502psubw mm1,mm6503movq mm6,[0x60+X]504paddw mm0,mm5505movq mm5,[0x50+X]506paddw mm0,mm4507movq mm4,[0x40+X]508/*We inline stage1 of the transform here so we can get better instruction509scheduling with the shifts.*/510/*mm0=t7'=t0-t7*/511psllw mm7,2512psubw mm0,mm7513psllw mm6,2514paddw mm7,mm7515/*mm1=t6'=t1-t6*/516psllw mm5,2517psubw mm1,mm6518psllw mm4,2519paddw mm6,mm6520/*mm2=t5'=t2-t5*/521psubw mm2,mm5522paddw mm5,mm5523/*mm3=t4'=t3-t4*/524psubw mm3,mm4525paddw mm4,mm4526/*mm7=t0'=t0+t7*/527paddw mm7,mm0528/*mm6=t1'=t1+t6*/529paddw mm6,mm1530/*mm5=t2'=t2+t5*/531paddw mm5,mm2532/*mm4=t3'=t3+t4*/533paddw mm4,mm3534OC_FDCT8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70)535OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70)536/*Swap out this 8x4 block for the next one.*/537movq mm0,[0x08+X]538movq [0x30+Y],mm7539movq mm7,[0x78+X]540movq [0x50+Y],mm1541movq mm1,[0x18+X]542movq [0x20+Y],mm6543movq mm6,[0x68+X]544movq [0x60+Y],mm2545movq mm2,[0x28+X]546movq [0x10+Y],mm5547movq mm5,[0x58+X]548movq [0x70+Y],mm3549movq mm3,[0x38+X]550/*And increase its working precision, too.*/551psllw mm0,2552movq [0x00+Y],mm4553psllw mm7,2554movq mm4,[0x48+X]555/*We inline stage1 of the transform here so we can get better instruction556scheduling with the shifts.*/557/*mm0=t7'=t0-t7*/558psubw mm0,mm7559psllw mm1,2560paddw mm7,mm7561psllw mm6,2562/*mm1=t6'=t1-t6*/563psubw mm1,mm6564psllw mm2,2565paddw mm6,mm6566psllw mm5,2567/*mm2=t5'=t2-t5*/568psubw mm2,mm5569psllw mm3,2570paddw mm5,mm5571psllw mm4,2572/*mm3=t4'=t3-t4*/573psubw mm3,mm4574paddw mm4,mm4575/*mm7=t0'=t0+t7*/576paddw mm7,mm0577/*mm6=t1'=t1+t6*/578paddw mm6,mm1579/*mm5=t2'=t2+t5*/580paddw mm5,mm2581/*mm4=t3'=t3+t4*/582paddw mm4,mm3583OC_FDCT8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78)584OC_TRANSPOSE8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78)585/*Here the first 4x4 block of output from the last transpose is the second5864x4 block of input for the next transform.587We have cleverly arranged that it already be in the appropriate place,588so we only have to do half the stores and loads.*/589movq mm0,[0x00+Y]590movq [0x58+Y],mm1591movq mm1,[0x10+Y]592movq [0x68+Y],mm2593movq mm2,[0x20+Y]594movq [0x78+Y],mm3595movq mm3,[0x30+Y]596OC_FDCT_STAGE1_8x4597OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)598/*mm0={-2}x4*/599pcmpeqw mm2,mm2600paddw mm2,mm2601/*Round and store the results (no transpose).*/602movq mm7,[Y+0x10]603psubw mm4,mm2604psubw mm6,mm2605psraw mm4,2606psubw mm0,mm2607movq [BUF+0x00],mm4608movq mm4,[Y+0x30]609psraw mm6,2610psubw mm5,mm2611movq [BUF+0x20],mm6612psraw mm0,2613psubw mm3,mm2614movq [BUF+0x40],mm0615psraw mm5,2616psubw mm1,mm2617movq [BUF+0x50],mm5618psraw mm3,2619psubw mm7,mm2620movq [BUF+0x60],mm3621psraw mm1,2622psubw mm4,mm2623movq [BUF+0x70],mm1624psraw mm7,2625movq [BUF+0x10],mm7626psraw mm4,2627movq [BUF+0x30],mm4628/*Load the next block.*/629movq mm0,[0x40+Y]630movq mm7,[0x78+Y]631movq mm1,[0x50+Y]632movq mm6,[0x68+Y]633movq mm2,[0x60+Y]634movq mm5,[0x58+Y]635movq mm3,[0x70+Y]636movq mm4,[0x48+Y]637OC_FDCT_STAGE1_8x4638OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)639/*mm0={-2}x4*/640pcmpeqw mm2,mm2641paddw mm2,mm2642/*Round and store the results (no transpose).*/643movq mm7,[Y+0x50]644psubw mm4,mm2645psubw mm6,mm2646psraw mm4,2647psubw mm0,mm2648movq [BUF+0x08],mm4649movq mm4,[Y+0x70]650psraw mm6,2651psubw mm5,mm2652movq [BUF+0x28],mm6653psraw mm0,2654psubw mm3,mm2655movq [BUF+0x48],mm0656psraw mm5,2657psubw mm1,mm2658movq [BUF+0x58],mm5659psraw mm3,2660psubw mm7,mm2661movq [BUF+0x68],mm3662psraw mm1,2663psubw mm4,mm2664movq [BUF+0x78],mm1665psraw mm7,2666movq [BUF+0x18],mm7667psraw mm4,2668movq [BUF+0x38],mm4669#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \670__asm movq _reg,[BUF+16*(_row)] \671672#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \673__asm movq _reg,[BUF+16*(_row)+8] \674675OC_TRANSPOSE_ZIG_ZAG_MMXEXT676#undef OC_ZZ_LOAD_ROW_LO677#undef OC_ZZ_LOAD_ROW_HI678#undef X679#undef Y680#undef A681#undef BUF682}683}684685#endif686687688