Path: blob/master/thirdparty/libtheora/x86_vc/mmxidct.c
9906 views
/********************************************************************1* *2* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *3* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *4* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *5* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *6* *7* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *8* by the Xiph.Org Foundation and contributors *9* https://www.xiph.org/ *10* *11********************************************************************1213function:1415********************************************************************/1617/*MMX acceleration of Theora's iDCT.18Originally written by Rudolf Marek, based on code from On2's VP3.*/19#include "x86int.h"20#include "../dct.h"2122#if defined(OC_X86_ASM)2324/*These are offsets into the table of constants below.*/25/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/26#define OC_COSINE_OFFSET (8)27/*A row of 8's.*/28#define OC_EIGHT_OFFSET (0)29303132/*A table of constants used by the MMX routines.*/33static const OC_ALIGN16(ogg_uint16_t) OC_IDCT_CONSTS[(1+7)*4]={348, 8, 8, 8,35(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,36(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,37(ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,38(ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,39(ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,40(ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,41(ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,42(ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,43(ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,44(ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,45(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,46(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,47(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,48(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S149};5051/*38 cycles*/52#define OC_IDCT_BEGIN(_y,_x) __asm{ \53__asm movq mm2,OC_I(3,_x) \54__asm movq mm6,OC_C(3) \55__asm movq mm4,mm2 \56__asm movq mm7,OC_J(5,_x) \57__asm pmulhw mm4,mm6 \58__asm movq mm1,OC_C(5) \59__asm pmulhw mm6,mm7 \60__asm movq mm5,mm1 \61__asm pmulhw mm1,mm2 \62__asm movq mm3,OC_I(1,_x) \63__asm pmulhw mm5,mm7 \64__asm movq mm0,OC_C(1) \65__asm paddw mm4,mm2 \66__asm paddw mm6,mm7 \67__asm paddw mm2,mm1 \68__asm movq mm1,OC_J(7,_x) \69__asm paddw mm7,mm5 \70__asm movq mm5,mm0 \71__asm pmulhw mm0,mm3 \72__asm paddw mm4,mm7 \73__asm pmulhw mm5,mm1 \74__asm movq mm7,OC_C(7) \75__asm psubw mm6,mm2 \76__asm paddw mm0,mm3 \77__asm pmulhw mm3,mm7 \78__asm movq mm2,OC_I(2,_x) \79__asm pmulhw mm7,mm1 \80__asm paddw mm5,mm1 \81__asm movq mm1,mm2 \82__asm pmulhw mm2,OC_C(2) \83__asm psubw mm3,mm5 \84__asm movq mm5,OC_J(6,_x) \85__asm paddw mm0,mm7 \86__asm movq mm7,mm5 \87__asm psubw mm0,mm4 \88__asm pmulhw mm5,OC_C(2) \89__asm paddw mm2,mm1 \90__asm pmulhw mm1,OC_C(6) \91__asm paddw mm4,mm4 \92__asm paddw mm4,mm0 \93__asm psubw mm3,mm6 \94__asm paddw mm5,mm7 \95__asm paddw mm6,mm6 \96__asm pmulhw mm7,OC_C(6) \97__asm paddw mm6,mm3 \98__asm movq OC_I(1,_y),mm4 \99__asm psubw mm1,mm5 \100__asm movq mm4,OC_C(4) \101__asm movq mm5,mm3 \102__asm pmulhw mm3,mm4 \103__asm paddw mm7,mm2 \104__asm movq OC_I(2,_y),mm6 \105__asm movq mm2,mm0 \106__asm movq mm6,OC_I(0,_x) \107__asm pmulhw mm0,mm4 \108__asm paddw mm5,mm3 \109__asm movq mm3,OC_J(4,_x) \110__asm psubw mm5,mm1 \111__asm paddw mm2,mm0 \112__asm psubw mm6,mm3 \113__asm movq mm0,mm6 \114__asm pmulhw mm6,mm4 \115__asm paddw mm3,mm3 \116__asm paddw mm1,mm1 \117__asm paddw mm3,mm0 \118__asm paddw mm1,mm5 \119__asm pmulhw mm4,mm3 \120__asm paddw mm6,mm0 \121__asm psubw mm6,mm2 \122__asm paddw mm2,mm2 \123__asm movq mm0,OC_I(1,_y) \124__asm paddw mm2,mm6 \125__asm paddw mm4,mm3 \126__asm psubw mm2,mm1 \127}128129/*38+8=46 cycles.*/130#define OC_ROW_IDCT(_y,_x) __asm{ \131OC_IDCT_BEGIN(_y,_x) \132/*r3=D'*/ \133__asm movq mm3,OC_I(2,_y) \134/*r4=E'=E-G*/ \135__asm psubw mm4,mm7 \136/*r1=H'+H'*/ \137__asm paddw mm1,mm1 \138/*r7=G+G*/ \139__asm paddw mm7,mm7 \140/*r1=R1=A''+H'*/ \141__asm paddw mm1,mm2 \142/*r7=G'=E+G*/ \143__asm paddw mm7,mm4 \144/*r4=R4=E'-D'*/ \145__asm psubw mm4,mm3 \146__asm paddw mm3,mm3 \147/*r6=R6=F'-B''*/ \148__asm psubw mm6,mm5 \149__asm paddw mm5,mm5 \150/*r3=R3=E'+D'*/ \151__asm paddw mm3,mm4 \152/*r5=R5=F'+B''*/ \153__asm paddw mm5,mm6 \154/*r7=R7=G'-C'*/ \155__asm psubw mm7,mm0 \156__asm paddw mm0,mm0 \157/*Save R1.*/ \158__asm movq OC_I(1,_y),mm1 \159/*r0=R0=G.+C.*/ \160__asm paddw mm0,mm7 \161}162163/*The following macro does two 4x4 transposes in place.164At entry, we assume:165r0 = a3 a2 a1 a0166I(1) = b3 b2 b1 b0167r2 = c3 c2 c1 c0168r3 = d3 d2 d1 d0169170r4 = e3 e2 e1 e0171r5 = f3 f2 f1 f0172r6 = g3 g2 g1 g0173r7 = h3 h2 h1 h0174175At exit, we have:176I(0) = d0 c0 b0 a0177I(1) = d1 c1 b1 a1178I(2) = d2 c2 b2 a2179I(3) = d3 c3 b3 a3180181J(4) = h0 g0 f0 e0182J(5) = h1 g1 f1 e1183J(6) = h2 g2 f2 e2184J(7) = h3 g3 f3 e3185186I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.187J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.188189Since r1 is free at entry, we calculate the Js first.*/190/*19 cycles.*/191#define OC_TRANSPOSE(_y) __asm{ \192__asm movq mm1,mm4 \193__asm punpcklwd mm4,mm5 \194__asm movq OC_I(0,_y),mm0 \195__asm punpckhwd mm1,mm5 \196__asm movq mm0,mm6 \197__asm punpcklwd mm6,mm7 \198__asm movq mm5,mm4 \199__asm punpckldq mm4,mm6 \200__asm punpckhdq mm5,mm6 \201__asm movq mm6,mm1 \202__asm movq OC_J(4,_y),mm4 \203__asm punpckhwd mm0,mm7 \204__asm movq OC_J(5,_y),mm5 \205__asm punpckhdq mm6,mm0 \206__asm movq mm4,OC_I(0,_y) \207__asm punpckldq mm1,mm0 \208__asm movq mm5,OC_I(1,_y) \209__asm movq mm0,mm4 \210__asm movq OC_J(7,_y),mm6 \211__asm punpcklwd mm0,mm5 \212__asm movq OC_J(6,_y),mm1 \213__asm punpckhwd mm4,mm5 \214__asm movq mm5,mm2 \215__asm punpcklwd mm2,mm3 \216__asm movq mm1,mm0 \217__asm punpckldq mm0,mm2 \218__asm punpckhdq mm1,mm2 \219__asm movq mm2,mm4 \220__asm movq OC_I(0,_y),mm0 \221__asm punpckhwd mm5,mm3 \222__asm movq OC_I(1,_y),mm1 \223__asm punpckhdq mm4,mm5 \224__asm punpckldq mm2,mm5 \225__asm movq OC_I(3,_y),mm4 \226__asm movq OC_I(2,_y),mm2 \227}228229/*38+19=57 cycles.*/230#define OC_COLUMN_IDCT(_y) __asm{ \231OC_IDCT_BEGIN(_y,_y) \232__asm paddw mm2,OC_8 \233/*r1=H'+H'*/ \234__asm paddw mm1,mm1 \235/*r1=R1=A''+H'*/ \236__asm paddw mm1,mm2 \237/*r2=NR2*/ \238__asm psraw mm2,4 \239/*r4=E'=E-G*/ \240__asm psubw mm4,mm7 \241/*r1=NR1*/ \242__asm psraw mm1,4 \243/*r3=D'*/ \244__asm movq mm3,OC_I(2,_y) \245/*r7=G+G*/ \246__asm paddw mm7,mm7 \247/*Store NR2 at I(2).*/ \248__asm movq OC_I(2,_y),mm2 \249/*r7=G'=E+G*/ \250__asm paddw mm7,mm4 \251/*Store NR1 at I(1).*/ \252__asm movq OC_I(1,_y),mm1 \253/*r4=R4=E'-D'*/ \254__asm psubw mm4,mm3 \255__asm paddw mm4,OC_8 \256/*r3=D'+D'*/ \257__asm paddw mm3,mm3 \258/*r3=R3=E'+D'*/ \259__asm paddw mm3,mm4 \260/*r4=NR4*/ \261__asm psraw mm4,4 \262/*r6=R6=F'-B''*/ \263__asm psubw mm6,mm5 \264/*r3=NR3*/ \265__asm psraw mm3,4 \266__asm paddw mm6,OC_8 \267/*r5=B''+B''*/ \268__asm paddw mm5,mm5 \269/*r5=R5=F'+B''*/ \270__asm paddw mm5,mm6 \271/*r6=NR6*/ \272__asm psraw mm6,4 \273/*Store NR4 at J(4).*/ \274__asm movq OC_J(4,_y),mm4 \275/*r5=NR5*/ \276__asm psraw mm5,4 \277/*Store NR3 at I(3).*/ \278__asm movq OC_I(3,_y),mm3 \279/*r7=R7=G'-C'*/ \280__asm psubw mm7,mm0 \281__asm paddw mm7,OC_8 \282/*r0=C'+C'*/ \283__asm paddw mm0,mm0 \284/*r0=R0=G'+C'*/ \285__asm paddw mm0,mm7 \286/*r7=NR7*/ \287__asm psraw mm7,4 \288/*Store NR6 at J(6).*/ \289__asm movq OC_J(6,_y),mm6 \290/*r0=NR0*/ \291__asm psraw mm0,4 \292/*Store NR5 at J(5).*/ \293__asm movq OC_J(5,_y),mm5 \294/*Store NR7 at J(7).*/ \295__asm movq OC_J(7,_y),mm7 \296/*Store NR0 at I(0).*/ \297__asm movq OC_I(0,_y),mm0 \298}299300#define OC_MID(_m,_i) [CONSTS+_m+(_i)*8]301#define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1)302#define OC_8 OC_MID(OC_EIGHT_OFFSET,0)303304static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){305int i;306/*This routine accepts an 8x8 matrix, but in partially transposed form.307Every 4x4 block is transposed.*/308__asm{309#define CONSTS eax310#define Y edx311#define X ecx312mov CONSTS,offset OC_IDCT_CONSTS313mov Y,_y314mov X,_x315#define OC_I(_k,_y) [(_y)+(_k)*16]316#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8]317OC_ROW_IDCT(Y,X)318OC_TRANSPOSE(Y)319#undef OC_I320#undef OC_J321#define OC_I(_k,_y) [(_y)+(_k)*16+64]322#define OC_J(_k,_y) [(_y)+((_k)-4)*16+72]323OC_ROW_IDCT(Y,X)324OC_TRANSPOSE(Y)325#undef OC_I326#undef OC_J327#define OC_I(_k,_y) [(_y)+(_k)*16]328#define OC_J(_k,_y) OC_I(_k,_y)329OC_COLUMN_IDCT(Y)330#undef OC_I331#undef OC_J332#define OC_I(_k,_y) [(_y)+(_k)*16+8]333#define OC_J(_k,_y) OC_I(_k,_y)334OC_COLUMN_IDCT(Y)335#undef OC_I336#undef OC_J337#undef CONSTS338#undef Y339#undef X340}341__asm pxor mm0,mm0;342for(i=0;i<4;i++){343ogg_int16_t *x;344x=_x+16*i;345#define X ecx346__asm{347mov X,x348movq [X+0x00],mm0349movq [X+0x08],mm0350movq [X+0x10],mm0351movq [X+0x18],mm0352}353#undef X354}355}356357/*25 cycles.*/358#define OC_IDCT_BEGIN_10(_y,_x) __asm{ \359__asm movq mm2,OC_I(3,_x) \360__asm nop \361__asm movq mm6,OC_C(3) \362__asm movq mm4,mm2 \363__asm movq mm1,OC_C(5) \364__asm pmulhw mm4,mm6 \365__asm movq mm3,OC_I(1,_x) \366__asm pmulhw mm1,mm2 \367__asm movq mm0,OC_C(1) \368__asm paddw mm4,mm2 \369__asm pxor mm6,mm6 \370__asm paddw mm2,mm1 \371__asm movq mm5,OC_I(2,_x) \372__asm pmulhw mm0,mm3 \373__asm movq mm1,mm5 \374__asm paddw mm0,mm3 \375__asm pmulhw mm3,OC_C(7) \376__asm psubw mm6,mm2 \377__asm pmulhw mm5,OC_C(2) \378__asm psubw mm0,mm4 \379__asm movq mm7,OC_I(2,_x) \380__asm paddw mm4,mm4 \381__asm paddw mm7,mm5 \382__asm paddw mm4,mm0 \383__asm pmulhw mm1,OC_C(6) \384__asm psubw mm3,mm6 \385__asm movq OC_I(1,_y),mm4 \386__asm paddw mm6,mm6 \387__asm movq mm4,OC_C(4) \388__asm paddw mm6,mm3 \389__asm movq mm5,mm3 \390__asm pmulhw mm3,mm4 \391__asm movq OC_I(2,_y),mm6 \392__asm movq mm2,mm0 \393__asm movq mm6,OC_I(0,_x) \394__asm pmulhw mm0,mm4 \395__asm paddw mm5,mm3 \396__asm paddw mm2,mm0 \397__asm psubw mm5,mm1 \398__asm pmulhw mm6,mm4 \399__asm paddw mm6,OC_I(0,_x) \400__asm paddw mm1,mm1 \401__asm movq mm4,mm6 \402__asm paddw mm1,mm5 \403__asm psubw mm6,mm2 \404__asm paddw mm2,mm2 \405__asm movq mm0,OC_I(1,_y) \406__asm paddw mm2,mm6 \407__asm psubw mm2,mm1 \408__asm nop \409}410411/*25+8=33 cycles.*/412#define OC_ROW_IDCT_10(_y,_x) __asm{ \413OC_IDCT_BEGIN_10(_y,_x) \414/*r3=D'*/ \415__asm movq mm3,OC_I(2,_y) \416/*r4=E'=E-G*/ \417__asm psubw mm4,mm7 \418/*r1=H'+H'*/ \419__asm paddw mm1,mm1 \420/*r7=G+G*/ \421__asm paddw mm7,mm7 \422/*r1=R1=A''+H'*/ \423__asm paddw mm1,mm2 \424/*r7=G'=E+G*/ \425__asm paddw mm7,mm4 \426/*r4=R4=E'-D'*/ \427__asm psubw mm4,mm3 \428__asm paddw mm3,mm3 \429/*r6=R6=F'-B''*/ \430__asm psubw mm6,mm5 \431__asm paddw mm5,mm5 \432/*r3=R3=E'+D'*/ \433__asm paddw mm3,mm4 \434/*r5=R5=F'+B''*/ \435__asm paddw mm5,mm6 \436/*r7=R7=G'-C'*/ \437__asm psubw mm7,mm0 \438__asm paddw mm0,mm0 \439/*Save R1.*/ \440__asm movq OC_I(1,_y),mm1 \441/*r0=R0=G'+C'*/ \442__asm paddw mm0,mm7 \443}444445/*25+19=44 cycles'*/446#define OC_COLUMN_IDCT_10(_y) __asm{ \447OC_IDCT_BEGIN_10(_y,_y) \448__asm paddw mm2,OC_8 \449/*r1=H'+H'*/ \450__asm paddw mm1,mm1 \451/*r1=R1=A''+H'*/ \452__asm paddw mm1,mm2 \453/*r2=NR2*/ \454__asm psraw mm2,4 \455/*r4=E'=E-G*/ \456__asm psubw mm4,mm7 \457/*r1=NR1*/ \458__asm psraw mm1,4 \459/*r3=D'*/ \460__asm movq mm3,OC_I(2,_y) \461/*r7=G+G*/ \462__asm paddw mm7,mm7 \463/*Store NR2 at I(2).*/ \464__asm movq OC_I(2,_y),mm2 \465/*r7=G'=E+G*/ \466__asm paddw mm7,mm4 \467/*Store NR1 at I(1).*/ \468__asm movq OC_I(1,_y),mm1 \469/*r4=R4=E'-D'*/ \470__asm psubw mm4,mm3 \471__asm paddw mm4,OC_8 \472/*r3=D'+D'*/ \473__asm paddw mm3,mm3 \474/*r3=R3=E'+D'*/ \475__asm paddw mm3,mm4 \476/*r4=NR4*/ \477__asm psraw mm4,4 \478/*r6=R6=F'-B''*/ \479__asm psubw mm6,mm5 \480/*r3=NR3*/ \481__asm psraw mm3,4 \482__asm paddw mm6,OC_8 \483/*r5=B''+B''*/ \484__asm paddw mm5,mm5 \485/*r5=R5=F'+B''*/ \486__asm paddw mm5,mm6 \487/*r6=NR6*/ \488__asm psraw mm6,4 \489/*Store NR4 at J(4).*/ \490__asm movq OC_J(4,_y),mm4 \491/*r5=NR5*/ \492__asm psraw mm5,4 \493/*Store NR3 at I(3).*/ \494__asm movq OC_I(3,_y),mm3 \495/*r7=R7=G'-C'*/ \496__asm psubw mm7,mm0 \497__asm paddw mm7,OC_8 \498/*r0=C'+C'*/ \499__asm paddw mm0,mm0 \500/*r0=R0=G'+C'*/ \501__asm paddw mm0,mm7 \502/*r7=NR7*/ \503__asm psraw mm7,4 \504/*Store NR6 at J(6).*/ \505__asm movq OC_J(6,_y),mm6 \506/*r0=NR0*/ \507__asm psraw mm0,4 \508/*Store NR5 at J(5).*/ \509__asm movq OC_J(5,_y),mm5 \510/*Store NR7 at J(7).*/ \511__asm movq OC_J(7,_y),mm7 \512/*Store NR0 at I(0).*/ \513__asm movq OC_I(0,_y),mm0 \514}515516static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){517__asm{518#define CONSTS eax519#define Y edx520#define X ecx521mov CONSTS,offset OC_IDCT_CONSTS522mov Y,_y523mov X,_x524#define OC_I(_k,_y) [(_y)+(_k)*16]525#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8]526/*Done with dequant, descramble, and partial transpose.527Now do the iDCT itself.*/528OC_ROW_IDCT_10(Y,X)529OC_TRANSPOSE(Y)530#undef OC_I531#undef OC_J532#define OC_I(_k,_y) [(_y)+(_k)*16]533#define OC_J(_k,_y) OC_I(_k,_y)534OC_COLUMN_IDCT_10(Y)535#undef OC_I536#undef OC_J537#define OC_I(_k,_y) [(_y)+(_k)*16+8]538#define OC_J(_k,_y) OC_I(_k,_y)539OC_COLUMN_IDCT_10(Y)540#undef OC_I541#undef OC_J542#undef CONSTS543#undef Y544#undef X545}546#define X ecx547__asm{548pxor mm0,mm0;549mov X,_x550movq [X+0x00],mm0551movq [X+0x10],mm0552movq [X+0x20],mm0553movq [X+0x30],mm0554}555#undef X556}557558/*Performs an inverse 8x8 Type-II DCT transform.559The input is assumed to be scaled by a factor of 4 relative to orthonormal560version of the transform.*/561void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){562/*_last_zzi is subtly different from an actual count of the number of563coefficients we decoded for this block.564It contains the value of zzi BEFORE the final token in the block was565decoded.566In most cases this is an EOB token (the continuation of an EOB run from a567previous block counts), and so this is the same as the coefficient count.568However, in the case that the last token was NOT an EOB token, but filled569the block up with exactly 64 coefficients, _last_zzi will be less than 64.570Provided the last token was not a pure zero run, the minimum value it can571be is 46, and so that doesn't affect any of the cases in this routine.572However, if the last token WAS a pure zero run of length 63, then _last_zzi573will be 1 while the number of coefficients decoded is 64.574Thus, we will trigger the following special case, where the real575coefficient count would not.576Note also that a zero run of length 64 will give _last_zzi a value of 0,577but we still process the DC coefficient, which might have a non-zero value578due to DC prediction.579Although convoluted, this is arguably the correct behavior: it allows us to580use a smaller transform when the block ends with a long zero run instead581of a normal EOB token.582It could be smarter... multiple separate zero runs at the end of a block583will fool it, but an encoder that generates these really deserves what it584gets.585Needless to say we inherited this approach from VP3.*/586/*Perform the iDCT.*/587if(_last_zzi<=10)oc_idct8x8_10(_y,_x);588else oc_idct8x8_slow(_y,_x);589}590591#endif592593594