Path: blob/master/thirdparty/libtheora/x86_vc/x86zigzag.h
9904 views
/********************************************************************1* *2* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *3* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *4* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *5* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *6* *7* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *8* by the Xiph.Org Foundation and contributors *9* https://www.xiph.org/ *10* *11********************************************************************1213function:1415********************************************************************/1617#if !defined(_x86_vc_x86zigzag_H)18# define _x86_vc_x86zigzag_H (1)19# include "x86enc.h"202122/*Converts DCT coefficients from transposed order into zig-zag scan order and23stores them in Y.24This relies on two macros to load the contents of each row:25OC_ZZ_LOAD_ROW_LO(row,reg) and OC_ZZ_LOAD_ROW_HI(row,reg), which load the26first four and second four entries of each row into the specified register,27respectively.28OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row29(because when the rows are already in SSE2 registers, loading the high half30destructively modifies the register).31The index of each output element in the original 64-element array should wind32up in the following 8x8 matrix (the letters indicate the order we compute33each 4-tuple below):34A 0 8 1 2 9 16 24 17 B35C 10 3 4 11 18 25 32 40 E36F 33 26 19 12 5 6 13 20 D37G 27 34 41 48 56 49 42 35 I38L 28 21 14 7 15 22 29 36 M39H 43 50 57 58 51 44 37 30 O40N 23 31 38 45 52 59 60 53 J41P 46 39 47 54 61 62 55 63 K42The order of the coefficients within each tuple is reversed in the comments43below to reflect the usual MSB to LSB notation.*/44#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \45OC_ZZ_LOAD_ROW_LO(0,mm0) /*mm0=03 02 01 00*/ \46OC_ZZ_LOAD_ROW_LO(1,mm1) /*mm1=11 10 09 08*/ \47OC_ZZ_LOAD_ROW_LO(2,mm2) /*mm2=19 18 17 16*/ \48OC_ZZ_LOAD_ROW_LO(3,mm3) /*mm3=27 26 25 24*/ \49OC_ZZ_LOAD_ROW_HI(0,mm4) /*mm4=07 06 05 04*/ \50OC_ZZ_LOAD_ROW_HI(1,mm5) /*mm5=15 14 13 12*/ \51OC_ZZ_LOAD_ROW_HI(2,mm6) /*mm6=23 22 21 20*/ \52__asm movq mm7,mm0 /*mm7=03 02 01 00*/ \53__asm punpckhdq mm0,mm1 /*mm0=11 10 03 02*/ \54__asm pshufw mm4,mm4,0x39 /*mm4=04 07 06 05*/ \55__asm punpcklwd mm1,mm0 /*mm1=03 09 02 08*/ \56__asm pshufw mm5,mm5,0x39 /*mm5=12 15 14 13*/ \57__asm punpcklwd mm7,mm1 /*mm7=02 01 08 00 *A*/ \58__asm movq [Y+0x00],mm7 \59__asm punpckhwd mm1,mm4 /*mm1=04 03 07 09*/ \60__asm movq mm7,mm2 /*mm7=19 18 17 16*/ \61__asm punpckhdq mm0,mm1 /*mm0=04 03 11 10*/ \62__asm punpckhwd mm7,mm5 /*mm7=12 19 15 18*/ \63__asm punpcklwd mm1,mm3 /*mm1=25 07 24 09*/ \64__asm punpcklwd mm5,mm6 /*mm5=21 14 20 13*/ \65__asm punpcklwd mm1,mm2 /*mm1=17 24 16 09 *B*/ \66OC_ZZ_LOAD_ROW_LO(4,mm2) /*mm2=35 34 33 32*/ \67__asm movq [Y+0x08],mm1 \68OC_ZZ_LOAD_ROW_LO(5,mm1) /*mm1=43 42 41 40*/ \69__asm pshufw mm0,mm0,0x78 /*mm0=11 04 03 10 *C*/ \70__asm movq [Y+0x10],mm0 \71__asm punpckhdq mm6,mm4 /*mm6=?? 07 23 22*/ \72__asm punpckldq mm4,mm5 /*mm4=20 13 06 05 *D*/ \73__asm movq [Y+0x28],mm4 \74__asm psrlq mm3,16 /*mm3=.. 27 26 25*/ \75__asm pshufw mm0,mm2,0x0E /*mm0=?? ?? 35 34*/ \76__asm movq mm4,mm7 /*mm4=12 19 15 18*/ \77__asm punpcklwd mm2,mm3 /*mm2=26 33 25 32*/ \78__asm punpcklwd mm4,mm1 /*mm4=41 15 40 18*/ \79__asm punpckhwd mm3,mm1 /*mm3=43 .. 42 27*/ \80__asm punpckldq mm4,mm2 /*mm4=25 32 40 18*/ \81__asm punpcklwd mm3,mm0 /*mm3=35 42 34 27*/ \82OC_ZZ_LOAD_ROW_LO(6,mm0) /*mm0=51 50 49 48*/ \83__asm pshufw mm4,mm4,0x6C /*mm4=40 32 25 18 *E*/ \84__asm movq [Y+0x18],mm4 \85OC_ZZ_LOAD_ROW_LO(7,mm4) /*mm4=59 58 57 56*/ \86__asm punpckhdq mm2,mm7 /*mm2=12 19 26 33 *F*/ \87__asm movq [Y+0x20],mm2 \88__asm pshufw mm1,mm1,0xD0 /*mm1=43 41 ?? ??*/ \89__asm pshufw mm0,mm0,0x87 /*mm0=50 48 49 51*/ \90__asm movq mm2,mm3 /*mm2=35 42 34 27*/ \91__asm punpckhwd mm1,mm0 /*mm1=50 43 48 41*/ \92__asm pshufw mm4,mm4,0x93 /*mm4=58 57 56 59*/ \93__asm punpckldq mm3,mm1 /*mm3=48 41 34 27 *G*/ \94__asm movq [Y+0x30],mm3 \95__asm punpckhdq mm1,mm4 /*mm1=58 57 50 43 *H*/ \96__asm movq [Y+0x50],mm1 \97OC_ZZ_LOAD_ROW_HI(7,mm1) /*mm1=63 62 61 60*/ \98__asm punpcklwd mm4,mm0 /*mm4=49 56 51 59*/ \99OC_ZZ_LOAD_ROW_HI(6,mm0) /*mm0=55 54 53 52*/ \100__asm psllq mm6,16 /*mm6=07 23 22 ..*/ \101__asm movq mm3,mm4 /*mm3=49 56 51 59*/ \102__asm punpckhdq mm4,mm2 /*mm4=35 42 49 56 *I*/ \103OC_ZZ_LOAD_ROW_HI(3,mm2) /*mm2=31 30 29 28*/ \104__asm movq [Y+0x38],mm4 \105__asm punpcklwd mm3,mm1 /*mm3=61 51 60 59*/ \106__asm punpcklwd mm7,mm6 /*mm7=22 15 .. ??*/ \107__asm movq mm4,mm3 /*mm4=61 51 60 59*/ \108__asm punpcklwd mm3,mm0 /*mm3=53 60 52 59*/ \109__asm punpckhwd mm4,mm0 /*mm4=55 61 54 51*/ \110OC_ZZ_LOAD_ROW_HI(4,mm0) /*mm0=39 38 37 36*/ \111__asm pshufw mm3,mm3,0xE1 /*mm3=53 60 59 52 *J*/ \112__asm movq [Y+0x68],mm3 \113__asm movq mm3,mm4 /*mm3=?? ?? 54 51*/ \114__asm pshufw mm2,mm2,0x39 /*mm2=28 31 30 29*/ \115__asm punpckhwd mm4,mm1 /*mm4=63 55 62 61 *K*/ \116OC_ZZ_LOAD_ROW_HI(5,mm1) /*mm1=47 46 45 44*/ \117__asm movq [Y+0x78],mm4 \118__asm punpckhwd mm6,mm2 /*mm6=28 07 31 23*/ \119__asm punpcklwd mm2,mm0 /*mm2=37 30 36 29*/ \120__asm punpckhdq mm5,mm6 /*mm5=28 07 21 14*/ \121__asm pshufw mm2,mm2,0x4B /*mm2=36 29 30 37*/ \122__asm pshufw mm5,mm5,0x87 /*mm5=07 14 21 28 *L*/ \123__asm movq [Y+0x40],mm5 \124__asm punpckhdq mm7,mm2 /*mm7=36 29 22 15 *M*/ \125__asm movq [Y+0x48],mm7 \126__asm pshufw mm1,mm1,0x9C /*mm1=46 45 47 44*/ \127__asm punpckhwd mm0,mm1 /*mm0=46 39 45 38*/ \128__asm punpcklwd mm3,mm1 /*mm3=47 54 44 51*/ \129__asm punpckldq mm6,mm0 /*mm6=45 38 31 23 *N*/ \130__asm movq [Y+0x60],mm6 \131__asm punpckhdq mm0,mm3 /*mm0=47 54 46 39*/ \132__asm punpckldq mm3,mm2 /*mm3=30 37 44 51 *O*/ \133__asm movq [Y+0x58],mm3 \134__asm pshufw mm0,mm0,0xB1 /*mm0=54 47 39 46 *P*/ \135__asm movq [Y+0x70],mm0 \136137/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan138order and stores them in %[qdct].139The index of each output element in the original 64-element array should wind140up in the following 8x8 matrix (the letters indicate the order we compute141each 4-tuple below):142A 0 1 8 16 9 2 3 10 B143C 17 24 32 25 18 11 4 5 D144E 12 19 26 33 40 48 41 34 I145H 27 20 13 6 7 14 21 28 G146K 35 42 49 56 57 50 43 36 J147F 29 22 15 23 30 37 44 51 M148P 58 59 52 45 38 31 39 46 L149N 53 60 61 54 47 55 62 63 O150The order of the coefficients within each tuple is reversed in the comments151below to reflect the usual MSB to LSB notation.*/152#define OC_ZIG_ZAG_MMXEXT \153"movq 0x00(%[dct]),%%mm0\n\t" /*mm0=03 02 01 00*/ \154"movq 0x08(%[dct]),%%mm1\n\t" /*mm1=07 06 05 04*/ \155"movq 0x10(%[dct]),%%mm2\n\t" /*mm2=11 10 09 08*/ \156"movq 0x20(%[dct]),%%mm3\n\t" /*mm3=19 18 17 16*/ \157"movq 0x30(%[dct]),%%mm4\n\t" /*mm4=27 26 25 24*/ \158"movq 0x40(%[dct]),%%mm5\n\t" /*mm5=35 34 33 32*/ \159"movq %%mm2,%%mm7\n\t" /*mm7=11 10 09 08*/ \160"punpcklwd %%mm3,%%mm2\n\t" /*mm2=17 09 16 08*/ \161"movq %%mm0,%%mm6\n\t" /*mm6=03 02 01 00*/ \162"punpckldq %%mm2,%%mm0\n\t" /*mm0=16 08 01 00 *A*/ \163"movq %%mm0,0x00(%[qdct])\n\t" \164"movq 0x18(%[dct]),%%mm0\n\t" /*mm0=15 14 13 12*/ \165"punpckhdq %%mm6,%%mm6\n\t" /*mm6=03 02 03 02*/ \166"psrlq $16,%%mm7\n\t" /*mm7=.. 11 10 09*/ \167"punpckldq %%mm7,%%mm6\n\t" /*mm6=10 09 03 02*/ \168"punpckhwd %%mm7,%%mm3\n\t" /*mm3=.. 19 11 18*/ \169"pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \170"movq %%mm6,0x08(%[qdct])\n\t" \171"psrlq $48,%%mm2\n\t" /*mm2=.. .. .. 17*/ \172"movq %%mm1,%%mm6\n\t" /*mm6=07 06 05 04*/ \173"punpcklwd %%mm5,%%mm2\n\t" /*mm2=33 .. 32 17*/ \174"movq %%mm3,%%mm7\n\t" /*mm7=.. 19 11 18*/ \175"punpckldq %%mm1,%%mm3\n\t" /*mm3=05 04 11 18 *C*/ \176"por %%mm2,%%mm7\n\t" /*mm7=33 19 ?? ??*/ \177"punpcklwd %%mm4,%%mm2\n\t" /*mm2=25 32 24 17 *D**/ \178"movq %%mm2,0x10(%[qdct])\n\t" \179"movq %%mm3,0x18(%[qdct])\n\t" \180"movq 0x28(%[dct]),%%mm2\n\t" /*mm2=23 22 21 20*/ \181"movq 0x38(%[dct]),%%mm1\n\t" /*mm1=31 30 29 28*/ \182"pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \183"punpckhdq %%mm7,%%mm7\n\t" /*mm7=33 19 33 19*/ \184"punpckhwd %%mm3,%%mm6\n\t" /*mm6=14 07 13 06*/ \185"punpckldq %%mm0,%%mm0\n\t" /*mm0=13 12 13 12*/ \186"punpcklwd %%mm1,%%mm3\n\t" /*mm3=29 15 28 12*/ \187"punpckhwd %%mm4,%%mm0\n\t" /*mm0=27 13 26 12*/ \188"pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \189"psrlq $48,%%mm4\n\t" /*mm4=.. .. .. 27*/ \190"punpcklwd %%mm7,%%mm0\n\t" /*mm0=33 26 19 12 *E*/ \191"punpcklwd %%mm1,%%mm4\n\t" /*mm4=29 .. 28 27*/ \192"punpckhwd %%mm2,%%mm3\n\t" /*mm3=23 15 22 29 *F*/ \193"movq %%mm0,0x20(%[qdct])\n\t" \194"movq %%mm3,0x50(%[qdct])\n\t" \195"movq 0x60(%[dct]),%%mm3\n\t" /*mm3=51 50 49 48*/ \196"movq 0x70(%[dct]),%%mm7\n\t" /*mm7=59 58 57 56*/ \197"movq 0x50(%[dct]),%%mm0\n\t" /*mm0=43 42 41 40*/ \198"punpcklwd %%mm4,%%mm2\n\t" /*mm2=28 21 27 20*/ \199"psrlq $32,%%mm5\n\t" /*mm5=.. .. 35 34*/ \200"movq %%mm2,%%mm4\n\t" /*mm4=28 21 27 20*/ \201"punpckldq %%mm6,%%mm2\n\t" /*mm2=13 06 27 20*/ \202"punpckhdq %%mm4,%%mm6\n\t" /*mm6=28 21 14 07 *G*/ \203"movq %%mm3,%%mm4\n\t" /*mm4=51 50 49 48*/ \204"pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \205"movq %%mm2,0x30(%[qdct])\n\t" \206"movq %%mm6,0x38(%[qdct])\n\t" \207"movq 0x48(%[dct]),%%mm2\n\t" /*mm2=39 38 37 36*/ \208"punpcklwd %%mm5,%%mm4\n\t" /*mm4=35 49 34 48*/ \209"movq 0x58(%[dct]),%%mm5\n\t" /*mm5=47 46 45 44*/ \210"punpckldq %%mm7,%%mm6\n\t" /*mm6=57 56 14 07*/ \211"psrlq $32,%%mm3\n\t" /*mm3=.. .. 51 50*/ \212"punpckhwd %%mm0,%%mm6\n\t" /*mm6=43 57 42 56*/ \213"punpcklwd %%mm4,%%mm0\n\t" /*mm0=34 41 48 40 *I*/ \214"pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \215"movq %%mm0,0x28(%[qdct])\n\t" \216"punpcklwd %%mm2,%%mm3\n\t" /*mm3=37 51 36 50*/ \217"punpckhwd %%mm6,%%mm4\n\t" /*mm4=42 35 56 49*/ \218"punpcklwd %%mm3,%%mm6\n\t" /*mm6=36 43 50 57 *J*/ \219"pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \220"movq %%mm4,0x40(%[qdct])\n\t" \221"movq %%mm6,0x48(%[qdct])\n\t" \222"movq 0x68(%[dct]),%%mm6\n\t" /*mm6=55 54 53 52*/ \223"movq 0x78(%[dct]),%%mm0\n\t" /*mm0=63 62 61 60*/ \224"psrlq $32,%%mm1\n\t" /*mm1=.. .. 31 30*/ \225"pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \226"pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \227"punpcklwd %%mm5,%%mm1\n\t" /*mm1=46 31 44 30*/ \228"pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \229"punpckhwd %%mm1,%%mm2\n\t" /*mm2=46 39 31 38 *L*/ \230"punpcklwd %%mm3,%%mm1\n\t" /*mm1=51 44 37 30 *M*/ \231"movq %%mm2,0x68(%[qdct])\n\t" \232"movq %%mm1,0x58(%[qdct])\n\t" \233"punpckhwd %%mm6,%%mm5\n\t" /*mm5=55 47 52 45*/ \234"punpckldq %%mm0,%%mm6\n\t" /*mm6=61 60 54 53*/ \235"pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \236"pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \237"punpckhdq %%mm0,%%mm5\n\t" /*mm5=63 62 55 47 *O*/ \238"punpckhdq %%mm4,%%mm7\n\t" /*mm7=45 52 59 58 *P*/ \239"movq %%mm6,0x70(%[qdct])\n\t" \240"movq %%mm5,0x78(%[qdct])\n\t" \241"movq %%mm7,0x60(%[qdct])\n\t" \242243#endif244245246