Path: blob/master/thirdparty/libtheora/x86/x86zigzag.h
9903 views
/********************************************************************1* *2* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *3* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *4* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *5* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *6* *7* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *8* by the Xiph.Org Foundation and contributors *9* https://www.xiph.org/ *10* *11********************************************************************1213function:1415********************************************************************/1617#if !defined(_x86_x86zigzag_H)18# define _x86_x86zigzag_H (1)19# include "x86enc.h"202122/*Converts DCT coefficients from transposed order into zig-zag scan order and23stores them in %[y].24This relies on two macros to load the contents of each row:25OC_ZZ_LOAD_ROW_LO(row,"reg") and OC_ZZ_LOAD_ROW_HI(row,"reg"), which load26the first four and second four entries of each row into the specified27register, respectively.28OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row29(because when the rows are already in SSE2 registers, loading the high half30destructively modifies the register).31The index of each output element in the original 64-element array should wind32up in the following 8x8 matrix (the letters indicate the order we compute33each 4-tuple below):34A 0 8 1 2 9 16 24 17 B35C 10 3 4 11 18 25 32 40 E36F 33 26 19 12 5 6 13 20 D37G 27 34 41 48 56 49 42 35 I38L 28 21 14 7 15 22 29 36 M39H 43 50 57 58 51 44 37 30 O40N 23 31 38 45 52 59 60 53 J41P 46 39 47 54 61 62 55 63 K42The order of the coefficients within each tuple is reversed in the comments43below to reflect the usual MSB to LSB notation.*/44#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \45OC_ZZ_LOAD_ROW_LO(0,"%%mm0") /*mm0=03 02 01 00*/ \46OC_ZZ_LOAD_ROW_LO(1,"%%mm1") /*mm1=11 10 09 08*/ \47OC_ZZ_LOAD_ROW_LO(2,"%%mm2") /*mm2=19 18 17 16*/ \48OC_ZZ_LOAD_ROW_LO(3,"%%mm3") /*mm3=27 26 25 24*/ \49OC_ZZ_LOAD_ROW_HI(0,"%%mm4") /*mm4=07 06 05 04*/ \50OC_ZZ_LOAD_ROW_HI(1,"%%mm5") /*mm5=15 14 13 12*/ \51OC_ZZ_LOAD_ROW_HI(2,"%%mm6") /*mm6=23 22 21 20*/ \52"movq %%mm0,%%mm7\n\t" /*mm7=03 02 01 00*/ \53"punpckhdq %%mm1,%%mm0\n\t" /*mm0=11 10 03 02*/ \54"pshufw $0x39,%%mm4,%%mm4\n\t" /*mm4=04 07 06 05*/ \55"punpcklwd %%mm0,%%mm1\n\t" /*mm1=03 09 02 08*/ \56"pshufw $0x39,%%mm5,%%mm5\n\t" /*mm5=12 15 14 13*/ \57"punpcklwd %%mm1,%%mm7\n\t" /*mm7=02 01 08 00 *A*/ \58"movq %%mm7,0x00(%[y])\n\t" \59"punpckhwd %%mm4,%%mm1\n\t" /*mm1=04 03 07 09*/ \60"movq %%mm2,%%mm7\n\t" /*mm7=19 18 17 16*/ \61"punpckhdq %%mm1,%%mm0\n\t" /*mm0=04 03 11 10*/ \62"punpckhwd %%mm5,%%mm7\n\t" /*mm7=12 19 15 18*/ \63"punpcklwd %%mm3,%%mm1\n\t" /*mm1=25 07 24 09*/ \64"punpcklwd %%mm6,%%mm5\n\t" /*mm5=21 14 20 13*/ \65"punpcklwd %%mm2,%%mm1\n\t" /*mm1=17 24 16 09 *B*/ \66OC_ZZ_LOAD_ROW_LO(4,"%%mm2") /*mm2=35 34 33 32*/ \67"movq %%mm1,0x08(%[y])\n\t" \68OC_ZZ_LOAD_ROW_LO(5,"%%mm1") /*mm1=43 42 41 40*/ \69"pshufw $0x78,%%mm0,%%mm0\n\t" /*mm0=11 04 03 10 *C*/ \70"movq %%mm0,0x10(%[y])\n\t" \71"punpckhdq %%mm4,%%mm6\n\t" /*mm6=?? 07 23 22*/ \72"punpckldq %%mm5,%%mm4\n\t" /*mm4=20 13 06 05 *D*/ \73"movq %%mm4,0x28(%[y])\n\t" \74"psrlq $16,%%mm3\n\t" /*mm3=.. 27 26 25*/ \75"pshufw $0x0E,%%mm2,%%mm0\n\t" /*mm0=?? ?? 35 34*/ \76"movq %%mm7,%%mm4\n\t" /*mm4=12 19 15 18*/ \77"punpcklwd %%mm3,%%mm2\n\t" /*mm2=26 33 25 32*/ \78"punpcklwd %%mm1,%%mm4\n\t" /*mm4=41 15 40 18*/ \79"punpckhwd %%mm1,%%mm3\n\t" /*mm3=43 .. 42 27*/ \80"punpckldq %%mm2,%%mm4\n\t" /*mm4=25 32 40 18*/ \81"punpcklwd %%mm0,%%mm3\n\t" /*mm3=35 42 34 27*/ \82OC_ZZ_LOAD_ROW_LO(6,"%%mm0") /*mm0=51 50 49 48*/ \83"pshufw $0x6C,%%mm4,%%mm4\n\t" /*mm4=40 32 25 18 *E*/ \84"movq %%mm4,0x18(%[y])\n\t" \85OC_ZZ_LOAD_ROW_LO(7,"%%mm4") /*mm4=59 58 57 56*/ \86"punpckhdq %%mm7,%%mm2\n\t" /*mm2=12 19 26 33 *F*/ \87"movq %%mm2,0x20(%[y])\n\t" \88"pshufw $0xD0,%%mm1,%%mm1\n\t" /*mm1=43 41 ?? ??*/ \89"pshufw $0x87,%%mm0,%%mm0\n\t" /*mm0=50 48 49 51*/ \90"movq %%mm3,%%mm2\n\t" /*mm2=35 42 34 27*/ \91"punpckhwd %%mm0,%%mm1\n\t" /*mm1=50 43 48 41*/ \92"pshufw $0x93,%%mm4,%%mm4\n\t" /*mm4=58 57 56 59*/ \93"punpckldq %%mm1,%%mm3\n\t" /*mm3=48 41 34 27 *G*/ \94"movq %%mm3,0x30(%[y])\n\t" \95"punpckhdq %%mm4,%%mm1\n\t" /*mm1=58 57 50 43 *H*/ \96"movq %%mm1,0x50(%[y])\n\t" \97OC_ZZ_LOAD_ROW_HI(7,"%%mm1") /*mm1=63 62 61 60*/ \98"punpcklwd %%mm0,%%mm4\n\t" /*mm4=49 56 51 59*/ \99OC_ZZ_LOAD_ROW_HI(6,"%%mm0") /*mm0=55 54 53 52*/ \100"psllq $16,%%mm6\n\t" /*mm6=07 23 22 ..*/ \101"movq %%mm4,%%mm3\n\t" /*mm3=49 56 51 59*/ \102"punpckhdq %%mm2,%%mm4\n\t" /*mm4=35 42 49 56 *I*/ \103OC_ZZ_LOAD_ROW_HI(3,"%%mm2") /*mm2=31 30 29 28*/ \104"movq %%mm4,0x38(%[y])\n\t" \105"punpcklwd %%mm1,%%mm3\n\t" /*mm3=61 51 60 59*/ \106"punpcklwd %%mm6,%%mm7\n\t" /*mm7=22 15 .. ??*/ \107"movq %%mm3,%%mm4\n\t" /*mm4=61 51 60 59*/ \108"punpcklwd %%mm0,%%mm3\n\t" /*mm3=53 60 52 59*/ \109"punpckhwd %%mm0,%%mm4\n\t" /*mm4=55 61 54 51*/ \110OC_ZZ_LOAD_ROW_HI(4,"%%mm0") /*mm0=39 38 37 36*/ \111"pshufw $0xE1,%%mm3,%%mm3\n\t" /*mm3=53 60 59 52 *J*/ \112"movq %%mm3,0x68(%[y])\n\t" \113"movq %%mm4,%%mm3\n\t" /*mm3=?? ?? 54 51*/ \114"pshufw $0x39,%%mm2,%%mm2\n\t" /*mm2=28 31 30 29*/ \115"punpckhwd %%mm1,%%mm4\n\t" /*mm4=63 55 62 61 *K*/ \116OC_ZZ_LOAD_ROW_HI(5,"%%mm1") /*mm1=47 46 45 44*/ \117"movq %%mm4,0x78(%[y])\n\t" \118"punpckhwd %%mm2,%%mm6\n\t" /*mm6=28 07 31 23*/ \119"punpcklwd %%mm0,%%mm2\n\t" /*mm2=37 30 36 29*/ \120"punpckhdq %%mm6,%%mm5\n\t" /*mm5=28 07 21 14*/ \121"pshufw $0x4B,%%mm2,%%mm2\n\t" /*mm2=36 29 30 37*/ \122"pshufw $0x87,%%mm5,%%mm5\n\t" /*mm5=07 14 21 28 *L*/ \123"movq %%mm5,0x40(%[y])\n\t" \124"punpckhdq %%mm2,%%mm7\n\t" /*mm7=36 29 22 15 *M*/ \125"movq %%mm7,0x48(%[y])\n\t" \126"pshufw $0x9C,%%mm1,%%mm1\n\t" /*mm1=46 45 47 44*/ \127"punpckhwd %%mm1,%%mm0\n\t" /*mm0=46 39 45 38*/ \128"punpcklwd %%mm1,%%mm3\n\t" /*mm3=47 54 44 51*/ \129"punpckldq %%mm0,%%mm6\n\t" /*mm6=45 38 31 23 *N*/ \130"movq %%mm6,0x60(%[y])\n\t" \131"punpckhdq %%mm3,%%mm0\n\t" /*mm0=47 54 46 39*/ \132"punpckldq %%mm2,%%mm3\n\t" /*mm3=30 37 44 51 *O*/ \133"movq %%mm3,0x58(%[y])\n\t" \134"pshufw $0xB1,%%mm0,%%mm0\n\t" /*mm0=54 47 39 46 *P*/ \135"movq %%mm0,0x70(%[y])\n\t" \136137/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan138order and stores them in %[qdct].139The index of each output element in the original 64-element array should wind140up in the following 8x8 matrix (the letters indicate the order we compute141each 4-tuple below):142A 0 1 8 16 9 2 3 10 B143C 17 24 32 25 18 11 4 5 D144E 12 19 26 33 40 48 41 34 I145H 27 20 13 6 7 14 21 28 G146K 35 42 49 56 57 50 43 36 J147F 29 22 15 23 30 37 44 51 M148P 58 59 52 45 38 31 39 46 L149N 53 60 61 54 47 55 62 63 O150The order of the coefficients within each tuple is reversed in the comments151below to reflect the usual MSB to LSB notation.*/152#define OC_ZIG_ZAG_MMXEXT \153"movq 0x00(%[dct]),%%mm0\n\t" /*mm0=03 02 01 00*/ \154"movq 0x08(%[dct]),%%mm1\n\t" /*mm1=07 06 05 04*/ \155"movq 0x10(%[dct]),%%mm2\n\t" /*mm2=11 10 09 08*/ \156"movq 0x20(%[dct]),%%mm3\n\t" /*mm3=19 18 17 16*/ \157"movq 0x30(%[dct]),%%mm4\n\t" /*mm4=27 26 25 24*/ \158"movq 0x40(%[dct]),%%mm5\n\t" /*mm5=35 34 33 32*/ \159"movq %%mm2,%%mm7\n\t" /*mm7=11 10 09 08*/ \160"punpcklwd %%mm3,%%mm2\n\t" /*mm2=17 09 16 08*/ \161"movq %%mm0,%%mm6\n\t" /*mm6=03 02 01 00*/ \162"punpckldq %%mm2,%%mm0\n\t" /*mm0=16 08 01 00 *A*/ \163"movq %%mm0,0x00(%[qdct])\n\t" \164"movq 0x18(%[dct]),%%mm0\n\t" /*mm0=15 14 13 12*/ \165"punpckhdq %%mm6,%%mm6\n\t" /*mm6=03 02 03 02*/ \166"psrlq $16,%%mm7\n\t" /*mm7=.. 11 10 09*/ \167"punpckldq %%mm7,%%mm6\n\t" /*mm6=10 09 03 02*/ \168"punpckhwd %%mm7,%%mm3\n\t" /*mm3=.. 19 11 18*/ \169"pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \170"movq %%mm6,0x08(%[qdct])\n\t" \171"psrlq $48,%%mm2\n\t" /*mm2=.. .. .. 17*/ \172"movq %%mm1,%%mm6\n\t" /*mm6=07 06 05 04*/ \173"punpcklwd %%mm5,%%mm2\n\t" /*mm2=33 .. 32 17*/ \174"movq %%mm3,%%mm7\n\t" /*mm7=.. 19 11 18*/ \175"punpckldq %%mm1,%%mm3\n\t" /*mm3=05 04 11 18 *C*/ \176"por %%mm2,%%mm7\n\t" /*mm7=33 19 ?? ??*/ \177"punpcklwd %%mm4,%%mm2\n\t" /*mm2=25 32 24 17 *D**/ \178"movq %%mm2,0x10(%[qdct])\n\t" \179"movq %%mm3,0x18(%[qdct])\n\t" \180"movq 0x28(%[dct]),%%mm2\n\t" /*mm2=23 22 21 20*/ \181"movq 0x38(%[dct]),%%mm1\n\t" /*mm1=31 30 29 28*/ \182"pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \183"punpckhdq %%mm7,%%mm7\n\t" /*mm7=33 19 33 19*/ \184"punpckhwd %%mm3,%%mm6\n\t" /*mm6=14 07 13 06*/ \185"punpckldq %%mm0,%%mm0\n\t" /*mm0=13 12 13 12*/ \186"punpcklwd %%mm1,%%mm3\n\t" /*mm3=29 15 28 12*/ \187"punpckhwd %%mm4,%%mm0\n\t" /*mm0=27 13 26 12*/ \188"pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \189"psrlq $48,%%mm4\n\t" /*mm4=.. .. .. 27*/ \190"punpcklwd %%mm7,%%mm0\n\t" /*mm0=33 26 19 12 *E*/ \191"punpcklwd %%mm1,%%mm4\n\t" /*mm4=29 .. 28 27*/ \192"punpckhwd %%mm2,%%mm3\n\t" /*mm3=23 15 22 29 *F*/ \193"movq %%mm0,0x20(%[qdct])\n\t" \194"movq %%mm3,0x50(%[qdct])\n\t" \195"movq 0x60(%[dct]),%%mm3\n\t" /*mm3=51 50 49 48*/ \196"movq 0x70(%[dct]),%%mm7\n\t" /*mm7=59 58 57 56*/ \197"movq 0x50(%[dct]),%%mm0\n\t" /*mm0=43 42 41 40*/ \198"punpcklwd %%mm4,%%mm2\n\t" /*mm2=28 21 27 20*/ \199"psrlq $32,%%mm5\n\t" /*mm5=.. .. 35 34*/ \200"movq %%mm2,%%mm4\n\t" /*mm4=28 21 27 20*/ \201"punpckldq %%mm6,%%mm2\n\t" /*mm2=13 06 27 20*/ \202"punpckhdq %%mm4,%%mm6\n\t" /*mm6=28 21 14 07 *G*/ \203"movq %%mm3,%%mm4\n\t" /*mm4=51 50 49 48*/ \204"pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \205"movq %%mm2,0x30(%[qdct])\n\t" \206"movq %%mm6,0x38(%[qdct])\n\t" \207"movq 0x48(%[dct]),%%mm2\n\t" /*mm2=39 38 37 36*/ \208"punpcklwd %%mm5,%%mm4\n\t" /*mm4=35 49 34 48*/ \209"movq 0x58(%[dct]),%%mm5\n\t" /*mm5=47 46 45 44*/ \210"punpckldq %%mm7,%%mm6\n\t" /*mm6=57 56 14 07*/ \211"psrlq $32,%%mm3\n\t" /*mm3=.. .. 51 50*/ \212"punpckhwd %%mm0,%%mm6\n\t" /*mm6=43 57 42 56*/ \213"punpcklwd %%mm4,%%mm0\n\t" /*mm0=34 41 48 40 *I*/ \214"pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \215"movq %%mm0,0x28(%[qdct])\n\t" \216"punpcklwd %%mm2,%%mm3\n\t" /*mm3=37 51 36 50*/ \217"punpckhwd %%mm6,%%mm4\n\t" /*mm4=42 35 56 49*/ \218"punpcklwd %%mm3,%%mm6\n\t" /*mm6=36 43 50 57 *J*/ \219"pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \220"movq %%mm4,0x40(%[qdct])\n\t" \221"movq %%mm6,0x48(%[qdct])\n\t" \222"movq 0x68(%[dct]),%%mm6\n\t" /*mm6=55 54 53 52*/ \223"movq 0x78(%[dct]),%%mm0\n\t" /*mm0=63 62 61 60*/ \224"psrlq $32,%%mm1\n\t" /*mm1=.. .. 31 30*/ \225"pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \226"pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \227"punpcklwd %%mm5,%%mm1\n\t" /*mm1=46 31 44 30*/ \228"pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \229"punpckhwd %%mm1,%%mm2\n\t" /*mm2=46 39 31 38 *L*/ \230"punpcklwd %%mm3,%%mm1\n\t" /*mm1=51 44 37 30 *M*/ \231"movq %%mm2,0x68(%[qdct])\n\t" \232"movq %%mm1,0x58(%[qdct])\n\t" \233"punpckhwd %%mm6,%%mm5\n\t" /*mm5=55 47 52 45*/ \234"punpckldq %%mm0,%%mm6\n\t" /*mm6=61 60 54 53*/ \235"pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \236"pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \237"punpckhdq %%mm0,%%mm5\n\t" /*mm5=63 62 55 47 *O*/ \238"punpckhdq %%mm4,%%mm7\n\t" /*mm7=45 52 59 58 *P*/ \239"movq %%mm6,0x70(%[qdct])\n\t" \240"movq %%mm5,0x78(%[qdct])\n\t" \241"movq %%mm7,0x60(%[qdct])\n\t" \242243#endif244245246