Path: blob/master/thirdparty/libtheora/x86/x86enquant.c
9898 views
/********************************************************************1* *2* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *3* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *4* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *5* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *6* *7* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *8* by the Xiph.Org Foundation and contributors *9* https://www.xiph.org/ *10* *11********************************************************************1213function:1415********************************************************************/1617#include "x86enc.h"1819#if defined(OC_X86_ASM)20212223/*The default enquant table is not quite suitable for SIMD purposes.24First, the m and l parameters need to be separated so that an entire row full25of m's or l's can be loaded at a time.26Second, x86 SIMD has no element-wise arithmetic right-shift, so we have to27emulate one with a multiply.28Therefore we translate the shift count into a scale factor.*/29void oc_enc_enquant_table_init_x86(void *_enquant,30const ogg_uint16_t _dequant[64]){31ogg_int16_t *m;32ogg_int16_t *l;33int zzi;34m=(ogg_int16_t *)_enquant;35l=m+64;36for(zzi=0;zzi<64;zzi++){37oc_iquant q;38oc_iquant_init(&q,_dequant[zzi]);39m[zzi]=q.m;40/*q.l must be at least 2 for this to work; fortunately, once all the scale41factors are baked in, the minimum quantizer is much larger than that.*/42l[zzi]=1<<16-q.l;43}44}4546void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){47int pli;48int qii;49int qti;50for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){51((ogg_int16_t *)_enquant[pli][qii][qti])[0]=52((ogg_int16_t *)_enquant[pli][0][qti])[0];53((ogg_int16_t *)_enquant[pli][qii][qti])[64]=54((ogg_int16_t *)_enquant[pli][0][qti])[64];55}56}5758int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],59const ogg_uint16_t _dequant[64],const void *_enquant){60ptrdiff_t r;61__asm__ __volatile__(62"xor %[r],%[r]\n\t"63/*Loop through two rows at a time.*/64".p2align 4\n\t"65"0:\n\t"66/*Load the first two rows of the data and the quant matrices.*/67"movdqa 0x00(%[dct],%[r]),%%xmm0\n\t"68"movdqa 0x10(%[dct],%[r]),%%xmm1\n\t"69"movdqa 0x00(%[dq],%[r]),%%xmm2\n\t"70"movdqa 0x10(%[dq],%[r]),%%xmm3\n\t"71"movdqa 0x00(%[q],%[r]),%%xmm4\n\t"72"movdqa 0x10(%[q],%[r]),%%xmm5\n\t"73/*Double the input and propagate its sign to the rounding factor.74Using SSSE3's psignw would help here, but we need the mask later anyway.*/75"movdqa %%xmm0,%%xmm6\n\t"76"psraw $15,%%xmm0\n\t"77"movdqa %%xmm1,%%xmm7\n\t"78"paddw %%xmm6,%%xmm6\n\t"79"psraw $15,%%xmm1\n\t"80"paddw %%xmm7,%%xmm7\n\t"81"paddw %%xmm0,%%xmm2\n\t"82"paddw %%xmm1,%%xmm3\n\t"83"pxor %%xmm0,%%xmm2\n\t"84"pxor %%xmm1,%%xmm3\n\t"85/*Add the rounding factor and perform the first multiply.*/86"paddw %%xmm2,%%xmm6\n\t"87"paddw %%xmm3,%%xmm7\n\t"88"pmulhw %%xmm6,%%xmm4\n\t"89"pmulhw %%xmm7,%%xmm5\n\t"90"movdqa 0x80(%[q],%[r]),%%xmm2\n\t"91"movdqa 0x90(%[q],%[r]),%%xmm3\n\t"92"paddw %%xmm4,%%xmm6\n\t"93"paddw %%xmm5,%%xmm7\n\t"94/*Emulate an element-wise right-shift via a second multiply.*/95"pmulhw %%xmm2,%%xmm6\n\t"96"pmulhw %%xmm3,%%xmm7\n\t"97"add $32,%[r]\n\t"98"cmp $96,%[r]\n\t"99/*Correct for the sign.*/100"psubw %%xmm0,%%xmm6\n\t"101"psubw %%xmm1,%%xmm7\n\t"102/*Save the result.*/103"movdqa %%xmm6,-0x20(%[qdct],%[r])\n\t"104"movdqa %%xmm7,-0x10(%[qdct],%[r])\n\t"105"jle 0b\n\t"106/*Now find the location of the last non-zero value.*/107"movdqa 0x50(%[qdct]),%%xmm5\n\t"108"movdqa 0x40(%[qdct]),%%xmm4\n\t"109"packsswb %%xmm7,%%xmm6\n\t"110"packsswb %%xmm5,%%xmm4\n\t"111"pxor %%xmm0,%%xmm0\n\t"112"mov $-1,%k[dq]\n\t"113"pcmpeqb %%xmm0,%%xmm6\n\t"114"pcmpeqb %%xmm0,%%xmm4\n\t"115"pmovmskb %%xmm6,%k[q]\n\t"116"pmovmskb %%xmm4,%k[r]\n\t"117"shl $16,%k[q]\n\t"118"or %k[r],%k[q]\n\t"119"mov $32,%[r]\n\t"120/*We have to use xor here instead of not in order to set the flags.*/121"xor %k[dq],%k[q]\n\t"122"jnz 1f\n\t"123"movdqa 0x30(%[qdct]),%%xmm7\n\t"124"movdqa 0x20(%[qdct]),%%xmm6\n\t"125"movdqa 0x10(%[qdct]),%%xmm5\n\t"126"movdqa 0x00(%[qdct]),%%xmm4\n\t"127"packsswb %%xmm7,%%xmm6\n\t"128"packsswb %%xmm5,%%xmm4\n\t"129"pcmpeqb %%xmm0,%%xmm6\n\t"130"pcmpeqb %%xmm0,%%xmm4\n\t"131"pmovmskb %%xmm6,%k[q]\n\t"132"pmovmskb %%xmm4,%k[r]\n\t"133"shl $16,%k[q]\n\t"134"or %k[r],%k[q]\n\t"135"xor %[r],%[r]\n\t"136"not %k[q]\n\t"137"or $1,%k[q]\n\t"138"1:\n\t"139"bsr %k[q],%k[q]\n\t"140"add %k[q],%k[r]\n\t"141:[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant)142:[dct]"r"(_dct),[qdct]"r"(_qdct)143:"cc","memory"144);145return (int)r;146}147148#endif149150151