/********************************************************************1* *2* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *3* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *4* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *5* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *6* *7* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *8* by the Xiph.Org Foundation and contributors *9* https://www.xiph.org/ *10* *11********************************************************************1213function:1415********************************************************************/1617/*Some common macros for potential platform-specific optimization.*/18#include <math.h>19#if !defined(_ocintrin_H)20# define _ocintrin_H (1)2122/*Some specific platforms may have optimized intrinsic or inline assembly23versions of these functions which can substantially improve performance.24We define macros for them to allow easy incorporation of these non-ANSI25features.*/2627/*Note that we do not provide a macro for abs(), because it is provided as a28library function, which we assume is translated into an intrinsic to avoid29the function call overhead and then implemented in the smartest way for the30target platform.31With modern gcc (4.x), this is true: it uses cmov instructions if the32architecture supports it and branchless bit-twiddling if it does not (the33speed difference between the two approaches is not measurable).34Interestingly, the bit-twiddling method was patented in 2000 (US 6,073,150)35by Sun Microsystems, despite prior art dating back to at least 1996:36https://web.archive.org/web/19961201174141/www.x86.org/ftp/articles/pentopt/PENTOPT.TXT37On gcc 3.x, however, our assumption is not true, as abs() is translated to a38conditional jump, which is horrible on deeply piplined architectures (e.g.,39all consumer architectures for the past decade or more).40Also be warned that -C*abs(x) where C is a constant is mis-optimized as41abs(C*x) on every gcc release before 4.2.3.42See bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=34130 */4344/*Modern gcc (4.x) can compile the naive versions of min and max with cmov if45given an appropriate architecture, but the branchless bit-twiddling versions46are just as fast, and do not require any special target architecture.47Earlier gcc versions (3.x) compiled both code to the same assembly48instructions, because of the way they represented ((_b)>(_a)) internally.*/49#define OC_MAXI(_a,_b) ((_a)-((_a)-(_b)&-((_b)>(_a))))50#define OC_MINI(_a,_b) ((_a)+((_b)-(_a)&-((_b)<(_a))))51/*Clamps an integer into the given range.52If _a>_c, then the lower bound _a is respected over the upper bound _c (this53behavior is required to meet our documented API behavior).54_a: The lower bound.55_b: The value to clamp.56_c: The upper boud.*/57#define OC_CLAMPI(_a,_b,_c) (OC_MAXI(_a,OC_MINI(_b,_c)))58#define OC_CLAMP255(_x) ((unsigned char)((((_x)<0)-1)&((_x)|-((_x)>255))))59/*This has a chance of compiling branchless, and is just as fast as the60bit-twiddling method, which is slightly less portable, since it relies on a61sign-extended rightshift, which is not guaranteed by ANSI (but present on62every relevant platform).*/63#define OC_SIGNI(_a) (((_a)>0)-((_a)<0))64/*Slightly more portable than relying on a sign-extended right-shift (which is65not guaranteed by ANSI), and just as fast, since gcc (3.x and 4.x both)66compile it into the right-shift anyway.*/67#define OC_SIGNMASK(_a) (-((_a)<0))68/*Divides an integer by a power of two, truncating towards 0.69_dividend: The integer to divide.70_shift: The non-negative power of two to divide by.71_rmask: (1<<_shift)-1*/72#define OC_DIV_POW2(_dividend,_shift,_rmask)\73((_dividend)+(OC_SIGNMASK(_dividend)&(_rmask))>>(_shift))74/*Divides _x by 65536, truncating towards 0.*/75#define OC_DIV2_16(_x) OC_DIV_POW2(_x,16,0xFFFF)76/*Divides _x by 2, truncating towards 0.*/77#define OC_DIV2(_x) OC_DIV_POW2(_x,1,0x1)78/*Divides _x by 8, truncating towards 0.*/79#define OC_DIV8(_x) OC_DIV_POW2(_x,3,0x7)80/*Divides _x by 16, truncating towards 0.*/81#define OC_DIV16(_x) OC_DIV_POW2(_x,4,0xF)82/*Right shifts _dividend by _shift, adding _rval, and subtracting one for83negative dividends first.84When _rval is (1<<_shift-1), this is equivalent to division with rounding85ties away from zero.*/86#define OC_DIV_ROUND_POW2(_dividend,_shift,_rval)\87((_dividend)+OC_SIGNMASK(_dividend)+(_rval)>>(_shift))88/*Divides a _x by 2, rounding towards even numbers.*/89#define OC_DIV2_RE(_x) ((_x)+((_x)>>1&1)>>1)90/*Divides a _x by (1<<(_shift)), rounding towards even numbers.*/91#define OC_DIV_POW2_RE(_x,_shift) \92((_x)+((_x)>>(_shift)&1)+((1<<(_shift))-1>>1)>>(_shift))93/*Swaps two integers _a and _b if _a>_b.*/94#define OC_SORT2I(_a,_b) \95do{ \96int t__; \97t__=((_a)^(_b))&-((_b)<(_a)); \98(_a)^=t__; \99(_b)^=t__; \100} \101while(0)102103/*Accesses one of four (signed) bytes given an index.104This can be used to avoid small lookup tables.*/105#define OC_BYTE_TABLE32(_a,_b,_c,_d,_i) \106((signed char) \107(((_a)&0xFF|((_b)&0xFF)<<8|((_c)&0xFF)<<16|((_d)&0xFF)<<24)>>(_i)*8))108/*Accesses one of eight (unsigned) nibbles given an index.109This can be used to avoid small lookup tables.*/110#define OC_UNIBBLE_TABLE32(_a,_b,_c,_d,_e,_f,_g,_h,_i) \111((((_a)&0xF|((_b)&0xF)<<4|((_c)&0xF)<<8|((_d)&0xF)<<12| \112((_e)&0xF)<<16|((_f)&0xF)<<20|((_g)&0xF)<<24|((_h)&0xF)<<28)>>(_i)*4)&0xF)113114115116/*All of these macros should expect floats as arguments.*/117#define OC_MAXF(_a,_b) ((_a)<(_b)?(_b):(_a))118#define OC_MINF(_a,_b) ((_a)>(_b)?(_b):(_a))119#define OC_CLAMPF(_a,_b,_c) (OC_MINF(_a,OC_MAXF(_b,_c)))120#define OC_FABSF(_f) ((float)fabs(_f))121#define OC_SQRTF(_f) ((float)sqrt(_f))122#define OC_POWF(_b,_e) ((float)pow(_b,_e))123#define OC_LOGF(_f) ((float)log(_f))124#define OC_IFLOORF(_f) ((int)floor(_f))125#define OC_ICEILF(_f) ((int)ceil(_f))126127#endif128129130