Path: blob/master/modules/core/src/mathfuncs_core.simd.hpp
16337 views
// This file is part of OpenCV project.1// It is subject to the license terms in the LICENSE file found in the top-level directory2// of this distribution and at http://opencv.org/license.html.34#include "mathfuncs.hpp"56namespace cv { namespace hal {78CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN910// forward declarations11void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees);12void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees);13void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees);14void magnitude32f(const float* x, const float* y, float* mag, int len);15void magnitude64f(const double* x, const double* y, double* mag, int len);16void invSqrt32f(const float* src, float* dst, int len);17void invSqrt64f(const double* src, double* dst, int len);18void sqrt32f(const float* src, float* dst, int len);19void sqrt64f(const double* src, double* dst, int len);20void exp32f(const float *src, float *dst, int n);21void exp64f(const double *src, double *dst, int n);22void log32f(const float *src, float *dst, int n);23void log64f(const double *src, double *dst, int n);24float fastAtan2(float y, float x);2526#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY2728using namespace std;29using namespace cv;3031namespace {3233#ifdef __EMSCRIPTEN__34static inline float atan_f32(float y, float x)35{36float a = atan2(y, x) * 180.0f / CV_PI;37if (a < 0.0f)38a += 360.0f;39if (a >= 360.0f)40a -= 360.0f;41return a; // range [0; 360)42}43#else44static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI);45static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI);46static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI);47static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI);4849static inline float atan_f32(float y, float x)50{51float ax = std::abs(x), ay = std::abs(y);52float a, c, c2;53if( ax >= ay )54{55c = ay/(ax + (float)DBL_EPSILON);56c2 = c*c;57a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;58}59else60{61c = ax/(ay + (float)DBL_EPSILON);62c2 = c*c;63a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;64}65if( x < 0 )66a = 180.f - a;67if( y < 0 )68a = 360.f - a;69return a;70}71#endif7273#if CV_SIMD7475struct v_atan_f3276{77explicit v_atan_f32(const float& scale)78{79eps = vx_setall_f32((float)DBL_EPSILON);80z = vx_setzero_f32();81p7 = vx_setall_f32(atan2_p7);82p5 = vx_setall_f32(atan2_p5);83p3 = vx_setall_f32(atan2_p3);84p1 = vx_setall_f32(atan2_p1);85val90 = vx_setall_f32(90.f);86val180 = vx_setall_f32(180.f);87val360 = vx_setall_f32(360.f);88s = vx_setall_f32(scale);89}9091v_float32 compute(const v_float32& y, const v_float32& x)92{93v_float32 ax = v_abs(x);94v_float32 ay = v_abs(y);95v_float32 c = v_min(ax, ay) / (v_max(ax, ay) + eps);96v_float32 cc = c * c;97v_float32 a = v_fma(v_fma(v_fma(cc, p7, p5), cc, p3), cc, p1)*c;98a = v_select(ax >= ay, a, val90 - a);99a = v_select(x < z, val180 - a, a);100a = v_select(y < z, val360 - a, a);101return a * s;102}103104v_float32 eps;105v_float32 z;106v_float32 p7;107v_float32 p5;108v_float32 p3;109v_float32 p1;110v_float32 val90;111v_float32 val180;112v_float32 val360;113v_float32 s;114};115116#endif117118} // anonymous::119120///////////////////////////////////// ATAN2 ////////////////////////////////////121122static void fastAtan32f_(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )123{124float scale = angleInDegrees ? 1.f : (float)(CV_PI/180);125int i = 0;126#if CV_SIMD127const int VECSZ = v_float32::nlanes;128v_atan_f32 v(scale);129130for( ; i < len; i += VECSZ*2 )131{132if( i + VECSZ*2 > len )133{134// if it's inplace operation, we cannot repeatedly process135// the tail for the second time, so we have to use the136// scalar code137if( i == 0 || angle == X || angle == Y )138break;139i = len - VECSZ*2;140}141142v_float32 y0 = vx_load(Y + i);143v_float32 x0 = vx_load(X + i);144v_float32 y1 = vx_load(Y + i + VECSZ);145v_float32 x1 = vx_load(X + i + VECSZ);146147v_float32 r0 = v.compute(y0, x0);148v_float32 r1 = v.compute(y1, x1);149150v_store(angle + i, r0);151v_store(angle + i + VECSZ, r1);152}153vx_cleanup();154#endif155156for( ; i < len; i++ )157angle[i] = atan_f32(Y[i], X[i])*scale;158}159160void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )161{162CV_INSTRUMENT_REGION();163fastAtan32f_(Y, X, angle, len, angleInDegrees );164}165166void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees)167{168CV_INSTRUMENT_REGION();169170const int BLKSZ = 128;171float ybuf[BLKSZ], xbuf[BLKSZ], abuf[BLKSZ];172for( int i = 0; i < len; i += BLKSZ )173{174int j, blksz = std::min(BLKSZ, len - i);175for( j = 0; j < blksz; j++ )176{177ybuf[j] = (float)Y[i + j];178xbuf[j] = (float)X[i + j];179}180fastAtan32f_(ybuf, xbuf, abuf, blksz, angleInDegrees);181for( j = 0; j < blksz; j++ )182angle[i + j] = abuf[j];183}184}185186// deprecated187void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )188{189CV_INSTRUMENT_REGION();190fastAtan32f(Y, X, angle, len, angleInDegrees);191}192193void magnitude32f(const float* x, const float* y, float* mag, int len)194{195CV_INSTRUMENT_REGION();196197int i = 0;198199#if CV_SIMD200const int VECSZ = v_float32::nlanes;201for( ; i < len; i += VECSZ*2 )202{203if( i + VECSZ*2 > len )204{205if( i == 0 || mag == x || mag == y )206break;207i = len - VECSZ*2;208}209v_float32 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);210v_float32 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);211x0 = v_sqrt(v_muladd(x0, x0, y0*y0));212x1 = v_sqrt(v_muladd(x1, x1, y1*y1));213v_store(mag + i, x0);214v_store(mag + i + VECSZ, x1);215}216vx_cleanup();217#endif218219for( ; i < len; i++ )220{221float x0 = x[i], y0 = y[i];222mag[i] = std::sqrt(x0*x0 + y0*y0);223}224}225226void magnitude64f(const double* x, const double* y, double* mag, int len)227{228CV_INSTRUMENT_REGION();229230int i = 0;231232#if CV_SIMD_64F233const int VECSZ = v_float64::nlanes;234for( ; i < len; i += VECSZ*2 )235{236if( i + VECSZ*2 > len )237{238if( i == 0 || mag == x || mag == y )239break;240i = len - VECSZ*2;241}242v_float64 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);243v_float64 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);244x0 = v_sqrt(v_muladd(x0, x0, y0*y0));245x1 = v_sqrt(v_muladd(x1, x1, y1*y1));246v_store(mag + i, x0);247v_store(mag + i + VECSZ, x1);248}249vx_cleanup();250#endif251252for( ; i < len; i++ )253{254double x0 = x[i], y0 = y[i];255mag[i] = std::sqrt(x0*x0 + y0*y0);256}257}258259260void invSqrt32f(const float* src, float* dst, int len)261{262CV_INSTRUMENT_REGION();263264int i = 0;265266#if CV_SIMD267const int VECSZ = v_float32::nlanes;268for( ; i < len; i += VECSZ*2 )269{270if( i + VECSZ*2 > len )271{272if( i == 0 || src == dst )273break;274i = len - VECSZ*2;275}276v_float32 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);277t0 = v_invsqrt(t0);278t1 = v_invsqrt(t1);279v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);280}281vx_cleanup();282#endif283284for( ; i < len; i++ )285dst[i] = 1/std::sqrt(src[i]);286}287288289void invSqrt64f(const double* src, double* dst, int len)290{291CV_INSTRUMENT_REGION();292int i = 0;293294#if CV_SIMD_64F295const int VECSZ = v_float64::nlanes;296for ( ; i < len; i += VECSZ*2)297{298if( i + VECSZ*2 > len )299{300if( i == 0 || src == dst )301break;302i = len - VECSZ*2;303}304v_float64 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);305t0 = v_invsqrt(t0);306t1 = v_invsqrt(t1);307v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);308}309#endif310311for( ; i < len; i++ )312dst[i] = 1/std::sqrt(src[i]);313}314315316void sqrt32f(const float* src, float* dst, int len)317{318CV_INSTRUMENT_REGION();319320int i = 0;321322#if CV_SIMD323const int VECSZ = v_float32::nlanes;324for( ; i < len; i += VECSZ*2 )325{326if( i + VECSZ*2 > len )327{328if( i == 0 || src == dst )329break;330i = len - VECSZ*2;331}332v_float32 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);333t0 = v_sqrt(t0);334t1 = v_sqrt(t1);335v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);336}337vx_cleanup();338#endif339340for( ; i < len; i++ )341dst[i] = std::sqrt(src[i]);342}343344345void sqrt64f(const double* src, double* dst, int len)346{347CV_INSTRUMENT_REGION();348349int i = 0;350351#if CV_SIMD_64F352const int VECSZ = v_float64::nlanes;353for( ; i < len; i += VECSZ*2 )354{355if( i + VECSZ*2 > len )356{357if( i == 0 || src == dst )358break;359i = len - VECSZ*2;360}361v_float64 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);362t0 = v_sqrt(t0);363t1 = v_sqrt(t1);364v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);365}366vx_cleanup();367#endif368369for( ; i < len; i++ )370dst[i] = std::sqrt(src[i]);371}372373// Workaround for ICE in MSVS 2015 update 3 (issue #7795)374// CV_AVX is not used here, because generated code is faster in non-AVX mode.375// (tested with disabled IPP on i5-6300U)376#if (defined _MSC_VER && _MSC_VER >= 1900) || defined(__EMSCRIPTEN__)377void exp32f(const float *src, float *dst, int n)378{379CV_INSTRUMENT_REGION();380381for (int i = 0; i < n; i++)382{383dst[i] = std::exp(src[i]);384}385}386387void exp64f(const double *src, double *dst, int n)388{389CV_INSTRUMENT_REGION();390391for (int i = 0; i < n; i++)392{393dst[i] = std::exp(src[i]);394}395}396397void log32f(const float *src, float *dst, int n)398{399CV_INSTRUMENT_REGION();400401for (int i = 0; i < n; i++)402{403dst[i] = std::log(src[i]);404}405}406void log64f(const double *src, double *dst, int n)407{408CV_INSTRUMENT_REGION();409410for (int i = 0; i < n; i++)411{412dst[i] = std::log(src[i]);413}414}415#else416417////////////////////////////////////// EXP /////////////////////////////////////418419#define EXPTAB_SCALE 6420#define EXPTAB_MASK ((1 << EXPTAB_SCALE) - 1)421422#define EXPPOLY_32F_A0 .9670371139572337719125840413672004409288e-2423424// the code below uses _mm_cast* intrinsics, which are not avialable on VS2005425#if (defined _MSC_VER && _MSC_VER < 1500) || \426(!defined __APPLE__ && defined __GNUC__ && __GNUC__*100 + __GNUC_MINOR__ < 402)427#undef CV_SSE2428#define CV_SSE2 0429#endif430431static const double exp_prescale = 1.4426950408889634073599246810019 * (1 << EXPTAB_SCALE);432static const double exp_postscale = 1./(1 << EXPTAB_SCALE);433static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) < 3000434435void exp32f( const float *_x, float *y, int n )436{437CV_INSTRUMENT_REGION();438439const float* const expTab_f = cv::details::getExpTab32f();440441const float442A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),443A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),444A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0),445A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0);446447int i = 0;448const Cv32suf* x = (const Cv32suf*)_x;449float minval = (float)(-exp_max_val/exp_prescale);450float maxval = (float)(exp_max_val/exp_prescale);451float postscale = (float)exp_postscale;452453#if CV_SIMD454const int VECSZ = v_float32::nlanes;455const v_float32 vprescale = vx_setall_f32((float)exp_prescale);456const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);457const v_float32 vminval = vx_setall_f32(minval);458const v_float32 vmaxval = vx_setall_f32(maxval);459460const v_float32 vA1 = vx_setall_f32((float)A1);461const v_float32 vA2 = vx_setall_f32((float)A2);462const v_float32 vA3 = vx_setall_f32((float)A3);463const v_float32 vA4 = vx_setall_f32((float)A4);464465const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);466bool y_aligned = (size_t)(void*)y % 32 == 0;467468for( ; i < n; i += VECSZ*2 )469{470if( i + VECSZ*2 > n )471{472if( i == 0 || _x == y )473break;474i = n - VECSZ*2;475y_aligned = false;476}477478v_float32 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f);479480xf0 = v_min(v_max(xf0, vminval), vmaxval);481xf1 = v_min(v_max(xf1, vminval), vmaxval);482483xf0 *= vprescale;484xf1 *= vprescale;485486v_int32 xi0 = v_round(xf0);487v_int32 xi1 = v_round(xf1);488xf0 = (xf0 - v_cvt_f32(xi0))*vpostscale;489xf1 = (xf1 - v_cvt_f32(xi1))*vpostscale;490491v_float32 yf0 = v_lut(expTab_f, xi0 & vidxmask);492v_float32 yf1 = v_lut(expTab_f, xi1 & vidxmask);493494v_int32 v0 = vx_setzero_s32(), v127 = vx_setall_s32(127), v255 = vx_setall_s32(255);495xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v127, v0), v255);496xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v127, v0), v255);497498yf0 *= v_reinterpret_as_f32(v_shl<23>(xi0));499yf1 *= v_reinterpret_as_f32(v_shl<23>(xi1));500501v_float32 zf0 = xf0 + vA1;502v_float32 zf1 = xf1 + vA1;503504zf0 = v_fma(zf0, xf0, vA2);505zf1 = v_fma(zf1, xf1, vA2);506507zf0 = v_fma(zf0, xf0, vA3);508zf1 = v_fma(zf1, xf1, vA3);509510zf0 = v_fma(zf0, xf0, vA4);511zf1 = v_fma(zf1, xf1, vA4);512513zf0 *= yf0;514zf1 *= yf1;515516if( y_aligned )517{518v_store_aligned(y + i, zf0);519v_store_aligned(y + i + VECSZ, zf1);520}521else522{523v_store(y + i, zf0);524v_store(y + i + VECSZ, zf1);525}526}527vx_cleanup();528#endif529530for( ; i < n; i++ )531{532float x0 = x[i].f;533x0 = std::min(std::max(x0, minval), maxval);534x0 *= (float)exp_prescale;535Cv32suf buf;536537int xi = saturate_cast<int>(x0);538x0 = (x0 - xi)*postscale;539540int t = (xi >> EXPTAB_SCALE) + 127;541t = !(t & ~255) ? t : t < 0 ? 0 : 255;542buf.i = t << 23;543544y[i] = buf.f * expTab_f[xi & EXPTAB_MASK] * ((((x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4);545}546}547548void exp64f( const double *_x, double *y, int n )549{550CV_INSTRUMENT_REGION();551552const double* const expTab = cv::details::getExpTab64f();553554const double555A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,556A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,557A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0,558A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0,559A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0,560A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0;561562int i = 0;563const Cv64suf* x = (const Cv64suf*)_x;564double minval = (-exp_max_val/exp_prescale);565double maxval = (exp_max_val/exp_prescale);566567#if CV_SIMD_64F568const int VECSZ = v_float64::nlanes;569const v_float64 vprescale = vx_setall_f64(exp_prescale);570const v_float64 vpostscale = vx_setall_f64(exp_postscale);571const v_float64 vminval = vx_setall_f64(minval);572const v_float64 vmaxval = vx_setall_f64(maxval);573574const v_float64 vA1 = vx_setall_f64(A1);575const v_float64 vA2 = vx_setall_f64(A2);576const v_float64 vA3 = vx_setall_f64(A3);577const v_float64 vA4 = vx_setall_f64(A4);578const v_float64 vA5 = vx_setall_f64(A5);579580const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);581bool y_aligned = (size_t)(void*)y % 32 == 0;582583for( ; i < n; i += VECSZ*2 )584{585if( i + VECSZ*2 > n )586{587if( i == 0 || _x == y )588break;589i = n - VECSZ*2;590y_aligned = false;591}592593v_float64 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f);594595xf0 = v_min(v_max(xf0, vminval), vmaxval);596xf1 = v_min(v_max(xf1, vminval), vmaxval);597598xf0 *= vprescale;599xf1 *= vprescale;600601v_int32 xi0 = v_round(xf0);602v_int32 xi1 = v_round(xf1);603xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale;604xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale;605606v_float64 yf0 = v_lut(expTab, xi0 & vidxmask);607v_float64 yf1 = v_lut(expTab, xi1 & vidxmask);608609v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047);610xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v1023, v0), v2047);611xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v1023, v0), v2047);612613v_int64 xq0, xq1, dummy;614v_expand(xi0, xq0, dummy);615v_expand(xi1, xq1, dummy);616617yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0));618yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1));619620v_float64 zf0 = xf0 + vA1;621v_float64 zf1 = xf1 + vA1;622623zf0 = v_fma(zf0, xf0, vA2);624zf1 = v_fma(zf1, xf1, vA2);625626zf0 = v_fma(zf0, xf0, vA3);627zf1 = v_fma(zf1, xf1, vA3);628629zf0 = v_fma(zf0, xf0, vA4);630zf1 = v_fma(zf1, xf1, vA4);631632zf0 = v_fma(zf0, xf0, vA5);633zf1 = v_fma(zf1, xf1, vA5);634635zf0 *= yf0;636zf1 *= yf1;637638if( y_aligned )639{640v_store_aligned(y + i, zf0);641v_store_aligned(y + i + VECSZ, zf1);642}643else644{645v_store(y + i, zf0);646v_store(y + i + VECSZ, zf1);647}648}649vx_cleanup();650#endif651652for( ; i < n; i++ )653{654double x0 = x[i].f;655x0 = std::min(std::max(x0, minval), maxval);656x0 *= exp_prescale;657Cv64suf buf;658659int xi = saturate_cast<int>(x0);660x0 = (x0 - xi)*exp_postscale;661662int t = (xi >> EXPTAB_SCALE) + 1023;663t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;664buf.i = (int64)t << 52;665666y[i] = buf.f * expTab[xi & EXPTAB_MASK] * (((((A0*x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4)*x0 + A5);667}668}669670#undef EXPTAB_SCALE671#undef EXPTAB_MASK672#undef EXPPOLY_32F_A0673674/////////////////////////////////////////// LOG ///////////////////////////////////////675676#define LOGTAB_SCALE 8677#define LOGTAB_MASK ((1 << LOGTAB_SCALE) - 1)678679#define LOGTAB_TRANSLATE(tab, x, h) (((x) - 1.f)*tab[(h)+1])680static const double ln_2 = 0.69314718055994530941723212145818;681682void log32f( const float *_x, float *y, int n )683{684CV_INSTRUMENT_REGION();685686const float* const logTab_f = cv::details::getLogTab32f();687688const int LOGTAB_MASK2_32F = (1 << (23 - LOGTAB_SCALE)) - 1;689const float690A0 = 0.3333333333333333333333333f,691A1 = -0.5f,692A2 = 1.f;693694int i = 0;695const int* x = (const int*)_x;696697#if CV_SIMD698const int VECSZ = v_float32::nlanes;699const v_float32 vln2 = vx_setall_f32((float)ln_2);700const v_float32 v1 = vx_setall_f32(1.f);701const v_float32 vshift = vx_setall_f32(-1.f/512);702703const v_float32 vA0 = vx_setall_f32(A0);704const v_float32 vA1 = vx_setall_f32(A1);705const v_float32 vA2 = vx_setall_f32(A2);706707for( ; i < n; i += VECSZ )708{709if( i + VECSZ > n )710{711if( i == 0 || _x == y )712break;713i = n - VECSZ;714}715716v_int32 h0 = vx_load(x + i);717v_int32 yi0 = (v_shr<23>(h0) & vx_setall_s32(255)) - vx_setall_s32(127);718v_int32 xi0 = (h0 & vx_setall_s32(LOGTAB_MASK2_32F)) | vx_setall_s32(127 << 23);719720h0 = v_shr<23 - LOGTAB_SCALE - 1>(h0) & vx_setall_s32(LOGTAB_MASK*2);721v_float32 yf0, xf0;722723v_lut_deinterleave(logTab_f, h0, yf0, xf0);724725yf0 = v_fma(v_cvt_f32(yi0), vln2, yf0);726727v_float32 delta = v_reinterpret_as_f32(h0 == vx_setall_s32(510)) & vshift;728xf0 = v_fma((v_reinterpret_as_f32(xi0) - v1), xf0, delta);729730v_float32 zf0 = v_fma(xf0, vA0, vA1);731zf0 = v_fma(zf0, xf0, vA2);732zf0 = v_fma(zf0, xf0, yf0);733734v_store(y + i, zf0);735}736vx_cleanup();737#endif738739for( ; i < n; i++ )740{741Cv32suf buf;742int i0 = x[i];743744buf.i = (i0 & LOGTAB_MASK2_32F) | (127 << 23);745int idx = (i0 >> (23 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2);746747float y0 = (((i0 >> 23) & 0xff) - 127) * (float)ln_2 + logTab_f[idx];748float x0 = (buf.f - 1.f)*logTab_f[idx + 1] + (idx == 510 ? -1.f/512 : 0.f);749y[i] = ((A0*x0 + A1)*x0 + A2)*x0 + y0;750}751}752753void log64f( const double *x, double *y, int n )754{755CV_INSTRUMENT_REGION();756757const double* const logTab = cv::details::getLogTab64f();758759const int64 LOGTAB_MASK2_64F = ((int64)1 << (52 - LOGTAB_SCALE)) - 1;760const double761A7 = 1.0,762A6 = -0.5,763A5 = 0.333333333333333314829616256247390992939472198486328125,764A4 = -0.25,765A3 = 0.2,766A2 = -0.1666666666666666574148081281236954964697360992431640625,767A1 = 0.1428571428571428769682682968777953647077083587646484375,768A0 = -0.125;769770int i = 0;771772#if CV_SIMD_64F773const int VECSZ = v_float64::nlanes;774const v_float64 vln2 = vx_setall_f64(ln_2);775776const v_float64777vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1),778vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3),779vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5),780vA6 = vx_setall_f64(A6), vA7 = vx_setall_f64(A7);781782for( ; i < n; i += VECSZ )783{784if( i + VECSZ > n )785{786if( i == 0 || x == y )787break;788i = n - VECSZ;789}790791v_int64 h0 = vx_load((const int64*)x + i);792v_int32 yi0 = v_pack(v_shr<52>(h0), vx_setzero_s64());793yi0 = (yi0 & vx_setall_s32(0x7ff)) - vx_setall_s32(1023);794795v_int64 xi0 = (h0 & vx_setall_s64(LOGTAB_MASK2_64F)) | vx_setall_s64((int64)1023 << 52);796h0 = v_shr<52 - LOGTAB_SCALE - 1>(h0);797v_int32 idx = v_pack(h0, h0) & vx_setall_s32(LOGTAB_MASK*2);798799v_float64 xf0, yf0;800v_lut_deinterleave(logTab, idx, yf0, xf0);801802yf0 = v_fma(v_cvt_f64(yi0), vln2, yf0);803v_float64 delta = v_cvt_f64(idx == vx_setall_s32(510))*vx_setall_f64(1./512);804xf0 = v_fma(v_reinterpret_as_f64(xi0) - vx_setall_f64(1.), xf0, delta);805806v_float64 xq = xf0*xf0;807v_float64 zf0 = v_fma(xq, vA0, vA2);808v_float64 zf1 = v_fma(xq, vA1, vA3);809zf0 = v_fma(zf0, xq, vA4);810zf1 = v_fma(zf1, xq, vA5);811zf0 = v_fma(zf0, xq, vA6);812zf1 = v_fma(zf1, xq, vA7);813zf1 = v_fma(zf1, xf0, yf0);814zf0 = v_fma(zf0, xq, zf1);815816v_store(y + i, zf0);817}818#endif819820for( ; i < n; i++ )821{822Cv64suf buf;823int64 i0 = ((const int64*)x)[i];824825buf.i = (i0 & LOGTAB_MASK2_64F) | ((int64)1023 << 52);826int idx = (int)(i0 >> (52 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2);827828double y0 = (((int)(i0 >> 52) & 0x7ff) - 1023) * ln_2 + logTab[idx];829double x0 = (buf.f - 1.)*logTab[idx + 1] + (idx == 510 ? -1./512 : 0.);830831double xq = x0*x0;832y[i] = (((A0*xq + A2)*xq + A4)*xq + A6)*xq + (((A1*xq + A3)*xq + A5)*xq + A7)*x0 + y0;833}834}835836#endif // issue 7795837838float fastAtan2( float y, float x )839{840return atan_f32(y, x);841}842843#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY844845CV_CPU_OPTIMIZATION_NAMESPACE_END846847}} // namespace cv::hal848849850