CoCalc -- mathfuncs_core.simd.hpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/core/src/mathfuncs_core.simd.hpp
¹⁶³³⁷ views
1
// This file is part of OpenCV project.
2
// It is subject to the license terms in the LICENSE file found in the top-level directory
3
// of this distribution and at http://opencv.org/license.html.
4

5
#include "mathfuncs.hpp"
6

7
namespace cv { namespace hal {
8

9
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
10

11
// forward declarations
12
void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees);
13
void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees);
14
void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees);
15
void magnitude32f(const float* x, const float* y, float* mag, int len);
16
void magnitude64f(const double* x, const double* y, double* mag, int len);
17
void invSqrt32f(const float* src, float* dst, int len);
18
void invSqrt64f(const double* src, double* dst, int len);
19
void sqrt32f(const float* src, float* dst, int len);
20
void sqrt64f(const double* src, double* dst, int len);
21
void exp32f(const float *src, float *dst, int n);
22
void exp64f(const double *src, double *dst, int n);
23
void log32f(const float *src, float *dst, int n);
24
void log64f(const double *src, double *dst, int n);
25
float fastAtan2(float y, float x);
26

27
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
28

29
using namespace std;
30
using namespace cv;
31

32
namespace {
33

34
#ifdef __EMSCRIPTEN__
35
static inline float atan_f32(float y, float x)
36
{
37
    float a = atan2(y, x) * 180.0f / CV_PI;
38
    if (a < 0.0f)
39
        a += 360.0f;
40
    if (a >= 360.0f)
41
        a -= 360.0f;
42
    return a; // range [0; 360)
43
}
44
#else
45
static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI);
46
static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI);
47
static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI);
48
static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI);
49

50
static inline float atan_f32(float y, float x)
51
{
52
    float ax = std::abs(x), ay = std::abs(y);
53
    float a, c, c2;
54
    if( ax >= ay )
55
    {
56
        c = ay/(ax + (float)DBL_EPSILON);
57
        c2 = c*c;
58
        a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
59
    }
60
    else
61
    {
62
        c = ax/(ay + (float)DBL_EPSILON);
63
        c2 = c*c;
64
        a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
65
    }
66
    if( x < 0 )
67
        a = 180.f - a;
68
    if( y < 0 )
69
        a = 360.f - a;
70
    return a;
71
}
72
#endif
73

74
#if CV_SIMD
75

76
struct v_atan_f32
77
{
78
    explicit v_atan_f32(const float& scale)
79
    {
80
        eps = vx_setall_f32((float)DBL_EPSILON);
81
        z = vx_setzero_f32();
82
        p7 = vx_setall_f32(atan2_p7);
83
        p5 = vx_setall_f32(atan2_p5);
84
        p3 = vx_setall_f32(atan2_p3);
85
        p1 = vx_setall_f32(atan2_p1);
86
        val90 = vx_setall_f32(90.f);
87
        val180 = vx_setall_f32(180.f);
88
        val360 = vx_setall_f32(360.f);
89
        s = vx_setall_f32(scale);
90
    }
91

92
    v_float32 compute(const v_float32& y, const v_float32& x)
93
    {
94
        v_float32 ax = v_abs(x);
95
        v_float32 ay = v_abs(y);
96
        v_float32 c = v_min(ax, ay) / (v_max(ax, ay) + eps);
97
        v_float32 cc = c * c;
98
        v_float32 a = v_fma(v_fma(v_fma(cc, p7, p5), cc, p3), cc, p1)*c;
99
        a = v_select(ax >= ay, a, val90 - a);
100
        a = v_select(x < z, val180 - a, a);
101
        a = v_select(y < z, val360 - a, a);
102
        return a * s;
103
    }
104

105
    v_float32 eps;
106
    v_float32 z;
107
    v_float32 p7;
108
    v_float32 p5;
109
    v_float32 p3;
110
    v_float32 p1;
111
    v_float32 val90;
112
    v_float32 val180;
113
    v_float32 val360;
114
    v_float32 s;
115
};
116

117
#endif
118

119
} // anonymous::
120

121
///////////////////////////////////// ATAN2 ////////////////////////////////////
122

123
static void fastAtan32f_(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
124
{
125
    float scale = angleInDegrees ? 1.f : (float)(CV_PI/180);
126
    int i = 0;
127
#if CV_SIMD
128
    const int VECSZ = v_float32::nlanes;
129
    v_atan_f32 v(scale);
130

131
    for( ; i < len; i += VECSZ*2 )
132
    {
133
        if( i + VECSZ*2 > len )
134
        {
135
            // if it's inplace operation, we cannot repeatedly process
136
            // the tail for the second time, so we have to use the
137
            // scalar code
138
            if( i == 0 || angle == X || angle == Y )
139
                break;
140
            i = len - VECSZ*2;
141
        }
142

143
        v_float32 y0 = vx_load(Y + i);
144
        v_float32 x0 = vx_load(X + i);
145
        v_float32 y1 = vx_load(Y + i + VECSZ);
146
        v_float32 x1 = vx_load(X + i + VECSZ);
147

148
        v_float32 r0 = v.compute(y0, x0);
149
        v_float32 r1 = v.compute(y1, x1);
150

151
        v_store(angle + i, r0);
152
        v_store(angle + i + VECSZ, r1);
153
    }
154
    vx_cleanup();
155
#endif
156

157
    for( ; i < len; i++ )
158
        angle[i] = atan_f32(Y[i], X[i])*scale;
159
}
160

161
void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
162
{
163
    CV_INSTRUMENT_REGION();
164
    fastAtan32f_(Y, X, angle, len, angleInDegrees );
165
}
166

167
void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees)
168
{
169
    CV_INSTRUMENT_REGION();
170

171
    const int BLKSZ = 128;
172
    float ybuf[BLKSZ], xbuf[BLKSZ], abuf[BLKSZ];
173
    for( int i = 0; i < len; i += BLKSZ )
174
    {
175
        int j, blksz = std::min(BLKSZ, len - i);
176
        for( j = 0; j < blksz; j++ )
177
        {
178
            ybuf[j] = (float)Y[i + j];
179
            xbuf[j] = (float)X[i + j];
180
        }
181
        fastAtan32f_(ybuf, xbuf, abuf, blksz, angleInDegrees);
182
        for( j = 0; j < blksz; j++ )
183
            angle[i + j] = abuf[j];
184
    }
185
}
186

187
// deprecated
188
void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
189
{
190
    CV_INSTRUMENT_REGION();
191
    fastAtan32f(Y, X, angle, len, angleInDegrees);
192
}
193

194
void magnitude32f(const float* x, const float* y, float* mag, int len)
195
{
196
    CV_INSTRUMENT_REGION();
197

198
    int i = 0;
199

200
#if CV_SIMD
201
    const int VECSZ = v_float32::nlanes;
202
    for( ; i < len; i += VECSZ*2 )
203
    {
204
        if( i + VECSZ*2 > len )
205
        {
206
            if( i == 0 || mag == x || mag == y )
207
                break;
208
            i = len - VECSZ*2;
209
        }
210
        v_float32 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
211
        v_float32 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
212
        x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
213
        x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
214
        v_store(mag + i, x0);
215
        v_store(mag + i + VECSZ, x1);
216
    }
217
    vx_cleanup();
218
#endif
219

220
    for( ; i < len; i++ )
221
    {
222
        float x0 = x[i], y0 = y[i];
223
        mag[i] = std::sqrt(x0*x0 + y0*y0);
224
    }
225
}
226

227
void magnitude64f(const double* x, const double* y, double* mag, int len)
228
{
229
    CV_INSTRUMENT_REGION();
230

231
    int i = 0;
232

233
#if CV_SIMD_64F
234
    const int VECSZ = v_float64::nlanes;
235
    for( ; i < len; i += VECSZ*2 )
236
    {
237
        if( i + VECSZ*2 > len )
238
        {
239
            if( i == 0 || mag == x || mag == y )
240
                break;
241
            i = len - VECSZ*2;
242
        }
243
        v_float64 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
244
        v_float64 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
245
        x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
246
        x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
247
        v_store(mag + i, x0);
248
        v_store(mag + i + VECSZ, x1);
249
    }
250
    vx_cleanup();
251
#endif
252

253
    for( ; i < len; i++ )
254
    {
255
        double x0 = x[i], y0 = y[i];
256
        mag[i] = std::sqrt(x0*x0 + y0*y0);
257
    }
258
}
259

260

261
void invSqrt32f(const float* src, float* dst, int len)
262
{
263
    CV_INSTRUMENT_REGION();
264

265
    int i = 0;
266

267
#if CV_SIMD
268
    const int VECSZ = v_float32::nlanes;
269
    for( ; i < len; i += VECSZ*2 )
270
    {
271
        if( i + VECSZ*2 > len )
272
        {
273
            if( i == 0 || src == dst )
274
                break;
275
            i = len - VECSZ*2;
276
        }
277
        v_float32 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
278
        t0 = v_invsqrt(t0);
279
        t1 = v_invsqrt(t1);
280
        v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
281
    }
282
    vx_cleanup();
283
#endif
284

285
    for( ; i < len; i++ )
286
        dst[i] = 1/std::sqrt(src[i]);
287
}
288

289

290
void invSqrt64f(const double* src, double* dst, int len)
291
{
292
    CV_INSTRUMENT_REGION();
293
    int i = 0;
294

295
#if CV_SIMD_64F
296
    const int VECSZ = v_float64::nlanes;
297
    for ( ; i < len; i += VECSZ*2)
298
    {
299
        if( i + VECSZ*2 > len )
300
        {
301
            if( i == 0 || src == dst )
302
                break;
303
            i = len - VECSZ*2;
304
        }
305
        v_float64 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
306
        t0 = v_invsqrt(t0);
307
        t1 = v_invsqrt(t1);
308
        v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
309
    }
310
#endif
311

312
    for( ; i < len; i++ )
313
        dst[i] = 1/std::sqrt(src[i]);
314
}
315

316

317
void sqrt32f(const float* src, float* dst, int len)
318
{
319
    CV_INSTRUMENT_REGION();
320

321
    int i = 0;
322

323
#if CV_SIMD
324
    const int VECSZ = v_float32::nlanes;
325
    for( ; i < len; i += VECSZ*2 )
326
    {
327
        if( i + VECSZ*2 > len )
328
        {
329
            if( i == 0 || src == dst )
330
                break;
331
            i = len - VECSZ*2;
332
        }
333
        v_float32 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
334
        t0 = v_sqrt(t0);
335
        t1 = v_sqrt(t1);
336
        v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
337
    }
338
    vx_cleanup();
339
#endif
340

341
    for( ; i < len; i++ )
342
        dst[i] = std::sqrt(src[i]);
343
}
344

345

346
void sqrt64f(const double* src, double* dst, int len)
347
{
348
    CV_INSTRUMENT_REGION();
349

350
    int i = 0;
351

352
#if CV_SIMD_64F
353
    const int VECSZ = v_float64::nlanes;
354
    for( ; i < len; i += VECSZ*2 )
355
    {
356
        if( i + VECSZ*2 > len )
357
        {
358
            if( i == 0 || src == dst )
359
                break;
360
            i = len - VECSZ*2;
361
        }
362
        v_float64 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
363
        t0 = v_sqrt(t0);
364
        t1 = v_sqrt(t1);
365
        v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
366
    }
367
    vx_cleanup();
368
#endif
369

370
    for( ; i < len; i++ )
371
        dst[i] = std::sqrt(src[i]);
372
}
373

374
// Workaround for ICE in MSVS 2015 update 3 (issue #7795)
375
// CV_AVX is not used here, because generated code is faster in non-AVX mode.
376
// (tested with disabled IPP on i5-6300U)
377
#if (defined _MSC_VER && _MSC_VER >= 1900) || defined(__EMSCRIPTEN__)
378
void exp32f(const float *src, float *dst, int n)
379
{
380
    CV_INSTRUMENT_REGION();
381

382
    for (int i = 0; i < n; i++)
383
    {
384
        dst[i] = std::exp(src[i]);
385
    }
386
}
387

388
void exp64f(const double *src, double *dst, int n)
389
{
390
    CV_INSTRUMENT_REGION();
391

392
    for (int i = 0; i < n; i++)
393
    {
394
        dst[i] = std::exp(src[i]);
395
    }
396
}
397

398
void log32f(const float *src, float *dst, int n)
399
{
400
    CV_INSTRUMENT_REGION();
401

402
    for (int i = 0; i < n; i++)
403
    {
404
        dst[i] = std::log(src[i]);
405
    }
406
}
407
void log64f(const double *src, double *dst, int n)
408
{
409
    CV_INSTRUMENT_REGION();
410

411
    for (int i = 0; i < n; i++)
412
    {
413
        dst[i] = std::log(src[i]);
414
    }
415
}
416
#else
417

418
////////////////////////////////////// EXP /////////////////////////////////////
419

420
#define EXPTAB_SCALE 6
421
#define EXPTAB_MASK  ((1 << EXPTAB_SCALE) - 1)
422

423
#define EXPPOLY_32F_A0 .9670371139572337719125840413672004409288e-2
424

425
// the code below uses _mm_cast* intrinsics, which are not avialable on VS2005
426
#if (defined _MSC_VER && _MSC_VER < 1500) || \
427
(!defined __APPLE__ && defined __GNUC__ && __GNUC__*100 + __GNUC_MINOR__ < 402)
428
#undef CV_SSE2
429
#define CV_SSE2 0
430
#endif
431

432
static const double exp_prescale = 1.4426950408889634073599246810019 * (1 << EXPTAB_SCALE);
433
static const double exp_postscale = 1./(1 << EXPTAB_SCALE);
434
static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) < 3000
435

436
void exp32f( const float *_x, float *y, int n )
437
{
438
    CV_INSTRUMENT_REGION();
439

440
    const float* const expTab_f = cv::details::getExpTab32f();
441

442
    const float
443
    A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
444
    A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
445
    A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0),
446
    A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0);
447

448
    int i = 0;
449
    const Cv32suf* x = (const Cv32suf*)_x;
450
    float minval = (float)(-exp_max_val/exp_prescale);
451
    float maxval = (float)(exp_max_val/exp_prescale);
452
    float postscale = (float)exp_postscale;
453

454
#if CV_SIMD
455
    const int VECSZ = v_float32::nlanes;
456
    const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
457
    const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
458
    const v_float32 vminval = vx_setall_f32(minval);
459
    const v_float32 vmaxval = vx_setall_f32(maxval);
460

461
    const v_float32 vA1 = vx_setall_f32((float)A1);
462
    const v_float32 vA2 = vx_setall_f32((float)A2);
463
    const v_float32 vA3 = vx_setall_f32((float)A3);
464
    const v_float32 vA4 = vx_setall_f32((float)A4);
465

466
    const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
467
    bool y_aligned = (size_t)(void*)y % 32 == 0;
468

469
    for( ; i < n; i += VECSZ*2 )
470
    {
471
        if( i + VECSZ*2 > n )
472
        {
473
            if( i == 0 || _x == y )
474
                break;
475
            i = n - VECSZ*2;
476
            y_aligned = false;
477
        }
478

479
        v_float32 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f);
480

481
        xf0 = v_min(v_max(xf0, vminval), vmaxval);
482
        xf1 = v_min(v_max(xf1, vminval), vmaxval);
483

484
        xf0 *= vprescale;
485
        xf1 *= vprescale;
486

487
        v_int32 xi0 = v_round(xf0);
488
        v_int32 xi1 = v_round(xf1);
489
        xf0 = (xf0 - v_cvt_f32(xi0))*vpostscale;
490
        xf1 = (xf1 - v_cvt_f32(xi1))*vpostscale;
491

492
        v_float32 yf0 = v_lut(expTab_f, xi0 & vidxmask);
493
        v_float32 yf1 = v_lut(expTab_f, xi1 & vidxmask);
494

495
        v_int32 v0 = vx_setzero_s32(), v127 = vx_setall_s32(127), v255 = vx_setall_s32(255);
496
        xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v127, v0), v255);
497
        xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v127, v0), v255);
498

499
        yf0 *= v_reinterpret_as_f32(v_shl<23>(xi0));
500
        yf1 *= v_reinterpret_as_f32(v_shl<23>(xi1));
501

502
        v_float32 zf0 = xf0 + vA1;
503
        v_float32 zf1 = xf1 + vA1;
504

505
        zf0 = v_fma(zf0, xf0, vA2);
506
        zf1 = v_fma(zf1, xf1, vA2);
507

508
        zf0 = v_fma(zf0, xf0, vA3);
509
        zf1 = v_fma(zf1, xf1, vA3);
510

511
        zf0 = v_fma(zf0, xf0, vA4);
512
        zf1 = v_fma(zf1, xf1, vA4);
513

514
        zf0 *= yf0;
515
        zf1 *= yf1;
516

517
        if( y_aligned )
518
        {
519
            v_store_aligned(y + i, zf0);
520
            v_store_aligned(y + i + VECSZ, zf1);
521
        }
522
        else
523
        {
524
            v_store(y + i, zf0);
525
            v_store(y + i + VECSZ, zf1);
526
        }
527
    }
528
    vx_cleanup();
529
#endif
530

531
    for( ; i < n; i++ )
532
    {
533
        float x0 = x[i].f;
534
        x0 = std::min(std::max(x0, minval), maxval);
535
        x0 *= (float)exp_prescale;
536
        Cv32suf buf;
537

538
        int xi = saturate_cast<int>(x0);
539
        x0 = (x0 - xi)*postscale;
540

541
        int t = (xi >> EXPTAB_SCALE) + 127;
542
        t = !(t & ~255) ? t : t < 0 ? 0 : 255;
543
        buf.i = t << 23;
544

545
        y[i] = buf.f * expTab_f[xi & EXPTAB_MASK] * ((((x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4);
546
    }
547
}
548

549
void exp64f( const double *_x, double *y, int n )
550
{
551
    CV_INSTRUMENT_REGION();
552

553
    const double* const expTab = cv::details::getExpTab64f();
554

555
    const double
556
    A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
557
    A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,
558
    A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0,
559
    A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0,
560
    A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0,
561
    A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0;
562

563
    int i = 0;
564
    const Cv64suf* x = (const Cv64suf*)_x;
565
    double minval = (-exp_max_val/exp_prescale);
566
    double maxval = (exp_max_val/exp_prescale);
567

568
#if CV_SIMD_64F
569
    const int VECSZ = v_float64::nlanes;
570
    const v_float64 vprescale = vx_setall_f64(exp_prescale);
571
    const v_float64 vpostscale = vx_setall_f64(exp_postscale);
572
    const v_float64 vminval = vx_setall_f64(minval);
573
    const v_float64 vmaxval = vx_setall_f64(maxval);
574

575
    const v_float64 vA1 = vx_setall_f64(A1);
576
    const v_float64 vA2 = vx_setall_f64(A2);
577
    const v_float64 vA3 = vx_setall_f64(A3);
578
    const v_float64 vA4 = vx_setall_f64(A4);
579
    const v_float64 vA5 = vx_setall_f64(A5);
580

581
    const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
582
    bool y_aligned = (size_t)(void*)y % 32 == 0;
583

584
    for( ; i < n; i += VECSZ*2 )
585
    {
586
        if( i + VECSZ*2 > n )
587
        {
588
            if( i == 0 || _x == y )
589
                break;
590
            i = n - VECSZ*2;
591
            y_aligned = false;
592
        }
593

594
        v_float64 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f);
595

596
        xf0 = v_min(v_max(xf0, vminval), vmaxval);
597
        xf1 = v_min(v_max(xf1, vminval), vmaxval);
598

599
        xf0 *= vprescale;
600
        xf1 *= vprescale;
601

602
        v_int32 xi0 = v_round(xf0);
603
        v_int32 xi1 = v_round(xf1);
604
        xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale;
605
        xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale;
606

607
        v_float64 yf0 = v_lut(expTab, xi0 & vidxmask);
608
        v_float64 yf1 = v_lut(expTab, xi1 & vidxmask);
609

610
        v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047);
611
        xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v1023, v0), v2047);
612
        xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v1023, v0), v2047);
613

614
        v_int64 xq0, xq1, dummy;
615
        v_expand(xi0, xq0, dummy);
616
        v_expand(xi1, xq1, dummy);
617

618
        yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0));
619
        yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1));
620

621
        v_float64 zf0 = xf0 + vA1;
622
        v_float64 zf1 = xf1 + vA1;
623

624
        zf0 = v_fma(zf0, xf0, vA2);
625
        zf1 = v_fma(zf1, xf1, vA2);
626

627
        zf0 = v_fma(zf0, xf0, vA3);
628
        zf1 = v_fma(zf1, xf1, vA3);
629

630
        zf0 = v_fma(zf0, xf0, vA4);
631
        zf1 = v_fma(zf1, xf1, vA4);
632

633
        zf0 = v_fma(zf0, xf0, vA5);
634
        zf1 = v_fma(zf1, xf1, vA5);
635

636
        zf0 *= yf0;
637
        zf1 *= yf1;
638

639
        if( y_aligned )
640
        {
641
            v_store_aligned(y + i, zf0);
642
            v_store_aligned(y + i + VECSZ, zf1);
643
        }
644
        else
645
        {
646
            v_store(y + i, zf0);
647
            v_store(y + i + VECSZ, zf1);
648
        }
649
    }
650
    vx_cleanup();
651
#endif
652

653
    for( ; i < n; i++ )
654
    {
655
        double x0 = x[i].f;
656
        x0 = std::min(std::max(x0, minval), maxval);
657
        x0 *= exp_prescale;
658
        Cv64suf buf;
659

660
        int xi = saturate_cast<int>(x0);
661
        x0 = (x0 - xi)*exp_postscale;
662

663
        int t = (xi >> EXPTAB_SCALE) + 1023;
664
        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
665
        buf.i = (int64)t << 52;
666

667
        y[i] = buf.f * expTab[xi & EXPTAB_MASK] * (((((A0*x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4)*x0 + A5);
668
    }
669
}
670

671
#undef EXPTAB_SCALE
672
#undef EXPTAB_MASK
673
#undef EXPPOLY_32F_A0
674

675
/////////////////////////////////////////// LOG ///////////////////////////////////////
676

677
#define LOGTAB_SCALE        8
678
#define LOGTAB_MASK         ((1 << LOGTAB_SCALE) - 1)
679

680
#define LOGTAB_TRANSLATE(tab, x, h) (((x) - 1.f)*tab[(h)+1])
681
static const double ln_2 = 0.69314718055994530941723212145818;
682

683
void log32f( const float *_x, float *y, int n )
684
{
685
    CV_INSTRUMENT_REGION();
686

687
    const float* const logTab_f = cv::details::getLogTab32f();
688

689
    const int LOGTAB_MASK2_32F = (1 << (23 - LOGTAB_SCALE)) - 1;
690
    const float
691
    A0 = 0.3333333333333333333333333f,
692
    A1 = -0.5f,
693
    A2 = 1.f;
694

695
    int i = 0;
696
    const int* x = (const int*)_x;
697

698
#if CV_SIMD
699
    const int VECSZ = v_float32::nlanes;
700
    const v_float32 vln2 = vx_setall_f32((float)ln_2);
701
    const v_float32 v1 = vx_setall_f32(1.f);
702
    const v_float32 vshift = vx_setall_f32(-1.f/512);
703

704
    const v_float32 vA0 = vx_setall_f32(A0);
705
    const v_float32 vA1 = vx_setall_f32(A1);
706
    const v_float32 vA2 = vx_setall_f32(A2);
707

708
    for( ; i < n; i += VECSZ )
709
    {
710
        if( i + VECSZ > n )
711
        {
712
            if( i == 0 || _x == y )
713
                break;
714
            i = n - VECSZ;
715
        }
716

717
        v_int32 h0 = vx_load(x + i);
718
        v_int32 yi0 = (v_shr<23>(h0) & vx_setall_s32(255)) - vx_setall_s32(127);
719
        v_int32 xi0 = (h0 & vx_setall_s32(LOGTAB_MASK2_32F)) | vx_setall_s32(127 << 23);
720

721
        h0 = v_shr<23 - LOGTAB_SCALE - 1>(h0) & vx_setall_s32(LOGTAB_MASK*2);
722
        v_float32 yf0, xf0;
723

724
        v_lut_deinterleave(logTab_f, h0, yf0, xf0);
725

726
        yf0 = v_fma(v_cvt_f32(yi0), vln2, yf0);
727

728
        v_float32 delta = v_reinterpret_as_f32(h0 == vx_setall_s32(510)) & vshift;
729
        xf0 = v_fma((v_reinterpret_as_f32(xi0) - v1), xf0, delta);
730

731
        v_float32 zf0 = v_fma(xf0, vA0, vA1);
732
        zf0 = v_fma(zf0, xf0, vA2);
733
        zf0 = v_fma(zf0, xf0, yf0);
734

735
        v_store(y + i, zf0);
736
    }
737
    vx_cleanup();
738
#endif
739

740
    for( ; i < n; i++ )
741
    {
742
        Cv32suf buf;
743
        int i0 = x[i];
744

745
        buf.i = (i0 & LOGTAB_MASK2_32F) | (127 << 23);
746
        int idx = (i0 >> (23 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2);
747

748
        float y0 = (((i0 >> 23) & 0xff) - 127) * (float)ln_2 + logTab_f[idx];
749
        float x0 = (buf.f - 1.f)*logTab_f[idx + 1] + (idx == 510 ? -1.f/512 : 0.f);
750
        y[i] = ((A0*x0 + A1)*x0 + A2)*x0 + y0;
751
    }
752
}
753

754
void log64f( const double *x, double *y, int n )
755
{
756
    CV_INSTRUMENT_REGION();
757

758
    const double* const logTab = cv::details::getLogTab64f();
759

760
    const int64 LOGTAB_MASK2_64F = ((int64)1 << (52 - LOGTAB_SCALE)) - 1;
761
    const double
762
    A7 = 1.0,
763
    A6 = -0.5,
764
    A5 = 0.333333333333333314829616256247390992939472198486328125,
765
    A4 = -0.25,
766
    A3 = 0.2,
767
    A2 = -0.1666666666666666574148081281236954964697360992431640625,
768
    A1 = 0.1428571428571428769682682968777953647077083587646484375,
769
    A0 = -0.125;
770

771
    int i = 0;
772

773
#if CV_SIMD_64F
774
    const int VECSZ = v_float64::nlanes;
775
    const v_float64 vln2 = vx_setall_f64(ln_2);
776

777
    const v_float64
778
        vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1),
779
        vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3),
780
        vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5),
781
        vA6 = vx_setall_f64(A6), vA7 = vx_setall_f64(A7);
782

783
    for( ; i < n; i += VECSZ )
784
    {
785
        if( i + VECSZ > n )
786
        {
787
            if( i == 0 || x == y )
788
                break;
789
            i = n - VECSZ;
790
        }
791

792
        v_int64 h0 = vx_load((const int64*)x + i);
793
        v_int32 yi0 = v_pack(v_shr<52>(h0), vx_setzero_s64());
794
        yi0 = (yi0 & vx_setall_s32(0x7ff)) - vx_setall_s32(1023);
795

796
        v_int64 xi0 = (h0 & vx_setall_s64(LOGTAB_MASK2_64F)) | vx_setall_s64((int64)1023 << 52);
797
        h0 = v_shr<52 - LOGTAB_SCALE - 1>(h0);
798
        v_int32 idx = v_pack(h0, h0) & vx_setall_s32(LOGTAB_MASK*2);
799

800
        v_float64 xf0, yf0;
801
        v_lut_deinterleave(logTab, idx, yf0, xf0);
802

803
        yf0 = v_fma(v_cvt_f64(yi0), vln2, yf0);
804
        v_float64 delta = v_cvt_f64(idx == vx_setall_s32(510))*vx_setall_f64(1./512);
805
        xf0 = v_fma(v_reinterpret_as_f64(xi0) - vx_setall_f64(1.), xf0, delta);
806

807
        v_float64 xq = xf0*xf0;
808
        v_float64 zf0 = v_fma(xq, vA0, vA2);
809
        v_float64 zf1 = v_fma(xq, vA1, vA3);
810
        zf0 = v_fma(zf0, xq, vA4);
811
        zf1 = v_fma(zf1, xq, vA5);
812
        zf0 = v_fma(zf0, xq, vA6);
813
        zf1 = v_fma(zf1, xq, vA7);
814
        zf1 = v_fma(zf1, xf0, yf0);
815
        zf0 = v_fma(zf0, xq, zf1);
816

817
        v_store(y + i, zf0);
818
    }
819
#endif
820

821
    for( ; i < n; i++ )
822
    {
823
        Cv64suf buf;
824
        int64 i0 = ((const int64*)x)[i];
825

826
        buf.i = (i0 & LOGTAB_MASK2_64F) | ((int64)1023 << 52);
827
        int idx = (int)(i0 >> (52 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2);
828

829
        double y0 = (((int)(i0 >> 52) & 0x7ff) - 1023) * ln_2 + logTab[idx];
830
        double x0 = (buf.f - 1.)*logTab[idx + 1] + (idx == 510 ? -1./512 : 0.);
831

832
        double xq = x0*x0;
833
        y[i] = (((A0*xq + A2)*xq + A4)*xq + A6)*xq + (((A1*xq + A3)*xq + A5)*xq + A7)*x0 + y0;
834
    }
835
}
836

837
#endif // issue 7795
838

839
float fastAtan2( float y, float x )
840
{
841
    return atan_f32(y, x);
842
}
843

844
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
845

846
CV_CPU_OPTIMIZATION_NAMESPACE_END
847

848
}} // namespace cv::hal
849

850
Product

Resources

Company