Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/core/src/mathfuncs_core.simd.hpp
16337 views
1
// This file is part of OpenCV project.
2
// It is subject to the license terms in the LICENSE file found in the top-level directory
3
// of this distribution and at http://opencv.org/license.html.
4
5
#include "mathfuncs.hpp"
6
7
namespace cv { namespace hal {
8
9
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
10
11
// forward declarations
12
void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees);
13
void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees);
14
void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees);
15
void magnitude32f(const float* x, const float* y, float* mag, int len);
16
void magnitude64f(const double* x, const double* y, double* mag, int len);
17
void invSqrt32f(const float* src, float* dst, int len);
18
void invSqrt64f(const double* src, double* dst, int len);
19
void sqrt32f(const float* src, float* dst, int len);
20
void sqrt64f(const double* src, double* dst, int len);
21
void exp32f(const float *src, float *dst, int n);
22
void exp64f(const double *src, double *dst, int n);
23
void log32f(const float *src, float *dst, int n);
24
void log64f(const double *src, double *dst, int n);
25
float fastAtan2(float y, float x);
26
27
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
28
29
using namespace std;
30
using namespace cv;
31
32
namespace {
33
34
#ifdef __EMSCRIPTEN__
35
static inline float atan_f32(float y, float x)
36
{
37
float a = atan2(y, x) * 180.0f / CV_PI;
38
if (a < 0.0f)
39
a += 360.0f;
40
if (a >= 360.0f)
41
a -= 360.0f;
42
return a; // range [0; 360)
43
}
44
#else
45
static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI);
46
static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI);
47
static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI);
48
static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI);
49
50
static inline float atan_f32(float y, float x)
51
{
52
float ax = std::abs(x), ay = std::abs(y);
53
float a, c, c2;
54
if( ax >= ay )
55
{
56
c = ay/(ax + (float)DBL_EPSILON);
57
c2 = c*c;
58
a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
59
}
60
else
61
{
62
c = ax/(ay + (float)DBL_EPSILON);
63
c2 = c*c;
64
a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
65
}
66
if( x < 0 )
67
a = 180.f - a;
68
if( y < 0 )
69
a = 360.f - a;
70
return a;
71
}
72
#endif
73
74
#if CV_SIMD
75
76
struct v_atan_f32
77
{
78
explicit v_atan_f32(const float& scale)
79
{
80
eps = vx_setall_f32((float)DBL_EPSILON);
81
z = vx_setzero_f32();
82
p7 = vx_setall_f32(atan2_p7);
83
p5 = vx_setall_f32(atan2_p5);
84
p3 = vx_setall_f32(atan2_p3);
85
p1 = vx_setall_f32(atan2_p1);
86
val90 = vx_setall_f32(90.f);
87
val180 = vx_setall_f32(180.f);
88
val360 = vx_setall_f32(360.f);
89
s = vx_setall_f32(scale);
90
}
91
92
v_float32 compute(const v_float32& y, const v_float32& x)
93
{
94
v_float32 ax = v_abs(x);
95
v_float32 ay = v_abs(y);
96
v_float32 c = v_min(ax, ay) / (v_max(ax, ay) + eps);
97
v_float32 cc = c * c;
98
v_float32 a = v_fma(v_fma(v_fma(cc, p7, p5), cc, p3), cc, p1)*c;
99
a = v_select(ax >= ay, a, val90 - a);
100
a = v_select(x < z, val180 - a, a);
101
a = v_select(y < z, val360 - a, a);
102
return a * s;
103
}
104
105
v_float32 eps;
106
v_float32 z;
107
v_float32 p7;
108
v_float32 p5;
109
v_float32 p3;
110
v_float32 p1;
111
v_float32 val90;
112
v_float32 val180;
113
v_float32 val360;
114
v_float32 s;
115
};
116
117
#endif
118
119
} // anonymous::
120
121
///////////////////////////////////// ATAN2 ////////////////////////////////////
122
123
static void fastAtan32f_(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
124
{
125
float scale = angleInDegrees ? 1.f : (float)(CV_PI/180);
126
int i = 0;
127
#if CV_SIMD
128
const int VECSZ = v_float32::nlanes;
129
v_atan_f32 v(scale);
130
131
for( ; i < len; i += VECSZ*2 )
132
{
133
if( i + VECSZ*2 > len )
134
{
135
// if it's inplace operation, we cannot repeatedly process
136
// the tail for the second time, so we have to use the
137
// scalar code
138
if( i == 0 || angle == X || angle == Y )
139
break;
140
i = len - VECSZ*2;
141
}
142
143
v_float32 y0 = vx_load(Y + i);
144
v_float32 x0 = vx_load(X + i);
145
v_float32 y1 = vx_load(Y + i + VECSZ);
146
v_float32 x1 = vx_load(X + i + VECSZ);
147
148
v_float32 r0 = v.compute(y0, x0);
149
v_float32 r1 = v.compute(y1, x1);
150
151
v_store(angle + i, r0);
152
v_store(angle + i + VECSZ, r1);
153
}
154
vx_cleanup();
155
#endif
156
157
for( ; i < len; i++ )
158
angle[i] = atan_f32(Y[i], X[i])*scale;
159
}
160
161
void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
162
{
163
CV_INSTRUMENT_REGION();
164
fastAtan32f_(Y, X, angle, len, angleInDegrees );
165
}
166
167
void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees)
168
{
169
CV_INSTRUMENT_REGION();
170
171
const int BLKSZ = 128;
172
float ybuf[BLKSZ], xbuf[BLKSZ], abuf[BLKSZ];
173
for( int i = 0; i < len; i += BLKSZ )
174
{
175
int j, blksz = std::min(BLKSZ, len - i);
176
for( j = 0; j < blksz; j++ )
177
{
178
ybuf[j] = (float)Y[i + j];
179
xbuf[j] = (float)X[i + j];
180
}
181
fastAtan32f_(ybuf, xbuf, abuf, blksz, angleInDegrees);
182
for( j = 0; j < blksz; j++ )
183
angle[i + j] = abuf[j];
184
}
185
}
186
187
// deprecated
188
void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
189
{
190
CV_INSTRUMENT_REGION();
191
fastAtan32f(Y, X, angle, len, angleInDegrees);
192
}
193
194
void magnitude32f(const float* x, const float* y, float* mag, int len)
195
{
196
CV_INSTRUMENT_REGION();
197
198
int i = 0;
199
200
#if CV_SIMD
201
const int VECSZ = v_float32::nlanes;
202
for( ; i < len; i += VECSZ*2 )
203
{
204
if( i + VECSZ*2 > len )
205
{
206
if( i == 0 || mag == x || mag == y )
207
break;
208
i = len - VECSZ*2;
209
}
210
v_float32 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
211
v_float32 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
212
x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
213
x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
214
v_store(mag + i, x0);
215
v_store(mag + i + VECSZ, x1);
216
}
217
vx_cleanup();
218
#endif
219
220
for( ; i < len; i++ )
221
{
222
float x0 = x[i], y0 = y[i];
223
mag[i] = std::sqrt(x0*x0 + y0*y0);
224
}
225
}
226
227
void magnitude64f(const double* x, const double* y, double* mag, int len)
228
{
229
CV_INSTRUMENT_REGION();
230
231
int i = 0;
232
233
#if CV_SIMD_64F
234
const int VECSZ = v_float64::nlanes;
235
for( ; i < len; i += VECSZ*2 )
236
{
237
if( i + VECSZ*2 > len )
238
{
239
if( i == 0 || mag == x || mag == y )
240
break;
241
i = len - VECSZ*2;
242
}
243
v_float64 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
244
v_float64 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
245
x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
246
x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
247
v_store(mag + i, x0);
248
v_store(mag + i + VECSZ, x1);
249
}
250
vx_cleanup();
251
#endif
252
253
for( ; i < len; i++ )
254
{
255
double x0 = x[i], y0 = y[i];
256
mag[i] = std::sqrt(x0*x0 + y0*y0);
257
}
258
}
259
260
261
void invSqrt32f(const float* src, float* dst, int len)
262
{
263
CV_INSTRUMENT_REGION();
264
265
int i = 0;
266
267
#if CV_SIMD
268
const int VECSZ = v_float32::nlanes;
269
for( ; i < len; i += VECSZ*2 )
270
{
271
if( i + VECSZ*2 > len )
272
{
273
if( i == 0 || src == dst )
274
break;
275
i = len - VECSZ*2;
276
}
277
v_float32 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
278
t0 = v_invsqrt(t0);
279
t1 = v_invsqrt(t1);
280
v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
281
}
282
vx_cleanup();
283
#endif
284
285
for( ; i < len; i++ )
286
dst[i] = 1/std::sqrt(src[i]);
287
}
288
289
290
void invSqrt64f(const double* src, double* dst, int len)
291
{
292
CV_INSTRUMENT_REGION();
293
int i = 0;
294
295
#if CV_SIMD_64F
296
const int VECSZ = v_float64::nlanes;
297
for ( ; i < len; i += VECSZ*2)
298
{
299
if( i + VECSZ*2 > len )
300
{
301
if( i == 0 || src == dst )
302
break;
303
i = len - VECSZ*2;
304
}
305
v_float64 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
306
t0 = v_invsqrt(t0);
307
t1 = v_invsqrt(t1);
308
v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
309
}
310
#endif
311
312
for( ; i < len; i++ )
313
dst[i] = 1/std::sqrt(src[i]);
314
}
315
316
317
void sqrt32f(const float* src, float* dst, int len)
318
{
319
CV_INSTRUMENT_REGION();
320
321
int i = 0;
322
323
#if CV_SIMD
324
const int VECSZ = v_float32::nlanes;
325
for( ; i < len; i += VECSZ*2 )
326
{
327
if( i + VECSZ*2 > len )
328
{
329
if( i == 0 || src == dst )
330
break;
331
i = len - VECSZ*2;
332
}
333
v_float32 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
334
t0 = v_sqrt(t0);
335
t1 = v_sqrt(t1);
336
v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
337
}
338
vx_cleanup();
339
#endif
340
341
for( ; i < len; i++ )
342
dst[i] = std::sqrt(src[i]);
343
}
344
345
346
void sqrt64f(const double* src, double* dst, int len)
347
{
348
CV_INSTRUMENT_REGION();
349
350
int i = 0;
351
352
#if CV_SIMD_64F
353
const int VECSZ = v_float64::nlanes;
354
for( ; i < len; i += VECSZ*2 )
355
{
356
if( i + VECSZ*2 > len )
357
{
358
if( i == 0 || src == dst )
359
break;
360
i = len - VECSZ*2;
361
}
362
v_float64 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
363
t0 = v_sqrt(t0);
364
t1 = v_sqrt(t1);
365
v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
366
}
367
vx_cleanup();
368
#endif
369
370
for( ; i < len; i++ )
371
dst[i] = std::sqrt(src[i]);
372
}
373
374
// Workaround for ICE in MSVS 2015 update 3 (issue #7795)
375
// CV_AVX is not used here, because generated code is faster in non-AVX mode.
376
// (tested with disabled IPP on i5-6300U)
377
#if (defined _MSC_VER && _MSC_VER >= 1900) || defined(__EMSCRIPTEN__)
378
void exp32f(const float *src, float *dst, int n)
379
{
380
CV_INSTRUMENT_REGION();
381
382
for (int i = 0; i < n; i++)
383
{
384
dst[i] = std::exp(src[i]);
385
}
386
}
387
388
void exp64f(const double *src, double *dst, int n)
389
{
390
CV_INSTRUMENT_REGION();
391
392
for (int i = 0; i < n; i++)
393
{
394
dst[i] = std::exp(src[i]);
395
}
396
}
397
398
void log32f(const float *src, float *dst, int n)
399
{
400
CV_INSTRUMENT_REGION();
401
402
for (int i = 0; i < n; i++)
403
{
404
dst[i] = std::log(src[i]);
405
}
406
}
407
void log64f(const double *src, double *dst, int n)
408
{
409
CV_INSTRUMENT_REGION();
410
411
for (int i = 0; i < n; i++)
412
{
413
dst[i] = std::log(src[i]);
414
}
415
}
416
#else
417
418
////////////////////////////////////// EXP /////////////////////////////////////
419
420
#define EXPTAB_SCALE 6
421
#define EXPTAB_MASK ((1 << EXPTAB_SCALE) - 1)
422
423
#define EXPPOLY_32F_A0 .9670371139572337719125840413672004409288e-2
424
425
// the code below uses _mm_cast* intrinsics, which are not avialable on VS2005
426
#if (defined _MSC_VER && _MSC_VER < 1500) || \
427
(!defined __APPLE__ && defined __GNUC__ && __GNUC__*100 + __GNUC_MINOR__ < 402)
428
#undef CV_SSE2
429
#define CV_SSE2 0
430
#endif
431
432
static const double exp_prescale = 1.4426950408889634073599246810019 * (1 << EXPTAB_SCALE);
433
static const double exp_postscale = 1./(1 << EXPTAB_SCALE);
434
static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) < 3000
435
436
void exp32f( const float *_x, float *y, int n )
437
{
438
CV_INSTRUMENT_REGION();
439
440
const float* const expTab_f = cv::details::getExpTab32f();
441
442
const float
443
A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
444
A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
445
A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0),
446
A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0);
447
448
int i = 0;
449
const Cv32suf* x = (const Cv32suf*)_x;
450
float minval = (float)(-exp_max_val/exp_prescale);
451
float maxval = (float)(exp_max_val/exp_prescale);
452
float postscale = (float)exp_postscale;
453
454
#if CV_SIMD
455
const int VECSZ = v_float32::nlanes;
456
const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
457
const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
458
const v_float32 vminval = vx_setall_f32(minval);
459
const v_float32 vmaxval = vx_setall_f32(maxval);
460
461
const v_float32 vA1 = vx_setall_f32((float)A1);
462
const v_float32 vA2 = vx_setall_f32((float)A2);
463
const v_float32 vA3 = vx_setall_f32((float)A3);
464
const v_float32 vA4 = vx_setall_f32((float)A4);
465
466
const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
467
bool y_aligned = (size_t)(void*)y % 32 == 0;
468
469
for( ; i < n; i += VECSZ*2 )
470
{
471
if( i + VECSZ*2 > n )
472
{
473
if( i == 0 || _x == y )
474
break;
475
i = n - VECSZ*2;
476
y_aligned = false;
477
}
478
479
v_float32 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f);
480
481
xf0 = v_min(v_max(xf0, vminval), vmaxval);
482
xf1 = v_min(v_max(xf1, vminval), vmaxval);
483
484
xf0 *= vprescale;
485
xf1 *= vprescale;
486
487
v_int32 xi0 = v_round(xf0);
488
v_int32 xi1 = v_round(xf1);
489
xf0 = (xf0 - v_cvt_f32(xi0))*vpostscale;
490
xf1 = (xf1 - v_cvt_f32(xi1))*vpostscale;
491
492
v_float32 yf0 = v_lut(expTab_f, xi0 & vidxmask);
493
v_float32 yf1 = v_lut(expTab_f, xi1 & vidxmask);
494
495
v_int32 v0 = vx_setzero_s32(), v127 = vx_setall_s32(127), v255 = vx_setall_s32(255);
496
xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v127, v0), v255);
497
xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v127, v0), v255);
498
499
yf0 *= v_reinterpret_as_f32(v_shl<23>(xi0));
500
yf1 *= v_reinterpret_as_f32(v_shl<23>(xi1));
501
502
v_float32 zf0 = xf0 + vA1;
503
v_float32 zf1 = xf1 + vA1;
504
505
zf0 = v_fma(zf0, xf0, vA2);
506
zf1 = v_fma(zf1, xf1, vA2);
507
508
zf0 = v_fma(zf0, xf0, vA3);
509
zf1 = v_fma(zf1, xf1, vA3);
510
511
zf0 = v_fma(zf0, xf0, vA4);
512
zf1 = v_fma(zf1, xf1, vA4);
513
514
zf0 *= yf0;
515
zf1 *= yf1;
516
517
if( y_aligned )
518
{
519
v_store_aligned(y + i, zf0);
520
v_store_aligned(y + i + VECSZ, zf1);
521
}
522
else
523
{
524
v_store(y + i, zf0);
525
v_store(y + i + VECSZ, zf1);
526
}
527
}
528
vx_cleanup();
529
#endif
530
531
for( ; i < n; i++ )
532
{
533
float x0 = x[i].f;
534
x0 = std::min(std::max(x0, minval), maxval);
535
x0 *= (float)exp_prescale;
536
Cv32suf buf;
537
538
int xi = saturate_cast<int>(x0);
539
x0 = (x0 - xi)*postscale;
540
541
int t = (xi >> EXPTAB_SCALE) + 127;
542
t = !(t & ~255) ? t : t < 0 ? 0 : 255;
543
buf.i = t << 23;
544
545
y[i] = buf.f * expTab_f[xi & EXPTAB_MASK] * ((((x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4);
546
}
547
}
548
549
void exp64f( const double *_x, double *y, int n )
550
{
551
CV_INSTRUMENT_REGION();
552
553
const double* const expTab = cv::details::getExpTab64f();
554
555
const double
556
A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
557
A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,
558
A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0,
559
A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0,
560
A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0,
561
A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0;
562
563
int i = 0;
564
const Cv64suf* x = (const Cv64suf*)_x;
565
double minval = (-exp_max_val/exp_prescale);
566
double maxval = (exp_max_val/exp_prescale);
567
568
#if CV_SIMD_64F
569
const int VECSZ = v_float64::nlanes;
570
const v_float64 vprescale = vx_setall_f64(exp_prescale);
571
const v_float64 vpostscale = vx_setall_f64(exp_postscale);
572
const v_float64 vminval = vx_setall_f64(minval);
573
const v_float64 vmaxval = vx_setall_f64(maxval);
574
575
const v_float64 vA1 = vx_setall_f64(A1);
576
const v_float64 vA2 = vx_setall_f64(A2);
577
const v_float64 vA3 = vx_setall_f64(A3);
578
const v_float64 vA4 = vx_setall_f64(A4);
579
const v_float64 vA5 = vx_setall_f64(A5);
580
581
const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
582
bool y_aligned = (size_t)(void*)y % 32 == 0;
583
584
for( ; i < n; i += VECSZ*2 )
585
{
586
if( i + VECSZ*2 > n )
587
{
588
if( i == 0 || _x == y )
589
break;
590
i = n - VECSZ*2;
591
y_aligned = false;
592
}
593
594
v_float64 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f);
595
596
xf0 = v_min(v_max(xf0, vminval), vmaxval);
597
xf1 = v_min(v_max(xf1, vminval), vmaxval);
598
599
xf0 *= vprescale;
600
xf1 *= vprescale;
601
602
v_int32 xi0 = v_round(xf0);
603
v_int32 xi1 = v_round(xf1);
604
xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale;
605
xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale;
606
607
v_float64 yf0 = v_lut(expTab, xi0 & vidxmask);
608
v_float64 yf1 = v_lut(expTab, xi1 & vidxmask);
609
610
v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047);
611
xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v1023, v0), v2047);
612
xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v1023, v0), v2047);
613
614
v_int64 xq0, xq1, dummy;
615
v_expand(xi0, xq0, dummy);
616
v_expand(xi1, xq1, dummy);
617
618
yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0));
619
yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1));
620
621
v_float64 zf0 = xf0 + vA1;
622
v_float64 zf1 = xf1 + vA1;
623
624
zf0 = v_fma(zf0, xf0, vA2);
625
zf1 = v_fma(zf1, xf1, vA2);
626
627
zf0 = v_fma(zf0, xf0, vA3);
628
zf1 = v_fma(zf1, xf1, vA3);
629
630
zf0 = v_fma(zf0, xf0, vA4);
631
zf1 = v_fma(zf1, xf1, vA4);
632
633
zf0 = v_fma(zf0, xf0, vA5);
634
zf1 = v_fma(zf1, xf1, vA5);
635
636
zf0 *= yf0;
637
zf1 *= yf1;
638
639
if( y_aligned )
640
{
641
v_store_aligned(y + i, zf0);
642
v_store_aligned(y + i + VECSZ, zf1);
643
}
644
else
645
{
646
v_store(y + i, zf0);
647
v_store(y + i + VECSZ, zf1);
648
}
649
}
650
vx_cleanup();
651
#endif
652
653
for( ; i < n; i++ )
654
{
655
double x0 = x[i].f;
656
x0 = std::min(std::max(x0, minval), maxval);
657
x0 *= exp_prescale;
658
Cv64suf buf;
659
660
int xi = saturate_cast<int>(x0);
661
x0 = (x0 - xi)*exp_postscale;
662
663
int t = (xi >> EXPTAB_SCALE) + 1023;
664
t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
665
buf.i = (int64)t << 52;
666
667
y[i] = buf.f * expTab[xi & EXPTAB_MASK] * (((((A0*x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4)*x0 + A5);
668
}
669
}
670
671
#undef EXPTAB_SCALE
672
#undef EXPTAB_MASK
673
#undef EXPPOLY_32F_A0
674
675
/////////////////////////////////////////// LOG ///////////////////////////////////////
676
677
#define LOGTAB_SCALE 8
678
#define LOGTAB_MASK ((1 << LOGTAB_SCALE) - 1)
679
680
#define LOGTAB_TRANSLATE(tab, x, h) (((x) - 1.f)*tab[(h)+1])
681
static const double ln_2 = 0.69314718055994530941723212145818;
682
683
void log32f( const float *_x, float *y, int n )
684
{
685
CV_INSTRUMENT_REGION();
686
687
const float* const logTab_f = cv::details::getLogTab32f();
688
689
const int LOGTAB_MASK2_32F = (1 << (23 - LOGTAB_SCALE)) - 1;
690
const float
691
A0 = 0.3333333333333333333333333f,
692
A1 = -0.5f,
693
A2 = 1.f;
694
695
int i = 0;
696
const int* x = (const int*)_x;
697
698
#if CV_SIMD
699
const int VECSZ = v_float32::nlanes;
700
const v_float32 vln2 = vx_setall_f32((float)ln_2);
701
const v_float32 v1 = vx_setall_f32(1.f);
702
const v_float32 vshift = vx_setall_f32(-1.f/512);
703
704
const v_float32 vA0 = vx_setall_f32(A0);
705
const v_float32 vA1 = vx_setall_f32(A1);
706
const v_float32 vA2 = vx_setall_f32(A2);
707
708
for( ; i < n; i += VECSZ )
709
{
710
if( i + VECSZ > n )
711
{
712
if( i == 0 || _x == y )
713
break;
714
i = n - VECSZ;
715
}
716
717
v_int32 h0 = vx_load(x + i);
718
v_int32 yi0 = (v_shr<23>(h0) & vx_setall_s32(255)) - vx_setall_s32(127);
719
v_int32 xi0 = (h0 & vx_setall_s32(LOGTAB_MASK2_32F)) | vx_setall_s32(127 << 23);
720
721
h0 = v_shr<23 - LOGTAB_SCALE - 1>(h0) & vx_setall_s32(LOGTAB_MASK*2);
722
v_float32 yf0, xf0;
723
724
v_lut_deinterleave(logTab_f, h0, yf0, xf0);
725
726
yf0 = v_fma(v_cvt_f32(yi0), vln2, yf0);
727
728
v_float32 delta = v_reinterpret_as_f32(h0 == vx_setall_s32(510)) & vshift;
729
xf0 = v_fma((v_reinterpret_as_f32(xi0) - v1), xf0, delta);
730
731
v_float32 zf0 = v_fma(xf0, vA0, vA1);
732
zf0 = v_fma(zf0, xf0, vA2);
733
zf0 = v_fma(zf0, xf0, yf0);
734
735
v_store(y + i, zf0);
736
}
737
vx_cleanup();
738
#endif
739
740
for( ; i < n; i++ )
741
{
742
Cv32suf buf;
743
int i0 = x[i];
744
745
buf.i = (i0 & LOGTAB_MASK2_32F) | (127 << 23);
746
int idx = (i0 >> (23 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2);
747
748
float y0 = (((i0 >> 23) & 0xff) - 127) * (float)ln_2 + logTab_f[idx];
749
float x0 = (buf.f - 1.f)*logTab_f[idx + 1] + (idx == 510 ? -1.f/512 : 0.f);
750
y[i] = ((A0*x0 + A1)*x0 + A2)*x0 + y0;
751
}
752
}
753
754
void log64f( const double *x, double *y, int n )
755
{
756
CV_INSTRUMENT_REGION();
757
758
const double* const logTab = cv::details::getLogTab64f();
759
760
const int64 LOGTAB_MASK2_64F = ((int64)1 << (52 - LOGTAB_SCALE)) - 1;
761
const double
762
A7 = 1.0,
763
A6 = -0.5,
764
A5 = 0.333333333333333314829616256247390992939472198486328125,
765
A4 = -0.25,
766
A3 = 0.2,
767
A2 = -0.1666666666666666574148081281236954964697360992431640625,
768
A1 = 0.1428571428571428769682682968777953647077083587646484375,
769
A0 = -0.125;
770
771
int i = 0;
772
773
#if CV_SIMD_64F
774
const int VECSZ = v_float64::nlanes;
775
const v_float64 vln2 = vx_setall_f64(ln_2);
776
777
const v_float64
778
vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1),
779
vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3),
780
vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5),
781
vA6 = vx_setall_f64(A6), vA7 = vx_setall_f64(A7);
782
783
for( ; i < n; i += VECSZ )
784
{
785
if( i + VECSZ > n )
786
{
787
if( i == 0 || x == y )
788
break;
789
i = n - VECSZ;
790
}
791
792
v_int64 h0 = vx_load((const int64*)x + i);
793
v_int32 yi0 = v_pack(v_shr<52>(h0), vx_setzero_s64());
794
yi0 = (yi0 & vx_setall_s32(0x7ff)) - vx_setall_s32(1023);
795
796
v_int64 xi0 = (h0 & vx_setall_s64(LOGTAB_MASK2_64F)) | vx_setall_s64((int64)1023 << 52);
797
h0 = v_shr<52 - LOGTAB_SCALE - 1>(h0);
798
v_int32 idx = v_pack(h0, h0) & vx_setall_s32(LOGTAB_MASK*2);
799
800
v_float64 xf0, yf0;
801
v_lut_deinterleave(logTab, idx, yf0, xf0);
802
803
yf0 = v_fma(v_cvt_f64(yi0), vln2, yf0);
804
v_float64 delta = v_cvt_f64(idx == vx_setall_s32(510))*vx_setall_f64(1./512);
805
xf0 = v_fma(v_reinterpret_as_f64(xi0) - vx_setall_f64(1.), xf0, delta);
806
807
v_float64 xq = xf0*xf0;
808
v_float64 zf0 = v_fma(xq, vA0, vA2);
809
v_float64 zf1 = v_fma(xq, vA1, vA3);
810
zf0 = v_fma(zf0, xq, vA4);
811
zf1 = v_fma(zf1, xq, vA5);
812
zf0 = v_fma(zf0, xq, vA6);
813
zf1 = v_fma(zf1, xq, vA7);
814
zf1 = v_fma(zf1, xf0, yf0);
815
zf0 = v_fma(zf0, xq, zf1);
816
817
v_store(y + i, zf0);
818
}
819
#endif
820
821
for( ; i < n; i++ )
822
{
823
Cv64suf buf;
824
int64 i0 = ((const int64*)x)[i];
825
826
buf.i = (i0 & LOGTAB_MASK2_64F) | ((int64)1023 << 52);
827
int idx = (int)(i0 >> (52 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2);
828
829
double y0 = (((int)(i0 >> 52) & 0x7ff) - 1023) * ln_2 + logTab[idx];
830
double x0 = (buf.f - 1.)*logTab[idx + 1] + (idx == 510 ? -1./512 : 0.);
831
832
double xq = x0*x0;
833
y[i] = (((A0*xq + A2)*xq + A4)*xq + A6)*xq + (((A1*xq + A3)*xq + A5)*xq + A7)*x0 + y0;
834
}
835
}
836
837
#endif // issue 7795
838
839
float fastAtan2( float y, float x )
840
{
841
return atan_f32(y, x);
842
}
843
844
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
845
846
CV_CPU_OPTIMIZATION_NAMESPACE_END
847
848
}} // namespace cv::hal
849
850