CoCalc -- phase.cpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/phase.cpp
¹⁶³³⁷ views
1
/*
2
 * By downloading, copying, installing or using the software you agree to this license.
3
 * If you do not agree to this license, do not download, install,
4
 * copy or use the software.
5
 *
6
 *
7
 *                           License Agreement
8
 *                For Open Source Computer Vision Library
9
 *                        (3-clause BSD License)
10
 *
11
 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
 * Third party copyrights are property of their respective owners.
13
 *
14
 * Redistribution and use in source and binary forms, with or without modification,
15
 * are permitted provided that the following conditions are met:
16
 *
17
 *   * Redistributions of source code must retain the above copyright notice,
18
 *     this list of conditions and the following disclaimer.
19
 *
20
 *   * Redistributions in binary form must reproduce the above copyright notice,
21
 *     this list of conditions and the following disclaimer in the documentation
22
 *     and/or other materials provided with the distribution.
23
 *
24
 *   * Neither the names of the copyright holders nor the names of the contributors
25
 *     may be used to endorse or promote products derived from this software
26
 *     without specific prior written permission.
27
 *
28
 * This software is provided by the copyright holders and contributors "as is" and
29
 * any express or implied warranties, including, but not limited to, the implied
30
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31
 * In no event shall copyright holders or contributors be liable for any direct,
32
 * indirect, incidental, special, exemplary, or consequential damages
33
 * (including, but not limited to, procurement of substitute goods or services;
34
 * loss of use, data, or profits; or business interruption) however caused
35
 * and on any theory of liability, whether in contract, strict liability,
36
 * or tort (including negligence or otherwise) arising in any way out of
37
 * the use of this software, even if advised of the possibility of such damage.
38
 */
39

40
#include <cfloat>
41
#include <cmath>
42

43
#include "common.hpp"
44

45
namespace CAROTENE_NS {
46

47
#ifdef CAROTENE_NEON
48

49
namespace {
50

51
#define FASTATAN2CONST(scale) \
52
        f32 P1((f32)( 0.9997878412794807  * (180.0 / M_PI) * scale)), \
53
        P3((f32)(-0.3258083974640975  * (180.0 / M_PI) * scale)), \
54
        P5((f32)( 0.1555786518463281  * (180.0 / M_PI) * scale)), \
55
        P7((f32)(-0.04432655554792128 * (180.0 / M_PI) * scale)), \
56
         A_90((f32)(90.f * scale)), \
57
        A_180((f32)(180.f * scale)), \
58
        A_360((f32)(360.f * scale)); \
59
        float32x4_t eps(vdupq_n_f32((float)DBL_EPSILON)), \
60
         _90(vdupq_n_f32(A_90)), \
61
        _180(vdupq_n_f32(A_180)), \
62
        _360(vdupq_n_f32(A_360)), \
63
           z(vdupq_n_f32(0.0f)), \
64
        p1(vdupq_n_f32(P1)), \
65
        p3(vdupq_n_f32(P3)), \
66
        p5(vdupq_n_f32(P5)), \
67
        p7(vdupq_n_f32(P7));
68

69
#define FASTATAN2SCALAR(y, x, a) \
70
    { \
71
        f32 ax = std::abs(x), ay = std::abs(y); \
72
        f32 c, c2; \
73
        if (ax >= ay) \
74
        { \
75
            c = ay / (ax + (float)DBL_EPSILON); \
76
            c2 = c * c; \
77
            a = (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
78
        } \
79
        else \
80
        { \
81
            c = ax / (ay + (float)DBL_EPSILON); \
82
            c2 = c * c; \
83
            a = A_90 - (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
84
        } \
85
        if (x < 0) \
86
            a = A_180 - a; \
87
        if (y < 0) \
88
            a = A_360 - a; \
89
    }
90

91
#define FASTATAN2VECTOR(v_y, v_x, a) \
92
    { \
93
        float32x4_t ax = vabsq_f32(v_x), ay = vabsq_f32(v_y); \
94
        float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay); \
95
        float32x4_t c = vmulq_f32(tmin, internal::vrecpq_f32(vaddq_f32(tmax, eps))); \
96
        float32x4_t c2 = vmulq_f32(c, c); \
97
        a = vmulq_f32(c2, p7); \
98
 \
99
        a = vmulq_f32(vaddq_f32(a, p5), c2); \
100
        a = vmulq_f32(vaddq_f32(a, p3), c2); \
101
        a = vmulq_f32(vaddq_f32(a, p1), c); \
102
 \
103
        a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a)); \
104
        a = vbslq_f32(vcltq_f32(v_x, z), vsubq_f32(_180, a), a); \
105
        a = vbslq_f32(vcltq_f32(v_y, z), vsubq_f32(_360, a), a); \
106
 \
107
    }
108

109
} // namespace
110

111
#endif
112

113
void phase(const Size2D &size,
114
           const s16 * src0Base, ptrdiff_t src0Stride,
115
           const s16 * src1Base, ptrdiff_t src1Stride,
116
           u8 * dstBase, ptrdiff_t dstStride)
117
{
118
    internal::assertSupportedConfiguration();
119
#ifdef CAROTENE_NEON
120
    FASTATAN2CONST(256.0f / 360.0f)
121
    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
122
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
123

124
    float32x4_t v_05 = vdupq_n_f32(0.5f);
125

126
    for (size_t i = 0; i < size.height; ++i)
127
    {
128
        const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
129
        const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
130
        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
131
        size_t j = 0;
132

133
        for (; j < roiw16; j += 16)
134
        {
135
            internal::prefetch(src0 + j);
136
            internal::prefetch(src1 + j);
137

138
            int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
139
            int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
140

141
            // 0
142
            float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
143
            float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
144
            float32x4_t v_dst32f0;
145
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
146

147
            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
148
            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
149
            float32x4_t v_dst32f1;
150
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
151

152
            uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
153
                                                vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
154

155
            // 1
156
            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
157
            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11)));
158
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
159

160
            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01)));
161
            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
162
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
163

164
            uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
165
                                                vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
166

167
            vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0),
168
                                          vmovn_u16(v_dst16s1)));
169
        }
170
        for (; j < roiw8; j += 8)
171
        {
172
            int16x8_t v_src0 = vld1q_s16(src0 + j);
173
            int16x8_t v_src1 = vld1q_s16(src1 + j);
174

175
            float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0)));
176
            float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)));
177
            float32x4_t v_dst32f0;
178
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
179

180
            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0)));
181
            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)));
182
            float32x4_t v_dst32f1;
183
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
184

185
            uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
186
                                            vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
187

188
            vst1_u8(dst + j, vmovn_u16(v_dst));
189
        }
190

191
        for (; j < size.width; j++)
192
        {
193
            f32 x = src0[j], y = src1[j];
194
            f32 a;
195
            FASTATAN2SCALAR(y, x, a)
196
            dst[j] = (u8)(s32)floor(a + 0.5f);
197
        }
198
    }
199
#else
200
    (void)size;
201
    (void)src0Base;
202
    (void)src0Stride;
203
    (void)src1Base;
204
    (void)src1Stride;
205
    (void)dstBase;
206
    (void)dstStride;
207
#endif
208
}
209

210
void phase(const Size2D &size,
211
           const f32 * src0Base, ptrdiff_t src0Stride,
212
           const f32 * src1Base, ptrdiff_t src1Stride,
213
           f32 * dstBase, ptrdiff_t dstStride,
214
           f32 scale)
215
{
216
    internal::assertSupportedConfiguration();
217
#ifdef CAROTENE_NEON
218
    FASTATAN2CONST(scale)
219
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
220

221
    for (size_t i = 0; i < size.height; ++i)
222
    {
223
        const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
224
        const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
225
        f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
226
        size_t j = 0;
227

228
        for (; j < roiw8; j += 8)
229
        {
230
            internal::prefetch(src0 + j);
231
            internal::prefetch(src1 + j);
232

233
            float32x4_t v_src00 = vld1q_f32(src0 + j), v_src01 = vld1q_f32(src0 + j + 4);
234
            float32x4_t v_src10 = vld1q_f32(src1 + j), v_src11 = vld1q_f32(src1 + j + 4);
235

236
            float32x4_t v_dst32f;
237
            // 0
238
            FASTATAN2VECTOR(v_src10, v_src00, v_dst32f)
239
            vst1q_f32(dst + j,     v_dst32f);
240
            // 1
241
            FASTATAN2VECTOR(v_src11, v_src01, v_dst32f)
242
            vst1q_f32(dst + j + 4, v_dst32f);
243
        }
244
        if(j + 4 <= size.width)
245
        {
246
            float32x4_t v_src0 = vld1q_f32(src0 + j);
247
            float32x4_t v_src1 = vld1q_f32(src1 + j);
248

249
            float32x4_t v_dst32f;
250
            FASTATAN2VECTOR(v_src1, v_src0, v_dst32f)
251
            vst1q_f32(dst + j, v_dst32f);
252
            j += 4;
253
        }
254

255
        for (; j < size.width; j++)
256
        {
257
            f32 a;
258
            FASTATAN2SCALAR(src1[j], src0[j], a)
259
            dst[j] = a;
260
        }
261
    }
262
#else
263
    (void)size;
264
    (void)src0Base;
265
    (void)src0Stride;
266
    (void)src1Base;
267
    (void)src1Stride;
268
    (void)dstBase;
269
    (void)dstStride;
270
    (void)scale;
271
#endif
272
}
273

274
} // namespace CAROTENE_NS
275

276
Product

Resources

Company