CoCalc -- undistort.avx2.cpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/calib3d/src/undistort.avx2.cpp
¹⁶³⁵⁴ views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
//  By downloading, copying, installing or using the software you agree to this license.
6
//  If you do not agree to this license, do not download, install,
7
//  copy or use the software.
8
//
9
//
10
//                           License Agreement
11
//                For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
// Third party copyrights are property of their respective owners.
16
//
17
// Redistribution and use in source and binary forms, with or without modification,
18
// are permitted provided that the following conditions are met:
19
//
20
//   * Redistribution's of source code must retain the above copyright notice,
21
//     this list of conditions and the following disclaimer.
22
//
23
//   * Redistribution's in binary form must reproduce the above copyright notice,
24
//     this list of conditions and the following disclaimer in the documentation
25
//     and/or other materials provided with the distribution.
26
//
27
//   * The name of the copyright holders may not be used to endorse or promote products
28
//     derived from this software without specific prior written permission.
29
//
30
// This software is provided by the copyright holders and contributors "as is" and
31
// any express or implied warranties, including, but not limited to, the implied
32
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33
// In no event shall the Intel Corporation or contributors be liable for any direct,
34
// indirect, incidental, special, exemplary, or consequential damages
35
// (including, but not limited to, procurement of substitute goods or services;
36
// loss of use, data, or profits; or business interruption) however caused
37
// and on any theory of liability, whether in contract, strict liability,
38
// or tort (including negligence or otherwise) arising in any way out of
39
// the use of this software, even if advised of the possibility of such damage.
40
//
41
//M*/
42

43
#include "precomp.hpp"
44
#include "undistort.hpp"
45

46
namespace cv
47
{
48

49
int initUndistortRectifyMapLine_AVX(float* m1f, float* m2f, short* m1, ushort* m2, double* matTilt, const double* ir,
50
                                    double& _x, double& _y, double& _w, int width, int m1type,
51
                                    double k1, double k2, double k3, double k4, double k5, double k6,
52
                                    double p1, double p2, double s1, double s2, double s3, double s4,
53
                                    double u0, double v0, double fx, double fy)
54
{
55
    int j = 0;
56

57
    static const __m256d __one = _mm256_set1_pd(1.0);
58
    static const __m256d __two = _mm256_set1_pd(2.0);
59

60
    const __m256d __matTilt_00 = _mm256_set1_pd(matTilt[0]);
61
    const __m256d __matTilt_10 = _mm256_set1_pd(matTilt[3]);
62
    const __m256d __matTilt_20 = _mm256_set1_pd(matTilt[6]);
63

64
    const __m256d __matTilt_01 = _mm256_set1_pd(matTilt[1]);
65
    const __m256d __matTilt_11 = _mm256_set1_pd(matTilt[4]);
66
    const __m256d __matTilt_21 = _mm256_set1_pd(matTilt[7]);
67

68
    const __m256d __matTilt_02 = _mm256_set1_pd(matTilt[2]);
69
    const __m256d __matTilt_12 = _mm256_set1_pd(matTilt[5]);
70
    const __m256d __matTilt_22 = _mm256_set1_pd(matTilt[8]);
71

72
    for (; j <= width - 4; j += 4, _x += 4 * ir[0], _y += 4 * ir[3], _w += 4 * ir[6])
73
    {
74
        // Question: Should we load the constants first?
75
        __m256d __w = _mm256_div_pd(__one, _mm256_set_pd(_w + 3 * ir[6], _w + 2 * ir[6], _w + ir[6], _w));
76
        __m256d __x = _mm256_mul_pd(_mm256_set_pd(_x + 3 * ir[0], _x + 2 * ir[0], _x + ir[0], _x), __w);
77
        __m256d __y = _mm256_mul_pd(_mm256_set_pd(_y + 3 * ir[3], _y + 2 * ir[3], _y + ir[3], _y), __w);
78
        __m256d __x2 = _mm256_mul_pd(__x, __x);
79
        __m256d __y2 = _mm256_mul_pd(__y, __y);
80
        __m256d __r2 = _mm256_add_pd(__x2, __y2);
81
        __m256d __2xy = _mm256_mul_pd(__two, _mm256_mul_pd(__x, __y));
82
        __m256d __kr = _mm256_div_pd(
83
#if CV_FMA3
84
            _mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_set1_pd(k3), __r2, _mm256_set1_pd(k2)), __r2, _mm256_set1_pd(k1)), __r2, __one),
85
            _mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_set1_pd(k6), __r2, _mm256_set1_pd(k5)), __r2, _mm256_set1_pd(k4)), __r2, __one)
86
#else
87
            _mm256_add_pd(__one, _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(k3), __r2), _mm256_set1_pd(k2)), __r2), _mm256_set1_pd(k1)), __r2)),
88
            _mm256_add_pd(__one, _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(k6), __r2), _mm256_set1_pd(k5)), __r2), _mm256_set1_pd(k4)), __r2))
89
#endif
90
        );
91
        __m256d __r22 = _mm256_mul_pd(__r2, __r2);
92
#if CV_FMA3
93
        __m256d __xd = _mm256_fmadd_pd(__x, __kr,
94
            _mm256_add_pd(
95
                _mm256_fmadd_pd(_mm256_set1_pd(p1), __2xy, _mm256_mul_pd(_mm256_set1_pd(p2), _mm256_fmadd_pd(__two, __x2, __r2))),
96
                _mm256_fmadd_pd(_mm256_set1_pd(s1), __r2, _mm256_mul_pd(_mm256_set1_pd(s2), __r22))));
97
        __m256d __yd = _mm256_fmadd_pd(__y, __kr,
98
            _mm256_add_pd(
99
                _mm256_fmadd_pd(_mm256_set1_pd(p1), _mm256_fmadd_pd(__two, __y2, __r2), _mm256_mul_pd(_mm256_set1_pd(p2), __2xy)),
100
                _mm256_fmadd_pd(_mm256_set1_pd(s3), __r2, _mm256_mul_pd(_mm256_set1_pd(s4), __r22))));
101

102
        __m256d __vecTilt2 = _mm256_fmadd_pd(__matTilt_20, __xd, _mm256_fmadd_pd(__matTilt_21, __yd, __matTilt_22));
103
#else
104
        __m256d __xd = _mm256_add_pd(
105
            _mm256_mul_pd(__x, __kr),
106
            _mm256_add_pd(
107
                _mm256_add_pd(
108
                    _mm256_mul_pd(_mm256_set1_pd(p1), __2xy),
109
                    _mm256_mul_pd(_mm256_set1_pd(p2), _mm256_add_pd(__r2, _mm256_mul_pd(__two, __x2)))),
110
                _mm256_add_pd(
111
                    _mm256_mul_pd(_mm256_set1_pd(s1), __r2),
112
                    _mm256_mul_pd(_mm256_set1_pd(s2), __r22))));
113
        __m256d __yd = _mm256_add_pd(
114
            _mm256_mul_pd(__y, __kr),
115
            _mm256_add_pd(
116
                _mm256_add_pd(
117
                    _mm256_mul_pd(_mm256_set1_pd(p1), _mm256_add_pd(__r2, _mm256_mul_pd(__two, __y2))),
118
                    _mm256_mul_pd(_mm256_set1_pd(p2), __2xy)),
119
                _mm256_add_pd(
120
                    _mm256_mul_pd(_mm256_set1_pd(s3), __r2),
121
                    _mm256_mul_pd(_mm256_set1_pd(s4), __r22))));
122

123
        __m256d __vecTilt2 = _mm256_add_pd(_mm256_add_pd(
124
            _mm256_mul_pd(__matTilt_20, __xd), _mm256_mul_pd(__matTilt_21, __yd)), __matTilt_22);
125
#endif
126
        __m256d __invProj = _mm256_blendv_pd(
127
            __one, _mm256_div_pd(__one, __vecTilt2),
128
            _mm256_cmp_pd(__vecTilt2, _mm256_setzero_pd(), _CMP_EQ_OQ));
129

130
#if CV_FMA3
131
        __m256d __u = _mm256_fmadd_pd(__matTilt_00, __xd, _mm256_fmadd_pd(__matTilt_01, __yd, __matTilt_02));
132
        __u = _mm256_fmadd_pd(_mm256_mul_pd(_mm256_set1_pd(fx), __invProj), __u, _mm256_set1_pd(u0));
133

134
        __m256d __v = _mm256_fmadd_pd(__matTilt_10, __xd, _mm256_fmadd_pd(__matTilt_11, __yd, __matTilt_12));
135
        __v = _mm256_fmadd_pd(_mm256_mul_pd(_mm256_set1_pd(fy), __invProj), __v, _mm256_set1_pd(v0));
136
#else
137
        __m256d __u = _mm256_add_pd(_mm256_add_pd(
138
            _mm256_mul_pd(__matTilt_00, __xd), _mm256_mul_pd(__matTilt_01, __yd)), __matTilt_02);
139
        __u = _mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(_mm256_set1_pd(fx), __invProj), __u), _mm256_set1_pd(u0));
140

141
        __m256d __v = _mm256_add_pd(_mm256_add_pd(
142
            _mm256_mul_pd(__matTilt_10, __xd), _mm256_mul_pd(__matTilt_11, __yd)), __matTilt_12);
143
        __v = _mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(_mm256_set1_pd(fy), __invProj), __v), _mm256_set1_pd(v0));
144
#endif
145

146
        if (m1type == CV_32FC1)
147
        {
148
            _mm_storeu_ps(&m1f[j], _mm256_cvtpd_ps(__u));
149
            _mm_storeu_ps(&m2f[j], _mm256_cvtpd_ps(__v));
150
        }
151
        else if (m1type == CV_32FC2)
152
        {
153
            __m128 __u_float = _mm256_cvtpd_ps(__u);
154
            __m128 __v_float = _mm256_cvtpd_ps(__v);
155

156
            _mm_storeu_ps(&m1f[j * 2], _mm_unpacklo_ps(__u_float, __v_float));
157
            _mm_storeu_ps(&m1f[j * 2 + 4], _mm_unpackhi_ps(__u_float, __v_float));
158
        }
159
        else // m1type == CV_16SC2
160
        {
161
            __u = _mm256_mul_pd(__u, _mm256_set1_pd(INTER_TAB_SIZE));
162
            __v = _mm256_mul_pd(__v, _mm256_set1_pd(INTER_TAB_SIZE));
163

164
            __m128i __iu = _mm256_cvtpd_epi32(__u);
165
            __m128i __iv = _mm256_cvtpd_epi32(__v);
166

167
            static const __m128i __INTER_TAB_SIZE_m1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
168
            __m128i __m2 = _mm_add_epi32(
169
                _mm_mullo_epi32(_mm_and_si128(__iv, __INTER_TAB_SIZE_m1), _mm_set1_epi32(INTER_TAB_SIZE)),
170
                _mm_and_si128(__iu, __INTER_TAB_SIZE_m1));
171
            __m2 = _mm_packus_epi32(__m2, __m2);
172
            _mm_maskstore_epi64((long long int*) &m2[j], _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF), __m2);
173

174
            // gcc4.9 does not support _mm256_set_m128
175
            // __m256i __m1 = _mm256_set_m128i(__iv, __iu);
176
            __m256i __m1 = _mm256_setzero_si256();
177
            __m1 = _mm256_inserti128_si256(__m1, __iu, 0);
178
            __m1 = _mm256_inserti128_si256(__m1, __iv, 1);
179
            __m1 = _mm256_srai_epi32(__m1, INTER_BITS); // v3 v2 v1 v0 u3 u2 u1 u0 (int32_t)
180
            static const __m256i __permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
181
            __m1 = _mm256_permutevar8x32_epi32(__m1, __permute_mask); // v3 u3 v2 u2 v1 u1 v0 u0 (int32_t)
182
            __m1 = _mm256_packs_epi32(__m1, __m1); // x x x x v3 u3 v2 u2 x x x x v1 u1 v0 u0 (int16_t)
183
            _mm_storeu_si128((__m128i*) &m1[j * 2], _mm256_extracti128_si256(_mm256_permute4x64_epi64(__m1, (2 << 2) + 0), 0));
184
        }
185
    }
186

187
    _mm256_zeroupper();
188

189
    return j;
190
}
191

192
}
193

194
/*  End of file  */
195

196
Product

Resources

Company