Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/calib3d/src/undistort.avx2.cpp
16354 views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
8
//
9
//
10
// License Agreement
11
// For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
// Third party copyrights are property of their respective owners.
16
//
17
// Redistribution and use in source and binary forms, with or without modification,
18
// are permitted provided that the following conditions are met:
19
//
20
// * Redistribution's of source code must retain the above copyright notice,
21
// this list of conditions and the following disclaimer.
22
//
23
// * Redistribution's in binary form must reproduce the above copyright notice,
24
// this list of conditions and the following disclaimer in the documentation
25
// and/or other materials provided with the distribution.
26
//
27
// * The name of the copyright holders may not be used to endorse or promote products
28
// derived from this software without specific prior written permission.
29
//
30
// This software is provided by the copyright holders and contributors "as is" and
31
// any express or implied warranties, including, but not limited to, the implied
32
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33
// In no event shall the Intel Corporation or contributors be liable for any direct,
34
// indirect, incidental, special, exemplary, or consequential damages
35
// (including, but not limited to, procurement of substitute goods or services;
36
// loss of use, data, or profits; or business interruption) however caused
37
// and on any theory of liability, whether in contract, strict liability,
38
// or tort (including negligence or otherwise) arising in any way out of
39
// the use of this software, even if advised of the possibility of such damage.
40
//
41
//M*/
42
43
#include "precomp.hpp"
44
#include "undistort.hpp"
45
46
namespace cv
47
{
48
49
int initUndistortRectifyMapLine_AVX(float* m1f, float* m2f, short* m1, ushort* m2, double* matTilt, const double* ir,
50
double& _x, double& _y, double& _w, int width, int m1type,
51
double k1, double k2, double k3, double k4, double k5, double k6,
52
double p1, double p2, double s1, double s2, double s3, double s4,
53
double u0, double v0, double fx, double fy)
54
{
55
int j = 0;
56
57
static const __m256d __one = _mm256_set1_pd(1.0);
58
static const __m256d __two = _mm256_set1_pd(2.0);
59
60
const __m256d __matTilt_00 = _mm256_set1_pd(matTilt[0]);
61
const __m256d __matTilt_10 = _mm256_set1_pd(matTilt[3]);
62
const __m256d __matTilt_20 = _mm256_set1_pd(matTilt[6]);
63
64
const __m256d __matTilt_01 = _mm256_set1_pd(matTilt[1]);
65
const __m256d __matTilt_11 = _mm256_set1_pd(matTilt[4]);
66
const __m256d __matTilt_21 = _mm256_set1_pd(matTilt[7]);
67
68
const __m256d __matTilt_02 = _mm256_set1_pd(matTilt[2]);
69
const __m256d __matTilt_12 = _mm256_set1_pd(matTilt[5]);
70
const __m256d __matTilt_22 = _mm256_set1_pd(matTilt[8]);
71
72
for (; j <= width - 4; j += 4, _x += 4 * ir[0], _y += 4 * ir[3], _w += 4 * ir[6])
73
{
74
// Question: Should we load the constants first?
75
__m256d __w = _mm256_div_pd(__one, _mm256_set_pd(_w + 3 * ir[6], _w + 2 * ir[6], _w + ir[6], _w));
76
__m256d __x = _mm256_mul_pd(_mm256_set_pd(_x + 3 * ir[0], _x + 2 * ir[0], _x + ir[0], _x), __w);
77
__m256d __y = _mm256_mul_pd(_mm256_set_pd(_y + 3 * ir[3], _y + 2 * ir[3], _y + ir[3], _y), __w);
78
__m256d __x2 = _mm256_mul_pd(__x, __x);
79
__m256d __y2 = _mm256_mul_pd(__y, __y);
80
__m256d __r2 = _mm256_add_pd(__x2, __y2);
81
__m256d __2xy = _mm256_mul_pd(__two, _mm256_mul_pd(__x, __y));
82
__m256d __kr = _mm256_div_pd(
83
#if CV_FMA3
84
_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_set1_pd(k3), __r2, _mm256_set1_pd(k2)), __r2, _mm256_set1_pd(k1)), __r2, __one),
85
_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_set1_pd(k6), __r2, _mm256_set1_pd(k5)), __r2, _mm256_set1_pd(k4)), __r2, __one)
86
#else
87
_mm256_add_pd(__one, _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(k3), __r2), _mm256_set1_pd(k2)), __r2), _mm256_set1_pd(k1)), __r2)),
88
_mm256_add_pd(__one, _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(k6), __r2), _mm256_set1_pd(k5)), __r2), _mm256_set1_pd(k4)), __r2))
89
#endif
90
);
91
__m256d __r22 = _mm256_mul_pd(__r2, __r2);
92
#if CV_FMA3
93
__m256d __xd = _mm256_fmadd_pd(__x, __kr,
94
_mm256_add_pd(
95
_mm256_fmadd_pd(_mm256_set1_pd(p1), __2xy, _mm256_mul_pd(_mm256_set1_pd(p2), _mm256_fmadd_pd(__two, __x2, __r2))),
96
_mm256_fmadd_pd(_mm256_set1_pd(s1), __r2, _mm256_mul_pd(_mm256_set1_pd(s2), __r22))));
97
__m256d __yd = _mm256_fmadd_pd(__y, __kr,
98
_mm256_add_pd(
99
_mm256_fmadd_pd(_mm256_set1_pd(p1), _mm256_fmadd_pd(__two, __y2, __r2), _mm256_mul_pd(_mm256_set1_pd(p2), __2xy)),
100
_mm256_fmadd_pd(_mm256_set1_pd(s3), __r2, _mm256_mul_pd(_mm256_set1_pd(s4), __r22))));
101
102
__m256d __vecTilt2 = _mm256_fmadd_pd(__matTilt_20, __xd, _mm256_fmadd_pd(__matTilt_21, __yd, __matTilt_22));
103
#else
104
__m256d __xd = _mm256_add_pd(
105
_mm256_mul_pd(__x, __kr),
106
_mm256_add_pd(
107
_mm256_add_pd(
108
_mm256_mul_pd(_mm256_set1_pd(p1), __2xy),
109
_mm256_mul_pd(_mm256_set1_pd(p2), _mm256_add_pd(__r2, _mm256_mul_pd(__two, __x2)))),
110
_mm256_add_pd(
111
_mm256_mul_pd(_mm256_set1_pd(s1), __r2),
112
_mm256_mul_pd(_mm256_set1_pd(s2), __r22))));
113
__m256d __yd = _mm256_add_pd(
114
_mm256_mul_pd(__y, __kr),
115
_mm256_add_pd(
116
_mm256_add_pd(
117
_mm256_mul_pd(_mm256_set1_pd(p1), _mm256_add_pd(__r2, _mm256_mul_pd(__two, __y2))),
118
_mm256_mul_pd(_mm256_set1_pd(p2), __2xy)),
119
_mm256_add_pd(
120
_mm256_mul_pd(_mm256_set1_pd(s3), __r2),
121
_mm256_mul_pd(_mm256_set1_pd(s4), __r22))));
122
123
__m256d __vecTilt2 = _mm256_add_pd(_mm256_add_pd(
124
_mm256_mul_pd(__matTilt_20, __xd), _mm256_mul_pd(__matTilt_21, __yd)), __matTilt_22);
125
#endif
126
__m256d __invProj = _mm256_blendv_pd(
127
__one, _mm256_div_pd(__one, __vecTilt2),
128
_mm256_cmp_pd(__vecTilt2, _mm256_setzero_pd(), _CMP_EQ_OQ));
129
130
#if CV_FMA3
131
__m256d __u = _mm256_fmadd_pd(__matTilt_00, __xd, _mm256_fmadd_pd(__matTilt_01, __yd, __matTilt_02));
132
__u = _mm256_fmadd_pd(_mm256_mul_pd(_mm256_set1_pd(fx), __invProj), __u, _mm256_set1_pd(u0));
133
134
__m256d __v = _mm256_fmadd_pd(__matTilt_10, __xd, _mm256_fmadd_pd(__matTilt_11, __yd, __matTilt_12));
135
__v = _mm256_fmadd_pd(_mm256_mul_pd(_mm256_set1_pd(fy), __invProj), __v, _mm256_set1_pd(v0));
136
#else
137
__m256d __u = _mm256_add_pd(_mm256_add_pd(
138
_mm256_mul_pd(__matTilt_00, __xd), _mm256_mul_pd(__matTilt_01, __yd)), __matTilt_02);
139
__u = _mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(_mm256_set1_pd(fx), __invProj), __u), _mm256_set1_pd(u0));
140
141
__m256d __v = _mm256_add_pd(_mm256_add_pd(
142
_mm256_mul_pd(__matTilt_10, __xd), _mm256_mul_pd(__matTilt_11, __yd)), __matTilt_12);
143
__v = _mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(_mm256_set1_pd(fy), __invProj), __v), _mm256_set1_pd(v0));
144
#endif
145
146
if (m1type == CV_32FC1)
147
{
148
_mm_storeu_ps(&m1f[j], _mm256_cvtpd_ps(__u));
149
_mm_storeu_ps(&m2f[j], _mm256_cvtpd_ps(__v));
150
}
151
else if (m1type == CV_32FC2)
152
{
153
__m128 __u_float = _mm256_cvtpd_ps(__u);
154
__m128 __v_float = _mm256_cvtpd_ps(__v);
155
156
_mm_storeu_ps(&m1f[j * 2], _mm_unpacklo_ps(__u_float, __v_float));
157
_mm_storeu_ps(&m1f[j * 2 + 4], _mm_unpackhi_ps(__u_float, __v_float));
158
}
159
else // m1type == CV_16SC2
160
{
161
__u = _mm256_mul_pd(__u, _mm256_set1_pd(INTER_TAB_SIZE));
162
__v = _mm256_mul_pd(__v, _mm256_set1_pd(INTER_TAB_SIZE));
163
164
__m128i __iu = _mm256_cvtpd_epi32(__u);
165
__m128i __iv = _mm256_cvtpd_epi32(__v);
166
167
static const __m128i __INTER_TAB_SIZE_m1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
168
__m128i __m2 = _mm_add_epi32(
169
_mm_mullo_epi32(_mm_and_si128(__iv, __INTER_TAB_SIZE_m1), _mm_set1_epi32(INTER_TAB_SIZE)),
170
_mm_and_si128(__iu, __INTER_TAB_SIZE_m1));
171
__m2 = _mm_packus_epi32(__m2, __m2);
172
_mm_maskstore_epi64((long long int*) &m2[j], _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF), __m2);
173
174
// gcc4.9 does not support _mm256_set_m128
175
// __m256i __m1 = _mm256_set_m128i(__iv, __iu);
176
__m256i __m1 = _mm256_setzero_si256();
177
__m1 = _mm256_inserti128_si256(__m1, __iu, 0);
178
__m1 = _mm256_inserti128_si256(__m1, __iv, 1);
179
__m1 = _mm256_srai_epi32(__m1, INTER_BITS); // v3 v2 v1 v0 u3 u2 u1 u0 (int32_t)
180
static const __m256i __permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
181
__m1 = _mm256_permutevar8x32_epi32(__m1, __permute_mask); // v3 u3 v2 u2 v1 u1 v0 u0 (int32_t)
182
__m1 = _mm256_packs_epi32(__m1, __m1); // x x x x v3 u3 v2 u2 x x x x v1 u1 v0 u0 (int16_t)
183
_mm_storeu_si128((__m128i*) &m1[j * 2], _mm256_extracti128_si256(_mm256_permute4x64_epi64(__m1, (2 << 2) + 0), 0));
184
}
185
}
186
187
_mm256_zeroupper();
188
189
return j;
190
}
191
192
}
193
194
/* End of file */
195
196