Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/imgproc/src/imgwarp.sse4_1.cpp
16354 views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
8
//
9
//
10
// License Agreement
11
// For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16
// Third party copyrights are property of their respective owners.
17
//
18
// Redistribution and use in source and binary forms, with or without modification,
19
// are permitted provided that the following conditions are met:
20
//
21
// * Redistribution's of source code must retain the above copyright notice,
22
// this list of conditions and the following disclaimer.
23
//
24
// * Redistribution's in binary form must reproduce the above copyright notice,
25
// this list of conditions and the following disclaimer in the documentation
26
// and/or other materials provided with the distribution.
27
//
28
// * The name of the copyright holders may not be used to endorse or promote products
29
// derived from this software without specific prior written permission.
30
//
31
// This software is provided by the copyright holders and contributors "as is" and
32
// any express or implied warranties, including, but not limited to, the implied
33
// warranties of merchantability and fitness for a particular purpose are disclaimed.
34
// In no event shall the Intel Corporation or contributors be liable for any direct,
35
// indirect, incidental, special, exemplary, or consequential damages
36
// (including, but not limited to, procurement of substitute goods or services;
37
// loss of use, data, or profits; or business interruption) however caused
38
// and on any theory of liability, whether in contract, strict liability,
39
// or tort (including negligence or otherwise) arising in any way out of
40
// the use of this software, even if advised of the possibility of such damage.
41
//
42
//M*/
43
44
/* ////////////////////////////////////////////////////////////////////
45
//
46
// Geometrical transforms on images and matrices: rotation, zoom etc.
47
//
48
// */
49
50
#include "precomp.hpp"
51
#include "imgwarp.hpp"
52
53
namespace cv
54
{
55
namespace opt_SSE4_1
56
{
57
58
void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width)
59
{
60
int x = 0;
61
for (; x <= width - 16; x += 16)
62
{
63
__m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
64
_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)));
65
__m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)),
66
_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12)));
67
68
__m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)),
69
_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4)));
70
__m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)),
71
_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12)));
72
73
_mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3);
74
75
_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0);
76
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1);
77
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);
78
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);
79
}
80
81
for (; x < width; x++)
82
{
83
dst1[x * 2] = saturate_cast<short>(src1f[x]);
84
dst1[x * 2 + 1] = saturate_cast<short>(src2f[x]);
85
}
86
}
87
88
void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width)
89
{
90
int x = 0;
91
__m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
92
__m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
93
94
for (; x <= width - 16; x += 16)
95
{
96
__m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its));
97
__m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its));
98
__m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its));
99
__m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its));
100
101
__m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
102
_mm_srai_epi32(v_ix1, INTER_BITS));
103
__m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
104
_mm_srai_epi32(v_iy1, INTER_BITS));
105
__m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
106
_mm_and_si128(v_ix0, v_its1));
107
__m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
108
_mm_and_si128(v_ix1, v_its1));
109
_mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21));
110
111
v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its));
112
v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its));
113
v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its));
114
v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its));
115
116
__m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
117
_mm_srai_epi32(v_ix1, INTER_BITS));
118
__m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
119
_mm_srai_epi32(v_iy1, INTER_BITS));
120
v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
121
_mm_and_si128(v_ix0, v_its1));
122
v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
123
_mm_and_si128(v_ix1, v_its1));
124
_mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21));
125
126
_mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13);
127
128
_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10);
129
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11);
130
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);
131
_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);
132
}
133
for (; x < width; x++)
134
{
135
int ix = saturate_cast<int>(src1f[x] * INTER_TAB_SIZE);
136
int iy = saturate_cast<int>(src2f[x] * INTER_TAB_SIZE);
137
dst1[x * 2] = saturate_cast<short>(ix >> INTER_BITS);
138
dst1[x * 2 + 1] = saturate_cast<short>(iy >> INTER_BITS);
139
dst2[x] = (ushort)((iy & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE - 1)));
140
}
141
}
142
143
void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width)
144
{
145
int x = 0;
146
__m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
147
__m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
148
__m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE - 1) << 16);
149
150
for (; x <= width - 4; x += 4)
151
{
152
__m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its));
153
__m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its));
154
155
__m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS),
156
_mm_srai_epi32(v_src1, INTER_BITS));
157
_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1);
158
159
// x0 y0 x1 y1 . . .
160
v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1),
161
_mm_and_si128(v_src1, v_its1));
162
__m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . .
163
_mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .
164
_mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));
165
}
166
for (; x < width; x++)
167
{
168
int ix = saturate_cast<int>(src1f[x * 2] * INTER_TAB_SIZE);
169
int iy = saturate_cast<int>(src1f[x * 2 + 1] * INTER_TAB_SIZE);
170
dst1[x * 2] = saturate_cast<short>(ix >> INTER_BITS);
171
dst1[x * 2 + 1] = saturate_cast<short>(iy >> INTER_BITS);
172
dst2[x] = (ushort)((iy & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE - 1)));
173
}
174
}
175
176
void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
177
{
178
const int AB_BITS = MAX(10, (int)INTER_BITS);
179
int x1 = 0;
180
181
__m128i v_X0 = _mm_set1_epi32(X0);
182
__m128i v_Y0 = _mm_set1_epi32(Y0);
183
for (; x1 <= bw - 16; x1 += 16)
184
{
185
__m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1))), AB_BITS),
186
_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 4))), AB_BITS));
187
__m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 8))), AB_BITS),
188
_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 12))), AB_BITS));
189
190
__m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1))), AB_BITS),
191
_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 4))), AB_BITS));
192
__m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 8))), AB_BITS),
193
_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 12))), AB_BITS));
194
195
_mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
196
197
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
198
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
199
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
200
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
201
}
202
for (; x1 < bw; x1++)
203
{
204
int X = (X0 + adelta[x1]) >> AB_BITS;
205
int Y = (Y0 + bdelta[x1]) >> AB_BITS;
206
xy[x1 * 2] = saturate_cast<short>(X);
207
xy[x1 * 2 + 1] = saturate_cast<short>(Y);
208
}
209
}
210
211
212
class WarpPerspectiveLine_SSE4_Impl CV_FINAL : public WarpPerspectiveLine_SSE4
213
{
214
public:
215
WarpPerspectiveLine_SSE4_Impl(const double *M)
216
{
217
CV_UNUSED(M);
218
}
219
virtual void processNN(const double *M, short* xy, double X0, double Y0, double W0, int bw) CV_OVERRIDE
220
{
221
const __m128d v_M0 = _mm_set1_pd(M[0]);
222
const __m128d v_M3 = _mm_set1_pd(M[3]);
223
const __m128d v_M6 = _mm_set1_pd(M[6]);
224
const __m128d v_intmax = _mm_set1_pd((double)INT_MAX);
225
const __m128d v_intmin = _mm_set1_pd((double)INT_MIN);
226
const __m128d v_2 = _mm_set1_pd(2);
227
const __m128d v_zero = _mm_setzero_pd();
228
const __m128d v_1 = _mm_set1_pd(1);
229
230
int x1 = 0;
231
__m128d v_X0d = _mm_set1_pd(X0);
232
__m128d v_Y0d = _mm_set1_pd(Y0);
233
__m128d v_W0 = _mm_set1_pd(W0);
234
__m128d v_x1 = _mm_set_pd(1, 0);
235
236
for (; x1 <= bw - 16; x1 += 16)
237
{
238
// 0-3
239
__m128i v_X0, v_Y0;
240
{
241
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
242
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
243
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
244
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
245
v_x1 = _mm_add_pd(v_x1, v_2);
246
247
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
248
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
249
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
250
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
251
v_x1 = _mm_add_pd(v_x1, v_2);
252
253
v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
254
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
255
v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
256
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
257
}
258
259
// 4-8
260
__m128i v_X1, v_Y1;
261
{
262
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
263
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
264
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
265
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
266
v_x1 = _mm_add_pd(v_x1, v_2);
267
268
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
269
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
270
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
271
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
272
v_x1 = _mm_add_pd(v_x1, v_2);
273
274
v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
275
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
276
v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
277
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
278
}
279
280
// 8-11
281
__m128i v_X2, v_Y2;
282
{
283
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
284
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
285
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
286
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
287
v_x1 = _mm_add_pd(v_x1, v_2);
288
289
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
290
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
291
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
292
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
293
v_x1 = _mm_add_pd(v_x1, v_2);
294
295
v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
296
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
297
v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
298
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
299
}
300
301
// 12-15
302
__m128i v_X3, v_Y3;
303
{
304
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
305
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
306
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
307
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
308
v_x1 = _mm_add_pd(v_x1, v_2);
309
310
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
311
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
312
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
313
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
314
v_x1 = _mm_add_pd(v_x1, v_2);
315
316
v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
317
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
318
v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
319
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
320
}
321
322
// convert to 16s
323
v_X0 = _mm_packs_epi32(v_X0, v_X1);
324
v_X1 = _mm_packs_epi32(v_X2, v_X3);
325
v_Y0 = _mm_packs_epi32(v_Y0, v_Y1);
326
v_Y1 = _mm_packs_epi32(v_Y2, v_Y3);
327
328
_mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
329
330
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
331
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
332
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
333
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
334
}
335
336
for (; x1 < bw; x1++)
337
{
338
double W = W0 + M[6] * x1;
339
W = W ? 1. / W : 0;
340
double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1)*W));
341
double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1)*W));
342
int X = saturate_cast<int>(fX);
343
int Y = saturate_cast<int>(fY);
344
345
xy[x1 * 2] = saturate_cast<short>(X);
346
xy[x1 * 2 + 1] = saturate_cast<short>(Y);
347
}
348
}
349
virtual void process(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw) CV_OVERRIDE
350
{
351
const __m128d v_M0 = _mm_set1_pd(M[0]);
352
const __m128d v_M3 = _mm_set1_pd(M[3]);
353
const __m128d v_M6 = _mm_set1_pd(M[6]);
354
const __m128d v_intmax = _mm_set1_pd((double)INT_MAX);
355
const __m128d v_intmin = _mm_set1_pd((double)INT_MIN);
356
const __m128d v_2 = _mm_set1_pd(2);
357
const __m128d v_zero = _mm_setzero_pd();
358
const __m128d v_its = _mm_set1_pd(INTER_TAB_SIZE);
359
const __m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
360
361
int x1 = 0;
362
363
__m128d v_X0d = _mm_set1_pd(X0);
364
__m128d v_Y0d = _mm_set1_pd(Y0);
365
__m128d v_W0 = _mm_set1_pd(W0);
366
__m128d v_x1 = _mm_set_pd(1, 0);
367
368
for (; x1 <= bw - 16; x1 += 16)
369
{
370
// 0-3
371
__m128i v_X0, v_Y0;
372
{
373
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
374
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
375
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
376
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
377
v_x1 = _mm_add_pd(v_x1, v_2);
378
379
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
380
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
381
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
382
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
383
v_x1 = _mm_add_pd(v_x1, v_2);
384
385
v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
386
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
387
v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
388
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
389
}
390
391
// 4-8
392
__m128i v_X1, v_Y1;
393
{
394
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
395
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
396
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
397
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
398
v_x1 = _mm_add_pd(v_x1, v_2);
399
400
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
401
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
402
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
403
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
404
v_x1 = _mm_add_pd(v_x1, v_2);
405
406
v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
407
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
408
v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
409
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
410
}
411
412
// 8-11
413
__m128i v_X2, v_Y2;
414
{
415
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
416
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
417
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
418
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
419
v_x1 = _mm_add_pd(v_x1, v_2);
420
421
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
422
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
423
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
424
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
425
v_x1 = _mm_add_pd(v_x1, v_2);
426
427
v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
428
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
429
v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
430
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
431
}
432
433
// 12-15
434
__m128i v_X3, v_Y3;
435
{
436
__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
437
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
438
__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
439
__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
440
v_x1 = _mm_add_pd(v_x1, v_2);
441
442
v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
443
v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
444
__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
445
__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
446
v_x1 = _mm_add_pd(v_x1, v_2);
447
448
v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
449
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
450
v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
451
_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
452
}
453
454
// store alpha
455
__m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS),
456
_mm_and_si128(v_X0, v_itsi1));
457
__m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS),
458
_mm_and_si128(v_X1, v_itsi1));
459
_mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1));
460
461
v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS),
462
_mm_and_si128(v_X2, v_itsi1));
463
v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS),
464
_mm_and_si128(v_X3, v_itsi1));
465
_mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1));
466
467
// convert to 16s
468
v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS));
469
v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS));
470
v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS));
471
v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS));
472
473
_mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
474
475
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
476
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
477
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
478
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
479
}
480
for (; x1 < bw; x1++)
481
{
482
double W = W0 + M[6] * x1;
483
W = W ? INTER_TAB_SIZE / W : 0;
484
double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1)*W));
485
double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1)*W));
486
int X = saturate_cast<int>(fX);
487
int Y = saturate_cast<int>(fY);
488
489
xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
490
xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
491
alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE +
492
(X & (INTER_TAB_SIZE - 1)));
493
}
494
}
495
virtual ~WarpPerspectiveLine_SSE4_Impl() CV_OVERRIDE {};
496
};
497
498
Ptr<WarpPerspectiveLine_SSE4> WarpPerspectiveLine_SSE4::getImpl(const double *M)
499
{
500
return Ptr<WarpPerspectiveLine_SSE4>(new WarpPerspectiveLine_SSE4_Impl(M));
501
}
502
503
}
504
}
505
/* End of file. */
506
507