CoCalc -- imgwarp.sse4

GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/imgproc/src/imgwarp.sse4_1.cpp
¹⁶³⁵⁴ views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
//  By downloading, copying, installing or using the software you agree to this license.
6
//  If you do not agree to this license, do not download, install,
7
//  copy or use the software.
8
//
9
//
10
//                           License Agreement
11
//                For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16
// Third party copyrights are property of their respective owners.
17
//
18
// Redistribution and use in source and binary forms, with or without modification,
19
// are permitted provided that the following conditions are met:
20
//
21
//   * Redistribution's of source code must retain the above copyright notice,
22
//     this list of conditions and the following disclaimer.
23
//
24
//   * Redistribution's in binary form must reproduce the above copyright notice,
25
//     this list of conditions and the following disclaimer in the documentation
26
//     and/or other materials provided with the distribution.
27
//
28
//   * The name of the copyright holders may not be used to endorse or promote products
29
//     derived from this software without specific prior written permission.
30
//
31
// This software is provided by the copyright holders and contributors "as is" and
32
// any express or implied warranties, including, but not limited to, the implied
33
// warranties of merchantability and fitness for a particular purpose are disclaimed.
34
// In no event shall the Intel Corporation or contributors be liable for any direct,
35
// indirect, incidental, special, exemplary, or consequential damages
36
// (including, but not limited to, procurement of substitute goods or services;
37
// loss of use, data, or profits; or business interruption) however caused
38
// and on any theory of liability, whether in contract, strict liability,
39
// or tort (including negligence or otherwise) arising in any way out of
40
// the use of this software, even if advised of the possibility of such damage.
41
//
42
//M*/
43

44
/* ////////////////////////////////////////////////////////////////////
45
//
46
//  Geometrical transforms on images and matrices: rotation, zoom etc.
47
//
48
// */
49

50
#include "precomp.hpp"
51
#include "imgwarp.hpp"
52

53
namespace cv
54
{
55
namespace opt_SSE4_1
56
{
57

58
void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width)
59
{
60
    int x = 0;
61
    for (; x <= width - 16; x += 16)
62
    {
63
        __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
64
            _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)));
65
        __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)),
66
            _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12)));
67

68
        __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)),
69
            _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4)));
70
        __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)),
71
            _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12)));
72

73
        _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3);
74

75
        _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0);
76
        _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1);
77
        _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);
78
        _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);
79
    }
80

81
    for (; x < width; x++)
82
    {
83
        dst1[x * 2] = saturate_cast<short>(src1f[x]);
84
        dst1[x * 2 + 1] = saturate_cast<short>(src2f[x]);
85
    }
86
}
87

88
void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width)
89
{
90
    int x = 0;
91
    __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
92
    __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
93

94
    for (; x <= width - 16; x += 16)
95
    {
96
        __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its));
97
        __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its));
98
        __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its));
99
        __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its));
100

101
        __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
102
            _mm_srai_epi32(v_ix1, INTER_BITS));
103
        __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
104
            _mm_srai_epi32(v_iy1, INTER_BITS));
105
        __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
106
            _mm_and_si128(v_ix0, v_its1));
107
        __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
108
            _mm_and_si128(v_ix1, v_its1));
109
        _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21));
110

111
        v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its));
112
        v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its));
113
        v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its));
114
        v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its));
115

116
        __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
117
            _mm_srai_epi32(v_ix1, INTER_BITS));
118
        __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
119
            _mm_srai_epi32(v_iy1, INTER_BITS));
120
        v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
121
            _mm_and_si128(v_ix0, v_its1));
122
        v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
123
            _mm_and_si128(v_ix1, v_its1));
124
        _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21));
125

126
        _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13);
127

128
        _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10);
129
        _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11);
130
        _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);
131
        _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);
132
    }
133
    for (; x < width; x++)
134
    {
135
        int ix = saturate_cast<int>(src1f[x] * INTER_TAB_SIZE);
136
        int iy = saturate_cast<int>(src2f[x] * INTER_TAB_SIZE);
137
        dst1[x * 2] = saturate_cast<short>(ix >> INTER_BITS);
138
        dst1[x * 2 + 1] = saturate_cast<short>(iy >> INTER_BITS);
139
        dst2[x] = (ushort)((iy & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE - 1)));
140
    }
141
}
142

143
void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width)
144
{
145
    int x = 0;
146
    __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
147
    __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
148
    __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE - 1) << 16);
149

150
    for (; x <= width - 4; x += 4)
151
    {
152
        __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its));
153
        __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its));
154

155
        __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS),
156
            _mm_srai_epi32(v_src1, INTER_BITS));
157
        _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1);
158

159
        // x0 y0 x1 y1 . . .
160
        v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1),
161
            _mm_and_si128(v_src1, v_its1));
162
        __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . .
163
            _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .
164
        _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));
165
    }
166
    for (; x < width; x++)
167
    {
168
        int ix = saturate_cast<int>(src1f[x * 2] * INTER_TAB_SIZE);
169
        int iy = saturate_cast<int>(src1f[x * 2 + 1] * INTER_TAB_SIZE);
170
        dst1[x * 2] = saturate_cast<short>(ix >> INTER_BITS);
171
        dst1[x * 2 + 1] = saturate_cast<short>(iy >> INTER_BITS);
172
        dst2[x] = (ushort)((iy & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE - 1)));
173
    }
174
}
175

176
void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
177
{
178
    const int AB_BITS = MAX(10, (int)INTER_BITS);
179
    int x1 = 0;
180

181
    __m128i v_X0 = _mm_set1_epi32(X0);
182
    __m128i v_Y0 = _mm_set1_epi32(Y0);
183
    for (; x1 <= bw - 16; x1 += 16)
184
    {
185
        __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1))), AB_BITS),
186
            _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 4))), AB_BITS));
187
        __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 8))), AB_BITS),
188
            _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 12))), AB_BITS));
189

190
        __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1))), AB_BITS),
191
            _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 4))), AB_BITS));
192
        __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 8))), AB_BITS),
193
            _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 12))), AB_BITS));
194

195
        _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
196

197
        _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
198
        _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
199
        _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
200
        _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
201
    }
202
    for (; x1 < bw; x1++)
203
    {
204
        int X = (X0 + adelta[x1]) >> AB_BITS;
205
        int Y = (Y0 + bdelta[x1]) >> AB_BITS;
206
        xy[x1 * 2] = saturate_cast<short>(X);
207
        xy[x1 * 2 + 1] = saturate_cast<short>(Y);
208
    }
209
}
210

211

212
class WarpPerspectiveLine_SSE4_Impl CV_FINAL : public WarpPerspectiveLine_SSE4
213
{
214
public:
215
    WarpPerspectiveLine_SSE4_Impl(const double *M)
216
    {
217
        CV_UNUSED(M);
218
    }
219
    virtual void processNN(const double *M, short* xy, double X0, double Y0, double W0, int bw) CV_OVERRIDE
220
    {
221
        const __m128d v_M0 = _mm_set1_pd(M[0]);
222
        const __m128d v_M3 = _mm_set1_pd(M[3]);
223
        const __m128d v_M6 = _mm_set1_pd(M[6]);
224
        const __m128d v_intmax = _mm_set1_pd((double)INT_MAX);
225
        const __m128d v_intmin = _mm_set1_pd((double)INT_MIN);
226
        const __m128d v_2 = _mm_set1_pd(2);
227
        const __m128d v_zero = _mm_setzero_pd();
228
        const __m128d v_1 = _mm_set1_pd(1);
229

230
        int x1 = 0;
231
        __m128d v_X0d = _mm_set1_pd(X0);
232
        __m128d v_Y0d = _mm_set1_pd(Y0);
233
        __m128d v_W0 = _mm_set1_pd(W0);
234
        __m128d v_x1 = _mm_set_pd(1, 0);
235

236
        for (; x1 <= bw - 16; x1 += 16)
237
        {
238
            // 0-3
239
            __m128i v_X0, v_Y0;
240
            {
241
                __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
242
                v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
243
                __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
244
                __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
245
                v_x1 = _mm_add_pd(v_x1, v_2);
246

247
                v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
248
                v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
249
                __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
250
                __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
251
                v_x1 = _mm_add_pd(v_x1, v_2);
252

253
                v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
254
                    _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
255
                v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
256
                    _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
257
            }
258

259
            // 4-8
260
            __m128i v_X1, v_Y1;
261
            {
262
                __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
263
                v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
264
                __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
265
                __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
266
                v_x1 = _mm_add_pd(v_x1, v_2);
267

268
                v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
269
                v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
270
                __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
271
                __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
272
                v_x1 = _mm_add_pd(v_x1, v_2);
273

274
                v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
275
                    _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
276
                v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
277
                    _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
278
            }
279

280
            // 8-11
281
            __m128i v_X2, v_Y2;
282
            {
283
                __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
284
                v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
285
                __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
286
                __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
287
                v_x1 = _mm_add_pd(v_x1, v_2);
288

289
                v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
290
                v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
291
                __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
292
                __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
293
                v_x1 = _mm_add_pd(v_x1, v_2);
294

295
                v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
296
                    _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
297
                v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
298
                    _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
299
            }
300

301
            // 12-15
302
            __m128i v_X3, v_Y3;
303
            {
304
                __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
305
                v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
306
                __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
307
                __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
308
                v_x1 = _mm_add_pd(v_x1, v_2);
309

310
                v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
311
                v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
312
                __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
313
                __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
314
                v_x1 = _mm_add_pd(v_x1, v_2);
315

316
                v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
317
                    _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
318
                v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
319
                    _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
320
            }
321

322
            // convert to 16s
323
            v_X0 = _mm_packs_epi32(v_X0, v_X1);
324
            v_X1 = _mm_packs_epi32(v_X2, v_X3);
325
            v_Y0 = _mm_packs_epi32(v_Y0, v_Y1);
326
            v_Y1 = _mm_packs_epi32(v_Y2, v_Y3);
327

328
            _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
329

330
            _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
331
            _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
332
            _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
333
            _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
334
        }
335

336
        for (; x1 < bw; x1++)
337
        {
338
            double W = W0 + M[6] * x1;
339
            W = W ? 1. / W : 0;
340
            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1)*W));
341
            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1)*W));
342
            int X = saturate_cast<int>(fX);
343
            int Y = saturate_cast<int>(fY);
344

345
            xy[x1 * 2] = saturate_cast<short>(X);
346
            xy[x1 * 2 + 1] = saturate_cast<short>(Y);
347
        }
348
    }
349
    virtual void process(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw) CV_OVERRIDE
350
    {
351
        const __m128d v_M0 = _mm_set1_pd(M[0]);
352
        const __m128d v_M3 = _mm_set1_pd(M[3]);
353
        const __m128d v_M6 = _mm_set1_pd(M[6]);
354
        const __m128d v_intmax = _mm_set1_pd((double)INT_MAX);
355
        const __m128d v_intmin = _mm_set1_pd((double)INT_MIN);
356
        const __m128d v_2 = _mm_set1_pd(2);
357
        const __m128d v_zero = _mm_setzero_pd();
358
        const __m128d v_its = _mm_set1_pd(INTER_TAB_SIZE);
359
        const __m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
360

361
        int x1 = 0;
362

363
        __m128d v_X0d = _mm_set1_pd(X0);
364
        __m128d v_Y0d = _mm_set1_pd(Y0);
365
        __m128d v_W0 = _mm_set1_pd(W0);
366
        __m128d v_x1 = _mm_set_pd(1, 0);
367

368
        for (; x1 <= bw - 16; x1 += 16)
369
        {
370
            // 0-3
371
            __m128i v_X0, v_Y0;
372
            {
373
                __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
374
                v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
375
                __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
376
                __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
377
                v_x1 = _mm_add_pd(v_x1, v_2);
378

379
                v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
380
                v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
381
                __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
382
                __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
383
                v_x1 = _mm_add_pd(v_x1, v_2);
384

385
                v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
386
                    _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
387
                v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
388
                    _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
389
            }
390

391
            // 4-8
392
            __m128i v_X1, v_Y1;
393
            {
394
                __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
395
                v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
396
                __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
397
                __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
398
                v_x1 = _mm_add_pd(v_x1, v_2);
399

400
                v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
401
                v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
402
                __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
403
                __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
404
                v_x1 = _mm_add_pd(v_x1, v_2);
405

406
                v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
407
                    _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
408
                v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
409
                    _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
410
            }
411

412
            // 8-11
413
            __m128i v_X2, v_Y2;
414
            {
415
                __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
416
                v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
417
                __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
418
                __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
419
                v_x1 = _mm_add_pd(v_x1, v_2);
420

421
                v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
422
                v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
423
                __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
424
                __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
425
                v_x1 = _mm_add_pd(v_x1, v_2);
426

427
                v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
428
                    _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
429
                v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
430
                    _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
431
            }
432

433
            // 12-15
434
            __m128i v_X3, v_Y3;
435
            {
436
                __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
437
                v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
438
                __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
439
                __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
440
                v_x1 = _mm_add_pd(v_x1, v_2);
441

442
                v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
443
                v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
444
                __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
445
                __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
446
                v_x1 = _mm_add_pd(v_x1, v_2);
447

448
                v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
449
                    _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
450
                v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
451
                    _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
452
            }
453

454
            // store alpha
455
            __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS),
456
                _mm_and_si128(v_X0, v_itsi1));
457
            __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS),
458
                _mm_and_si128(v_X1, v_itsi1));
459
            _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1));
460

461
            v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS),
462
                _mm_and_si128(v_X2, v_itsi1));
463
            v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS),
464
                _mm_and_si128(v_X3, v_itsi1));
465
            _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1));
466

467
            // convert to 16s
468
            v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS));
469
            v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS));
470
            v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS));
471
            v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS));
472

473
            _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
474

475
            _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
476
            _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
477
            _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
478
            _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
479
        }
480
        for (; x1 < bw; x1++)
481
        {
482
            double W = W0 + M[6] * x1;
483
            W = W ? INTER_TAB_SIZE / W : 0;
484
            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1)*W));
485
            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1)*W));
486
            int X = saturate_cast<int>(fX);
487
            int Y = saturate_cast<int>(fY);
488

489
            xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
490
            xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
491
            alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE +
492
                (X & (INTER_TAB_SIZE - 1)));
493
        }
494
    }
495
    virtual ~WarpPerspectiveLine_SSE4_Impl() CV_OVERRIDE {};
496
};
497

498
Ptr<WarpPerspectiveLine_SSE4> WarpPerspectiveLine_SSE4::getImpl(const double *M)
499
{
500
    return Ptr<WarpPerspectiveLine_SSE4>(new WarpPerspectiveLine_SSE4_Impl(M));
501
}
502

503
}
504
}
505
/* End of file. */
506

507
Product

Resources

Company