CoCalc -- arithm

GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/core/src/arithm_simd.hpp
¹⁶³³⁷ views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
//  By downloading, copying, installing or using the software you agree to this license.
6
//  If you do not agree to this license, do not download, install,
7
//  copy or use the software.
8
//
9
//
10
//                          License Agreement
11
//                For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16
// Copyright (C) 2015, Itseez Inc., all rights reserved.
17
// Third party copyrights are property of their respective owners.
18
//
19
// Redistribution and use in source and binary forms, with or without modification,
20
// are permitted provided that the following conditions are met:
21
//
22
//   * Redistribution's of source code must retain the above copyright notice,
23
//     this list of conditions and the following disclaimer.
24
//
25
//   * Redistribution's in binary form must reproduce the above copyright notice,
26
//     this list of conditions and the following disclaimer in the documentation
27
//     and/or other materials provided with the distribution.
28
//
29
//   * The name of the copyright holders may not be used to endorse or promote products
30
//     derived from this software without specific prior written permission.
31
//
32
// This software is provided by the copyright holders and contributors "as is" and
33
// any express or implied warranties, including, but not limited to, the implied
34
// warranties of merchantability and fitness for a particular purpose are disclaimed.
35
// In no event shall the Intel Corporation or contributors be liable for any direct,
36
// indirect, incidental, special, exemplary, or consequential damages
37
// (including, but not limited to, procurement of substitute goods or services;
38
// loss of use, data, or profits; or business interruption) however caused
39
// and on any theory of liability, whether in contract, strict liability,
40
// or tort (including negligence or otherwise) arising in any way out of
41
// the use of this software, even if advised of the possibility of such damage.
42
//
43
//M*/
44

45
#ifndef __OPENCV_ARITHM_SIMD_HPP__
46
#define __OPENCV_ARITHM_SIMD_HPP__
47

48
namespace cv {
49

50
struct NOP {};
51

52
#if CV_SSE2 || CV_NEON
53
#define IF_SIMD(op) op
54
#else
55
#define IF_SIMD(op) NOP
56
#endif
57

58

59
#if CV_SSE2 || CV_NEON
60

61
#define FUNCTOR_TEMPLATE(name)          \
62
    template<typename T> struct name {}
63

64
FUNCTOR_TEMPLATE(VLoadStore128);
65
#if CV_SSE2
66
FUNCTOR_TEMPLATE(VLoadStore64);
67
FUNCTOR_TEMPLATE(VLoadStore128Aligned);
68
#if CV_AVX2
69
FUNCTOR_TEMPLATE(VLoadStore256);
70
FUNCTOR_TEMPLATE(VLoadStore256Aligned);
71
#endif
72
#endif
73

74
#endif
75

76
#if CV_AVX2
77

78
#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)         \
79
    template <>                                                                                  \
80
    struct name<template_arg>{                                                                   \
81
        typedef register_type reg_type;                                                          \
82
        static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
83
        static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
84
    }
85

86
#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \
87
    template <>                                                                     \
88
    struct name<template_arg>{                                                      \
89
        typedef register_type reg_type;                                             \
90
        static reg_type load(const template_arg * p) { return load_body (p); }      \
91
        static void store(template_arg * p, reg_type v) { store_body (p, v); }      \
92
    }
93

94
#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)                         \
95
    template<>                                                                 \
96
    struct name<template_arg>                                                  \
97
    {                                                                          \
98
        VLoadStore256<template_arg>::reg_type operator()(                      \
99
                        const VLoadStore256<template_arg>::reg_type & a,       \
100
                        const VLoadStore256<template_arg>::reg_type & b) const \
101
        {                                                                      \
102
            body;                                                              \
103
        }                                                                      \
104
    }
105

106
#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)                         \
107
    template<>                                                                 \
108
    struct name<template_arg>                                                  \
109
    {                                                                          \
110
        VLoadStore256<template_arg>::reg_type operator()(                      \
111
                        const VLoadStore256<template_arg>::reg_type & a,       \
112
                        const VLoadStore256<template_arg>::reg_type &  ) const \
113
        {                                                                      \
114
            body;                                                              \
115
        }                                                                      \
116
    }
117

118
FUNCTOR_LOADSTORE_CAST(VLoadStore256,  uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
119
FUNCTOR_LOADSTORE_CAST(VLoadStore256,  schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
120
FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
121
FUNCTOR_LOADSTORE_CAST(VLoadStore256,  short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
122
FUNCTOR_LOADSTORE_CAST(VLoadStore256,    int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
123
FUNCTOR_LOADSTORE(     VLoadStore256,  float, __m256 , _mm256_loadu_ps   , _mm256_storeu_ps   );
124
FUNCTOR_LOADSTORE(     VLoadStore256, double, __m256d, _mm256_loadu_pd   , _mm256_storeu_pd   );
125

126
FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned,    int, __m256i, _mm256_load_si256, _mm256_store_si256);
127
FUNCTOR_LOADSTORE(     VLoadStore256Aligned,  float, __m256 , _mm256_load_ps   , _mm256_store_ps   );
128
FUNCTOR_LOADSTORE(     VLoadStore256Aligned, double, __m256d, _mm256_load_pd   , _mm256_store_pd   );
129

130
FUNCTOR_TEMPLATE(VAdd);
131
FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm256_adds_epu8 (a, b));
132
FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm256_adds_epi8 (a, b));
133
FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b));
134
FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm256_adds_epi16(a, b));
135
FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm256_add_epi32 (a, b));
136
FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm256_add_ps    (a, b));
137
FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd    (a, b));
138

139
FUNCTOR_TEMPLATE(VSub);
140
FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm256_subs_epu8 (a, b));
141
FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm256_subs_epi8 (a, b));
142
FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b));
143
FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm256_subs_epi16(a, b));
144
FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm256_sub_epi32 (a, b));
145
FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm256_sub_ps    (a, b));
146
FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd    (a, b));
147

148
FUNCTOR_TEMPLATE(VMin);
149
FUNCTOR_CLOSURE_2arg(VMin,  uchar, return _mm256_min_epu8 (a, b));
150
FUNCTOR_CLOSURE_2arg(VMin,  schar, return _mm256_min_epi8 (a, b));
151
FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epu16(a, b));
152
FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm256_min_epi16(a, b));
153
FUNCTOR_CLOSURE_2arg(VMin,    int, return _mm256_min_epi32(a, b));
154
FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm256_min_ps   (a, b));
155
FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd   (a, b));
156

157
FUNCTOR_TEMPLATE(VMax);
158
FUNCTOR_CLOSURE_2arg(VMax,  uchar, return _mm256_max_epu8 (a, b));
159
FUNCTOR_CLOSURE_2arg(VMax,  schar, return _mm256_max_epi8 (a, b));
160
FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b));
161
FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm256_max_epi16(a, b));
162
FUNCTOR_CLOSURE_2arg(VMax,    int, return _mm256_max_epi32(a, b));
163
FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm256_max_ps   (a, b));
164
FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd   (a, b));
165

166

167
static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
168
                                                           0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
169
static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff,
170
                                                           0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
171

172
FUNCTOR_TEMPLATE(VAbsDiff);
173
FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
174
        return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a));
175
    );
176
FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
177
        __m256i d = _mm256_subs_epi8(a, b);
178
        __m256i m = _mm256_cmpgt_epi8(b, a);
179
        return _mm256_subs_epi8(_mm256_xor_si256(d, m), m);
180
    );
181
FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
182
        return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a));
183
    );
184
FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
185
        __m256i M = _mm256_max_epi16(a, b);
186
        __m256i m = _mm256_min_epi16(a, b);
187
        return _mm256_subs_epi16(M, m);
188
    );
189
FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
190
        __m256i d = _mm256_sub_epi32(a, b);
191
        __m256i m = _mm256_cmpgt_epi32(b, a);
192
        return _mm256_sub_epi32(_mm256_xor_si256(d, m), m);
193
    );
194
FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
195
        return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask);
196
    );
197
FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
198
        return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask);
199
    );
200

201
FUNCTOR_TEMPLATE(VAnd);
202
FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b));
203
FUNCTOR_TEMPLATE(VOr);
204
FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b));
205
FUNCTOR_TEMPLATE(VXor);
206
FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b));
207
FUNCTOR_TEMPLATE(VNot);
208
FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a));
209

210
#elif CV_SSE2
211

212
#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
213
    template <>                                                                                  \
214
    struct name<template_arg>{                                                                   \
215
        typedef register_type reg_type;                                                          \
216
        static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
217
        static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
218
    }
219

220
#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
221
    template <>                                                                \
222
    struct name<template_arg>{                                                 \
223
        typedef register_type reg_type;                                        \
224
        static reg_type load(const template_arg * p) { return load_body (p); } \
225
        static void store(template_arg * p, reg_type v) { store_body (p, v); } \
226
    }
227

228
#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
229
    template<>                                                                 \
230
    struct name<template_arg>                                                  \
231
    {                                                                          \
232
        VLoadStore128<template_arg>::reg_type operator()(                      \
233
                        const VLoadStore128<template_arg>::reg_type & a,       \
234
                        const VLoadStore128<template_arg>::reg_type & b) const \
235
        {                                                                      \
236
            body;                                                              \
237
        }                                                                      \
238
    }
239

240
#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
241
    template<>                                                                 \
242
    struct name<template_arg>                                                  \
243
    {                                                                          \
244
        VLoadStore128<template_arg>::reg_type operator()(                      \
245
                        const VLoadStore128<template_arg>::reg_type & a,       \
246
                        const VLoadStore128<template_arg>::reg_type &  ) const \
247
        {                                                                      \
248
            body;                                                              \
249
        }                                                                      \
250
    }
251

252
FUNCTOR_LOADSTORE_CAST(VLoadStore128,  uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
253
FUNCTOR_LOADSTORE_CAST(VLoadStore128,  schar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
254
FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128);
255
FUNCTOR_LOADSTORE_CAST(VLoadStore128,  short, __m128i, _mm_loadu_si128, _mm_storeu_si128);
256
FUNCTOR_LOADSTORE_CAST(VLoadStore128,    int, __m128i, _mm_loadu_si128, _mm_storeu_si128);
257
FUNCTOR_LOADSTORE(     VLoadStore128,  float, __m128 , _mm_loadu_ps   , _mm_storeu_ps   );
258
FUNCTOR_LOADSTORE(     VLoadStore128, double, __m128d, _mm_loadu_pd   , _mm_storeu_pd   );
259

260
FUNCTOR_LOADSTORE_CAST(VLoadStore64,  uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
261
FUNCTOR_LOADSTORE_CAST(VLoadStore64,  schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
262
FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
263
FUNCTOR_LOADSTORE_CAST(VLoadStore64,  short, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
264

265
FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned,    int, __m128i, _mm_load_si128, _mm_store_si128);
266
FUNCTOR_LOADSTORE(     VLoadStore128Aligned,  float, __m128 , _mm_load_ps   , _mm_store_ps   );
267
FUNCTOR_LOADSTORE(     VLoadStore128Aligned, double, __m128d, _mm_load_pd   , _mm_store_pd   );
268

269
FUNCTOR_TEMPLATE(VAdd);
270
FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm_adds_epu8 (a, b));
271
FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm_adds_epi8 (a, b));
272
FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b));
273
FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm_adds_epi16(a, b));
274
FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm_add_epi32 (a, b));
275
FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm_add_ps    (a, b));
276
FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd    (a, b));
277

278
FUNCTOR_TEMPLATE(VSub);
279
FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm_subs_epu8 (a, b));
280
FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm_subs_epi8 (a, b));
281
FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b));
282
FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm_subs_epi16(a, b));
283
FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm_sub_epi32 (a, b));
284
FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm_sub_ps    (a, b));
285
FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd    (a, b));
286

287
FUNCTOR_TEMPLATE(VMin);
288
FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b));
289
FUNCTOR_CLOSURE_2arg(VMin, schar,
290
        __m128i m = _mm_cmpgt_epi8(a, b);
291
        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
292
    );
293
FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b)));
294
FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm_min_epi16(a, b));
295
FUNCTOR_CLOSURE_2arg(VMin,    int,
296
        __m128i m = _mm_cmpgt_epi32(a, b);
297
        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
298
    );
299
FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm_min_ps(a, b));
300
FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b));
301

302
FUNCTOR_TEMPLATE(VMax);
303
FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b));
304
FUNCTOR_CLOSURE_2arg(VMax, schar,
305
        __m128i m = _mm_cmpgt_epi8(b, a);
306
        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
307
    );
308
FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b));
309
FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm_max_epi16(a, b));
310
FUNCTOR_CLOSURE_2arg(VMax,    int,
311
        __m128i m = _mm_cmpgt_epi32(b, a);
312
        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
313
    );
314
FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm_max_ps(a, b));
315
FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b));
316

317

318
static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
319
static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
320

321
FUNCTOR_TEMPLATE(VAbsDiff);
322
FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
323
        return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
324
    );
325
FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
326
        __m128i d = _mm_subs_epi8(a, b);
327
        __m128i m = _mm_cmpgt_epi8(b, a);
328
        return _mm_subs_epi8(_mm_xor_si128(d, m), m);
329
    );
330
FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
331
        return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
332
    );
333
FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
334
        __m128i M = _mm_max_epi16(a, b);
335
        __m128i m = _mm_min_epi16(a, b);
336
        return _mm_subs_epi16(M, m);
337
    );
338
FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
339
        __m128i d = _mm_sub_epi32(a, b);
340
        __m128i m = _mm_cmpgt_epi32(b, a);
341
        return _mm_sub_epi32(_mm_xor_si128(d, m), m);
342
    );
343
FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
344
        return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
345
    );
346
FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
347
        return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
348
    );
349

350
FUNCTOR_TEMPLATE(VAnd);
351
FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b));
352
FUNCTOR_TEMPLATE(VOr);
353
FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b));
354
FUNCTOR_TEMPLATE(VXor);
355
FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b));
356
FUNCTOR_TEMPLATE(VNot);
357
FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
358
#endif
359

360
#if CV_NEON
361

362
#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
363
    template <>                                                                \
364
    struct name<template_arg>{                                                 \
365
        typedef register_type reg_type;                                        \
366
        static reg_type load(const template_arg * p) { return load_body (p);}; \
367
        static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
368
    }
369

370
#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
371
    template<>                                                         \
372
    struct name<template_arg>                                          \
373
    {                                                                  \
374
        VLoadStore128<template_arg>::reg_type operator()(              \
375
                        VLoadStore128<template_arg>::reg_type a,       \
376
                        VLoadStore128<template_arg>::reg_type b) const \
377
        {                                                              \
378
            return body;                                               \
379
        };                                                             \
380
    }
381

382
#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
383
    template<>                                                         \
384
    struct name<template_arg>                                          \
385
    {                                                                  \
386
        VLoadStore128<template_arg>::reg_type operator()(              \
387
                        VLoadStore128<template_arg>::reg_type a,       \
388
                        VLoadStore128<template_arg>::reg_type  ) const \
389
        {                                                              \
390
            return body;                                               \
391
        };                                                             \
392
    }
393

394
FUNCTOR_LOADSTORE(VLoadStore128,  uchar,  uint8x16_t, vld1q_u8 , vst1q_u8 );
395
FUNCTOR_LOADSTORE(VLoadStore128,  schar,   int8x16_t, vld1q_s8 , vst1q_s8 );
396
FUNCTOR_LOADSTORE(VLoadStore128, ushort,  uint16x8_t, vld1q_u16, vst1q_u16);
397
FUNCTOR_LOADSTORE(VLoadStore128,  short,   int16x8_t, vld1q_s16, vst1q_s16);
398
FUNCTOR_LOADSTORE(VLoadStore128,    int,   int32x4_t, vld1q_s32, vst1q_s32);
399
FUNCTOR_LOADSTORE(VLoadStore128,  float, float32x4_t, vld1q_f32, vst1q_f32);
400

401
FUNCTOR_TEMPLATE(VAdd);
402
FUNCTOR_CLOSURE_2arg(VAdd,  uchar, vqaddq_u8 (a, b));
403
FUNCTOR_CLOSURE_2arg(VAdd,  schar, vqaddq_s8 (a, b));
404
FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
405
FUNCTOR_CLOSURE_2arg(VAdd,  short, vqaddq_s16(a, b));
406
FUNCTOR_CLOSURE_2arg(VAdd,    int, vaddq_s32 (a, b));
407
FUNCTOR_CLOSURE_2arg(VAdd,  float, vaddq_f32 (a, b));
408

409
FUNCTOR_TEMPLATE(VSub);
410
FUNCTOR_CLOSURE_2arg(VSub,  uchar, vqsubq_u8 (a, b));
411
FUNCTOR_CLOSURE_2arg(VSub,  schar, vqsubq_s8 (a, b));
412
FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
413
FUNCTOR_CLOSURE_2arg(VSub,  short, vqsubq_s16(a, b));
414
FUNCTOR_CLOSURE_2arg(VSub,    int, vsubq_s32 (a, b));
415
FUNCTOR_CLOSURE_2arg(VSub,  float, vsubq_f32 (a, b));
416

417
FUNCTOR_TEMPLATE(VMin);
418
FUNCTOR_CLOSURE_2arg(VMin,  uchar, vminq_u8 (a, b));
419
FUNCTOR_CLOSURE_2arg(VMin,  schar, vminq_s8 (a, b));
420
FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
421
FUNCTOR_CLOSURE_2arg(VMin,  short, vminq_s16(a, b));
422
FUNCTOR_CLOSURE_2arg(VMin,    int, vminq_s32(a, b));
423
FUNCTOR_CLOSURE_2arg(VMin,  float, vminq_f32(a, b));
424

425
FUNCTOR_TEMPLATE(VMax);
426
FUNCTOR_CLOSURE_2arg(VMax,  uchar, vmaxq_u8 (a, b));
427
FUNCTOR_CLOSURE_2arg(VMax,  schar, vmaxq_s8 (a, b));
428
FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
429
FUNCTOR_CLOSURE_2arg(VMax,  short, vmaxq_s16(a, b));
430
FUNCTOR_CLOSURE_2arg(VMax,    int, vmaxq_s32(a, b));
431
FUNCTOR_CLOSURE_2arg(VMax,  float, vmaxq_f32(a, b));
432

433
FUNCTOR_TEMPLATE(VAbsDiff);
434
FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar, vabdq_u8  (a, b));
435
FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar, vqabsq_s8 (vqsubq_s8(a, b)));
436
FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
437
FUNCTOR_CLOSURE_2arg(VAbsDiff,  short, vqabsq_s16(vqsubq_s16(a, b)));
438
FUNCTOR_CLOSURE_2arg(VAbsDiff,    int, vabdq_s32 (a, b));
439
FUNCTOR_CLOSURE_2arg(VAbsDiff,  float, vabdq_f32 (a, b));
440

441
FUNCTOR_TEMPLATE(VAnd);
442
FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
443
FUNCTOR_TEMPLATE(VOr);
444
FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
445
FUNCTOR_TEMPLATE(VXor);
446
FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
447
FUNCTOR_TEMPLATE(VNot);
448
FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a   ));
449
#endif
450

451

452
template <typename T>
453
struct Cmp_SIMD
454
{
455
    explicit Cmp_SIMD(int)
456
    {
457
    }
458

459
    int operator () (const T *, const T *, uchar *, int) const
460
    {
461
        return 0;
462
    }
463
};
464

465
#if CV_NEON
466

467
template <>
468
struct Cmp_SIMD<schar>
469
{
470
    explicit Cmp_SIMD(int code_) :
471
        code(code_)
472
    {
473
        // CV_Assert(code == CMP_GT || code == CMP_LE ||
474
        //           code == CMP_EQ || code == CMP_NE);
475

476
        v_mask = vdupq_n_u8(255);
477
    }
478

479
    int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
480
    {
481
        int x = 0;
482

483
        if (code == CMP_GT)
484
            for ( ; x <= width - 16; x += 16)
485
                vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
486
        else if (code == CMP_LE)
487
            for ( ; x <= width - 16; x += 16)
488
                vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
489
        else if (code == CMP_EQ)
490
            for ( ; x <= width - 16; x += 16)
491
                vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
492
        else if (code == CMP_NE)
493
            for ( ; x <= width - 16; x += 16)
494
                vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask));
495

496
        return x;
497
    }
498

499
    int code;
500
    uint8x16_t v_mask;
501
};
502

503
template <>
504
struct Cmp_SIMD<ushort>
505
{
506
    explicit Cmp_SIMD(int code_) :
507
        code(code_)
508
    {
509
        // CV_Assert(code == CMP_GT || code == CMP_LE ||
510
        //           code == CMP_EQ || code == CMP_NE);
511

512
        v_mask = vdup_n_u8(255);
513
    }
514

515
    int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const
516
    {
517
        int x = 0;
518

519
        if (code == CMP_GT)
520
            for ( ; x <= width - 8; x += 8)
521
            {
522
                uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
523
                vst1_u8(dst + x, vmovn_u16(v_dst));
524
            }
525
        else if (code == CMP_LE)
526
            for ( ; x <= width - 8; x += 8)
527
            {
528
                uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
529
                vst1_u8(dst + x, vmovn_u16(v_dst));
530
            }
531
        else if (code == CMP_EQ)
532
            for ( ; x <= width - 8; x += 8)
533
            {
534
                uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
535
                vst1_u8(dst + x, vmovn_u16(v_dst));
536
            }
537
        else if (code == CMP_NE)
538
            for ( ; x <= width - 8; x += 8)
539
            {
540
                uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
541
                vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask));
542
            }
543

544
        return x;
545
    }
546

547
    int code;
548
    uint8x8_t v_mask;
549
};
550

551
template <>
552
struct Cmp_SIMD<int>
553
{
554
    explicit Cmp_SIMD(int code_) :
555
        code(code_)
556
    {
557
        // CV_Assert(code == CMP_GT || code == CMP_LE ||
558
        //           code == CMP_EQ || code == CMP_NE);
559

560
        v_mask = vdup_n_u8(255);
561
    }
562

563
    int operator () (const int * src1, const int * src2, uchar * dst, int width) const
564
    {
565
        int x = 0;
566

567
        if (code == CMP_GT)
568
            for ( ; x <= width - 8; x += 8)
569
            {
570
                uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
571
                uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
572
                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
573
            }
574
        else if (code == CMP_LE)
575
            for ( ; x <= width - 8; x += 8)
576
            {
577
                uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
578
                uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
579
                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
580
            }
581
        else if (code == CMP_EQ)
582
            for ( ; x <= width - 8; x += 8)
583
            {
584
                uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
585
                uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
586
                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
587
            }
588
        else if (code == CMP_NE)
589
            for ( ; x <= width - 8; x += 8)
590
            {
591
                uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
592
                uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
593
                uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
594
                vst1_u8(dst + x, veor_u8(v_dst, v_mask));
595
            }
596

597
        return x;
598
    }
599

600
    int code;
601
    uint8x8_t v_mask;
602
};
603

604
template <>
605
struct Cmp_SIMD<float>
606
{
607
    explicit Cmp_SIMD(int code_) :
608
        code(code_)
609
    {
610
        // CV_Assert(code == CMP_GT || code == CMP_LE ||
611
        //           code == CMP_EQ || code == CMP_NE);
612

613
        v_mask = vdup_n_u8(255);
614
    }
615

616
    int operator () (const float * src1, const float * src2, uchar * dst, int width) const
617
    {
618
        int x = 0;
619

620
        if (code == CMP_GT)
621
            for ( ; x <= width - 8; x += 8)
622
            {
623
                uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
624
                uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
625
                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
626
            }
627
        else if (code == CMP_LE)
628
            for ( ; x <= width - 8; x += 8)
629
            {
630
                uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
631
                uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
632
                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
633
            }
634
        else if (code == CMP_EQ)
635
            for ( ; x <= width - 8; x += 8)
636
            {
637
                uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
638
                uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
639
                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
640
            }
641
        else if (code == CMP_NE)
642
            for ( ; x <= width - 8; x += 8)
643
            {
644
                uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
645
                uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
646
                uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
647
                vst1_u8(dst + x, veor_u8(v_dst, v_mask));
648
            }
649

650
        return x;
651
    }
652

653
    int code;
654
    uint8x8_t v_mask;
655
};
656

657
#elif CV_SSE2
658

659
template <>
660
struct Cmp_SIMD<schar>
661
{
662
    explicit Cmp_SIMD(int code_) :
663
        code(code_)
664
    {
665
        // CV_Assert(code == CMP_GT || code == CMP_LE ||
666
        //           code == CMP_EQ || code == CMP_NE);
667

668
        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
669

670
        v_mask = _mm_set1_epi8(-1);
671
    }
672

673
    int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
674
    {
675
        int x = 0;
676

677
        if (!haveSSE)
678
            return x;
679

680
        if (code == CMP_GT)
681
            for ( ; x <= width - 16; x += 16)
682
                _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
683
                                                                      _mm_loadu_si128((const __m128i *)(src2 + x))));
684
        else if (code == CMP_LE)
685
            for ( ; x <= width - 16; x += 16)
686
            {
687
                __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
688
                                              _mm_loadu_si128((const __m128i *)(src2 + x)));
689
                _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
690
            }
691
        else if (code == CMP_EQ)
692
            for ( ; x <= width - 16; x += 16)
693
                _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
694
                                                                      _mm_loadu_si128((const __m128i *)(src2 + x))));
695
        else if (code == CMP_NE)
696
            for ( ; x <= width - 16; x += 16)
697
            {
698
                __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
699
                                              _mm_loadu_si128((const __m128i *)(src2 + x)));
700
                _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
701
            }
702

703
        return x;
704
    }
705

706
    int code;
707
    __m128i v_mask;
708
    bool haveSSE;
709
};
710

711
template <>
712
struct Cmp_SIMD<int>
713
{
714
    explicit Cmp_SIMD(int code_) :
715
        code(code_)
716
    {
717
        // CV_Assert(code == CMP_GT || code == CMP_LE ||
718
        //           code == CMP_EQ || code == CMP_NE);
719

720
        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
721

722
        v_mask = _mm_set1_epi32(0xffffffff);
723
    }
724

725
    int operator () (const int * src1, const int * src2, uchar * dst, int width) const
726
    {
727
        int x = 0;
728

729
        if (!haveSSE)
730
            return x;
731

732
        if (code == CMP_GT)
733
            for ( ; x <= width - 8; x += 8)
734
            {
735
                __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
736
                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
737
                __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
738
                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
739

740
                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
741
            }
742
        else if (code == CMP_LE)
743
            for ( ; x <= width - 8; x += 8)
744
            {
745
                __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
746
                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
747
                __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
748
                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
749

750
                _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
751
            }
752
        else if (code == CMP_EQ)
753
            for ( ; x <= width - 8; x += 8)
754
            {
755
                __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
756
                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
757
                __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
758
                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
759

760
                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
761
            }
762
        else if (code == CMP_NE)
763
            for ( ; x <= width - 8; x += 8)
764
            {
765
                __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
766
                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
767
                __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
768
                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
769

770
                _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
771
            }
772

773
        return x;
774
    }
775

776
    int code;
777
    __m128i v_mask;
778
    bool haveSSE;
779
};
780

781
#endif
782

783

784
template <typename T, typename WT>
785
struct Mul_SIMD
786
{
787
    int operator() (const T *, const T *, T *, int, WT) const
788
    {
789
        return 0;
790
    }
791
};
792

793
#if CV_NEON
794

795
template <>
796
struct Mul_SIMD<uchar, float>
797
{
798
    int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const
799
    {
800
        int x = 0;
801

802
        if( scale == 1.0f )
803
            for ( ; x <= width - 8; x += 8)
804
            {
805
                uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
806
                uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
807

808
                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
809
                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
810
                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
811
                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
812

813
                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
814
                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
815
                vst1_u8(dst + x, vqmovn_u16(v_dst));
816
            }
817
        else
818
        {
819
            float32x4_t v_scale = vdupq_n_f32(scale);
820
            for ( ; x <= width - 8; x += 8)
821
            {
822
                uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
823
                uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
824

825
                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
826
                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
827
                v_dst1 = vmulq_f32(v_dst1, v_scale);
828
                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
829
                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
830
                v_dst2 = vmulq_f32(v_dst2, v_scale);
831

832
                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
833
                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
834
                vst1_u8(dst + x, vqmovn_u16(v_dst));
835
            }
836
        }
837

838
        return x;
839
    }
840
};
841

842
template <>
843
struct Mul_SIMD<schar, float>
844
{
845
    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
846
    {
847
        int x = 0;
848

849
        if( scale == 1.0f )
850
            for ( ; x <= width - 8; x += 8)
851
            {
852
                int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
853
                int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
854

855
                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
856
                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
857
                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
858
                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
859

860
                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
861
                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
862
                vst1_s8(dst + x, vqmovn_s16(v_dst));
863
            }
864
        else
865
        {
866
            float32x4_t v_scale = vdupq_n_f32(scale);
867
            for ( ; x <= width - 8; x += 8)
868
            {
869
                int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
870
                int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
871

872
                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
873
                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
874
                v_dst1 = vmulq_f32(v_dst1, v_scale);
875
                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
876
                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
877
                v_dst2 = vmulq_f32(v_dst2, v_scale);
878

879
                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
880
                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
881
                vst1_s8(dst + x, vqmovn_s16(v_dst));
882
            }
883
        }
884

885
        return x;
886
    }
887
};
888

889
template <>
890
struct Mul_SIMD<ushort, float>
891
{
892
    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
893
    {
894
        int x = 0;
895

896
        if( scale == 1.0f )
897
            for ( ; x <= width - 8; x += 8)
898
            {
899
                uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
900

901
                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
902
                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
903
                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
904
                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
905

906
                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
907
                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
908
                vst1q_u16(dst + x, v_dst);
909
            }
910
        else
911
        {
912
            float32x4_t v_scale = vdupq_n_f32(scale);
913
            for ( ; x <= width - 8; x += 8)
914
            {
915
                uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
916

917
                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
918
                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
919
                v_dst1 = vmulq_f32(v_dst1, v_scale);
920
                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
921
                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
922
                v_dst2 = vmulq_f32(v_dst2, v_scale);
923

924
                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
925
                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
926
                vst1q_u16(dst + x, v_dst);
927
            }
928
        }
929

930
        return x;
931
    }
932
};
933

934
template <>
935
struct Mul_SIMD<short, float>
936
{
937
    int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
938
    {
939
        int x = 0;
940

941
        if( scale == 1.0f )
942
            for ( ; x <= width - 8; x += 8)
943
            {
944
                int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
945

946
                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
947
                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
948
                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
949
                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
950

951
                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
952
                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
953
                vst1q_s16(dst + x, v_dst);
954
            }
955
        else
956
        {
957
            float32x4_t v_scale = vdupq_n_f32(scale);
958
            for ( ; x <= width - 8; x += 8)
959
            {
960
                int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
961

962
                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
963
                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
964
                v_dst1 = vmulq_f32(v_dst1, v_scale);
965
                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
966
                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
967
                v_dst2 = vmulq_f32(v_dst2, v_scale);
968

969
                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
970
                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
971
                vst1q_s16(dst + x, v_dst);
972
            }
973
        }
974

975
        return x;
976
    }
977
};
978

979
template <>
980
struct Mul_SIMD<float, float>
981
{
982
    int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const
983
    {
984
        int x = 0;
985

986
        if( scale == 1.0f )
987
            for ( ; x <= width - 8; x += 8)
988
            {
989
                float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
990
                float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
991
                vst1q_f32(dst + x, v_dst1);
992
                vst1q_f32(dst + x + 4, v_dst2);
993
            }
994
        else
995
        {
996
            float32x4_t v_scale = vdupq_n_f32(scale);
997
            for ( ; x <= width - 8; x += 8)
998
            {
999
                float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
1000
                v_dst1 = vmulq_f32(v_dst1, v_scale);
1001

1002
                float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
1003
                v_dst2 = vmulq_f32(v_dst2, v_scale);
1004

1005
                vst1q_f32(dst + x, v_dst1);
1006
                vst1q_f32(dst + x + 4, v_dst2);
1007
            }
1008
        }
1009

1010
        return x;
1011
    }
1012
};
1013

1014
#elif CV_SSE2
1015

1016
#if CV_SSE4_1
1017

1018
template <>
1019
struct Mul_SIMD<ushort, float>
1020
{
1021
    Mul_SIMD()
1022
    {
1023
        haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
1024
    }
1025

1026
    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
1027
    {
1028
        int x = 0;
1029

1030
        if (!haveSSE)
1031
            return x;
1032

1033
        __m128i v_zero = _mm_setzero_si128();
1034

1035
        if( scale != 1.0f )
1036
        {
1037
            __m128 v_scale = _mm_set1_ps(scale);
1038
            for ( ; x <= width - 8; x += 8)
1039
            {
1040
                __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
1041
                __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
1042

1043
                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)),
1044
                                           _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)));
1045
                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
1046

1047
                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)),
1048
                                           _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)));
1049
                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
1050

1051
                __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1052
                _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
1053
            }
1054
        }
1055

1056
        return x;
1057
    }
1058

1059
    bool haveSSE;
1060
};
1061

1062
#endif
1063

1064
template <>
1065
struct Mul_SIMD<schar, float>
1066
{
1067
    Mul_SIMD()
1068
    {
1069
        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
1070
    }
1071

1072
    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
1073
    {
1074
        int x = 0;
1075

1076
        if (!haveSSE)
1077
            return x;
1078

1079
        __m128i v_zero = _mm_setzero_si128();
1080

1081
        if( scale == 1.0f )
1082
            for ( ; x <= width - 8; x += 8)
1083
            {
1084
                __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
1085
                __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
1086

1087
                v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
1088
                v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
1089

1090
                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
1091
                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
1092

1093
                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
1094
                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
1095

1096
                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1097
                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
1098
            }
1099
        else
1100
        {
1101
            __m128 v_scale = _mm_set1_ps(scale);
1102
            for ( ; x <= width - 8; x += 8)
1103
            {
1104
                __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
1105
                __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
1106

1107
                v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
1108
                v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
1109

1110
                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
1111
                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
1112
                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
1113

1114
                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
1115
                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
1116
                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
1117

1118
                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1119
                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
1120
            }
1121
        }
1122

1123
        return x;
1124
    }
1125

1126
    bool haveSSE;
1127
};
1128

1129
template <>
1130
struct Mul_SIMD<short, float>
1131
{
1132
    Mul_SIMD()
1133
    {
1134
        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
1135
    }
1136

1137
    int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
1138
    {
1139
        int x = 0;
1140

1141
        if (!haveSSE)
1142
            return x;
1143

1144
        __m128i v_zero = _mm_setzero_si128();
1145

1146
        if( scale != 1.0f )
1147
        {
1148
            __m128 v_scale = _mm_set1_ps(scale);
1149
            for ( ; x <= width - 8; x += 8)
1150
            {
1151
                __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
1152
                __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
1153

1154
                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
1155
                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
1156
                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
1157

1158
                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
1159
                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
1160
                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
1161

1162
                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1163
                _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
1164
            }
1165
        }
1166

1167
        return x;
1168
    }
1169

1170
    bool haveSSE;
1171
};
1172

1173
#endif
1174

1175
template <typename T>
1176
struct Div_SIMD
1177
{
1178
    int operator() (const T *, const T *, T *, int, double) const
1179
    {
1180
        return 0;
1181
    }
1182
};
1183

1184
template <typename T>
1185
struct Recip_SIMD
1186
{
1187
    int operator() (const T *, T *, int, double) const
1188
    {
1189
        return 0;
1190
    }
1191
};
1192

1193

1194
#if CV_SIMD128
1195

1196
template <>
1197
struct Div_SIMD<uchar>
1198
{
1199
    bool haveSIMD;
1200
    Div_SIMD() { haveSIMD = hasSIMD128(); }
1201

1202
    int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
1203
    {
1204
        int x = 0;
1205

1206
        if (!haveSIMD)
1207
            return x;
1208

1209
        v_float32x4 v_scale = v_setall_f32((float)scale);
1210
        v_uint16x8 v_zero = v_setzero_u16();
1211

1212
        for ( ; x <= width - 8; x += 8)
1213
        {
1214
            v_uint16x8 v_src1 = v_load_expand(src1 + x);
1215
            v_uint16x8 v_src2 = v_load_expand(src2 + x);
1216

1217
            v_uint32x4 t0, t1, t2, t3;
1218
            v_expand(v_src1, t0, t1);
1219
            v_expand(v_src2, t2, t3);
1220

1221
            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1222
            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1223

1224
            v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
1225
            v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
1226

1227
            f0 = f0 * v_scale / f2;
1228
            f1 = f1 * v_scale / f3;
1229

1230
            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1231
            v_uint16x8 res = v_pack_u(i0, i1);
1232

1233
            res = v_select(v_src2 == v_zero, v_zero, res);
1234
            v_pack_store(dst + x, res);
1235
        }
1236

1237
        return x;
1238
    }
1239
};
1240

1241

1242
template <>
1243
struct Div_SIMD<schar>
1244
{
1245
    bool haveSIMD;
1246
    Div_SIMD() { haveSIMD = hasSIMD128(); }
1247

1248
    int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
1249
    {
1250
        int x = 0;
1251

1252
        if (!haveSIMD)
1253
            return x;
1254

1255
        v_float32x4 v_scale = v_setall_f32((float)scale);
1256
        v_int16x8 v_zero = v_setzero_s16();
1257

1258
        for ( ; x <= width - 8; x += 8)
1259
        {
1260
            v_int16x8 v_src1 = v_load_expand(src1 + x);
1261
            v_int16x8 v_src2 = v_load_expand(src2 + x);
1262

1263
            v_int32x4 t0, t1, t2, t3;
1264
            v_expand(v_src1, t0, t1);
1265
            v_expand(v_src2, t2, t3);
1266

1267
            v_float32x4 f0 = v_cvt_f32(t0);
1268
            v_float32x4 f1 = v_cvt_f32(t1);
1269

1270
            v_float32x4 f2 = v_cvt_f32(t2);
1271
            v_float32x4 f3 = v_cvt_f32(t3);
1272

1273
            f0 = f0 * v_scale / f2;
1274
            f1 = f1 * v_scale / f3;
1275

1276
            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1277
            v_int16x8 res = v_pack(i0, i1);
1278

1279
            res = v_select(v_src2 == v_zero, v_zero, res);
1280
            v_pack_store(dst + x, res);
1281
        }
1282

1283
        return x;
1284
    }
1285
};
1286

1287

1288
template <>
1289
struct Div_SIMD<ushort>
1290
{
1291
    bool haveSIMD;
1292
    Div_SIMD() { haveSIMD = hasSIMD128(); }
1293

1294
    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
1295
    {
1296
        int x = 0;
1297

1298
        if (!haveSIMD)
1299
            return x;
1300

1301
        v_float32x4 v_scale = v_setall_f32((float)scale);
1302
        v_uint16x8 v_zero = v_setzero_u16();
1303

1304
        for ( ; x <= width - 8; x += 8)
1305
        {
1306
            v_uint16x8 v_src1 = v_load(src1 + x);
1307
            v_uint16x8 v_src2 = v_load(src2 + x);
1308

1309
            v_uint32x4 t0, t1, t2, t3;
1310
            v_expand(v_src1, t0, t1);
1311
            v_expand(v_src2, t2, t3);
1312

1313
            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1314
            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1315

1316
            v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
1317
            v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
1318

1319
            f0 = f0 * v_scale / f2;
1320
            f1 = f1 * v_scale / f3;
1321

1322
            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1323
            v_uint16x8 res = v_pack_u(i0, i1);
1324

1325
            res = v_select(v_src2 == v_zero, v_zero, res);
1326
            v_store(dst + x, res);
1327
        }
1328

1329
        return x;
1330
    }
1331
};
1332

1333
template <>
1334
struct Div_SIMD<short>
1335
{
1336
    bool haveSIMD;
1337
    Div_SIMD() { haveSIMD = hasSIMD128(); }
1338

1339
    int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
1340
    {
1341
        int x = 0;
1342

1343
        if (!haveSIMD)
1344
            return x;
1345

1346
        v_float32x4 v_scale = v_setall_f32((float)scale);
1347
        v_int16x8 v_zero = v_setzero_s16();
1348

1349
        for ( ; x <= width - 8; x += 8)
1350
        {
1351
            v_int16x8 v_src1 = v_load(src1 + x);
1352
            v_int16x8 v_src2 = v_load(src2 + x);
1353

1354
            v_int32x4 t0, t1, t2, t3;
1355
            v_expand(v_src1, t0, t1);
1356
            v_expand(v_src2, t2, t3);
1357

1358
            v_float32x4 f0 = v_cvt_f32(t0);
1359
            v_float32x4 f1 = v_cvt_f32(t1);
1360

1361
            v_float32x4 f2 = v_cvt_f32(t2);
1362
            v_float32x4 f3 = v_cvt_f32(t3);
1363

1364
            f0 = f0 * v_scale / f2;
1365
            f1 = f1 * v_scale / f3;
1366

1367
            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1368
            v_int16x8 res = v_pack(i0, i1);
1369

1370
            res = v_select(v_src2 == v_zero, v_zero, res);
1371
            v_store(dst + x, res);
1372
        }
1373

1374
        return x;
1375
    }
1376
};
1377

1378
template <>
1379
struct Div_SIMD<int>
1380
{
1381
    bool haveSIMD;
1382
    Div_SIMD() { haveSIMD = hasSIMD128(); }
1383

1384
    int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
1385
    {
1386
        int x = 0;
1387

1388
        if (!haveSIMD)
1389
            return x;
1390

1391
        v_float32x4 v_scale = v_setall_f32((float)scale);
1392
        v_int32x4 v_zero = v_setzero_s32();
1393

1394
        for ( ; x <= width - 8; x += 8)
1395
        {
1396
            v_int32x4 t0 = v_load(src1 + x);
1397
            v_int32x4 t1 = v_load(src1 + x + 4);
1398
            v_int32x4 t2 = v_load(src2 + x);
1399
            v_int32x4 t3 = v_load(src2 + x + 4);
1400

1401
            v_float32x4 f0 = v_cvt_f32(t0);
1402
            v_float32x4 f1 = v_cvt_f32(t1);
1403
            v_float32x4 f2 = v_cvt_f32(t2);
1404
            v_float32x4 f3 = v_cvt_f32(t3);
1405

1406
            f0 = f0 * v_scale / f2;
1407
            f1 = f1 * v_scale / f3;
1408

1409
            v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
1410

1411
            res0 = v_select(t2 == v_zero, v_zero, res0);
1412
            res1 = v_select(t3 == v_zero, v_zero, res1);
1413
            v_store(dst + x, res0);
1414
            v_store(dst + x + 4, res1);
1415
        }
1416

1417
        return x;
1418
    }
1419
};
1420

1421

1422
template <>
1423
struct Div_SIMD<float>
1424
{
1425
    bool haveSIMD;
1426
    Div_SIMD() { haveSIMD = hasSIMD128(); }
1427

1428
    int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
1429
    {
1430
        int x = 0;
1431

1432
        if (!haveSIMD)
1433
            return x;
1434

1435
        v_float32x4 v_scale = v_setall_f32((float)scale);
1436

1437
        for ( ; x <= width - 8; x += 8)
1438
        {
1439
            v_float32x4 f0 = v_load(src1 + x);
1440
            v_float32x4 f1 = v_load(src1 + x + 4);
1441
            v_float32x4 f2 = v_load(src2 + x);
1442
            v_float32x4 f3 = v_load(src2 + x + 4);
1443

1444
            v_float32x4 res0 = f0 * v_scale / f2;
1445
            v_float32x4 res1 = f1 * v_scale / f3;
1446

1447
            v_store(dst + x, res0);
1448
            v_store(dst + x + 4, res1);
1449
        }
1450

1451
        return x;
1452
    }
1453
};
1454

1455

1456
///////////////////////// RECIPROCAL //////////////////////
1457

1458
template <>
1459
struct Recip_SIMD<uchar>
1460
{
1461
    bool haveSIMD;
1462
    Recip_SIMD() { haveSIMD = hasSIMD128(); }
1463

1464
    int operator() (const uchar * src2, uchar * dst, int width, double scale) const
1465
    {
1466
        int x = 0;
1467

1468
        if (!haveSIMD)
1469
            return x;
1470

1471
        v_float32x4 v_scale = v_setall_f32((float)scale);
1472
        v_uint16x8 v_zero = v_setzero_u16();
1473

1474
        for ( ; x <= width - 8; x += 8)
1475
        {
1476
            v_uint16x8 v_src2 = v_load_expand(src2 + x);
1477

1478
            v_uint32x4 t0, t1;
1479
            v_expand(v_src2, t0, t1);
1480

1481
            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1482
            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1483

1484
            f0 = v_scale / f0;
1485
            f1 = v_scale / f1;
1486

1487
            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1488
            v_uint16x8 res = v_pack_u(i0, i1);
1489

1490
            res = v_select(v_src2 == v_zero, v_zero, res);
1491
            v_pack_store(dst + x, res);
1492
        }
1493

1494
        return x;
1495
    }
1496
};
1497

1498

1499
template <>
1500
struct Recip_SIMD<schar>
1501
{
1502
    bool haveSIMD;
1503
    Recip_SIMD() { haveSIMD = hasSIMD128(); }
1504

1505
    int operator() (const schar * src2, schar * dst, int width, double scale) const
1506
    {
1507
        int x = 0;
1508

1509
        if (!haveSIMD)
1510
            return x;
1511

1512
        v_float32x4 v_scale = v_setall_f32((float)scale);
1513
        v_int16x8 v_zero = v_setzero_s16();
1514

1515
        for ( ; x <= width - 8; x += 8)
1516
        {
1517
            v_int16x8 v_src2 = v_load_expand(src2 + x);
1518

1519
            v_int32x4 t0, t1;
1520
            v_expand(v_src2, t0, t1);
1521

1522
            v_float32x4 f0 = v_cvt_f32(t0);
1523
            v_float32x4 f1 = v_cvt_f32(t1);
1524

1525
            f0 = v_scale / f0;
1526
            f1 = v_scale / f1;
1527

1528
            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1529
            v_int16x8 res = v_pack(i0, i1);
1530

1531
            res = v_select(v_src2 == v_zero, v_zero, res);
1532
            v_pack_store(dst + x, res);
1533
        }
1534

1535
        return x;
1536
    }
1537
};
1538

1539

1540
template <>
1541
struct Recip_SIMD<ushort>
1542
{
1543
    bool haveSIMD;
1544
    Recip_SIMD() { haveSIMD = hasSIMD128(); }
1545

1546
    int operator() (const ushort * src2, ushort * dst, int width, double scale) const
1547
    {
1548
        int x = 0;
1549

1550
        if (!haveSIMD)
1551
            return x;
1552

1553
        v_float32x4 v_scale = v_setall_f32((float)scale);
1554
        v_uint16x8 v_zero = v_setzero_u16();
1555

1556
        for ( ; x <= width - 8; x += 8)
1557
        {
1558
            v_uint16x8 v_src2 = v_load(src2 + x);
1559

1560
            v_uint32x4 t0, t1;
1561
            v_expand(v_src2, t0, t1);
1562

1563
            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1564
            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1565

1566
            f0 = v_scale / f0;
1567
            f1 = v_scale / f1;
1568

1569
            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1570
            v_uint16x8 res = v_pack_u(i0, i1);
1571

1572
            res = v_select(v_src2 == v_zero, v_zero, res);
1573
            v_store(dst + x, res);
1574
        }
1575

1576
        return x;
1577
    }
1578
};
1579

1580
template <>
1581
struct Recip_SIMD<short>
1582
{
1583
    bool haveSIMD;
1584
    Recip_SIMD() { haveSIMD = hasSIMD128(); }
1585

1586
    int operator() (const short * src2, short * dst, int width, double scale) const
1587
    {
1588
        int x = 0;
1589

1590
        if (!haveSIMD)
1591
            return x;
1592

1593
        v_float32x4 v_scale = v_setall_f32((float)scale);
1594
        v_int16x8 v_zero = v_setzero_s16();
1595

1596
        for ( ; x <= width - 8; x += 8)
1597
        {
1598
            v_int16x8 v_src2 = v_load(src2 + x);
1599

1600
            v_int32x4 t0, t1;
1601
            v_expand(v_src2, t0, t1);
1602

1603
            v_float32x4 f0 = v_cvt_f32(t0);
1604
            v_float32x4 f1 = v_cvt_f32(t1);
1605

1606
            f0 = v_scale / f0;
1607
            f1 = v_scale / f1;
1608

1609
            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1610
            v_int16x8 res = v_pack(i0, i1);
1611

1612
            res = v_select(v_src2 == v_zero, v_zero, res);
1613
            v_store(dst + x, res);
1614
        }
1615

1616
        return x;
1617
    }
1618
};
1619

1620
template <>
1621
struct Recip_SIMD<int>
1622
{
1623
    bool haveSIMD;
1624
    Recip_SIMD() { haveSIMD = hasSIMD128(); }
1625

1626
    int operator() (const int * src2, int * dst, int width, double scale) const
1627
    {
1628
        int x = 0;
1629

1630
        if (!haveSIMD)
1631
            return x;
1632

1633
        v_float32x4 v_scale = v_setall_f32((float)scale);
1634
        v_int32x4 v_zero = v_setzero_s32();
1635

1636
        for ( ; x <= width - 8; x += 8)
1637
        {
1638
            v_int32x4 t0 = v_load(src2 + x);
1639
            v_int32x4 t1 = v_load(src2 + x + 4);
1640

1641
            v_float32x4 f0 = v_cvt_f32(t0);
1642
            v_float32x4 f1 = v_cvt_f32(t1);
1643

1644
            f0 = v_scale / f0;
1645
            f1 = v_scale / f1;
1646

1647
            v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
1648

1649
            res0 = v_select(t0 == v_zero, v_zero, res0);
1650
            res1 = v_select(t1 == v_zero, v_zero, res1);
1651
            v_store(dst + x, res0);
1652
            v_store(dst + x + 4, res1);
1653
        }
1654

1655
        return x;
1656
    }
1657
};
1658

1659

1660
template <>
1661
struct Recip_SIMD<float>
1662
{
1663
    bool haveSIMD;
1664
    Recip_SIMD() { haveSIMD = hasSIMD128(); }
1665

1666
    int operator() (const float * src2, float * dst, int width, double scale) const
1667
    {
1668
        int x = 0;
1669

1670
        if (!haveSIMD)
1671
            return x;
1672

1673
        v_float32x4 v_scale = v_setall_f32((float)scale);
1674

1675
        for ( ; x <= width - 8; x += 8)
1676
        {
1677
            v_float32x4 f0 = v_load(src2 + x);
1678
            v_float32x4 f1 = v_load(src2 + x + 4);
1679

1680
            v_float32x4 res0 = v_scale / f0;
1681
            v_float32x4 res1 = v_scale / f1;
1682

1683
            v_store(dst + x, res0);
1684
            v_store(dst + x + 4, res1);
1685
        }
1686

1687
        return x;
1688
    }
1689
};
1690

1691
#if CV_SIMD128_64F
1692

1693
template <>
1694
struct Div_SIMD<double>
1695
{
1696
    bool haveSIMD;
1697
    Div_SIMD() { haveSIMD = hasSIMD128(); }
1698

1699
    int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
1700
    {
1701
        int x = 0;
1702

1703
        if (!haveSIMD)
1704
            return x;
1705

1706
        v_float64x2 v_scale = v_setall_f64(scale);
1707

1708
        for ( ; x <= width - 4; x += 4)
1709
        {
1710
            v_float64x2 f0 = v_load(src1 + x);
1711
            v_float64x2 f1 = v_load(src1 + x + 2);
1712
            v_float64x2 f2 = v_load(src2 + x);
1713
            v_float64x2 f3 = v_load(src2 + x + 2);
1714

1715
            v_float64x2 res0 = f0 * v_scale / f2;
1716
            v_float64x2 res1 = f1 * v_scale / f3;
1717

1718
            v_store(dst + x, res0);
1719
            v_store(dst + x + 2, res1);
1720
        }
1721

1722
        return x;
1723
    }
1724
};
1725

1726
template <>
1727
struct Recip_SIMD<double>
1728
{
1729
    bool haveSIMD;
1730
    Recip_SIMD() { haveSIMD = hasSIMD128(); }
1731

1732
    int operator() (const double * src2, double * dst, int width, double scale) const
1733
    {
1734
        int x = 0;
1735

1736
        if (!haveSIMD)
1737
            return x;
1738

1739
        v_float64x2 v_scale = v_setall_f64(scale);
1740

1741
        for ( ; x <= width - 4; x += 4)
1742
        {
1743
            v_float64x2 f0 = v_load(src2 + x);
1744
            v_float64x2 f1 = v_load(src2 + x + 2);
1745

1746
            v_float64x2 res0 = v_scale / f0;
1747
            v_float64x2 res1 = v_scale / f1;
1748

1749
            v_store(dst + x, res0);
1750
            v_store(dst + x + 2, res1);
1751
        }
1752

1753
        return x;
1754
    }
1755
};
1756

1757
#endif
1758

1759
#endif
1760

1761

1762
template <typename T, typename WT>
1763
struct AddWeighted_SIMD
1764
{
1765
    int operator() (const T *, const T *, T *, int, WT, WT, WT) const
1766
    {
1767
        return 0;
1768
    }
1769
};
1770

1771
#if CV_SSE2
1772

1773
template <>
1774
struct AddWeighted_SIMD<schar, float>
1775
{
1776
    AddWeighted_SIMD()
1777
    {
1778
        haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
1779
    }
1780

1781
    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
1782
    {
1783
        int x = 0;
1784

1785
        if (!haveSSE2)
1786
            return x;
1787

1788
        __m128i v_zero = _mm_setzero_si128();
1789
        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
1790
               v_gamma = _mm_set1_ps(gamma);
1791

1792
        for( ; x <= width - 8; x += 8 )
1793
        {
1794
            __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x));
1795
            __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
1796

1797
            __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
1798
            __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
1799

1800
            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha);
1801
            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
1802
                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta));
1803

1804
            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha);
1805
            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
1806
                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta));
1807

1808
            __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
1809
                                              _mm_cvtps_epi32(v_dstf1));
1810

1811
            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero));
1812
        }
1813

1814
        return x;
1815
    }
1816

1817
    bool haveSSE2;
1818
};
1819

1820
template <>
1821
struct AddWeighted_SIMD<short, float>
1822
{
1823
    AddWeighted_SIMD()
1824
    {
1825
        haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
1826
    }
1827

1828
    int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
1829
    {
1830
        int x = 0;
1831

1832
        if (!haveSSE2)
1833
            return x;
1834

1835
        __m128i v_zero = _mm_setzero_si128();
1836
        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
1837
               v_gamma = _mm_set1_ps(gamma);
1838

1839
        for( ; x <= width - 8; x += 8 )
1840
        {
1841
            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
1842
            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
1843

1844
            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha);
1845
            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
1846
                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta));
1847

1848
            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha);
1849
            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
1850
                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta));
1851

1852
            _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
1853
                                                                   _mm_cvtps_epi32(v_dstf1)));
1854
        }
1855

1856
        return x;
1857
    }
1858

1859
    bool haveSSE2;
1860
};
1861

1862
#if CV_SSE4_1
1863

1864
template <>
1865
struct AddWeighted_SIMD<ushort, float>
1866
{
1867
    AddWeighted_SIMD()
1868
    {
1869
        haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
1870
    }
1871

1872
    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
1873
    {
1874
        int x = 0;
1875

1876
        if (!haveSSE4_1)
1877
            return x;
1878

1879
        __m128i v_zero = _mm_setzero_si128();
1880
        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
1881
               v_gamma = _mm_set1_ps(gamma);
1882

1883
        for( ; x <= width - 8; x += 8 )
1884
        {
1885
            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
1886
            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
1887

1888
            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha);
1889
            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
1890
                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta));
1891

1892
            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha);
1893
            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
1894
                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta));
1895

1896
            _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0),
1897
                                                                    _mm_cvtps_epi32(v_dstf1)));
1898
        }
1899

1900
        return x;
1901
    }
1902

1903
    bool haveSSE4_1;
1904
};
1905

1906
#endif
1907

1908
#elif CV_NEON
1909

1910
template <>
1911
struct AddWeighted_SIMD<schar, float>
1912
{
1913
    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
1914
    {
1915
        int x = 0;
1916

1917
        float32x4_t g = vdupq_n_f32 (gamma);
1918

1919
        for( ; x <= width - 8; x += 8 )
1920
        {
1921
            int8x8_t in1 = vld1_s8(src1 + x);
1922
            int16x8_t in1_16 = vmovl_s8(in1);
1923
            float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16)));
1924
            float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16)));
1925

1926
            int8x8_t in2 = vld1_s8(src2+x);
1927
            int16x8_t in2_16 = vmovl_s8(in2);
1928
            float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16)));
1929
            float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16)));
1930

1931
            float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
1932
            float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
1933
            out_f_l = vaddq_f32(out_f_l, g);
1934
            out_f_h = vaddq_f32(out_f_h, g);
1935

1936
            int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l));
1937
            int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h));
1938

1939
            int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h);
1940
            int8x8_t out = vqmovn_s16(out_16);
1941

1942
            vst1_s8(dst + x, out);
1943
        }
1944

1945
        return x;
1946
    }
1947
};
1948

1949
template <>
1950
struct AddWeighted_SIMD<ushort, float>
1951
{
1952
    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
1953
    {
1954
        int x = 0;
1955

1956
        float32x4_t g = vdupq_n_f32(gamma);
1957

1958
        for( ; x <= width - 8; x += 8 )
1959
        {
1960
            uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
1961

1962
            float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha);
1963
            float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta);
1964
            uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
1965

1966
            v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha);
1967
            v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta);
1968
            uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
1969

1970
            vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
1971
        }
1972

1973
        return x;
1974
    }
1975
};
1976

1977
template <>
1978
struct AddWeighted_SIMD<short, float>
1979
{
1980
    int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
1981
    {
1982
        int x = 0;
1983

1984
        float32x4_t g = vdupq_n_f32(gamma);
1985

1986
        for( ; x <= width - 8; x += 8 )
1987
        {
1988
            int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
1989

1990
            float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha);
1991
            float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta);
1992
            int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
1993

1994
            v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha);
1995
            v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta);
1996
            int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
1997

1998
            vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2));
1999
        }
2000

2001
        return x;
2002
    }
2003
};
2004

2005
#endif
2006

2007
}
2008

2009
#endif // __OPENCV_ARITHM_SIMD_HPP__
2010

2011
Product

Resources

Company