CoCalc -- accum.simd.hpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/imgproc/src/accum.simd.hpp
¹⁶³⁵⁴ views
1
// This file is part of OpenCV project.
2
// It is subject to the license terms in the LICENSE file found in the top-level directory
3
// of this distribution and at http://opencv.org/license.html.
4

5
#include "opencv2/core/hal/intrin.hpp"
6

7
#define DEF_ACC_INT_FUNCS(suffix, type, acctype) \
8
void acc_##suffix(const type* src, acctype* dst, \
9
                         const uchar* mask, int len, int cn) \
10
{ \
11
    CV_CPU_DISPATCH(acc_simd_, (src, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
12
} \
13
void accSqr_##suffix(const type* src, acctype* dst, \
14
                            const uchar* mask, int len, int cn) \
15
{ \
16
    CV_CPU_DISPATCH(accSqr_simd_, (src, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
17
} \
18
void accProd_##suffix(const type* src1, const type* src2, \
19
                             acctype* dst, const uchar* mask, int len, int cn) \
20
{ \
21
    CV_CPU_DISPATCH(accProd_simd_, (src1, src2, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
22
} \
23
void accW_##suffix(const type* src, acctype* dst, \
24
                          const uchar* mask, int len, int cn, double alpha) \
25
{ \
26
    CV_CPU_DISPATCH(accW_simd_, (src, dst, mask, len, cn, alpha),  CV_CPU_DISPATCH_MODES_ALL); \
27
}
28
#define DEF_ACC_FLT_FUNCS(suffix, type, acctype) \
29
void acc_##suffix(const type* src, acctype* dst, \
30
                         const uchar* mask, int len, int cn) \
31
{ \
32
    CV_CPU_DISPATCH(acc_simd_, (src, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
33
} \
34
void accSqr_##suffix(const type* src, acctype* dst, \
35
                            const uchar* mask, int len, int cn) \
36
{ \
37
    CV_CPU_DISPATCH(accSqr_simd_, (src, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
38
} \
39
void accProd_##suffix(const type* src1, const type* src2, \
40
                             acctype* dst, const uchar* mask, int len, int cn) \
41
{ \
42
    CV_CPU_DISPATCH(accProd_simd_, (src1, src2, dst, mask, len, cn),  CV_CPU_DISPATCH_MODES_ALL); \
43
} \
44
void accW_##suffix(const type* src, acctype* dst, \
45
                          const uchar* mask, int len, int cn, double alpha) \
46
{ \
47
    CV_CPU_DISPATCH(accW_simd_, (src, dst, mask, len, cn, alpha),  CV_CPU_DISPATCH_MODES_ALL); \
48
}
49
#define DECLARATE_ACC_FUNCS(suffix, type, acctype) \
50
void acc_##suffix(const type* src, acctype* dst, const uchar* mask, int len, int cn); \
51
void accSqr_##suffix(const type* src, acctype* dst, const uchar* mask, int len, int cn); \
52
void accProd_##suffix(const type* src1, const type* src2, acctype* dst, const uchar* mask, int len, int cn); \
53
void accW_##suffix(const type* src, acctype* dst, const uchar* mask, int len, int cn, double alpha);
54

55

56
namespace cv {
57

58
DECLARATE_ACC_FUNCS(8u32f, uchar, float)
59
DECLARATE_ACC_FUNCS(8u64f, uchar, double)
60
DECLARATE_ACC_FUNCS(16u32f, ushort, float)
61
DECLARATE_ACC_FUNCS(16u64f, ushort, double)
62
DECLARATE_ACC_FUNCS(32f, float, float)
63
DECLARATE_ACC_FUNCS(32f64f, float, double)
64
DECLARATE_ACC_FUNCS(64f, double, double)
65

66
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
67

68
void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn);
69
void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn);
70
void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn);
71
void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn);
72
void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn);
73
void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn);
74
void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn);
75
void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn);
76
void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn);
77
void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn);
78
void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn);
79
void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int cn);
80
void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn);
81
void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn);
82
void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar* mask, int len, int cn);
83
void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uchar* mask, int len, int cn);
84
void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn);
85
void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn);
86
void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn);
87
void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn);
88
void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn);
89
void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn, double alpha);
90
void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn, double alpha);
91
void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha);
92
void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha);
93
void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha);
94
void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha);
95
void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha);
96

97
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
98
// todo: remove AVX branch after support it by universal intrinsics
99
template <typename T, typename AT>
100
void acc_general_(const T* src, AT* dst, const uchar* mask, int len, int cn, int start = 0 )
101
{
102
    int i = start;
103

104
    if( !mask )
105
    {
106
        len *= cn;
107
        #if CV_ENABLE_UNROLLED
108
        for( ; i <= len - 4; i += 4 )
109
        {
110
            AT t0, t1;
111
            t0 = src[i] + dst[i];
112
            t1 = src[i+1] + dst[i+1];
113
            dst[i] = t0; dst[i+1] = t1;
114

115
            t0 = src[i+2] + dst[i+2];
116
            t1 = src[i+3] + dst[i+3];
117
            dst[i+2] = t0; dst[i+3] = t1;
118
        }
119
        #endif
120
        for( ; i < len; i++ )
121
        {
122
            dst[i] += src[i];
123
        }
124
    }
125
    else
126
    {
127
        src += (i * cn);
128
        dst += (i * cn);
129
        for( ; i < len; i++, src += cn, dst += cn )
130
        {
131
            if( mask[i] )
132
            {
133
                for( int k = 0; k < cn; k++ )
134
                {
135
                    dst[k] += src[k];
136
                }
137
            }
138
        }
139
    }
140
#if CV_AVX && !CV_AVX2
141
    _mm256_zeroupper();
142
#elif CV_SIMD
143
    vx_cleanup();
144
#endif
145
}
146

147
template<typename T, typename AT> void
148
accSqr_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, int start = 0 )
149
{
150
    int i = start;
151

152
    if( !mask )
153
    {
154
        len *= cn;
155
        #if CV_ENABLE_UNROLLED
156
        for( ; i <= len - 4; i += 4 )
157
        {
158
            AT t0, t1;
159
            t0 = (AT)src[i]*src[i] + dst[i];
160
            t1 = (AT)src[i+1]*src[i+1] + dst[i+1];
161
            dst[i] = t0; dst[i+1] = t1;
162

163
            t0 = (AT)src[i+2]*src[i+2] + dst[i+2];
164
            t1 = (AT)src[i+3]*src[i+3] + dst[i+3];
165
            dst[i+2] = t0; dst[i+3] = t1;
166
        }
167
        #endif
168
        for( ; i < len; i++ )
169
        {
170
            dst[i] += (AT)src[i]*src[i];
171
        }
172
    }
173
    else
174
    {
175
        src += (i * cn);
176
        dst += (i * cn);
177
        for( ; i < len; i++, src += cn, dst += cn )
178
        {
179
            if( mask[i] )
180
            {
181
                for( int k = 0; k < cn; k++ )
182
                {
183
                    dst[k] += (AT)src[k]*src[k];
184
                }
185
            }
186
        }
187
    }
188
#if CV_AVX && !CV_AVX2
189
    _mm256_zeroupper();
190
#elif CV_SIMD
191
    vx_cleanup();
192
#endif
193
}
194

195
template<typename T, typename AT> void
196
accProd_general_( const T* src1, const T* src2, AT* dst, const uchar* mask, int len, int cn, int start = 0 )
197
{
198
    int i = start;
199

200
    if( !mask )
201
    {
202
        len *= cn;
203
        #if CV_ENABLE_UNROLLED
204
        for( ; i <= len - 4; i += 4 )
205
        {
206
            AT t0, t1;
207
            t0 = (AT)src1[i]*src2[i] + dst[i];
208
            t1 = (AT)src1[i+1]*src2[i+1] + dst[i+1];
209
            dst[i] = t0; dst[i+1] = t1;
210

211
            t0 = (AT)src1[i+2]*src2[i+2] + dst[i+2];
212
            t1 = (AT)src1[i+3]*src2[i+3] + dst[i+3];
213
            dst[i+2] = t0; dst[i+3] = t1;
214
        }
215
        #endif
216
        for( ; i < len; i++ )
217
        {
218
            dst[i] += (AT)src1[i]*src2[i];
219
        }
220
    }
221
    else
222
    {
223
        src1 += (i * cn);
224
        src2 += (i * cn);
225
        dst += (i * cn);
226
        for( ; i < len; i++, src1 += cn, src2 += cn, dst += cn )
227
        {
228
            if( mask[i] )
229
            {
230
                for( int k = 0; k < cn; k++ )
231
                {
232
                    dst[k] += (AT)src1[k]*src2[k];
233
                }
234
            }
235
        }
236
    }
237
#if CV_AVX && !CV_AVX2
238
    _mm256_zeroupper();
239
#elif CV_SIMD
240
    vx_cleanup();
241
#endif
242
}
243

244
template<typename T, typename AT> void
245
accW_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, double alpha, int start = 0 )
246
{
247
    AT a = (AT)alpha, b = 1 - a;
248
    int i = start;
249

250
    if( !mask )
251
    {
252
        len *= cn;
253
        #if CV_ENABLE_UNROLLED
254
        for( ; i <= len - 4; i += 4 )
255
        {
256
            AT t0, t1;
257
            t0 = src[i]*a + dst[i]*b;
258
            t1 = src[i+1]*a + dst[i+1]*b;
259
            dst[i] = t0; dst[i+1] = t1;
260

261
            t0 = src[i+2]*a + dst[i+2]*b;
262
            t1 = src[i+3]*a + dst[i+3]*b;
263
            dst[i+2] = t0; dst[i+3] = t1;
264
        }
265
        #endif
266
        for( ; i < len; i++ )
267
        {
268
            dst[i] = src[i]*a + dst[i]*b;
269
        }
270
    }
271
    else
272
    {
273
        src += (i * cn);
274
        dst += (i * cn);
275
        for( ; i < len; i++, src += cn, dst += cn )
276
        {
277
            if( mask[i] )
278
            {
279
                for( int k = 0; k < cn; k++ )
280
                {
281
                    dst[k] = src[k]*a + dst[k]*b;
282
                }
283
            }
284
        }
285
    }
286
#if CV_AVX && !CV_AVX2
287
    _mm256_zeroupper();
288
#elif CV_SIMD
289
    vx_cleanup();
290
#endif
291
}
292
void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
293
{
294
    int x = 0;
295
#if CV_SIMD
296
    const int cVectorWidth = v_uint8::nlanes;
297
    const int step = v_float32::nlanes;
298

299
    if (!mask)
300
    {
301
        int size = len * cn;
302
        for (; x <= size - cVectorWidth; x += cVectorWidth)
303
        {
304
            v_uint8 v_src  = vx_load(src + x);
305
            v_uint16 v_src0, v_src1;
306
            v_expand(v_src, v_src0, v_src1);
307

308
            v_uint32 v_src00, v_src01, v_src10, v_src11;
309
            v_expand(v_src0, v_src00, v_src01);
310
            v_expand(v_src1, v_src10, v_src11);
311

312
            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
313
            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
314
            v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
315
            v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
316
        }
317
    }
318
    else
319
    {
320
        v_uint8 v_0 = vx_setall_u8(0);
321
        if (cn == 1)
322
        {
323
            for ( ; x <= len - cVectorWidth; x += cVectorWidth)
324
            {
325
                v_uint8 v_mask = vx_load(mask + x);
326
                v_mask = ~(v_0 == v_mask);
327
                v_uint8 v_src = vx_load(src + x);
328
                v_src = v_src & v_mask;
329
                v_uint16 v_src0, v_src1;
330
                v_expand(v_src, v_src0, v_src1);
331

332
                v_uint32 v_src00, v_src01, v_src10, v_src11;
333
                v_expand(v_src0, v_src00, v_src01);
334
                v_expand(v_src1, v_src10, v_src11);
335

336
                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
337
                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
338
                v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
339
                v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
340
            }
341
        }
342
        else if (cn == 3)
343
        {
344
            for ( ; x <= len - cVectorWidth; x += cVectorWidth)
345
            {
346
                v_uint8 v_mask = vx_load(mask + x);
347
                v_mask = ~(v_0 == v_mask);
348
                v_uint8 v_src0, v_src1, v_src2;
349
                v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2);
350
                v_src0 = v_src0 & v_mask;
351
                v_src1 = v_src1 & v_mask;
352
                v_src2 = v_src2 & v_mask;
353
                v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
354
                v_expand(v_src0, v_src00, v_src01);
355
                v_expand(v_src1, v_src10, v_src11);
356
                v_expand(v_src2, v_src20, v_src21);
357

358
                v_uint32 v_src000, v_src001, v_src010, v_src011;
359
                v_uint32 v_src100, v_src101, v_src110, v_src111;
360
                v_uint32 v_src200, v_src201, v_src210, v_src211;
361
                v_expand(v_src00, v_src000, v_src001);
362
                v_expand(v_src01, v_src010, v_src011);
363
                v_expand(v_src10, v_src100, v_src101);
364
                v_expand(v_src11, v_src110, v_src111);
365
                v_expand(v_src20, v_src200, v_src201);
366
                v_expand(v_src21, v_src210, v_src211);
367

368
                v_float32 v_dst000, v_dst001, v_dst010, v_dst011;
369
                v_float32 v_dst100, v_dst101, v_dst110, v_dst111;
370
                v_float32 v_dst200, v_dst201, v_dst210, v_dst211;
371
                v_load_deinterleave(dst + (x * cn), v_dst000, v_dst100, v_dst200);
372
                v_load_deinterleave(dst + ((x + step) * cn), v_dst001, v_dst101, v_dst201);
373
                v_load_deinterleave(dst + ((x + step * 2) * cn), v_dst010, v_dst110, v_dst210);
374
                v_load_deinterleave(dst + ((x + step * 3) * cn), v_dst011, v_dst111, v_dst211);
375

376
                v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000));
377
                v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100));
378
                v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200));
379
                v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001));
380
                v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101));
381
                v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201));
382
                v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010));
383
                v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110));
384
                v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210));
385
                v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011));
386
                v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111));
387
                v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211));
388

389
                v_store_interleave(dst + (x * cn), v_dst000, v_dst100, v_dst200);
390
                v_store_interleave(dst + ((x + step) * cn), v_dst001, v_dst101, v_dst201);
391
                v_store_interleave(dst + ((x + step * 2) * cn), v_dst010, v_dst110, v_dst210);
392
                v_store_interleave(dst + ((x + step * 3) * cn), v_dst011, v_dst111, v_dst211);
393
            }
394
        }
395
    }
396
#endif // CV_SIMD
397
    acc_general_(src, dst, mask, len, cn, x);
398
}
399

400
void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn)
401
{
402
    int x = 0;
403
#if CV_SIMD
404
    const int cVectorWidth = v_uint16::nlanes;
405
    const int step = v_float32::nlanes;
406

407
    if (!mask)
408
    {
409
        int size = len * cn;
410
        for (; x <= size - cVectorWidth; x += cVectorWidth)
411
        {
412
            v_uint16 v_src = vx_load(src + x);
413
            v_uint32 v_src0, v_src1;
414
            v_expand(v_src, v_src0, v_src1);
415

416
            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
417
            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
418
        }
419
    }
420
    else
421
    {
422
        if (cn == 1)
423
        {
424
            v_uint16 v_0 = vx_setall_u16(0);
425
            for ( ; x <= len - cVectorWidth; x += cVectorWidth)
426
            {
427
                v_uint16 v_mask = vx_load_expand(mask + x);
428
                v_mask = ~(v_mask == v_0);
429
                v_uint16 v_src = vx_load(src + x);
430
                v_src = v_src & v_mask;
431
                v_uint32 v_src0, v_src1;
432
                v_expand(v_src, v_src0, v_src1);
433

434
                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
435
                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
436
            }
437
        }
438
        else if (cn == 3)
439
        {
440
            v_uint16 v_0 = vx_setall_u16(0);
441
            for ( ; x <= len - cVectorWidth; x += cVectorWidth)
442
            {
443
                v_uint16 v_mask = vx_load_expand(mask + x);
444
                v_mask = ~(v_mask == v_0);
445
                v_uint16 v_src0, v_src1, v_src2;
446
                v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
447
                v_src0 = v_src0 & v_mask;
448
                v_src1 = v_src1 & v_mask;
449
                v_src2 = v_src2 & v_mask;
450
                v_uint32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
451
                v_expand(v_src0, v_src00, v_src01);
452
                v_expand(v_src1, v_src10, v_src11);
453
                v_expand(v_src2, v_src20, v_src21);
454

455
                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
456
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
457
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
458

459
                v_dst00 += v_cvt_f32(v_reinterpret_as_s32(v_src00));
460
                v_dst01 += v_cvt_f32(v_reinterpret_as_s32(v_src01));
461
                v_dst10 += v_cvt_f32(v_reinterpret_as_s32(v_src10));
462
                v_dst11 += v_cvt_f32(v_reinterpret_as_s32(v_src11));
463
                v_dst20 += v_cvt_f32(v_reinterpret_as_s32(v_src20));
464
                v_dst21 += v_cvt_f32(v_reinterpret_as_s32(v_src21));
465

466
                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
467
                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
468
            }
469
        }
470
    }
471
#endif // CV_SIMD
472
    acc_general_(src, dst, mask, len, cn, x);
473
}
474
// todo: remove AVX branch after support it by universal intrinsics
475
void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
476
{
477
    int x = 0;
478
#if CV_SIMD
479
    const int cVectorWidth = v_uint16::nlanes;
480
    const int step = v_float32::nlanes;
481

482
    if (!mask)
483
    {
484
        int size = len * cn;
485
        #if CV_AVX && !CV_AVX2
486
        for (; x <= size - 8 ; x += 8)
487
        {
488
            __m256 v_src = _mm256_loadu_ps(src + x);
489
            __m256 v_dst = _mm256_loadu_ps(dst + x);
490
            v_dst = _mm256_add_ps(v_src, v_dst);
491
            _mm256_storeu_ps(dst + x, v_dst);
492
        }
493
        #else
494
        for (; x <= size - cVectorWidth; x += cVectorWidth)
495
        {
496
            v_store(dst + x, vx_load(dst + x) + vx_load(src + x));
497
            v_store(dst + x + step, vx_load(dst + x + step) + vx_load(src + x + step));
498
        }
499
        #endif // CV_AVX && !CV_AVX2
500
    }
501
    else
502
    {
503
        v_float32 v_0 = vx_setzero_f32();
504
        if (cn == 1)
505
        {
506
            for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
507
            {
508
                v_uint16 v_masku16 = vx_load_expand(mask + x);
509
                v_uint32 v_masku320, v_masku321;
510
                v_expand(v_masku16, v_masku320, v_masku321);
511
                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
512
                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
513

514
                v_store(dst + x, vx_load(dst + x) + (vx_load(src + x) & v_mask0));
515
                v_store(dst + x + step, vx_load(dst + x + step) + (vx_load(src + x + step) & v_mask1));
516
            }
517
        }
518
        else if (cn == 3)
519
        {
520
            for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
521
            {
522
                v_uint16 v_masku16 = vx_load_expand(mask + x);
523
                v_uint32 v_masku320, v_masku321;
524
                v_expand(v_masku16, v_masku320, v_masku321);
525
                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
526
                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
527

528
                v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
529
                v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
530
                v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
531
                v_src00 = v_src00 & v_mask0;
532
                v_src01 = v_src01 & v_mask1;
533
                v_src10 = v_src10 & v_mask0;
534
                v_src11 = v_src11 & v_mask1;
535
                v_src20 = v_src20 & v_mask0;
536
                v_src21 = v_src21 & v_mask1;
537

538
                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
539
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
540
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
541

542
                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
543
                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
544
            }
545
        }
546
    }
547
#endif // CV_SIMD
548
    acc_general_(src, dst, mask, len, cn, x);
549
}
550

551
void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)
552
{
553
    int x = 0;
554
#if CV_SIMD_64F
555
    const int cVectorWidth = v_uint8::nlanes;
556
    const int step = v_float64::nlanes;
557

558
    if (!mask)
559
    {
560
        int size = len * cn;
561
        for (; x <= size - cVectorWidth; x += cVectorWidth)
562
        {
563
            v_uint8 v_src  = vx_load(src + x);
564
            v_uint16 v_int0, v_int1;
565
            v_expand(v_src, v_int0, v_int1);
566

567
            v_uint32 v_int00, v_int01, v_int10, v_int11;
568
            v_expand(v_int0, v_int00, v_int01);
569
            v_expand(v_int1, v_int10, v_int11);
570

571
            v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
572
            v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
573
            v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
574
            v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
575
            v_float64 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
576
            v_float64 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
577
            v_float64 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
578
            v_float64 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
579

580
            v_float64 v_dst0 = vx_load(dst + x);
581
            v_float64 v_dst1 = vx_load(dst + x + step);
582
            v_float64 v_dst2 = vx_load(dst + x + step * 2);
583
            v_float64 v_dst3 = vx_load(dst + x + step * 3);
584
            v_float64 v_dst4 = vx_load(dst + x + step * 4);
585
            v_float64 v_dst5 = vx_load(dst + x + step * 5);
586
            v_float64 v_dst6 = vx_load(dst + x + step * 6);
587
            v_float64 v_dst7 = vx_load(dst + x + step * 7);
588

589
            v_dst0 = v_dst0 + v_src0;
590
            v_dst1 = v_dst1 + v_src1;
591
            v_dst2 = v_dst2 + v_src2;
592
            v_dst3 = v_dst3 + v_src3;
593
            v_dst4 = v_dst4 + v_src4;
594
            v_dst5 = v_dst5 + v_src5;
595
            v_dst6 = v_dst6 + v_src6;
596
            v_dst7 = v_dst7 + v_src7;
597

598
            v_store(dst + x, v_dst0);
599
            v_store(dst + x + step, v_dst1);
600
            v_store(dst + x + step * 2, v_dst2);
601
            v_store(dst + x + step * 3, v_dst3);
602
            v_store(dst + x + step * 4, v_dst4);
603
            v_store(dst + x + step * 5, v_dst5);
604
            v_store(dst + x + step * 6, v_dst6);
605
            v_store(dst + x + step * 7, v_dst7);
606
        }
607
    }
608
    else
609
    {
610
        v_uint8 v_0 = vx_setall_u8(0);
611
        if (cn == 1)
612
        {
613
            for ( ; x <= len - cVectorWidth; x += cVectorWidth)
614
            {
615
                v_uint8 v_mask = vx_load(mask + x);
616
                v_mask = ~(v_mask == v_0);
617
                v_uint8 v_src  = vx_load(src + x);
618
                v_src = v_src & v_mask;
619
                v_uint16 v_int0, v_int1;
620
                v_expand(v_src, v_int0, v_int1);
621

622
                v_uint32 v_int00, v_int01, v_int10, v_int11;
623
                v_expand(v_int0, v_int00, v_int01);
624
                v_expand(v_int1, v_int10, v_int11);
625

626
                v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
627
                v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
628
                v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
629
                v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
630
                v_float64 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
631
                v_float64 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
632
                v_float64 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
633
                v_float64 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
634

635
                v_float64 v_dst0 = vx_load(dst + x);
636
                v_float64 v_dst1 = vx_load(dst + x + step);
637
                v_float64 v_dst2 = vx_load(dst + x + step * 2);
638
                v_float64 v_dst3 = vx_load(dst + x + step * 3);
639
                v_float64 v_dst4 = vx_load(dst + x + step * 4);
640
                v_float64 v_dst5 = vx_load(dst + x + step * 5);
641
                v_float64 v_dst6 = vx_load(dst + x + step * 6);
642
                v_float64 v_dst7 = vx_load(dst + x + step * 7);
643

644
                v_dst0 = v_dst0 + v_src0;
645
                v_dst1 = v_dst1 + v_src1;
646
                v_dst2 = v_dst2 + v_src2;
647
                v_dst3 = v_dst3 + v_src3;
648
                v_dst4 = v_dst4 + v_src4;
649
                v_dst5 = v_dst5 + v_src5;
650
                v_dst6 = v_dst6 + v_src6;
651
                v_dst7 = v_dst7 + v_src7;
652

653
                v_store(dst + x, v_dst0);
654
                v_store(dst + x + step, v_dst1);
655
                v_store(dst + x + step * 2, v_dst2);
656
                v_store(dst + x + step * 3, v_dst3);
657
                v_store(dst + x + step * 4, v_dst4);
658
                v_store(dst + x + step * 5, v_dst5);
659
                v_store(dst + x + step * 6, v_dst6);
660
                v_store(dst + x + step * 7, v_dst7);
661
            }
662
        }
663
        else if (cn == 3)
664
        {
665
            for ( ; x <= len - cVectorWidth; x += cVectorWidth)
666
            {
667
                v_uint8 v_mask = vx_load(mask + x);
668
                v_mask = ~(v_0 == v_mask);
669
                v_uint8 v_src0, v_src1, v_src2;
670
                v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2);
671
                v_src0 = v_src0 & v_mask;
672
                v_src1 = v_src1 & v_mask;
673
                v_src2 = v_src2 & v_mask;
674
                v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
675
                v_expand(v_src0, v_src00, v_src01);
676
                v_expand(v_src1, v_src10, v_src11);
677
                v_expand(v_src2, v_src20, v_src21);
678

679
                v_uint32 v_src000, v_src001, v_src010, v_src011;
680
                v_uint32 v_src100, v_src101, v_src110, v_src111;
681
                v_uint32 v_src200, v_src201, v_src210, v_src211;
682
                v_expand(v_src00, v_src000, v_src001);
683
                v_expand(v_src01, v_src010, v_src011);
684
                v_expand(v_src10, v_src100, v_src101);
685
                v_expand(v_src11, v_src110, v_src111);
686
                v_expand(v_src20, v_src200, v_src201);
687
                v_expand(v_src21, v_src210, v_src211);
688

689
                v_float64 v_src0000, v_src0001, v_src0010, v_src0011, v_src0100, v_src0101, v_src0110, v_src0111;
690
                v_float64 v_src1000, v_src1001, v_src1010, v_src1011, v_src1100, v_src1101, v_src1110, v_src1111;
691
                v_float64 v_src2000, v_src2001, v_src2010, v_src2011, v_src2100, v_src2101, v_src2110, v_src2111;
692
                v_src0000 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src000)));
693
                v_src0001 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src000)));
694
                v_src0010 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src001)));
695
                v_src0011 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src001)));
696
                v_src0100 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src010)));
697
                v_src0101 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src010)));
698
                v_src0110 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src011)));
699
                v_src0111 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src011)));
700
                v_src1000 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src100)));
701
                v_src1001 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src100)));
702
                v_src1010 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src101)));
703
                v_src1011 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src101)));
704
                v_src1100 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src110)));
705
                v_src1101 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src110)));
706
                v_src1110 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src111)));
707
                v_src1111 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src111)));
708
                v_src2000 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src200)));
709
                v_src2001 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src200)));
710
                v_src2010 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src201)));
711
                v_src2011 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src201)));
712
                v_src2100 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src210)));
713
                v_src2101 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src210)));
714
                v_src2110 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src211)));
715
                v_src2111 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src211)));
716

717
                v_float64 v_dst0000, v_dst0001, v_dst0010, v_dst0011, v_dst0100, v_dst0101, v_dst0110, v_dst0111;
718
                v_float64 v_dst1000, v_dst1001, v_dst1010, v_dst1011, v_dst1100, v_dst1101, v_dst1110, v_dst1111;
719
                v_float64 v_dst2000, v_dst2001, v_dst2010, v_dst2011, v_dst2100, v_dst2101, v_dst2110, v_dst2111;
720
                v_load_deinterleave(dst + (x * cn), v_dst0000, v_dst1000, v_dst2000);
721
                v_load_deinterleave(dst + ((x + step) * cn), v_dst0001, v_dst1001, v_dst2001);
722
                v_load_deinterleave(dst + ((x + step * 2) * cn), v_dst0010, v_dst1010, v_dst2010);
723
                v_load_deinterleave(dst + ((x + step * 3) * cn), v_dst0011, v_dst1011, v_dst2011);
724
                v_load_deinterleave(dst + ((x + step * 4) * cn), v_dst0100, v_dst1100, v_dst2100);
725
                v_load_deinterleave(dst + ((x + step * 5) * cn), v_dst0101, v_dst1101, v_dst2101);
726
                v_load_deinterleave(dst + ((x + step * 6) * cn), v_dst0110, v_dst1110, v_dst2110);
727
                v_load_deinterleave(dst + ((x + step * 7) * cn), v_dst0111, v_dst1111, v_dst2111);
728

729
                v_store_interleave(dst + (x * cn), v_dst0000 + v_src0000, v_dst1000 + v_src1000, v_dst2000 + v_src2000);
730
                v_store_interleave(dst + ((x + step) * cn), v_dst0001 + v_src0001, v_dst1001 + v_src1001, v_dst2001 + v_src2001);
731
                v_store_interleave(dst + ((x + step * 2) * cn), v_dst0010 + v_src0010, v_dst1010 + v_src1010, v_dst2010 + v_src2010);
732
                v_store_interleave(dst + ((x + step * 3) * cn), v_dst0011 + v_src0011, v_dst1011 + v_src1011, v_dst2011 + v_src2011);
733
                v_store_interleave(dst + ((x + step * 4) * cn), v_dst0100 + v_src0100, v_dst1100 + v_src1100, v_dst2100 + v_src2100);
734
                v_store_interleave(dst + ((x + step * 5) * cn), v_dst0101 + v_src0101, v_dst1101 + v_src1101, v_dst2101 + v_src2101);
735
                v_store_interleave(dst + ((x + step * 6) * cn), v_dst0110 + v_src0110, v_dst1110 + v_src1110, v_dst2110 + v_src2110);
736
                v_store_interleave(dst + ((x + step * 7) * cn), v_dst0111 + v_src0111, v_dst1111 + v_src1111, v_dst2111 + v_src2111);
737
            }
738
        }
739
    }
740
#endif // CV_SIMD_64F
741
    acc_general_(src, dst, mask, len, cn, x);
742
}
743

744
void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)
745
{
746
    int x = 0;
747
#if CV_SIMD_64F
748
    const int cVectorWidth = v_uint16::nlanes;
749
    const int step = v_float64::nlanes;
750

751
    if (!mask)
752
    {
753
        int size = len * cn;
754
        for (; x <= size - cVectorWidth; x += cVectorWidth)
755
        {
756
            v_uint16 v_src  = vx_load(src + x);
757
            v_uint32 v_int0, v_int1;
758
            v_expand(v_src, v_int0, v_int1);
759

760
            v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
761
            v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
762
            v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
763
            v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
764

765
            v_float64 v_dst0 = vx_load(dst + x);
766
            v_float64 v_dst1 = vx_load(dst + x + step);
767
            v_float64 v_dst2 = vx_load(dst + x + step * 2);
768
            v_float64 v_dst3 = vx_load(dst + x + step * 3);
769

770
            v_dst0 = v_dst0 + v_src0;
771
            v_dst1 = v_dst1 + v_src1;
772
            v_dst2 = v_dst2 + v_src2;
773
            v_dst3 = v_dst3 + v_src3;
774

775
            v_store(dst + x, v_dst0);
776
            v_store(dst + x + step, v_dst1);
777
            v_store(dst + x + step * 2, v_dst2);
778
            v_store(dst + x + step * 3, v_dst3);
779
        }
780
    }
781
    else
782
    {
783
        v_uint16 v_0 = vx_setzero_u16();
784
        if (cn == 1)
785
        {
786
            for ( ; x <= len - cVectorWidth; x += cVectorWidth)
787
            {
788
                v_uint16 v_mask = vx_load_expand(mask + x);
789
                v_mask = ~(v_mask == v_0);
790
                v_uint16 v_src  = vx_load(src + x);
791
                v_src = v_src & v_mask;
792
                v_uint32 v_int0, v_int1;
793
                v_expand(v_src, v_int0, v_int1);
794

795
                v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
796
                v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
797
                v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
798
                v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
799

800
                v_float64 v_dst0 = vx_load(dst + x);
801
                v_float64 v_dst1 = vx_load(dst + x + step);
802
                v_float64 v_dst2 = vx_load(dst + x + step * 2);
803
                v_float64 v_dst3 = vx_load(dst + x + step * 3);
804

805
                v_dst0 = v_dst0 + v_src0;
806
                v_dst1 = v_dst1 + v_src1;
807
                v_dst2 = v_dst2 + v_src2;
808
                v_dst3 = v_dst3 + v_src3;
809

810
                v_store(dst + x, v_dst0);
811
                v_store(dst + x + step, v_dst1);
812
                v_store(dst + x + step * 2, v_dst2);
813
                v_store(dst + x + step * 3, v_dst3);
814
            }
815
        }
816
        if (cn == 3)
817
        {
818
            for ( ; x <= len - cVectorWidth; x += cVectorWidth)
819
            {
820
                v_uint16 v_mask = vx_load_expand(mask + x);
821
                v_mask = ~(v_mask == v_0);
822
                v_uint16 v_src0, v_src1, v_src2;
823
                v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
824
                v_src0 = v_src0 & v_mask;
825
                v_src1 = v_src1 & v_mask;
826
                v_src2 = v_src2 & v_mask;
827
                v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
828
                v_expand(v_src0, v_int00, v_int01);
829
                v_expand(v_src1, v_int10, v_int11);
830
                v_expand(v_src2, v_int20, v_int21);
831

832
                v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
833
                v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
834
                v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
835
                v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
836
                v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
837
                v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
838
                v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
839
                v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
840
                v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
841
                v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
842
                v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
843
                v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
844

845
                v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
846
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
847
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
848
                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
849
                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
850

851
                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
852
                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
853
                v_store_interleave(dst + (x + step * 2) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);
854
                v_store_interleave(dst + (x + step * 3) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);
855
            }
856
        }
857
    }
858
#endif // CV_SIMD_64F
859
    acc_general_(src, dst, mask, len, cn, x);
860
}
861

862
void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
863
{
864
    int x = 0;
865
#if CV_SIMD_64F
866
    const int cVectorWidth = v_float32::nlanes;
867
    const int step = v_float64::nlanes;
868

869
    if (!mask)
870
    {
871
        int size = len * cn;
872
        #if CV_AVX && !CV_AVX2
873
        for (; x <= size - 8 ; x += 8)
874
        {
875
            __m256 v_src = _mm256_loadu_ps(src + x);
876
            __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 0));
877
            __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 1));
878
            __m256d v_dst0 = _mm256_loadu_pd(dst + x);
879
            __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
880
            v_dst0 = _mm256_add_pd(v_src0, v_dst0);
881
            v_dst1 = _mm256_add_pd(v_src1, v_dst1);
882
            _mm256_storeu_pd(dst + x, v_dst0);
883
            _mm256_storeu_pd(dst + x + 4, v_dst1);
884
        }
885
        #else
886
        for (; x <= size - cVectorWidth; x += cVectorWidth)
887
        {
888
            v_float32 v_src = vx_load(src + x);
889
            v_float64 v_src0 = v_cvt_f64(v_src);
890
            v_float64 v_src1 = v_cvt_f64_high(v_src);
891

892
            v_store(dst + x, vx_load(dst + x) + v_src0);
893
            v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
894
        }
895
        #endif // CV_AVX && !CV_AVX2
896
    }
897
    else
898
    {
899
        v_uint64 v_0 = vx_setzero_u64();
900
        if (cn == 1)
901
        {
902
            for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
903
            {
904
                v_uint32 v_masku32 = vx_load_expand_q(mask + x);
905
                v_uint64 v_masku640, v_masku641;
906
                v_expand(v_masku32, v_masku640, v_masku641);
907
                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
908
                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
909

910
                v_float32 v_src = vx_load(src + x);
911
                v_float64 v_src0 = v_cvt_f64(v_src) & v_mask0;
912
                v_float64 v_src1 = v_cvt_f64_high(v_src) & v_mask1;
913

914
                v_store(dst + x, vx_load(dst + x) + v_src0);
915
                v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
916
            }
917
        }
918
        else if (cn == 3)
919
        {
920
            for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
921
            {
922
                v_uint32 v_masku32 = vx_load_expand_q(mask + x);
923
                v_uint64 v_masku640, v_masku641;
924
                v_expand(v_masku32, v_masku640, v_masku641);
925
                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
926
                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
927

928
                v_float32 v_src0, v_src1, v_src2;
929
                v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
930
                v_float64 v_src00 = v_cvt_f64(v_src0) & v_mask0;
931
                v_float64 v_src01 = v_cvt_f64_high(v_src0) & v_mask1;
932
                v_float64 v_src10 = v_cvt_f64(v_src1) & v_mask0;
933
                v_float64 v_src11 = v_cvt_f64_high(v_src1) & v_mask1;
934
                v_float64 v_src20 = v_cvt_f64(v_src2) & v_mask0;
935
                v_float64 v_src21 = v_cvt_f64_high(v_src2) & v_mask1;
936

937
                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
938
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
939
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
940

941
                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
942
                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
943
            }
944
        }
945
    }
946
#endif // CV_SIMD_64F
947
    acc_general_(src, dst, mask, len, cn, x);
948
}
949

950
void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
951
{
952
    int x = 0;
953
#if CV_SIMD_64F
954
    const int cVectorWidth = v_float64::nlanes * 2;
955
    const int step = v_float64::nlanes;
956

957
    if (!mask)
958
    {
959
        int size = len * cn;
960
        #if CV_AVX && !CV_AVX2
961
        for ( ; x <= size - 4 ; x += 4)
962
        {
963
            __m256d v_src = _mm256_loadu_pd(src + x);
964
            __m256d v_dst = _mm256_loadu_pd(dst + x);
965
            v_dst = _mm256_add_pd(v_dst, v_src);
966
            _mm256_storeu_pd(dst + x, v_dst);
967
        }
968
        #else
969
        for (; x <= size - cVectorWidth; x += cVectorWidth)
970
        {
971
            v_float64 v_src0 = vx_load(src + x);
972
            v_float64 v_src1 = vx_load(src + x + step);
973

974
            v_store(dst + x, vx_load(dst + x) + v_src0);
975
            v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
976
        }
977
        #endif // CV_AVX && !CV_AVX2
978
    }
979
    else
980
    {
981
        v_uint64 v_0 = vx_setzero_u64();
982
        if (cn == 1)
983
        {
984
            for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
985
            {
986
                v_uint32 v_masku32 = vx_load_expand_q(mask + x);
987
                v_uint64 v_masku640, v_masku641;
988
                v_expand(v_masku32, v_masku640, v_masku641);
989
                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
990
                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
991

992
                v_float64 v_src0 = vx_load(src + x);
993
                v_float64 v_src1 = vx_load(src + x + step);
994

995
                v_store(dst + x, vx_load(dst + x) + (v_src0 & v_mask0));
996
                v_store(dst + x + step, vx_load(dst + x + step) + (v_src1 & v_mask1));
997
            }
998
        }
999
        else if (cn == 3)
1000
        {
1001
            for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
1002
            {
1003
                v_uint32 v_masku32 = vx_load_expand_q(mask + x);
1004
                v_uint64 v_masku640, v_masku641;
1005
                v_expand(v_masku32, v_masku640, v_masku641);
1006
                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
1007
                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
1008

1009
                v_float64 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
1010
                v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
1011
                v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
1012
                v_src00 = v_src00 & v_mask0;
1013
                v_src01 = v_src01 & v_mask1;
1014
                v_src10 = v_src10 & v_mask0;
1015
                v_src11 = v_src11 & v_mask1;
1016
                v_src20 = v_src20 & v_mask0;
1017
                v_src21 = v_src21 & v_mask1;
1018

1019
                v_float64 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
1020
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1021
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1022

1023
                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
1024
                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
1025
            }
1026
        }
1027
    }
1028
#endif // CV_SIMD_64F
1029
    acc_general_(src, dst, mask, len, cn, x);
1030
}
1031

1032
// square accumulate optimized by universal intrinsic
1033
void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
1034
{
1035
    int x = 0;
1036
#if CV_SIMD
1037
    const int cVectorWidth = v_uint8::nlanes;
1038
    const int step = v_float32::nlanes;
1039

1040
    if (!mask)
1041
    {
1042
        int size = len * cn;
1043
        for (; x <= size - cVectorWidth; x += cVectorWidth)
1044
        {
1045
            v_uint8 v_src  = vx_load(src + x);
1046
            v_uint16 v_src0, v_src1;
1047
            v_expand(v_src, v_src0, v_src1);
1048
            v_src0 = v_mul_wrap(v_src0, v_src0);
1049
            v_src1 = v_mul_wrap(v_src1, v_src1);
1050

1051
            v_uint32 v_src00, v_src01, v_src10, v_src11;
1052
            v_expand(v_src0, v_src00, v_src01);
1053
            v_expand(v_src1, v_src10, v_src11);
1054

1055
            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
1056
            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
1057
            v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
1058
            v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
1059
        }
1060
    }
1061
    else
1062
    {
1063
        v_uint8 v_0 = vx_setall_u8(0);
1064
        if (cn == 1)
1065
        {
1066
            for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
1067
            {
1068
                v_uint8 v_mask = vx_load(mask + x);
1069
                v_mask = ~(v_0 == v_mask);
1070
                v_uint8 v_src = vx_load(src + x);
1071
                v_src = v_src & v_mask;
1072
                v_uint16 v_src0, v_src1;
1073
                v_expand(v_src, v_src0, v_src1);
1074
                v_src0 = v_mul_wrap(v_src0, v_src0);
1075
                v_src1 = v_mul_wrap(v_src1, v_src1);
1076

1077
                v_uint32 v_src00, v_src01, v_src10, v_src11;
1078
                v_expand(v_src0, v_src00, v_src01);
1079
                v_expand(v_src1, v_src10, v_src11);
1080

1081
                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
1082
                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
1083
                v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
1084
                v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
1085
            }
1086
        }
1087
        else if (cn == 3)
1088
        {
1089
            for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
1090
            {
1091
                v_uint8 v_mask = vx_load(mask + x);
1092
                v_mask = ~(v_0 == v_mask);
1093

1094
                v_uint8 v_src0, v_src1, v_src2;
1095
                v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
1096
                v_src0 = v_src0 & v_mask;
1097
                v_src1 = v_src1 & v_mask;
1098
                v_src2 = v_src2 & v_mask;
1099

1100
                v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
1101
                v_expand(v_src0, v_src00, v_src01);
1102
                v_expand(v_src1, v_src10, v_src11);
1103
                v_expand(v_src2, v_src20, v_src21);
1104
                v_src00 = v_mul_wrap(v_src00, v_src00);
1105
                v_src01 = v_mul_wrap(v_src01, v_src01);
1106
                v_src10 = v_mul_wrap(v_src10, v_src10);
1107
                v_src11 = v_mul_wrap(v_src11, v_src11);
1108
                v_src20 = v_mul_wrap(v_src20, v_src20);
1109
                v_src21 = v_mul_wrap(v_src21, v_src21);
1110

1111
                v_uint32 v_src000, v_src001, v_src010, v_src011;
1112
                v_uint32 v_src100, v_src101, v_src110, v_src111;
1113
                v_uint32 v_src200, v_src201, v_src210, v_src211;
1114
                v_expand(v_src00, v_src000, v_src001);
1115
                v_expand(v_src01, v_src010, v_src011);
1116
                v_expand(v_src10, v_src100, v_src101);
1117
                v_expand(v_src11, v_src110, v_src111);
1118
                v_expand(v_src20, v_src200, v_src201);
1119
                v_expand(v_src21, v_src210, v_src211);
1120

1121
                v_float32 v_dst000, v_dst001, v_dst010, v_dst011;
1122
                v_float32 v_dst100, v_dst101, v_dst110, v_dst111;
1123
                v_float32 v_dst200, v_dst201, v_dst210, v_dst211;
1124
                v_load_deinterleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
1125
                v_load_deinterleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
1126
                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst010, v_dst110, v_dst210);
1127
                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst011, v_dst111, v_dst211);
1128

1129
                v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000));
1130
                v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001));
1131
                v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010));
1132
                v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011));
1133

1134
                v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100));
1135
                v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101));
1136
                v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110));
1137
                v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111));
1138

1139
                v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200));
1140
                v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201));
1141
                v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210));
1142
                v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211));
1143

1144
                v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
1145
                v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
1146
                v_store_interleave(dst + (x + step * 2) * cn, v_dst010, v_dst110, v_dst210);
1147
                v_store_interleave(dst + (x + step * 3) * cn, v_dst011, v_dst111, v_dst211);
1148
            }
1149
        }
1150
    }
1151
#endif // CV_SIMD
1152
    accSqr_general_(src, dst, mask, len, cn, x);
1153
}
1154

1155
void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn)
1156
{
1157
    int x = 0;
1158
#if CV_SIMD
1159
    const int cVectorWidth = v_uint16::nlanes;
1160
    const int step = v_float32::nlanes;
1161

1162
    if (!mask)
1163
    {
1164
        int size = len * cn;
1165
        for (; x <= size - cVectorWidth; x += cVectorWidth)
1166
        {
1167
            v_uint16 v_src = vx_load(src + x);
1168
            v_uint32 v_src0, v_src1;
1169
            v_expand(v_src, v_src0, v_src1);
1170

1171
            v_float32 v_float0, v_float1;
1172
            v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0));
1173
            v_float1 = v_cvt_f32(v_reinterpret_as_s32(v_src1));
1174

1175
            v_store(dst + x, v_fma(v_float0, v_float0, vx_load(dst + x)));
1176
            v_store(dst + x + step, v_fma(v_float1, v_float1, vx_load(dst + x + step)));
1177
        }
1178
    }
1179
    else
1180
    {
1181
        v_uint32 v_0 = vx_setzero_u32();
1182
        if (cn == 1)
1183
        {
1184
            for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
1185
            {
1186
                v_uint16 v_mask16 = vx_load_expand(mask + x);
1187
                v_uint32 v_mask0, v_mask1;
1188
                v_expand(v_mask16, v_mask0, v_mask1);
1189
                v_mask0 = ~(v_mask0 == v_0);
1190
                v_mask1 = ~(v_mask1 == v_0);
1191
                v_uint16 v_src = vx_load(src + x);
1192
                v_uint32 v_src0, v_src1;
1193
                v_expand(v_src, v_src0, v_src1);
1194
                v_src0 = v_src0 & v_mask0;
1195
                v_src1 = v_src1 & v_mask1;
1196

1197
                v_float32 v_float0, v_float1;
1198
                v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0));
1199
                v_float1 = v_cvt_f32(v_reinterpret_as_s32(v_src1));
1200

1201
                v_store(dst + x, v_fma(v_float0, v_float0, vx_load(dst + x)));
1202
                v_store(dst + x + step, v_fma(v_float1, v_float1, vx_load(dst + x + step)));
1203
            }
1204
        }
1205
        else if (cn == 3)
1206
        {
1207
            for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
1208
            {
1209
                v_uint16 v_mask16 = vx_load_expand(mask + x);
1210
                v_uint32 v_mask0, v_mask1;
1211
                v_expand(v_mask16, v_mask0, v_mask1);
1212
                v_mask0 = ~(v_mask0 == v_0);
1213
                v_mask1 = ~(v_mask1 == v_0);
1214

1215
                v_uint16 v_src0, v_src1, v_src2;
1216
                v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
1217
                v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
1218
                v_expand(v_src0, v_int00, v_int01);
1219
                v_expand(v_src1, v_int10, v_int11);
1220
                v_expand(v_src2, v_int20, v_int21);
1221
                v_int00 = v_int00 & v_mask0;
1222
                v_int01 = v_int01 & v_mask1;
1223
                v_int10 = v_int10 & v_mask0;
1224
                v_int11 = v_int11 & v_mask1;
1225
                v_int20 = v_int20 & v_mask0;
1226
                v_int21 = v_int21 & v_mask1;
1227

1228
                v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
1229
                v_src00 = v_cvt_f32(v_reinterpret_as_s32(v_int00));
1230
                v_src01 = v_cvt_f32(v_reinterpret_as_s32(v_int01));
1231
                v_src10 = v_cvt_f32(v_reinterpret_as_s32(v_int10));
1232
                v_src11 = v_cvt_f32(v_reinterpret_as_s32(v_int11));
1233
                v_src20 = v_cvt_f32(v_reinterpret_as_s32(v_int20));
1234
                v_src21 = v_cvt_f32(v_reinterpret_as_s32(v_int21));
1235

1236
                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
1237
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1238
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1239

1240
                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
1241
                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
1242
                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
1243
                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
1244
                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
1245
                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
1246

1247
                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1248
                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1249
            }
1250
        }
1251
    }
1252
#endif // CV_SIMD
1253
    accSqr_general_(src, dst, mask, len, cn, x);
1254
}
1255

1256
void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
1257
{
1258
    int x = 0;
1259
#if CV_SIMD
1260
    const int cVectorWidth = v_uint16::nlanes;
1261
    const int step = v_float32::nlanes;
1262

1263
    if (!mask)
1264
    {
1265
        int size = len * cn;
1266
        #if CV_AVX && !CV_AVX2
1267
        for ( ; x <= size - 8 ; x += 8)
1268
        {
1269
            __m256 v_src = _mm256_loadu_ps(src + x);
1270
            __m256 v_dst = _mm256_loadu_ps(dst + x);
1271
            v_src = _mm256_mul_ps(v_src, v_src);
1272
            v_dst = _mm256_add_ps(v_src, v_dst);
1273
            _mm256_storeu_ps(dst + x, v_dst);
1274
        }
1275
        #else
1276
        for (; x <= size - cVectorWidth; x += cVectorWidth)
1277
        {
1278
            v_float32 v_src0 = vx_load(src + x);
1279
            v_float32 v_src1 = vx_load(src + x + step);
1280

1281
            v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
1282
            v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
1283
        }
1284
        #endif // CV_AVX && !CV_AVX2
1285
    }
1286
    else
1287
    {
1288
        v_uint32 v_0 = vx_setzero_u32();
1289
        if (cn == 1)
1290
        {
1291
            for (; x <= len - cVectorWidth; x += cVectorWidth)
1292
            {
1293
                v_uint16 v_mask16 = vx_load_expand(mask + x);
1294
                v_uint32 v_mask_0, v_mask_1;
1295
                v_expand(v_mask16, v_mask_0, v_mask_1);
1296
                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
1297
                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
1298
                v_float32 v_src0 = vx_load(src + x);
1299
                v_float32 v_src1 = vx_load(src + x + step);
1300
                v_src0 = v_src0 & v_mask0;
1301
                v_src1 = v_src1 & v_mask1;
1302

1303
                v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
1304
                v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
1305
            }
1306
        }
1307
        else if (cn == 3)
1308
        {
1309
            for (; x <= len - cVectorWidth; x += cVectorWidth)
1310
            {
1311
                v_uint16 v_mask16 = vx_load_expand(mask + x);
1312
                v_uint32 v_mask_0, v_mask_1;
1313
                v_expand(v_mask16, v_mask_0, v_mask_1);
1314
                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
1315
                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
1316

1317
                v_float32 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
1318
                v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
1319
                v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
1320
                v_src00 = v_src00 & v_mask0;
1321
                v_src01 = v_src01 & v_mask1;
1322
                v_src10 = v_src10 & v_mask0;
1323
                v_src11 = v_src11 & v_mask1;
1324
                v_src20 = v_src20 & v_mask0;
1325
                v_src21 = v_src21 & v_mask1;
1326

1327
                v_float32 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
1328
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1329
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1330

1331
                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
1332
                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
1333
                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
1334
                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
1335
                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
1336
                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
1337

1338
                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1339
                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1340
            }
1341
        }
1342
    }
1343
#endif // CV_SIMD
1344
    accSqr_general_(src, dst, mask, len, cn, x);
1345
}
1346

1347
void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)
1348
{
1349
    int x = 0;
1350
#if CV_SIMD_64F
1351
    const int cVectorWidth = v_uint16::nlanes;
1352
    const int step = v_float64::nlanes;
1353

1354
    if (!mask)
1355
    {
1356
        int size = len * cn;
1357
        for (; x <= size - cVectorWidth; x += cVectorWidth)
1358
        {
1359
            v_uint16 v_int = vx_load_expand(src + x);
1360

1361
            v_uint32 v_int0, v_int1;
1362
            v_expand(v_int, v_int0, v_int1);
1363

1364
            v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
1365
            v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
1366
            v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
1367
            v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
1368

1369
            v_float64 v_dst0 = vx_load(dst + x);
1370
            v_float64 v_dst1 = vx_load(dst + x + step);
1371
            v_float64 v_dst2 = vx_load(dst + x + step * 2);
1372
            v_float64 v_dst3 = vx_load(dst + x + step * 3);
1373

1374
            v_dst0 = v_fma(v_src0, v_src0, v_dst0);
1375
            v_dst1 = v_fma(v_src1, v_src1, v_dst1);
1376
            v_dst2 = v_fma(v_src2, v_src2, v_dst2);
1377
            v_dst3 = v_fma(v_src3, v_src3, v_dst3);
1378

1379
            v_store(dst + x, v_dst0);
1380
            v_store(dst + x + step, v_dst1);
1381
            v_store(dst + x + step * 2, v_dst2);
1382
            v_store(dst + x + step * 3, v_dst3);
1383
        }
1384
    }
1385
    else
1386
    {
1387
        v_uint16 v_0 = vx_setzero_u16();
1388
        if (cn == 1)
1389
        {
1390
            for (; x <= len - cVectorWidth; x += cVectorWidth)
1391
            {
1392
                v_uint16 v_mask = vx_load_expand(mask + x);
1393
                v_mask = ~(v_mask == v_0);
1394
                v_uint16 v_src = vx_load_expand(src + x);
1395
                v_uint16 v_int = v_src & v_mask;
1396

1397
                v_uint32 v_int0, v_int1;
1398
                v_expand(v_int, v_int0, v_int1);
1399

1400
                v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
1401
                v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
1402
                v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
1403
                v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
1404

1405
                v_float64 v_dst0 = vx_load(dst + x);
1406
                v_float64 v_dst1 = vx_load(dst + x + step);
1407
                v_float64 v_dst2 = vx_load(dst + x + step * 2);
1408
                v_float64 v_dst3 = vx_load(dst + x + step * 3);
1409

1410
                v_dst0 = v_fma(v_src0, v_src0, v_dst0);
1411
                v_dst1 = v_fma(v_src1, v_src1, v_dst1);
1412
                v_dst2 = v_fma(v_src2, v_src2, v_dst2);
1413
                v_dst3 = v_fma(v_src3, v_src3, v_dst3);
1414

1415
                v_store(dst + x, v_dst0);
1416
                v_store(dst + x + step, v_dst1);
1417
                v_store(dst + x + step * 2, v_dst2);
1418
                v_store(dst + x + step * 3, v_dst3);
1419
            }
1420
        }
1421
        else if (cn == 3)
1422
        {
1423
            for (; x <= len - cVectorWidth * 2; x += cVectorWidth)
1424
            {
1425
                v_uint8 v_src0, v_src1, v_src2;
1426
                v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
1427

1428
                v_uint16 v_int0 = v_expand_low(v_src0);
1429
                v_uint16 v_int1 = v_expand_low(v_src1);
1430
                v_uint16 v_int2 = v_expand_low(v_src2);
1431

1432
                v_uint16 v_mask = vx_load_expand(mask + x);
1433
                v_mask = ~(v_mask == v_0);
1434
                v_int0 = v_int0 & v_mask;
1435
                v_int1 = v_int1 & v_mask;
1436
                v_int2 = v_int2 & v_mask;
1437

1438
                v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
1439
                v_expand(v_int0, v_int00, v_int01);
1440
                v_expand(v_int1, v_int10, v_int11);
1441
                v_expand(v_int2, v_int20, v_int21);
1442

1443
                v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
1444
                v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
1445
                v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
1446
                v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
1447
                v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
1448
                v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
1449
                v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
1450
                v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
1451
                v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
1452
                v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
1453
                v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
1454
                v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
1455

1456
                v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
1457
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1458
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1459
                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
1460
                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
1461

1462
                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
1463
                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
1464
                v_dst02 = v_fma(v_src02, v_src02, v_dst02);
1465
                v_dst03 = v_fma(v_src03, v_src03, v_dst03);
1466
                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
1467
                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
1468
                v_dst12 = v_fma(v_src12, v_src12, v_dst12);
1469
                v_dst13 = v_fma(v_src13, v_src13, v_dst13);
1470
                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
1471
                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
1472
                v_dst22 = v_fma(v_src22, v_src22, v_dst22);
1473
                v_dst23 = v_fma(v_src23, v_src23, v_dst23);
1474

1475
                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1476
                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1477
                v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
1478
                v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
1479
            }
1480
        }
1481
    }
1482
#endif // CV_SIMD_64F
1483
    accSqr_general_(src, dst, mask, len, cn, x);
1484
}
1485

1486
void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)
1487
{
1488
    int x = 0;
1489
#if CV_SIMD_64F
1490
    const int cVectorWidth = v_uint16::nlanes;
1491
    const int step = v_float64::nlanes;
1492

1493
    if (!mask)
1494
    {
1495
        int size = len * cn;
1496
        for (; x <= size - cVectorWidth; x += cVectorWidth)
1497
        {
1498
            v_uint16 v_src  = vx_load(src + x);
1499
            v_uint32 v_int_0, v_int_1;
1500
            v_expand(v_src, v_int_0, v_int_1);
1501

1502
            v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);
1503
            v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);
1504

1505
            v_float64 v_src0 = v_cvt_f64(v_int0);
1506
            v_float64 v_src1 = v_cvt_f64_high(v_int0);
1507
            v_float64 v_src2 = v_cvt_f64(v_int1);
1508
            v_float64 v_src3 = v_cvt_f64_high(v_int1);
1509

1510
            v_float64 v_dst0 = vx_load(dst + x);
1511
            v_float64 v_dst1 = vx_load(dst + x + step);
1512
            v_float64 v_dst2 = vx_load(dst + x + step * 2);
1513
            v_float64 v_dst3 = vx_load(dst + x + step * 3);
1514

1515
            v_dst0 = v_fma(v_src0, v_src0, v_dst0);
1516
            v_dst1 = v_fma(v_src1, v_src1, v_dst1);
1517
            v_dst2 = v_fma(v_src2, v_src2, v_dst2);
1518
            v_dst3 = v_fma(v_src3, v_src3, v_dst3);
1519

1520
            v_store(dst + x, v_dst0);
1521
            v_store(dst + x + step, v_dst1);
1522
            v_store(dst + x + step * 2, v_dst2);
1523
            v_store(dst + x + step * 3, v_dst3);
1524
        }
1525
    }
1526
    else
1527
    {
1528
        v_uint16 v_0 = vx_setzero_u16();
1529
        if (cn == 1)
1530
        {
1531
            for (; x <= len - cVectorWidth; x += cVectorWidth)
1532
            {
1533
                v_uint16 v_mask = vx_load_expand(mask + x);
1534
                v_mask = ~(v_mask == v_0);
1535
                v_uint16 v_src = vx_load(src + x);
1536
                v_src = v_src & v_mask;
1537
                v_uint32 v_int_0, v_int_1;
1538
                v_expand(v_src, v_int_0, v_int_1);
1539

1540
                v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);
1541
                v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);
1542

1543
                v_float64 v_src0 = v_cvt_f64(v_int0);
1544
                v_float64 v_src1 = v_cvt_f64_high(v_int0);
1545
                v_float64 v_src2 = v_cvt_f64(v_int1);
1546
                v_float64 v_src3 = v_cvt_f64_high(v_int1);
1547

1548
                v_float64 v_dst0 = vx_load(dst + x);
1549
                v_float64 v_dst1 = vx_load(dst + x + step);
1550
                v_float64 v_dst2 = vx_load(dst + x + step * 2);
1551
                v_float64 v_dst3 = vx_load(dst + x + step * 3);
1552

1553
                v_dst0 = v_fma(v_src0, v_src0, v_dst0);
1554
                v_dst1 = v_fma(v_src1, v_src1, v_dst1);
1555
                v_dst2 = v_fma(v_src2, v_src2, v_dst2);
1556
                v_dst3 = v_fma(v_src3, v_src3, v_dst3);
1557

1558
                v_store(dst + x, v_dst0);
1559
                v_store(dst + x + step, v_dst1);
1560
                v_store(dst + x + step * 2, v_dst2);
1561
                v_store(dst + x + step * 3, v_dst3);
1562
            }
1563
        }
1564
        else if (cn == 3)
1565
        {
1566
            for (; x <= len - cVectorWidth; x += cVectorWidth)
1567
            {
1568
                v_uint16 v_mask = vx_load_expand(mask + x);
1569
                v_mask = ~(v_mask == v_0);
1570
                v_uint16 v_src0, v_src1, v_src2;
1571
                v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
1572
                v_src0 = v_src0 & v_mask;
1573
                v_src1 = v_src1 & v_mask;
1574
                v_src2 = v_src2 & v_mask;
1575
                v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
1576
                v_expand(v_src0, v_int00, v_int01);
1577
                v_expand(v_src1, v_int10, v_int11);
1578
                v_expand(v_src2, v_int20, v_int21);
1579

1580
                v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
1581
                v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
1582
                v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
1583
                v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
1584
                v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
1585
                v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
1586
                v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
1587
                v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
1588
                v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
1589
                v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
1590
                v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
1591
                v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
1592

1593
                v_float64 v_dst00, v_dst01, v_dst02, v_dst03;
1594
                v_float64 v_dst10, v_dst11, v_dst12, v_dst13;
1595
                v_float64 v_dst20, v_dst21, v_dst22, v_dst23;
1596
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1597
                v_load_deinterleave(dst + (x + step)* cn, v_dst01, v_dst11, v_dst21);
1598
                v_load_deinterleave(dst + (x + step * 2)* cn, v_dst02, v_dst12, v_dst22);
1599
                v_load_deinterleave(dst + (x + step * 3)* cn, v_dst03, v_dst13, v_dst23);
1600

1601
                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
1602
                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
1603
                v_dst02 = v_fma(v_src02, v_src02, v_dst02);
1604
                v_dst03 = v_fma(v_src03, v_src03, v_dst03);
1605
                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
1606
                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
1607
                v_dst12 = v_fma(v_src12, v_src12, v_dst12);
1608
                v_dst13 = v_fma(v_src13, v_src13, v_dst13);
1609
                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
1610
                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
1611
                v_dst22 = v_fma(v_src22, v_src22, v_dst22);
1612
                v_dst23 = v_fma(v_src23, v_src23, v_dst23);
1613

1614
                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1615
                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1616
                v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
1617
                v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
1618
            }
1619
        }
1620
    }
1621
#endif // CV_SIMD_64F
1622
    accSqr_general_(src, dst, mask, len, cn, x);
1623
}
1624

1625
void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
1626
{
1627
    int x = 0;
1628
#if CV_SIMD_64F
1629
    const int cVectorWidth = v_float32::nlanes;
1630
    const int step = v_float64::nlanes;
1631

1632
    if (!mask)
1633
    {
1634
        int size = len * cn;
1635
        #if CV_AVX && !CV_AVX2
1636
        for (; x <= size - 8 ; x += 8)
1637
        {
1638
            __m256 v_src = _mm256_loadu_ps(src + x);
1639
            __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0));
1640
            __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1));
1641
            __m256d v_dst0 = _mm256_loadu_pd(dst + x);
1642
            __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
1643
            v_src0 = _mm256_mul_pd(v_src0, v_src0);
1644
            v_src1 = _mm256_mul_pd(v_src1, v_src1);
1645
            v_dst0 = _mm256_add_pd(v_src0, v_dst0);
1646
            v_dst1 = _mm256_add_pd(v_src1, v_dst1);
1647
            _mm256_storeu_pd(dst + x, v_dst0);
1648
            _mm256_storeu_pd(dst + x + 4, v_dst1);
1649
        }
1650
        #else
1651
        for (; x <= size - cVectorWidth; x += cVectorWidth)
1652
        {
1653
            v_float32 v_src = vx_load(src + x);
1654
            v_float64 v_src0 = v_cvt_f64(v_src);
1655
            v_float64 v_src1 = v_cvt_f64_high(v_src);
1656

1657
            v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
1658
            v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
1659
        }
1660
        #endif // CV_AVX && !CV_AVX2
1661
    }
1662
    else
1663
    {
1664
        v_uint32 v_0 = vx_setzero_u32();
1665
        if (cn == 1)
1666
        {
1667
            for (; x <= len - cVectorWidth; x += cVectorWidth)
1668
            {
1669
                v_uint32 v_mask = vx_load_expand_q(mask + x);;
1670
                v_mask = ~(v_mask == v_0);
1671
                v_float32 v_src = vx_load(src + x);
1672
                v_src = v_src & v_reinterpret_as_f32(v_mask);
1673
                v_float64 v_src0 = v_cvt_f64(v_src);
1674
                v_float64 v_src1 = v_cvt_f64_high(v_src);
1675

1676
                v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
1677
                v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
1678
            }
1679
        }
1680
        else if (cn == 3)
1681
        {
1682
            for (; x <= len - cVectorWidth; x += cVectorWidth)
1683
            {
1684
                v_uint32 v_mask = vx_load_expand_q(mask + x);
1685
                v_mask = ~(v_mask == v_0);
1686

1687
                v_float32 v_src0, v_src1, v_src2;
1688
                v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
1689
                v_src0 = v_src0 & v_reinterpret_as_f32(v_mask);
1690
                v_src1 = v_src1 & v_reinterpret_as_f32(v_mask);
1691
                v_src2 = v_src2 & v_reinterpret_as_f32(v_mask);
1692

1693
                v_float64 v_src00 = v_cvt_f64(v_src0);
1694
                v_float64 v_src01 = v_cvt_f64_high(v_src0);
1695
                v_float64 v_src10 = v_cvt_f64(v_src1);
1696
                v_float64 v_src11 = v_cvt_f64_high(v_src1);
1697
                v_float64 v_src20 = v_cvt_f64(v_src2);
1698
                v_float64 v_src21 = v_cvt_f64_high(v_src2);
1699

1700
                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
1701
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1702
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1703

1704
                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
1705
                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
1706
                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
1707
                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
1708
                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
1709
                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
1710

1711
                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1712
                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1713
            }
1714
        }
1715
    }
1716
#endif // CV_SIMD_64F
1717
    accSqr_general_(src, dst, mask, len, cn, x);
1718
}
1719

1720
void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
1721
{
1722
    int x = 0;
1723
#if CV_SIMD_64F
1724
    const int cVectorWidth = v_float64::nlanes * 2;
1725
    const int step = v_float64::nlanes;
1726

1727
    if (!mask)
1728
    {
1729
        int size = len * cn;
1730
        #if CV_AVX && !CV_AVX2
1731
        for (; x <= size - 4 ; x += 4)
1732
        {
1733
            __m256d v_src = _mm256_loadu_pd(src + x);
1734
            __m256d v_dst = _mm256_loadu_pd(dst + x);
1735
            v_src = _mm256_mul_pd(v_src, v_src);
1736
            v_dst = _mm256_add_pd(v_dst, v_src);
1737
            _mm256_storeu_pd(dst + x, v_dst);
1738
        }
1739
        #else
1740
        for (; x <= size - cVectorWidth; x += cVectorWidth)
1741
        {
1742
            v_float64 v_src0 = vx_load(src + x);
1743
            v_float64 v_src1 = vx_load(src + x + step);
1744
            v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
1745
            v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
1746
        }
1747
        #endif // CV_AVX && !CV_AVX2
1748
    }
1749
    else
1750
    {
1751
        v_uint64 v_0 = vx_setzero_u64();
1752
        if (cn == 1)
1753
        {
1754
            for (; x <= len - cVectorWidth; x += cVectorWidth)
1755
            {
1756
                v_uint32 v_mask32 = vx_load_expand_q(mask + x);
1757
                v_uint64 v_masku640, v_masku641;
1758
                v_expand(v_mask32, v_masku640, v_masku641);
1759
                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
1760
                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
1761
                v_float64 v_src0 = vx_load(src + x);
1762
                v_float64 v_src1 = vx_load(src + x + step);
1763
                v_src0 = v_src0 & v_mask0;
1764
                v_src1 = v_src1 & v_mask1;
1765
                v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
1766
                v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
1767
            }
1768
        }
1769
        else if (cn == 3)
1770
        {
1771
            for (; x <= len - cVectorWidth; x += cVectorWidth)
1772
            {
1773
                v_uint32 v_mask32 = vx_load_expand_q(mask + x);
1774
                v_uint64 v_masku640, v_masku641;
1775
                v_expand(v_mask32, v_masku640, v_masku641);
1776
                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
1777
                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
1778

1779
                v_float64 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
1780
                v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
1781
                v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
1782
                v_src00 = v_src00 & v_mask0;
1783
                v_src01 = v_src01 & v_mask1;
1784
                v_src10 = v_src10 & v_mask0;
1785
                v_src11 = v_src11 & v_mask1;
1786
                v_src20 = v_src20 & v_mask0;
1787
                v_src21 = v_src21 & v_mask1;
1788

1789
                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
1790
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1791
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1792

1793
                v_dst00 = v_fma(v_src00, v_src00, v_dst00);
1794
                v_dst01 = v_fma(v_src01, v_src01, v_dst01);
1795
                v_dst10 = v_fma(v_src10, v_src10, v_dst10);
1796
                v_dst11 = v_fma(v_src11, v_src11, v_dst11);
1797
                v_dst20 = v_fma(v_src20, v_src20, v_dst20);
1798
                v_dst21 = v_fma(v_src21, v_src21, v_dst21);
1799

1800
                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1801
                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1802
            }
1803
        }
1804
    }
1805
#endif // CV_SIMD_64F
1806
    accSqr_general_(src, dst, mask, len, cn, x);
1807
}
1808

1809
// product accumulate optimized by universal intrinsic
1810
void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar* mask, int len, int cn)
1811
{
1812
    int x = 0;
1813
#if CV_SIMD
1814
    const int cVectorWidth = v_uint8::nlanes;
1815
    const int step = v_uint32::nlanes;
1816

1817
    if (!mask)
1818
    {
1819
        int size = len * cn;
1820
        for (; x <= size - cVectorWidth; x += cVectorWidth)
1821
        {
1822
            v_uint8 v_1src = vx_load(src1 + x);
1823
            v_uint8 v_2src = vx_load(src2 + x);
1824

1825
            v_uint16 v_src0, v_src1;
1826
            v_mul_expand(v_1src, v_2src, v_src0, v_src1);
1827

1828
            v_uint32 v_src00, v_src01, v_src10, v_src11;
1829
            v_expand(v_src0, v_src00, v_src01);
1830
            v_expand(v_src1, v_src10, v_src11);
1831

1832
            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
1833
            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
1834
            v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
1835
            v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
1836
        }
1837
    }
1838
    else
1839
    {
1840
        v_uint8 v_0 = vx_setzero_u8();
1841
        if (cn == 1)
1842
        {
1843
            for (; x <= len - cVectorWidth; x += cVectorWidth)
1844
            {
1845
                v_uint8 v_mask = vx_load(mask + x);
1846
                v_mask = ~(v_mask == v_0);
1847
                v_uint8 v_1src = vx_load(src1 + x);
1848
                v_uint8 v_2src = vx_load(src2 + x);
1849
                v_1src = v_1src & v_mask;
1850
                v_2src = v_2src & v_mask;
1851

1852
                v_uint16 v_src0, v_src1;
1853
                v_mul_expand(v_1src, v_2src, v_src0, v_src1);
1854

1855
                v_uint32 v_src00, v_src01, v_src10, v_src11;
1856
                v_expand(v_src0, v_src00, v_src01);
1857
                v_expand(v_src1, v_src10, v_src11);
1858

1859
                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
1860
                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
1861
                v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
1862
                v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
1863
            }
1864
        }
1865
        else if (cn == 3)
1866
        {
1867
            for (; x <= len - cVectorWidth; x += cVectorWidth)
1868
            {
1869
                v_uint8 v_mask = vx_load(mask + x);
1870
                v_mask = ~(v_mask == v_0);
1871
                v_uint8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
1872
                v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
1873
                v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
1874
                v_1src0 = v_1src0 & v_mask;
1875
                v_1src1 = v_1src1 & v_mask;
1876
                v_1src2 = v_1src2 & v_mask;
1877
                v_2src0 = v_2src0 & v_mask;
1878
                v_2src1 = v_2src1 & v_mask;
1879
                v_2src2 = v_2src2 & v_mask;
1880

1881
                v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
1882
                v_mul_expand(v_1src0, v_2src0, v_src00, v_src01);
1883
                v_mul_expand(v_1src1, v_2src1, v_src10, v_src11);
1884
                v_mul_expand(v_1src2, v_2src2, v_src20, v_src21);
1885

1886
                v_uint32 v_src000, v_src001, v_src002, v_src003, v_src100, v_src101, v_src102, v_src103, v_src200, v_src201, v_src202, v_src203;
1887
                v_expand(v_src00, v_src000, v_src001);
1888
                v_expand(v_src01, v_src002, v_src003);
1889
                v_expand(v_src10, v_src100, v_src101);
1890
                v_expand(v_src11, v_src102, v_src103);
1891
                v_expand(v_src20, v_src200, v_src201);
1892
                v_expand(v_src21, v_src202, v_src203);
1893

1894
                v_float32 v_dst000, v_dst001, v_dst002, v_dst003, v_dst100, v_dst101, v_dst102, v_dst103, v_dst200, v_dst201, v_dst202, v_dst203;
1895
                v_load_deinterleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
1896
                v_load_deinterleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
1897
                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst002, v_dst102, v_dst202);
1898
                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst003, v_dst103, v_dst203);
1899
                v_dst000 = v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000));
1900
                v_dst001 = v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001));
1901
                v_dst002 = v_dst002 + v_cvt_f32(v_reinterpret_as_s32(v_src002));
1902
                v_dst003 = v_dst003 + v_cvt_f32(v_reinterpret_as_s32(v_src003));
1903
                v_dst100 = v_dst100 + v_cvt_f32(v_reinterpret_as_s32(v_src100));
1904
                v_dst101 = v_dst101 + v_cvt_f32(v_reinterpret_as_s32(v_src101));
1905
                v_dst102 = v_dst102 + v_cvt_f32(v_reinterpret_as_s32(v_src102));
1906
                v_dst103 = v_dst103 + v_cvt_f32(v_reinterpret_as_s32(v_src103));
1907
                v_dst200 = v_dst200 + v_cvt_f32(v_reinterpret_as_s32(v_src200));
1908
                v_dst201 = v_dst201 + v_cvt_f32(v_reinterpret_as_s32(v_src201));
1909
                v_dst202 = v_dst202 + v_cvt_f32(v_reinterpret_as_s32(v_src202));
1910
                v_dst203 = v_dst203 + v_cvt_f32(v_reinterpret_as_s32(v_src203));
1911

1912
                v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
1913
                v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
1914
                v_store_interleave(dst + (x + step * 2) * cn, v_dst002, v_dst102, v_dst202);
1915
                v_store_interleave(dst + (x + step * 3) * cn, v_dst003, v_dst103, v_dst203);
1916
            }
1917
        }
1918
    }
1919
#endif // CV_SIMD
1920
    accProd_general_(src1, src2, dst, mask, len, cn, x);
1921
}
1922

1923
void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uchar* mask, int len, int cn)
1924
{
1925
    int x = 0;
1926
#if CV_SIMD
1927
    const int cVectorWidth = v_uint16::nlanes;
1928
    const int step = v_float32::nlanes;
1929

1930
    if (!mask)
1931
    {
1932
        int size = len * cn;
1933
        for (; x <= size - cVectorWidth; x += cVectorWidth)
1934
        {
1935
            v_uint16 v_1src = vx_load(src1 + x);
1936
            v_uint16 v_2src = vx_load(src2 + x);
1937

1938
            v_uint32 v_1src0, v_1src1, v_2src0, v_2src1;
1939
            v_expand(v_1src, v_1src0, v_1src1);
1940
            v_expand(v_2src, v_2src0, v_2src1);
1941

1942
            v_float32 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0));
1943
            v_float32 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1));
1944
            v_float32 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0));
1945
            v_float32 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1));
1946

1947
            v_store(dst + x, v_fma(v_1float0, v_2float0, vx_load(dst + x)));
1948
            v_store(dst + x + step, v_fma(v_1float1, v_2float1, vx_load(dst + x + step)));
1949
        }
1950
    }
1951
    else
1952
    {
1953
        v_uint16 v_0 = vx_setzero_u16();
1954
        if (cn == 1)
1955
        {
1956
            for (; x <= len - cVectorWidth; x += cVectorWidth)
1957
            {
1958
                v_uint16 v_mask = vx_load_expand(mask + x);
1959
                v_mask = ~(v_0 == v_mask);
1960

1961
                v_uint16 v_1src = vx_load(src1 + x) & v_mask;
1962
                v_uint16 v_2src = vx_load(src2 + x) & v_mask;
1963

1964
                v_uint32 v_1src0, v_1src1, v_2src0, v_2src1;
1965
                v_expand(v_1src, v_1src0, v_1src1);
1966
                v_expand(v_2src, v_2src0, v_2src1);
1967

1968
                v_float32 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0));
1969
                v_float32 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1));
1970
                v_float32 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0));
1971
                v_float32 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1));
1972

1973
                v_store(dst + x, v_fma(v_1float0, v_2float0, vx_load(dst + x)));
1974
                v_store(dst + x + step, v_fma(v_1float1, v_2float1, vx_load(dst + x + step)));
1975
            }
1976
        }
1977
        else if (cn == 3)
1978
        {
1979
            for (; x <= len - cVectorWidth; x += cVectorWidth)
1980
            {
1981
                v_uint16 v_mask = vx_load_expand(mask + x);
1982
                v_mask = ~(v_0 == v_mask);
1983

1984
                v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
1985
                v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
1986
                v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
1987
                v_1src0 = v_1src0 & v_mask;
1988
                v_1src1 = v_1src1 & v_mask;
1989
                v_1src2 = v_1src2 & v_mask;
1990
                v_2src0 = v_2src0 & v_mask;
1991
                v_2src1 = v_2src1 & v_mask;
1992
                v_2src2 = v_2src2 & v_mask;
1993

1994
                v_uint32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
1995
                v_expand(v_1src0, v_1src00, v_1src01);
1996
                v_expand(v_1src1, v_1src10, v_1src11);
1997
                v_expand(v_1src2, v_1src20, v_1src21);
1998
                v_expand(v_2src0, v_2src00, v_2src01);
1999
                v_expand(v_2src1, v_2src10, v_2src11);
2000
                v_expand(v_2src2, v_2src20, v_2src21);
2001

2002
                v_float32 v_1float00 = v_cvt_f32(v_reinterpret_as_s32(v_1src00));
2003
                v_float32 v_1float01 = v_cvt_f32(v_reinterpret_as_s32(v_1src01));
2004
                v_float32 v_1float10 = v_cvt_f32(v_reinterpret_as_s32(v_1src10));
2005
                v_float32 v_1float11 = v_cvt_f32(v_reinterpret_as_s32(v_1src11));
2006
                v_float32 v_1float20 = v_cvt_f32(v_reinterpret_as_s32(v_1src20));
2007
                v_float32 v_1float21 = v_cvt_f32(v_reinterpret_as_s32(v_1src21));
2008
                v_float32 v_2float00 = v_cvt_f32(v_reinterpret_as_s32(v_2src00));
2009
                v_float32 v_2float01 = v_cvt_f32(v_reinterpret_as_s32(v_2src01));
2010
                v_float32 v_2float10 = v_cvt_f32(v_reinterpret_as_s32(v_2src10));
2011
                v_float32 v_2float11 = v_cvt_f32(v_reinterpret_as_s32(v_2src11));
2012
                v_float32 v_2float20 = v_cvt_f32(v_reinterpret_as_s32(v_2src20));
2013
                v_float32 v_2float21 = v_cvt_f32(v_reinterpret_as_s32(v_2src21));
2014

2015
                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
2016
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2017
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2018

2019
                v_dst00 = v_fma(v_1float00, v_2float00, v_dst00);
2020
                v_dst01 = v_fma(v_1float01, v_2float01, v_dst01);
2021
                v_dst10 = v_fma(v_1float10, v_2float10, v_dst10);
2022
                v_dst11 = v_fma(v_1float11, v_2float11, v_dst11);
2023
                v_dst20 = v_fma(v_1float20, v_2float20, v_dst20);
2024
                v_dst21 = v_fma(v_1float21, v_2float21, v_dst21);
2025

2026
                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2027
                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2028
            }
2029
        }
2030
    }
2031
#endif // CV_SIMD
2032
    accProd_general_(src1, src2, dst, mask, len, cn, x);
2033
}
2034

2035
void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn)
2036
{
2037
    int x = 0;
2038
#if CV_SIMD
2039
    const int cVectorWidth = v_uint16::nlanes;
2040
    const int step = v_float32::nlanes;
2041

2042
    if (!mask)
2043
    {
2044
        int size = len * cn;
2045
        #if CV_AVX && !CV_AVX2
2046
        for (; x <= size - 8 ; x += 8)
2047
        {
2048
            __m256 v_src0 = _mm256_loadu_ps(src1 + x);
2049
            __m256 v_src1 = _mm256_loadu_ps(src2 + x);
2050
            __m256 v_dst = _mm256_loadu_ps(dst + x);
2051
            __m256 v_src = _mm256_mul_ps(v_src0, v_src1);
2052
            v_dst = _mm256_add_ps(v_src, v_dst);
2053
            _mm256_storeu_ps(dst + x, v_dst);
2054
        }
2055
        #else
2056
        for (; x <= size - cVectorWidth; x += cVectorWidth)
2057
        {
2058
            v_store(dst + x, v_fma(vx_load(src1 + x), vx_load(src2 + x), vx_load(dst + x)));
2059
            v_store(dst + x + step, v_fma(vx_load(src1 + x + step), vx_load(src2 + x + step), vx_load(dst + x + step)));
2060
        }
2061
        #endif // CV_AVX && !CV_AVX2
2062
    }
2063
    else
2064
    {
2065
        v_uint32 v_0 = vx_setzero_u32();
2066
        if (cn == 1)
2067
        {
2068
            for (; x <= len - cVectorWidth; x += cVectorWidth)
2069
            {
2070
                v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);
2071
                v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);
2072
                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
2073
                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
2074

2075
                v_store(dst + x, vx_load(dst + x) + ((vx_load(src1 + x) * vx_load(src2 + x)) & v_mask0));
2076
                v_store(dst + x + step, vx_load(dst + x + step) + ((vx_load(src1 + x + step) * vx_load(src2 + x + step)) & v_mask1));
2077
            }
2078
        }
2079
        else if (cn == 3)
2080
        {
2081
            for (; x <= len - cVectorWidth; x += cVectorWidth)
2082
            {
2083
                v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);
2084
                v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);
2085
                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
2086
                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
2087

2088
                v_float32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
2089
                v_float32 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
2090
                v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20);
2091
                v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20);
2092
                v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21);
2093
                v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21);
2094

2095
                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
2096
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2097
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2098

2099
                v_store_interleave(dst + x * cn, v_dst00 + ((v_1src00 * v_2src00) & v_mask0), v_dst10 + ((v_1src10 * v_2src10) & v_mask0), v_dst20 + ((v_1src20 * v_2src20) & v_mask0));
2100
                v_store_interleave(dst + (x + step) * cn, v_dst01 + ((v_1src01 * v_2src01) & v_mask1), v_dst11 + ((v_1src11 * v_2src11) & v_mask1), v_dst21 + ((v_1src21 * v_2src21) & v_mask1));
2101
            }
2102
        }
2103
    }
2104
#endif // CV_SIMD
2105
    accProd_general_(src1, src2, dst, mask, len, cn, x);
2106
}
2107

2108
void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn)
2109
{
2110
    int x = 0;
2111
#if CV_SIMD_64F
2112
    const int cVectorWidth = v_uint16::nlanes;
2113
    const int step = v_float64::nlanes;
2114

2115
    if (!mask)
2116
    {
2117
        int size = len * cn;
2118
        for (; x <= size - cVectorWidth; x += cVectorWidth)
2119
        {
2120
            v_uint16 v_1int  = vx_load_expand(src1 + x);
2121
            v_uint16 v_2int  = vx_load_expand(src2 + x);
2122

2123
            v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
2124
            v_expand(v_1int, v_1int_0, v_1int_1);
2125
            v_expand(v_2int, v_2int_0, v_2int_1);
2126

2127
            v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);
2128
            v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);
2129
            v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);
2130
            v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);
2131

2132
            v_float64 v_dst0 = vx_load(dst + x);
2133
            v_float64 v_dst1 = vx_load(dst + x + step);
2134
            v_float64 v_dst2 = vx_load(dst + x + step * 2);
2135
            v_float64 v_dst3 = vx_load(dst + x + step * 3);
2136

2137
            v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);
2138
            v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);
2139
            v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);
2140
            v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);
2141

2142
            v_store(dst + x, v_dst0);
2143
            v_store(dst + x + step, v_dst1);
2144
            v_store(dst + x + step * 2, v_dst2);
2145
            v_store(dst + x + step * 3, v_dst3);
2146
        }
2147
    }
2148
    else
2149
    {
2150
        v_uint16 v_0 = vx_setzero_u16();
2151
        if (cn == 1)
2152
        {
2153
            for (; x <= len - cVectorWidth; x += cVectorWidth)
2154
            {
2155
                v_uint16 v_mask = vx_load_expand(mask + x);
2156
                v_mask = ~(v_mask == v_0);
2157
                v_uint16 v_1int = vx_load_expand(src1 + x) & v_mask;
2158
                v_uint16 v_2int = vx_load_expand(src2 + x) & v_mask;
2159

2160
                v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
2161
                v_expand(v_1int, v_1int_0, v_1int_1);
2162
                v_expand(v_2int, v_2int_0, v_2int_1);
2163

2164
                v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);
2165
                v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);
2166
                v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);
2167
                v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);
2168

2169
                v_float64 v_dst0 = vx_load(dst + x);
2170
                v_float64 v_dst1 = vx_load(dst + x + step);
2171
                v_float64 v_dst2 = vx_load(dst + x + step * 2);
2172
                v_float64 v_dst3 = vx_load(dst + x + step * 3);
2173

2174
                v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);
2175
                v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);
2176
                v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);
2177
                v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);
2178

2179
                v_store(dst + x, v_dst0);
2180
                v_store(dst + x + step, v_dst1);
2181
                v_store(dst + x + step * 2, v_dst2);
2182
                v_store(dst + x + step * 3, v_dst3);
2183
            }
2184
        }
2185
        else if (cn == 3)
2186
        {
2187
            for (; x <= len - cVectorWidth * 2; x += cVectorWidth)
2188
            {
2189
                v_uint8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
2190
                v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
2191
                v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
2192

2193
                v_uint16 v_1int0 = v_expand_low(v_1src0);
2194
                v_uint16 v_1int1 = v_expand_low(v_1src1);
2195
                v_uint16 v_1int2 = v_expand_low(v_1src2);
2196
                v_uint16 v_2int0 = v_expand_low(v_2src0);
2197
                v_uint16 v_2int1 = v_expand_low(v_2src1);
2198
                v_uint16 v_2int2 = v_expand_low(v_2src2);
2199

2200
                v_uint16 v_mask = vx_load_expand(mask + x);
2201
                v_mask = ~(v_mask == v_0);
2202
                v_1int0 = v_1int0 & v_mask;
2203
                v_1int1 = v_1int1 & v_mask;
2204
                v_1int2 = v_1int2 & v_mask;
2205
                v_2int0 = v_2int0 & v_mask;
2206
                v_2int1 = v_2int1 & v_mask;
2207
                v_2int2 = v_2int2 & v_mask;
2208

2209
                v_uint32 v_1int00, v_1int01, v_1int10, v_1int11, v_1int20, v_1int21;
2210
                v_uint32 v_2int00, v_2int01, v_2int10, v_2int11, v_2int20, v_2int21;
2211
                v_expand(v_1int0, v_1int00, v_1int01);
2212
                v_expand(v_1int1, v_1int10, v_1int11);
2213
                v_expand(v_1int2, v_1int20, v_1int21);
2214
                v_expand(v_2int0, v_2int00, v_2int01);
2215
                v_expand(v_2int1, v_2int10, v_2int11);
2216
                v_expand(v_2int2, v_2int20, v_2int21);
2217

2218
                v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
2219
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2220
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2221
                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
2222
                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
2223

2224
                v_dst00 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int00)), v_cvt_f64(v_reinterpret_as_s32(v_2int00)), v_dst00);
2225
                v_dst01 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int00)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int00)), v_dst01);
2226
                v_dst02 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int01)), v_cvt_f64(v_reinterpret_as_s32(v_2int01)), v_dst02);
2227
                v_dst03 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int01)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int01)), v_dst03);
2228
                v_dst10 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int10)), v_cvt_f64(v_reinterpret_as_s32(v_2int10)), v_dst10);
2229
                v_dst11 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int10)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int10)), v_dst11);
2230
                v_dst12 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int11)), v_cvt_f64(v_reinterpret_as_s32(v_2int11)), v_dst12);
2231
                v_dst13 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int11)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int11)), v_dst13);
2232
                v_dst20 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int20)), v_cvt_f64(v_reinterpret_as_s32(v_2int20)), v_dst20);
2233
                v_dst21 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int20)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int20)), v_dst21);
2234
                v_dst22 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int21)), v_cvt_f64(v_reinterpret_as_s32(v_2int21)), v_dst22);
2235
                v_dst23 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int21)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int21)), v_dst23);
2236

2237
                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2238
                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2239
                v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
2240
                v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
2241
            }
2242
        }
2243
    }
2244
#endif // CV_SIMD_64F
2245
    accProd_general_(src1, src2, dst, mask, len, cn, x);
2246
}
2247

2248
void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn)
2249
{
2250
    int x = 0;
2251
#if CV_SIMD_64F
2252
    const int cVectorWidth = v_uint16::nlanes;
2253
    const int step = v_float64::nlanes;
2254

2255
    if (!mask)
2256
    {
2257
        int size = len * cn;
2258
        for (; x <= size - cVectorWidth; x += cVectorWidth)
2259
        {
2260
            v_uint16 v_1src  = vx_load(src1 + x);
2261
            v_uint16 v_2src  = vx_load(src2 + x);
2262

2263
            v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
2264
            v_expand(v_1src, v_1int_0, v_1int_1);
2265
            v_expand(v_2src, v_2int_0, v_2int_1);
2266

2267
            v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);
2268
            v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);
2269
            v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);
2270
            v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);
2271

2272
            v_float64 v_dst0 = vx_load(dst + x);
2273
            v_float64 v_dst1 = vx_load(dst + x + step);
2274
            v_float64 v_dst2 = vx_load(dst + x + step * 2);
2275
            v_float64 v_dst3 = vx_load(dst + x + step * 3);
2276

2277
            v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);
2278
            v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);
2279
            v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);
2280
            v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);
2281

2282
            v_store(dst + x, v_dst0);
2283
            v_store(dst + x + step, v_dst1);
2284
            v_store(dst + x + step * 2, v_dst2);
2285
            v_store(dst + x + step * 3, v_dst3);
2286
        }
2287
    }
2288
    else
2289
    {
2290
        v_uint16 v_0 = vx_setzero_u16();
2291
        if (cn == 1)
2292
        {
2293
            for (; x <= len - cVectorWidth; x += cVectorWidth)
2294
            {
2295
                v_uint16 v_mask = vx_load_expand(mask + x);
2296
                v_mask = ~(v_mask == v_0);
2297
                v_uint16 v_1src = vx_load(src1 + x);
2298
                v_uint16 v_2src = vx_load(src2 + x);
2299
                v_1src = v_1src & v_mask;
2300
                v_2src = v_2src & v_mask;
2301

2302
                v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
2303
                v_expand(v_1src, v_1int_0, v_1int_1);
2304
                v_expand(v_2src, v_2int_0, v_2int_1);
2305

2306
                v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);
2307
                v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);
2308
                v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);
2309
                v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);
2310

2311
                v_float64 v_dst0 = vx_load(dst + x);
2312
                v_float64 v_dst1 = vx_load(dst + x + step);
2313
                v_float64 v_dst2 = vx_load(dst + x + step * 2);
2314
                v_float64 v_dst3 = vx_load(dst + x + step * 3);
2315

2316
                v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);
2317
                v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);
2318
                v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);
2319
                v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);
2320

2321
                v_store(dst + x, v_dst0);
2322
                v_store(dst + x + step, v_dst1);
2323
                v_store(dst + x + step * 2, v_dst2);
2324
                v_store(dst + x + step * 3, v_dst3);
2325
            }
2326
        }
2327
        else if (cn == 3)
2328
        {
2329
            for (; x <= len - cVectorWidth; x += cVectorWidth)
2330
            {
2331
                v_uint16 v_mask = vx_load_expand(mask + x);
2332
                v_mask = ~(v_mask == v_0);
2333
                v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
2334
                v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
2335
                v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
2336
                v_1src0 = v_1src0 & v_mask;
2337
                v_1src1 = v_1src1 & v_mask;
2338
                v_1src2 = v_1src2 & v_mask;
2339
                v_2src0 = v_2src0 & v_mask;
2340
                v_2src1 = v_2src1 & v_mask;
2341
                v_2src2 = v_2src2 & v_mask;
2342

2343
                v_uint32 v_1int_00, v_1int_01, v_2int_00, v_2int_01;
2344
                v_uint32 v_1int_10, v_1int_11, v_2int_10, v_2int_11;
2345
                v_uint32 v_1int_20, v_1int_21, v_2int_20, v_2int_21;
2346
                v_expand(v_1src0, v_1int_00, v_1int_01);
2347
                v_expand(v_1src1, v_1int_10, v_1int_11);
2348
                v_expand(v_1src2, v_1int_20, v_1int_21);
2349
                v_expand(v_2src0, v_2int_00, v_2int_01);
2350
                v_expand(v_2src1, v_2int_10, v_2int_11);
2351
                v_expand(v_2src2, v_2int_20, v_2int_21);
2352

2353
                v_int32 v_1int00 = v_reinterpret_as_s32(v_1int_00);
2354
                v_int32 v_1int01 = v_reinterpret_as_s32(v_1int_01);
2355
                v_int32 v_1int10 = v_reinterpret_as_s32(v_1int_10);
2356
                v_int32 v_1int11 = v_reinterpret_as_s32(v_1int_11);
2357
                v_int32 v_1int20 = v_reinterpret_as_s32(v_1int_20);
2358
                v_int32 v_1int21 = v_reinterpret_as_s32(v_1int_21);
2359
                v_int32 v_2int00 = v_reinterpret_as_s32(v_2int_00);
2360
                v_int32 v_2int01 = v_reinterpret_as_s32(v_2int_01);
2361
                v_int32 v_2int10 = v_reinterpret_as_s32(v_2int_10);
2362
                v_int32 v_2int11 = v_reinterpret_as_s32(v_2int_11);
2363
                v_int32 v_2int20 = v_reinterpret_as_s32(v_2int_20);
2364
                v_int32 v_2int21 = v_reinterpret_as_s32(v_2int_21);
2365

2366
                v_float64 v_dst00, v_dst01, v_dst02, v_dst03;
2367
                v_float64 v_dst10, v_dst11, v_dst12, v_dst13;
2368
                v_float64 v_dst20, v_dst21, v_dst22, v_dst23;
2369
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2370
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2371
                v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
2372
                v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
2373

2374
                v_dst00 = v_fma(v_cvt_f64(v_1int00), v_cvt_f64(v_2int00), v_dst00);
2375
                v_dst01 = v_fma(v_cvt_f64_high(v_1int00), v_cvt_f64_high(v_2int00), v_dst01);
2376
                v_dst02 = v_fma(v_cvt_f64(v_1int01), v_cvt_f64(v_2int01), v_dst02);
2377
                v_dst03 = v_fma(v_cvt_f64_high(v_1int01), v_cvt_f64_high(v_2int01), v_dst03);
2378
                v_dst10 = v_fma(v_cvt_f64(v_1int10), v_cvt_f64(v_2int10), v_dst10);
2379
                v_dst11 = v_fma(v_cvt_f64_high(v_1int10), v_cvt_f64_high(v_2int10), v_dst11);
2380
                v_dst12 = v_fma(v_cvt_f64(v_1int11), v_cvt_f64(v_2int11), v_dst12);
2381
                v_dst13 = v_fma(v_cvt_f64_high(v_1int11), v_cvt_f64_high(v_2int11), v_dst13);
2382
                v_dst20 = v_fma(v_cvt_f64(v_1int20), v_cvt_f64(v_2int20), v_dst20);
2383
                v_dst21 = v_fma(v_cvt_f64_high(v_1int20), v_cvt_f64_high(v_2int20), v_dst21);
2384
                v_dst22 = v_fma(v_cvt_f64(v_1int21), v_cvt_f64(v_2int21), v_dst22);
2385
                v_dst23 = v_fma(v_cvt_f64_high(v_1int21), v_cvt_f64_high(v_2int21), v_dst23);
2386

2387
                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2388
                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2389
                v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
2390
                v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
2391
            }
2392
        }
2393
    }
2394
#endif // CV_SIMD_64F
2395
    accProd_general_(src1, src2, dst, mask, len, cn, x);
2396
}
2397

2398
void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn)
2399
{
2400
    int x = 0;
2401
#if CV_SIMD_64F
2402
    const int cVectorWidth = v_float32::nlanes;
2403
    const int step = v_float64::nlanes;
2404

2405
    if (!mask)
2406
    {
2407
        int size = len * cn;
2408
        #if CV_AVX && !CV_AVX2
2409
        for ( ; x <= size - 8 ; x += 8)
2410
        {
2411
            __m256 v_1src = _mm256_loadu_ps(src1 + x);
2412
            __m256 v_2src = _mm256_loadu_ps(src2 + x);
2413
            __m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,0));
2414
            __m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,1));
2415
            __m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,0));
2416
            __m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,1));
2417
            __m256d v_dst0 = _mm256_loadu_pd(dst + x);
2418
            __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
2419
            __m256d v_src0 = _mm256_mul_pd(v_src00, v_src10);
2420
            __m256d v_src1 = _mm256_mul_pd(v_src01, v_src11);
2421
            v_dst0 = _mm256_add_pd(v_src0, v_dst0);
2422
            v_dst1 = _mm256_add_pd(v_src1, v_dst1);
2423
            _mm256_storeu_pd(dst + x, v_dst0);
2424
            _mm256_storeu_pd(dst + x + 4, v_dst1);
2425
        }
2426
        #else
2427
        for (; x <= size - cVectorWidth; x += cVectorWidth)
2428
        {
2429
            v_float32 v_1src  = vx_load(src1 + x);
2430
            v_float32 v_2src  = vx_load(src2 + x);
2431

2432
            v_float64 v_1src0 = v_cvt_f64(v_1src);
2433
            v_float64 v_1src1 = v_cvt_f64_high(v_1src);
2434
            v_float64 v_2src0 = v_cvt_f64(v_2src);
2435
            v_float64 v_2src1 = v_cvt_f64_high(v_2src);
2436

2437
            v_store(dst + x, v_fma(v_1src0, v_2src0, vx_load(dst + x)));
2438
            v_store(dst + x + step, v_fma(v_1src1, v_2src1, vx_load(dst + x + step)));
2439
        }
2440
        #endif // CV_AVX && !CV_AVX2
2441
    }
2442
    else
2443
    {
2444
        v_uint32 v_0 = vx_setzero_u32();
2445
        if (cn == 1)
2446
        {
2447
            for (; x <= len - cVectorWidth; x += cVectorWidth)
2448
            {
2449
                v_uint32 v_mask = vx_load_expand_q(mask + x);
2450
                v_mask = ~(v_mask == v_0);
2451
                v_float32 v_1src = vx_load(src1 + x);
2452
                v_float32 v_2src = vx_load(src2 + x);
2453
                v_1src = v_1src & v_reinterpret_as_f32(v_mask);
2454
                v_2src = v_2src & v_reinterpret_as_f32(v_mask);
2455

2456
                v_float64 v_1src0 = v_cvt_f64(v_1src);
2457
                v_float64 v_1src1 = v_cvt_f64_high(v_1src);
2458
                v_float64 v_2src0 = v_cvt_f64(v_2src);
2459
                v_float64 v_2src1 = v_cvt_f64_high(v_2src);
2460

2461
                v_store(dst + x, v_fma(v_1src0, v_2src0, vx_load(dst + x)));
2462
                v_store(dst + x + step, v_fma(v_1src1, v_2src1, vx_load(dst + x + step)));
2463
            }
2464
        }
2465
        else if (cn == 3)
2466
        {
2467
            for (; x <= len - cVectorWidth; x += cVectorWidth)
2468
            {
2469
                v_uint32 v_mask = vx_load_expand_q(mask + x);
2470
                v_mask = ~(v_mask == v_0);
2471
                v_float32 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
2472
                v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
2473
                v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
2474
                v_1src0 = v_1src0 & v_reinterpret_as_f32(v_mask);
2475
                v_1src1 = v_1src1 & v_reinterpret_as_f32(v_mask);
2476
                v_1src2 = v_1src2 & v_reinterpret_as_f32(v_mask);
2477
                v_2src0 = v_2src0 & v_reinterpret_as_f32(v_mask);
2478
                v_2src1 = v_2src1 & v_reinterpret_as_f32(v_mask);
2479
                v_2src2 = v_2src2 & v_reinterpret_as_f32(v_mask);
2480

2481
                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
2482
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2483
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2484

2485
                v_dst00 = v_fma(v_cvt_f64(v_1src0), v_cvt_f64(v_2src0), v_dst00);
2486
                v_dst01 = v_fma(v_cvt_f64_high(v_1src0), v_cvt_f64_high(v_2src0), v_dst01);
2487
                v_dst10 = v_fma(v_cvt_f64(v_1src1), v_cvt_f64(v_2src1), v_dst10);
2488
                v_dst11 = v_fma(v_cvt_f64_high(v_1src1), v_cvt_f64_high(v_2src1), v_dst11);
2489
                v_dst20 = v_fma(v_cvt_f64(v_1src2), v_cvt_f64(v_2src2), v_dst20);
2490
                v_dst21 = v_fma(v_cvt_f64_high(v_1src2), v_cvt_f64_high(v_2src2), v_dst21);
2491

2492
                v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2493
                v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2494
            }
2495
        }
2496
    }
2497
#endif // CV_SIMD_64F
2498
    accProd_general_(src1, src2, dst, mask, len, cn, x);
2499
}
2500

2501
void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn)
2502
{
2503
    int x = 0;
2504
#if CV_SIMD_64F
2505
    const int cVectorWidth = v_float64::nlanes * 2;
2506
    const int step = v_float64::nlanes;
2507

2508
    if (!mask)
2509
    {
2510
        int size = len * cn;
2511
        #if CV_AVX && !CV_AVX2
2512
        for ( ; x <= size - 4 ; x += 4)
2513
        {
2514
            __m256d v_src0 = _mm256_loadu_pd(src1 + x);
2515
            __m256d v_src1 = _mm256_loadu_pd(src2 + x);
2516
            __m256d v_dst = _mm256_loadu_pd(dst + x);
2517
            v_src0 = _mm256_mul_pd(v_src0, v_src1);
2518
            v_dst = _mm256_add_pd(v_dst, v_src0);
2519
            _mm256_storeu_pd(dst + x, v_dst);
2520
        }
2521
        #else
2522
        for (; x <= size - cVectorWidth; x += cVectorWidth)
2523
        {
2524
            v_float64 v_src00 = vx_load(src1 + x);
2525
            v_float64 v_src01 = vx_load(src1 + x + step);
2526
            v_float64 v_src10 = vx_load(src2 + x);
2527
            v_float64 v_src11 = vx_load(src2 + x + step);
2528

2529
            v_store(dst + x, v_fma(v_src00, v_src10, vx_load(dst + x)));
2530
            v_store(dst + x + step, v_fma(v_src01, v_src11, vx_load(dst + x + step)));
2531
        }
2532
        #endif
2533
    }
2534
    else
2535
    {
2536
        // todo: try fma
2537
        v_uint64 v_0 = vx_setzero_u64();
2538
        if (cn == 1)
2539
        {
2540
            for (; x <= len - cVectorWidth; x += cVectorWidth)
2541
            {
2542
                v_uint32 v_mask32 = vx_load_expand_q(mask + x);
2543
                v_uint64 v_masku640, v_masku641;
2544
                v_expand(v_mask32, v_masku640, v_masku641);
2545
                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
2546
                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
2547

2548
                v_float64 v_src00 = vx_load(src1 + x);
2549
                v_float64 v_src01 = vx_load(src1 + x + step);
2550
                v_float64 v_src10 = vx_load(src2 + x);
2551
                v_float64 v_src11 = vx_load(src2 + x + step);
2552

2553
                v_store(dst + x, vx_load(dst + x) + ((v_src00 * v_src10) & v_mask0));
2554
                v_store(dst + x + step, vx_load(dst + x + step) + ((v_src01 * v_src11) & v_mask1));
2555
            }
2556
        }
2557
        else if (cn == 3)
2558
        {
2559
            for (; x <= len - cVectorWidth; x += cVectorWidth)
2560
            {
2561
                v_uint32 v_mask32 = vx_load_expand_q(mask + x);
2562
                v_uint64 v_masku640, v_masku641;
2563
                v_expand(v_mask32, v_masku640, v_masku641);
2564
                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
2565
                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
2566

2567
                v_float64 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
2568
                v_float64 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
2569
                v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20);
2570
                v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21);
2571
                v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20);
2572
                v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21);
2573
                v_float64 v_src00 = (v_1src00 & v_mask0) * v_2src00;
2574
                v_float64 v_src01 = (v_1src01 & v_mask1) * v_2src01;
2575
                v_float64 v_src10 = (v_1src10 & v_mask0) * v_2src10;
2576
                v_float64 v_src11 = (v_1src11 & v_mask1) * v_2src11;
2577
                v_float64 v_src20 = (v_1src20 & v_mask0) * v_2src20;
2578
                v_float64 v_src21 = (v_1src21 & v_mask1) * v_2src21;
2579

2580
                v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
2581
                v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2582
                v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2583

2584
                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
2585
                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
2586
            }
2587
        }
2588
    }
2589
#endif // CV_SIMD_64F
2590
    accProd_general_(src1, src2, dst, mask, len, cn, x);
2591
}
2592

2593
// running weight accumulate optimized by universal intrinsic
2594
void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn, double alpha)
2595
{
2596
    int x = 0;
2597
#if CV_SIMD
2598
    const v_float32 v_alpha = vx_setall_f32((float)alpha);
2599
    const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
2600
    const int cVectorWidth = v_uint8::nlanes;
2601
    const int step = v_float32::nlanes;
2602

2603
    if (!mask)
2604
    {
2605
        int size = len * cn;
2606
        for (; x <= size - cVectorWidth; x += cVectorWidth)
2607
        {
2608
            v_uint8 v_src = vx_load(src + x);
2609

2610
            v_uint16 v_src0, v_src1;
2611
            v_expand(v_src, v_src0, v_src1);
2612

2613
            v_uint32 v_src00, v_src01, v_src10, v_src11;
2614
            v_expand(v_src0, v_src00, v_src01);
2615
            v_expand(v_src1, v_src10, v_src11);
2616

2617
            v_float32 v_dst00 = vx_load(dst + x);
2618
            v_float32 v_dst01 = vx_load(dst + x + step);
2619
            v_float32 v_dst10 = vx_load(dst + x + step * 2);
2620
            v_float32 v_dst11 = vx_load(dst + x + step * 3);
2621

2622
            v_dst00 = v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha);
2623
            v_dst01 = v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha);
2624
            v_dst10 = v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha);
2625
            v_dst11 = v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha);
2626

2627
            v_store(dst + x, v_dst00);
2628
            v_store(dst + x + step, v_dst01);
2629
            v_store(dst + x + step * 2, v_dst10);
2630
            v_store(dst + x + step * 3, v_dst11);
2631
        }
2632
    }
2633
#endif // CV_SIMD
2634
    accW_general_(src, dst, mask, len, cn, alpha, x);
2635
}
2636

2637
void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn, double alpha)
2638
{
2639
    int x = 0;
2640
#if CV_SIMD
2641
    const v_float32 v_alpha = vx_setall_f32((float)alpha);
2642
    const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
2643
    const int cVectorWidth = v_uint16::nlanes;
2644
    const int step = v_float32::nlanes;
2645

2646
    if (!mask)
2647
    {
2648
        int size = len * cn;
2649
        for (; x <= size - cVectorWidth; x += cVectorWidth)
2650
        {
2651
            v_uint16 v_src = vx_load(src + x);
2652
            v_uint32 v_int0, v_int1;
2653
            v_expand(v_src, v_int0, v_int1);
2654

2655
            v_float32 v_dst0 = vx_load(dst + x);
2656
            v_float32 v_dst1 = vx_load(dst + x + step);
2657
            v_dst0 = v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int0)) * v_alpha);
2658
            v_dst1 = v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int1)) * v_alpha);
2659

2660
            v_store(dst + x, v_dst0);
2661
            v_store(dst + x + step, v_dst1);
2662
        }
2663
    }
2664
#endif // CV_SIMD
2665
    accW_general_(src, dst, mask, len, cn, alpha, x);
2666
}
2667

2668
void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha)
2669
{
2670
    int x = 0;
2671
#if CV_AVX && !CV_AVX2
2672
    const __m256 v_alpha = _mm256_set1_ps((float)alpha);
2673
    const __m256 v_beta = _mm256_set1_ps((float)(1.0f - alpha));
2674
    const int cVectorWidth = 16;
2675

2676
    if (!mask)
2677
    {
2678
        int size = len * cn;
2679
        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
2680
        {
2681
            _mm256_storeu_ps(dst + x, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x), v_alpha)));
2682
            _mm256_storeu_ps(dst + x + 8, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x + 8), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x + 8), v_alpha)));
2683
        }
2684
    }
2685
#elif CV_SIMD
2686
    const v_float32 v_alpha = vx_setall_f32((float)alpha);
2687
    const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
2688
    const int cVectorWidth = v_uint16::nlanes;
2689
    const int step = v_float32::nlanes;
2690

2691
    if (!mask)
2692
    {
2693
        int size = len * cn;
2694
        for (; x <= size - cVectorWidth; x += cVectorWidth)
2695
        {
2696
            v_float32 v_dst0 = vx_load(dst + x);
2697
            v_float32 v_dst1 = vx_load(dst + x + step);
2698

2699
            v_dst0 = v_fma(v_dst0, v_beta, vx_load(src + x) * v_alpha);
2700
            v_dst1 = v_fma(v_dst1, v_beta, vx_load(src + x + step) * v_alpha);
2701

2702
            v_store(dst + x, v_dst0);
2703
            v_store(dst + x + step, v_dst1);
2704
        }
2705
    }
2706
#endif // CV_SIMD
2707
    accW_general_(src, dst, mask, len, cn, alpha, x);
2708
}
2709

2710
void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha)
2711
{
2712
    int x = 0;
2713
#if CV_SIMD_64F
2714
    const v_float64 v_alpha = vx_setall_f64(alpha);
2715
    const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
2716
    const int cVectorWidth = v_uint16::nlanes;
2717
    const int step = v_float64::nlanes;
2718

2719
    if (!mask)
2720
    {
2721
        int size = len * cn;
2722
        for (; x <= size - cVectorWidth; x += cVectorWidth)
2723
        {
2724
            v_uint16 v_src16 = vx_load_expand(src + x);
2725

2726
            v_uint32 v_int_0, v_int_1;
2727
            v_expand(v_src16, v_int_0, v_int_1);
2728

2729
            v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);
2730
            v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);
2731

2732
            v_float64 v_src0 = v_cvt_f64(v_int0);
2733
            v_float64 v_src1 = v_cvt_f64_high(v_int0);
2734
            v_float64 v_src2 = v_cvt_f64(v_int1);
2735
            v_float64 v_src3 = v_cvt_f64_high(v_int1);
2736

2737
            v_float64 v_dst0 = vx_load(dst + x);
2738
            v_float64 v_dst1 = vx_load(dst + x + step);
2739
            v_float64 v_dst2 = vx_load(dst + x + step * 2);
2740
            v_float64 v_dst3 = vx_load(dst + x + step * 3);
2741

2742
            v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha);
2743
            v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha);
2744
            v_dst2 = v_fma(v_dst2, v_beta, v_src2 * v_alpha);
2745
            v_dst3 = v_fma(v_dst3, v_beta, v_src3 * v_alpha);
2746

2747
            v_store(dst + x, v_dst0);
2748
            v_store(dst + x + step, v_dst1);
2749
            v_store(dst + x + step * 2, v_dst2);
2750
            v_store(dst + x + step * 3, v_dst3);
2751
        }
2752
    }
2753
#endif // CV_SIMD_64F
2754
    accW_general_(src, dst, mask, len, cn, alpha, x);
2755
}
2756

2757
void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha)
2758
{
2759
    int x = 0;
2760
#if CV_SIMD_64F
2761
    const v_float64 v_alpha = vx_setall_f64(alpha);
2762
    const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
2763
    const int cVectorWidth = v_uint16::nlanes;
2764
    const int step = v_float64::nlanes;
2765

2766
    if (!mask)
2767
    {
2768
        int size = len * cn;
2769
        for (; x <= size - cVectorWidth; x += cVectorWidth)
2770
        {
2771
            v_uint16 v_src = vx_load(src + x);
2772
            v_uint32 v_int_0, v_int_1;
2773
            v_expand(v_src, v_int_0, v_int_1);
2774

2775
            v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);
2776
            v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);
2777

2778
            v_float64 v_src00 = v_cvt_f64(v_int0);
2779
            v_float64 v_src01 = v_cvt_f64_high(v_int0);
2780
            v_float64 v_src10 = v_cvt_f64(v_int1);
2781
            v_float64 v_src11 = v_cvt_f64_high(v_int1);
2782

2783
            v_float64 v_dst00 = vx_load(dst + x);
2784
            v_float64 v_dst01 = vx_load(dst + x + step);
2785
            v_float64 v_dst10 = vx_load(dst + x + step * 2);
2786
            v_float64 v_dst11 = vx_load(dst + x + step * 3);
2787

2788
            v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha);
2789
            v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha);
2790
            v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha);
2791
            v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha);
2792

2793
            v_store(dst + x, v_dst00);
2794
            v_store(dst + x + step, v_dst01);
2795
            v_store(dst + x + step * 2, v_dst10);
2796
            v_store(dst + x + step * 3, v_dst11);
2797
        }
2798
    }
2799
#endif // CV_SIMD_64F
2800
    accW_general_(src, dst, mask, len, cn, alpha, x);
2801
}
2802

2803
void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha)
2804
{
2805
    int x = 0;
2806
#if CV_AVX && !CV_AVX2
2807
    const __m256d v_alpha = _mm256_set1_pd(alpha);
2808
    const __m256d v_beta = _mm256_set1_pd(1.0f - alpha);
2809
    const int cVectorWidth = 16;
2810

2811
    if (!mask)
2812
    {
2813
        int size = len * cn;
2814
        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
2815
        {
2816
            __m256 v_src0 = _mm256_loadu_ps(src + x);
2817
            __m256 v_src1 = _mm256_loadu_ps(src + x + 8);
2818
            __m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src0,0));
2819
            __m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src0,1));
2820
            __m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src1,0));
2821
            __m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src1,1));
2822

2823
            _mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src00, v_alpha)));
2824
            _mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src01, v_alpha)));
2825
            _mm256_storeu_pd(dst + x + 8, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 8), v_beta), _mm256_mul_pd(v_src10, v_alpha)));
2826
            _mm256_storeu_pd(dst + x + 12, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 12), v_beta), _mm256_mul_pd(v_src11, v_alpha)));
2827
        }
2828
    }
2829
#elif CV_SIMD_64F
2830
    const v_float64 v_alpha = vx_setall_f64(alpha);
2831
    const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
2832
    const int cVectorWidth = v_float32::nlanes * 2;
2833
    const int step = v_float64::nlanes;
2834

2835
    if (!mask)
2836
    {
2837
        int size = len * cn;
2838
        for (; x <= size - cVectorWidth; x += cVectorWidth)
2839
        {
2840
            v_float32 v_src0 = vx_load(src + x);
2841
            v_float32 v_src1 = vx_load(src + x + v_float32::nlanes);
2842
            v_float64 v_src00 = v_cvt_f64(v_src0);
2843
            v_float64 v_src01 = v_cvt_f64_high(v_src0);
2844
            v_float64 v_src10 = v_cvt_f64(v_src1);
2845
            v_float64 v_src11 = v_cvt_f64_high(v_src1);
2846

2847
            v_float64 v_dst00 = vx_load(dst + x);
2848
            v_float64 v_dst01 = vx_load(dst + x + step);
2849
            v_float64 v_dst10 = vx_load(dst + x + step * 2);
2850
            v_float64 v_dst11 = vx_load(dst + x + step * 3);
2851

2852
            v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha);
2853
            v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha);
2854
            v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha);
2855
            v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha);
2856

2857
            v_store(dst + x, v_dst00);
2858
            v_store(dst + x + step, v_dst01);
2859
            v_store(dst + x + step * 2, v_dst10);
2860
            v_store(dst + x + step * 3, v_dst11);
2861
        }
2862
    }
2863
#endif // CV_SIMD_64F
2864
    accW_general_(src, dst, mask, len, cn, alpha, x);
2865
}
2866

2867
void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha)
2868
{
2869
    int x = 0;
2870
#if CV_AVX && !CV_AVX2
2871
    const __m256d v_alpha = _mm256_set1_pd(alpha);
2872
    const __m256d v_beta = _mm256_set1_pd(1.0f - alpha);
2873
    const int cVectorWidth = 8;
2874

2875
    if (!mask)
2876
    {
2877
        int size = len * cn;
2878
        for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
2879
        {
2880
            __m256d v_src0 = _mm256_loadu_pd(src + x);
2881
            __m256d v_src1 = _mm256_loadu_pd(src + x + 4);
2882

2883
            _mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src0, v_alpha)));
2884
            _mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src1, v_alpha)));
2885
        }
2886
    }
2887
#elif CV_SIMD_64F
2888
    const v_float64 v_alpha = vx_setall_f64(alpha);
2889
    const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
2890
    const int cVectorWidth = v_float64::nlanes * 2;
2891
    const int step = v_float64::nlanes;
2892

2893
    if (!mask)
2894
    {
2895
        int size = len * cn;
2896
        for (; x <= size - cVectorWidth; x += cVectorWidth)
2897
        {
2898
            v_float64 v_src0 = vx_load(src + x);
2899
            v_float64 v_src1 = vx_load(src + x + step);
2900

2901
            v_float64 v_dst0 = vx_load(dst + x);
2902
            v_float64 v_dst1 = vx_load(dst + x + step);
2903

2904
            v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha);
2905
            v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha);
2906

2907
            v_store(dst + x, v_dst0);
2908
            v_store(dst + x + step, v_dst1);
2909
        }
2910
    }
2911
#endif // CV_SIMD_64F
2912
    accW_general_(src, dst, mask, len, cn, alpha, x);
2913
}
2914

2915
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
2916

2917
CV_CPU_OPTIMIZATION_NAMESPACE_END
2918

2919
} // namespace cv
2920

2921
///* End of file. */
2922

2923
Product

Resources

Company