Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/imgproc/src/accum.simd.hpp
16354 views
1
// This file is part of OpenCV project.
2
// It is subject to the license terms in the LICENSE file found in the top-level directory
3
// of this distribution and at http://opencv.org/license.html.
4
5
#include "opencv2/core/hal/intrin.hpp"
6
7
#define DEF_ACC_INT_FUNCS(suffix, type, acctype) \
8
void acc_##suffix(const type* src, acctype* dst, \
9
const uchar* mask, int len, int cn) \
10
{ \
11
CV_CPU_DISPATCH(acc_simd_, (src, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \
12
} \
13
void accSqr_##suffix(const type* src, acctype* dst, \
14
const uchar* mask, int len, int cn) \
15
{ \
16
CV_CPU_DISPATCH(accSqr_simd_, (src, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \
17
} \
18
void accProd_##suffix(const type* src1, const type* src2, \
19
acctype* dst, const uchar* mask, int len, int cn) \
20
{ \
21
CV_CPU_DISPATCH(accProd_simd_, (src1, src2, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \
22
} \
23
void accW_##suffix(const type* src, acctype* dst, \
24
const uchar* mask, int len, int cn, double alpha) \
25
{ \
26
CV_CPU_DISPATCH(accW_simd_, (src, dst, mask, len, cn, alpha), CV_CPU_DISPATCH_MODES_ALL); \
27
}
28
#define DEF_ACC_FLT_FUNCS(suffix, type, acctype) \
29
void acc_##suffix(const type* src, acctype* dst, \
30
const uchar* mask, int len, int cn) \
31
{ \
32
CV_CPU_DISPATCH(acc_simd_, (src, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \
33
} \
34
void accSqr_##suffix(const type* src, acctype* dst, \
35
const uchar* mask, int len, int cn) \
36
{ \
37
CV_CPU_DISPATCH(accSqr_simd_, (src, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \
38
} \
39
void accProd_##suffix(const type* src1, const type* src2, \
40
acctype* dst, const uchar* mask, int len, int cn) \
41
{ \
42
CV_CPU_DISPATCH(accProd_simd_, (src1, src2, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \
43
} \
44
void accW_##suffix(const type* src, acctype* dst, \
45
const uchar* mask, int len, int cn, double alpha) \
46
{ \
47
CV_CPU_DISPATCH(accW_simd_, (src, dst, mask, len, cn, alpha), CV_CPU_DISPATCH_MODES_ALL); \
48
}
49
#define DECLARATE_ACC_FUNCS(suffix, type, acctype) \
50
void acc_##suffix(const type* src, acctype* dst, const uchar* mask, int len, int cn); \
51
void accSqr_##suffix(const type* src, acctype* dst, const uchar* mask, int len, int cn); \
52
void accProd_##suffix(const type* src1, const type* src2, acctype* dst, const uchar* mask, int len, int cn); \
53
void accW_##suffix(const type* src, acctype* dst, const uchar* mask, int len, int cn, double alpha);
54
55
56
namespace cv {
57
58
DECLARATE_ACC_FUNCS(8u32f, uchar, float)
59
DECLARATE_ACC_FUNCS(8u64f, uchar, double)
60
DECLARATE_ACC_FUNCS(16u32f, ushort, float)
61
DECLARATE_ACC_FUNCS(16u64f, ushort, double)
62
DECLARATE_ACC_FUNCS(32f, float, float)
63
DECLARATE_ACC_FUNCS(32f64f, float, double)
64
DECLARATE_ACC_FUNCS(64f, double, double)
65
66
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
67
68
void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn);
69
void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn);
70
void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn);
71
void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn);
72
void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn);
73
void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn);
74
void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn);
75
void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn);
76
void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn);
77
void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn);
78
void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn);
79
void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int cn);
80
void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn);
81
void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn);
82
void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar* mask, int len, int cn);
83
void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uchar* mask, int len, int cn);
84
void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn);
85
void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn);
86
void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn);
87
void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn);
88
void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn);
89
void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn, double alpha);
90
void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn, double alpha);
91
void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha);
92
void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha);
93
void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha);
94
void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha);
95
void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha);
96
97
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
98
// todo: remove AVX branch after support it by universal intrinsics
99
template <typename T, typename AT>
100
void acc_general_(const T* src, AT* dst, const uchar* mask, int len, int cn, int start = 0 )
101
{
102
int i = start;
103
104
if( !mask )
105
{
106
len *= cn;
107
#if CV_ENABLE_UNROLLED
108
for( ; i <= len - 4; i += 4 )
109
{
110
AT t0, t1;
111
t0 = src[i] + dst[i];
112
t1 = src[i+1] + dst[i+1];
113
dst[i] = t0; dst[i+1] = t1;
114
115
t0 = src[i+2] + dst[i+2];
116
t1 = src[i+3] + dst[i+3];
117
dst[i+2] = t0; dst[i+3] = t1;
118
}
119
#endif
120
for( ; i < len; i++ )
121
{
122
dst[i] += src[i];
123
}
124
}
125
else
126
{
127
src += (i * cn);
128
dst += (i * cn);
129
for( ; i < len; i++, src += cn, dst += cn )
130
{
131
if( mask[i] )
132
{
133
for( int k = 0; k < cn; k++ )
134
{
135
dst[k] += src[k];
136
}
137
}
138
}
139
}
140
#if CV_AVX && !CV_AVX2
141
_mm256_zeroupper();
142
#elif CV_SIMD
143
vx_cleanup();
144
#endif
145
}
146
147
template<typename T, typename AT> void
148
accSqr_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, int start = 0 )
149
{
150
int i = start;
151
152
if( !mask )
153
{
154
len *= cn;
155
#if CV_ENABLE_UNROLLED
156
for( ; i <= len - 4; i += 4 )
157
{
158
AT t0, t1;
159
t0 = (AT)src[i]*src[i] + dst[i];
160
t1 = (AT)src[i+1]*src[i+1] + dst[i+1];
161
dst[i] = t0; dst[i+1] = t1;
162
163
t0 = (AT)src[i+2]*src[i+2] + dst[i+2];
164
t1 = (AT)src[i+3]*src[i+3] + dst[i+3];
165
dst[i+2] = t0; dst[i+3] = t1;
166
}
167
#endif
168
for( ; i < len; i++ )
169
{
170
dst[i] += (AT)src[i]*src[i];
171
}
172
}
173
else
174
{
175
src += (i * cn);
176
dst += (i * cn);
177
for( ; i < len; i++, src += cn, dst += cn )
178
{
179
if( mask[i] )
180
{
181
for( int k = 0; k < cn; k++ )
182
{
183
dst[k] += (AT)src[k]*src[k];
184
}
185
}
186
}
187
}
188
#if CV_AVX && !CV_AVX2
189
_mm256_zeroupper();
190
#elif CV_SIMD
191
vx_cleanup();
192
#endif
193
}
194
195
template<typename T, typename AT> void
196
accProd_general_( const T* src1, const T* src2, AT* dst, const uchar* mask, int len, int cn, int start = 0 )
197
{
198
int i = start;
199
200
if( !mask )
201
{
202
len *= cn;
203
#if CV_ENABLE_UNROLLED
204
for( ; i <= len - 4; i += 4 )
205
{
206
AT t0, t1;
207
t0 = (AT)src1[i]*src2[i] + dst[i];
208
t1 = (AT)src1[i+1]*src2[i+1] + dst[i+1];
209
dst[i] = t0; dst[i+1] = t1;
210
211
t0 = (AT)src1[i+2]*src2[i+2] + dst[i+2];
212
t1 = (AT)src1[i+3]*src2[i+3] + dst[i+3];
213
dst[i+2] = t0; dst[i+3] = t1;
214
}
215
#endif
216
for( ; i < len; i++ )
217
{
218
dst[i] += (AT)src1[i]*src2[i];
219
}
220
}
221
else
222
{
223
src1 += (i * cn);
224
src2 += (i * cn);
225
dst += (i * cn);
226
for( ; i < len; i++, src1 += cn, src2 += cn, dst += cn )
227
{
228
if( mask[i] )
229
{
230
for( int k = 0; k < cn; k++ )
231
{
232
dst[k] += (AT)src1[k]*src2[k];
233
}
234
}
235
}
236
}
237
#if CV_AVX && !CV_AVX2
238
_mm256_zeroupper();
239
#elif CV_SIMD
240
vx_cleanup();
241
#endif
242
}
243
244
template<typename T, typename AT> void
245
accW_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, double alpha, int start = 0 )
246
{
247
AT a = (AT)alpha, b = 1 - a;
248
int i = start;
249
250
if( !mask )
251
{
252
len *= cn;
253
#if CV_ENABLE_UNROLLED
254
for( ; i <= len - 4; i += 4 )
255
{
256
AT t0, t1;
257
t0 = src[i]*a + dst[i]*b;
258
t1 = src[i+1]*a + dst[i+1]*b;
259
dst[i] = t0; dst[i+1] = t1;
260
261
t0 = src[i+2]*a + dst[i+2]*b;
262
t1 = src[i+3]*a + dst[i+3]*b;
263
dst[i+2] = t0; dst[i+3] = t1;
264
}
265
#endif
266
for( ; i < len; i++ )
267
{
268
dst[i] = src[i]*a + dst[i]*b;
269
}
270
}
271
else
272
{
273
src += (i * cn);
274
dst += (i * cn);
275
for( ; i < len; i++, src += cn, dst += cn )
276
{
277
if( mask[i] )
278
{
279
for( int k = 0; k < cn; k++ )
280
{
281
dst[k] = src[k]*a + dst[k]*b;
282
}
283
}
284
}
285
}
286
#if CV_AVX && !CV_AVX2
287
_mm256_zeroupper();
288
#elif CV_SIMD
289
vx_cleanup();
290
#endif
291
}
292
void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
293
{
294
int x = 0;
295
#if CV_SIMD
296
const int cVectorWidth = v_uint8::nlanes;
297
const int step = v_float32::nlanes;
298
299
if (!mask)
300
{
301
int size = len * cn;
302
for (; x <= size - cVectorWidth; x += cVectorWidth)
303
{
304
v_uint8 v_src = vx_load(src + x);
305
v_uint16 v_src0, v_src1;
306
v_expand(v_src, v_src0, v_src1);
307
308
v_uint32 v_src00, v_src01, v_src10, v_src11;
309
v_expand(v_src0, v_src00, v_src01);
310
v_expand(v_src1, v_src10, v_src11);
311
312
v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
313
v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
314
v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
315
v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
316
}
317
}
318
else
319
{
320
v_uint8 v_0 = vx_setall_u8(0);
321
if (cn == 1)
322
{
323
for ( ; x <= len - cVectorWidth; x += cVectorWidth)
324
{
325
v_uint8 v_mask = vx_load(mask + x);
326
v_mask = ~(v_0 == v_mask);
327
v_uint8 v_src = vx_load(src + x);
328
v_src = v_src & v_mask;
329
v_uint16 v_src0, v_src1;
330
v_expand(v_src, v_src0, v_src1);
331
332
v_uint32 v_src00, v_src01, v_src10, v_src11;
333
v_expand(v_src0, v_src00, v_src01);
334
v_expand(v_src1, v_src10, v_src11);
335
336
v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
337
v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
338
v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
339
v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
340
}
341
}
342
else if (cn == 3)
343
{
344
for ( ; x <= len - cVectorWidth; x += cVectorWidth)
345
{
346
v_uint8 v_mask = vx_load(mask + x);
347
v_mask = ~(v_0 == v_mask);
348
v_uint8 v_src0, v_src1, v_src2;
349
v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2);
350
v_src0 = v_src0 & v_mask;
351
v_src1 = v_src1 & v_mask;
352
v_src2 = v_src2 & v_mask;
353
v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
354
v_expand(v_src0, v_src00, v_src01);
355
v_expand(v_src1, v_src10, v_src11);
356
v_expand(v_src2, v_src20, v_src21);
357
358
v_uint32 v_src000, v_src001, v_src010, v_src011;
359
v_uint32 v_src100, v_src101, v_src110, v_src111;
360
v_uint32 v_src200, v_src201, v_src210, v_src211;
361
v_expand(v_src00, v_src000, v_src001);
362
v_expand(v_src01, v_src010, v_src011);
363
v_expand(v_src10, v_src100, v_src101);
364
v_expand(v_src11, v_src110, v_src111);
365
v_expand(v_src20, v_src200, v_src201);
366
v_expand(v_src21, v_src210, v_src211);
367
368
v_float32 v_dst000, v_dst001, v_dst010, v_dst011;
369
v_float32 v_dst100, v_dst101, v_dst110, v_dst111;
370
v_float32 v_dst200, v_dst201, v_dst210, v_dst211;
371
v_load_deinterleave(dst + (x * cn), v_dst000, v_dst100, v_dst200);
372
v_load_deinterleave(dst + ((x + step) * cn), v_dst001, v_dst101, v_dst201);
373
v_load_deinterleave(dst + ((x + step * 2) * cn), v_dst010, v_dst110, v_dst210);
374
v_load_deinterleave(dst + ((x + step * 3) * cn), v_dst011, v_dst111, v_dst211);
375
376
v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000));
377
v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100));
378
v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200));
379
v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001));
380
v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101));
381
v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201));
382
v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010));
383
v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110));
384
v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210));
385
v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011));
386
v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111));
387
v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211));
388
389
v_store_interleave(dst + (x * cn), v_dst000, v_dst100, v_dst200);
390
v_store_interleave(dst + ((x + step) * cn), v_dst001, v_dst101, v_dst201);
391
v_store_interleave(dst + ((x + step * 2) * cn), v_dst010, v_dst110, v_dst210);
392
v_store_interleave(dst + ((x + step * 3) * cn), v_dst011, v_dst111, v_dst211);
393
}
394
}
395
}
396
#endif // CV_SIMD
397
acc_general_(src, dst, mask, len, cn, x);
398
}
399
400
void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn)
401
{
402
int x = 0;
403
#if CV_SIMD
404
const int cVectorWidth = v_uint16::nlanes;
405
const int step = v_float32::nlanes;
406
407
if (!mask)
408
{
409
int size = len * cn;
410
for (; x <= size - cVectorWidth; x += cVectorWidth)
411
{
412
v_uint16 v_src = vx_load(src + x);
413
v_uint32 v_src0, v_src1;
414
v_expand(v_src, v_src0, v_src1);
415
416
v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
417
v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
418
}
419
}
420
else
421
{
422
if (cn == 1)
423
{
424
v_uint16 v_0 = vx_setall_u16(0);
425
for ( ; x <= len - cVectorWidth; x += cVectorWidth)
426
{
427
v_uint16 v_mask = vx_load_expand(mask + x);
428
v_mask = ~(v_mask == v_0);
429
v_uint16 v_src = vx_load(src + x);
430
v_src = v_src & v_mask;
431
v_uint32 v_src0, v_src1;
432
v_expand(v_src, v_src0, v_src1);
433
434
v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
435
v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
436
}
437
}
438
else if (cn == 3)
439
{
440
v_uint16 v_0 = vx_setall_u16(0);
441
for ( ; x <= len - cVectorWidth; x += cVectorWidth)
442
{
443
v_uint16 v_mask = vx_load_expand(mask + x);
444
v_mask = ~(v_mask == v_0);
445
v_uint16 v_src0, v_src1, v_src2;
446
v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
447
v_src0 = v_src0 & v_mask;
448
v_src1 = v_src1 & v_mask;
449
v_src2 = v_src2 & v_mask;
450
v_uint32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
451
v_expand(v_src0, v_src00, v_src01);
452
v_expand(v_src1, v_src10, v_src11);
453
v_expand(v_src2, v_src20, v_src21);
454
455
v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
456
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
457
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
458
459
v_dst00 += v_cvt_f32(v_reinterpret_as_s32(v_src00));
460
v_dst01 += v_cvt_f32(v_reinterpret_as_s32(v_src01));
461
v_dst10 += v_cvt_f32(v_reinterpret_as_s32(v_src10));
462
v_dst11 += v_cvt_f32(v_reinterpret_as_s32(v_src11));
463
v_dst20 += v_cvt_f32(v_reinterpret_as_s32(v_src20));
464
v_dst21 += v_cvt_f32(v_reinterpret_as_s32(v_src21));
465
466
v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
467
v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
468
}
469
}
470
}
471
#endif // CV_SIMD
472
acc_general_(src, dst, mask, len, cn, x);
473
}
474
// todo: remove AVX branch after support it by universal intrinsics
475
void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
476
{
477
int x = 0;
478
#if CV_SIMD
479
const int cVectorWidth = v_uint16::nlanes;
480
const int step = v_float32::nlanes;
481
482
if (!mask)
483
{
484
int size = len * cn;
485
#if CV_AVX && !CV_AVX2
486
for (; x <= size - 8 ; x += 8)
487
{
488
__m256 v_src = _mm256_loadu_ps(src + x);
489
__m256 v_dst = _mm256_loadu_ps(dst + x);
490
v_dst = _mm256_add_ps(v_src, v_dst);
491
_mm256_storeu_ps(dst + x, v_dst);
492
}
493
#else
494
for (; x <= size - cVectorWidth; x += cVectorWidth)
495
{
496
v_store(dst + x, vx_load(dst + x) + vx_load(src + x));
497
v_store(dst + x + step, vx_load(dst + x + step) + vx_load(src + x + step));
498
}
499
#endif // CV_AVX && !CV_AVX2
500
}
501
else
502
{
503
v_float32 v_0 = vx_setzero_f32();
504
if (cn == 1)
505
{
506
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
507
{
508
v_uint16 v_masku16 = vx_load_expand(mask + x);
509
v_uint32 v_masku320, v_masku321;
510
v_expand(v_masku16, v_masku320, v_masku321);
511
v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
512
v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
513
514
v_store(dst + x, vx_load(dst + x) + (vx_load(src + x) & v_mask0));
515
v_store(dst + x + step, vx_load(dst + x + step) + (vx_load(src + x + step) & v_mask1));
516
}
517
}
518
else if (cn == 3)
519
{
520
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
521
{
522
v_uint16 v_masku16 = vx_load_expand(mask + x);
523
v_uint32 v_masku320, v_masku321;
524
v_expand(v_masku16, v_masku320, v_masku321);
525
v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
526
v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
527
528
v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
529
v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
530
v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
531
v_src00 = v_src00 & v_mask0;
532
v_src01 = v_src01 & v_mask1;
533
v_src10 = v_src10 & v_mask0;
534
v_src11 = v_src11 & v_mask1;
535
v_src20 = v_src20 & v_mask0;
536
v_src21 = v_src21 & v_mask1;
537
538
v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
539
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
540
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
541
542
v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
543
v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
544
}
545
}
546
}
547
#endif // CV_SIMD
548
acc_general_(src, dst, mask, len, cn, x);
549
}
550
551
void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)
552
{
553
int x = 0;
554
#if CV_SIMD_64F
555
const int cVectorWidth = v_uint8::nlanes;
556
const int step = v_float64::nlanes;
557
558
if (!mask)
559
{
560
int size = len * cn;
561
for (; x <= size - cVectorWidth; x += cVectorWidth)
562
{
563
v_uint8 v_src = vx_load(src + x);
564
v_uint16 v_int0, v_int1;
565
v_expand(v_src, v_int0, v_int1);
566
567
v_uint32 v_int00, v_int01, v_int10, v_int11;
568
v_expand(v_int0, v_int00, v_int01);
569
v_expand(v_int1, v_int10, v_int11);
570
571
v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
572
v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
573
v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
574
v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
575
v_float64 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
576
v_float64 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
577
v_float64 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
578
v_float64 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
579
580
v_float64 v_dst0 = vx_load(dst + x);
581
v_float64 v_dst1 = vx_load(dst + x + step);
582
v_float64 v_dst2 = vx_load(dst + x + step * 2);
583
v_float64 v_dst3 = vx_load(dst + x + step * 3);
584
v_float64 v_dst4 = vx_load(dst + x + step * 4);
585
v_float64 v_dst5 = vx_load(dst + x + step * 5);
586
v_float64 v_dst6 = vx_load(dst + x + step * 6);
587
v_float64 v_dst7 = vx_load(dst + x + step * 7);
588
589
v_dst0 = v_dst0 + v_src0;
590
v_dst1 = v_dst1 + v_src1;
591
v_dst2 = v_dst2 + v_src2;
592
v_dst3 = v_dst3 + v_src3;
593
v_dst4 = v_dst4 + v_src4;
594
v_dst5 = v_dst5 + v_src5;
595
v_dst6 = v_dst6 + v_src6;
596
v_dst7 = v_dst7 + v_src7;
597
598
v_store(dst + x, v_dst0);
599
v_store(dst + x + step, v_dst1);
600
v_store(dst + x + step * 2, v_dst2);
601
v_store(dst + x + step * 3, v_dst3);
602
v_store(dst + x + step * 4, v_dst4);
603
v_store(dst + x + step * 5, v_dst5);
604
v_store(dst + x + step * 6, v_dst6);
605
v_store(dst + x + step * 7, v_dst7);
606
}
607
}
608
else
609
{
610
v_uint8 v_0 = vx_setall_u8(0);
611
if (cn == 1)
612
{
613
for ( ; x <= len - cVectorWidth; x += cVectorWidth)
614
{
615
v_uint8 v_mask = vx_load(mask + x);
616
v_mask = ~(v_mask == v_0);
617
v_uint8 v_src = vx_load(src + x);
618
v_src = v_src & v_mask;
619
v_uint16 v_int0, v_int1;
620
v_expand(v_src, v_int0, v_int1);
621
622
v_uint32 v_int00, v_int01, v_int10, v_int11;
623
v_expand(v_int0, v_int00, v_int01);
624
v_expand(v_int1, v_int10, v_int11);
625
626
v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
627
v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
628
v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
629
v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
630
v_float64 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
631
v_float64 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
632
v_float64 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
633
v_float64 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
634
635
v_float64 v_dst0 = vx_load(dst + x);
636
v_float64 v_dst1 = vx_load(dst + x + step);
637
v_float64 v_dst2 = vx_load(dst + x + step * 2);
638
v_float64 v_dst3 = vx_load(dst + x + step * 3);
639
v_float64 v_dst4 = vx_load(dst + x + step * 4);
640
v_float64 v_dst5 = vx_load(dst + x + step * 5);
641
v_float64 v_dst6 = vx_load(dst + x + step * 6);
642
v_float64 v_dst7 = vx_load(dst + x + step * 7);
643
644
v_dst0 = v_dst0 + v_src0;
645
v_dst1 = v_dst1 + v_src1;
646
v_dst2 = v_dst2 + v_src2;
647
v_dst3 = v_dst3 + v_src3;
648
v_dst4 = v_dst4 + v_src4;
649
v_dst5 = v_dst5 + v_src5;
650
v_dst6 = v_dst6 + v_src6;
651
v_dst7 = v_dst7 + v_src7;
652
653
v_store(dst + x, v_dst0);
654
v_store(dst + x + step, v_dst1);
655
v_store(dst + x + step * 2, v_dst2);
656
v_store(dst + x + step * 3, v_dst3);
657
v_store(dst + x + step * 4, v_dst4);
658
v_store(dst + x + step * 5, v_dst5);
659
v_store(dst + x + step * 6, v_dst6);
660
v_store(dst + x + step * 7, v_dst7);
661
}
662
}
663
else if (cn == 3)
664
{
665
for ( ; x <= len - cVectorWidth; x += cVectorWidth)
666
{
667
v_uint8 v_mask = vx_load(mask + x);
668
v_mask = ~(v_0 == v_mask);
669
v_uint8 v_src0, v_src1, v_src2;
670
v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2);
671
v_src0 = v_src0 & v_mask;
672
v_src1 = v_src1 & v_mask;
673
v_src2 = v_src2 & v_mask;
674
v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
675
v_expand(v_src0, v_src00, v_src01);
676
v_expand(v_src1, v_src10, v_src11);
677
v_expand(v_src2, v_src20, v_src21);
678
679
v_uint32 v_src000, v_src001, v_src010, v_src011;
680
v_uint32 v_src100, v_src101, v_src110, v_src111;
681
v_uint32 v_src200, v_src201, v_src210, v_src211;
682
v_expand(v_src00, v_src000, v_src001);
683
v_expand(v_src01, v_src010, v_src011);
684
v_expand(v_src10, v_src100, v_src101);
685
v_expand(v_src11, v_src110, v_src111);
686
v_expand(v_src20, v_src200, v_src201);
687
v_expand(v_src21, v_src210, v_src211);
688
689
v_float64 v_src0000, v_src0001, v_src0010, v_src0011, v_src0100, v_src0101, v_src0110, v_src0111;
690
v_float64 v_src1000, v_src1001, v_src1010, v_src1011, v_src1100, v_src1101, v_src1110, v_src1111;
691
v_float64 v_src2000, v_src2001, v_src2010, v_src2011, v_src2100, v_src2101, v_src2110, v_src2111;
692
v_src0000 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src000)));
693
v_src0001 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src000)));
694
v_src0010 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src001)));
695
v_src0011 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src001)));
696
v_src0100 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src010)));
697
v_src0101 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src010)));
698
v_src0110 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src011)));
699
v_src0111 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src011)));
700
v_src1000 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src100)));
701
v_src1001 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src100)));
702
v_src1010 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src101)));
703
v_src1011 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src101)));
704
v_src1100 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src110)));
705
v_src1101 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src110)));
706
v_src1110 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src111)));
707
v_src1111 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src111)));
708
v_src2000 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src200)));
709
v_src2001 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src200)));
710
v_src2010 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src201)));
711
v_src2011 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src201)));
712
v_src2100 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src210)));
713
v_src2101 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src210)));
714
v_src2110 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src211)));
715
v_src2111 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src211)));
716
717
v_float64 v_dst0000, v_dst0001, v_dst0010, v_dst0011, v_dst0100, v_dst0101, v_dst0110, v_dst0111;
718
v_float64 v_dst1000, v_dst1001, v_dst1010, v_dst1011, v_dst1100, v_dst1101, v_dst1110, v_dst1111;
719
v_float64 v_dst2000, v_dst2001, v_dst2010, v_dst2011, v_dst2100, v_dst2101, v_dst2110, v_dst2111;
720
v_load_deinterleave(dst + (x * cn), v_dst0000, v_dst1000, v_dst2000);
721
v_load_deinterleave(dst + ((x + step) * cn), v_dst0001, v_dst1001, v_dst2001);
722
v_load_deinterleave(dst + ((x + step * 2) * cn), v_dst0010, v_dst1010, v_dst2010);
723
v_load_deinterleave(dst + ((x + step * 3) * cn), v_dst0011, v_dst1011, v_dst2011);
724
v_load_deinterleave(dst + ((x + step * 4) * cn), v_dst0100, v_dst1100, v_dst2100);
725
v_load_deinterleave(dst + ((x + step * 5) * cn), v_dst0101, v_dst1101, v_dst2101);
726
v_load_deinterleave(dst + ((x + step * 6) * cn), v_dst0110, v_dst1110, v_dst2110);
727
v_load_deinterleave(dst + ((x + step * 7) * cn), v_dst0111, v_dst1111, v_dst2111);
728
729
v_store_interleave(dst + (x * cn), v_dst0000 + v_src0000, v_dst1000 + v_src1000, v_dst2000 + v_src2000);
730
v_store_interleave(dst + ((x + step) * cn), v_dst0001 + v_src0001, v_dst1001 + v_src1001, v_dst2001 + v_src2001);
731
v_store_interleave(dst + ((x + step * 2) * cn), v_dst0010 + v_src0010, v_dst1010 + v_src1010, v_dst2010 + v_src2010);
732
v_store_interleave(dst + ((x + step * 3) * cn), v_dst0011 + v_src0011, v_dst1011 + v_src1011, v_dst2011 + v_src2011);
733
v_store_interleave(dst + ((x + step * 4) * cn), v_dst0100 + v_src0100, v_dst1100 + v_src1100, v_dst2100 + v_src2100);
734
v_store_interleave(dst + ((x + step * 5) * cn), v_dst0101 + v_src0101, v_dst1101 + v_src1101, v_dst2101 + v_src2101);
735
v_store_interleave(dst + ((x + step * 6) * cn), v_dst0110 + v_src0110, v_dst1110 + v_src1110, v_dst2110 + v_src2110);
736
v_store_interleave(dst + ((x + step * 7) * cn), v_dst0111 + v_src0111, v_dst1111 + v_src1111, v_dst2111 + v_src2111);
737
}
738
}
739
}
740
#endif // CV_SIMD_64F
741
acc_general_(src, dst, mask, len, cn, x);
742
}
743
744
void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)
745
{
746
int x = 0;
747
#if CV_SIMD_64F
748
const int cVectorWidth = v_uint16::nlanes;
749
const int step = v_float64::nlanes;
750
751
if (!mask)
752
{
753
int size = len * cn;
754
for (; x <= size - cVectorWidth; x += cVectorWidth)
755
{
756
v_uint16 v_src = vx_load(src + x);
757
v_uint32 v_int0, v_int1;
758
v_expand(v_src, v_int0, v_int1);
759
760
v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
761
v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
762
v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
763
v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
764
765
v_float64 v_dst0 = vx_load(dst + x);
766
v_float64 v_dst1 = vx_load(dst + x + step);
767
v_float64 v_dst2 = vx_load(dst + x + step * 2);
768
v_float64 v_dst3 = vx_load(dst + x + step * 3);
769
770
v_dst0 = v_dst0 + v_src0;
771
v_dst1 = v_dst1 + v_src1;
772
v_dst2 = v_dst2 + v_src2;
773
v_dst3 = v_dst3 + v_src3;
774
775
v_store(dst + x, v_dst0);
776
v_store(dst + x + step, v_dst1);
777
v_store(dst + x + step * 2, v_dst2);
778
v_store(dst + x + step * 3, v_dst3);
779
}
780
}
781
else
782
{
783
v_uint16 v_0 = vx_setzero_u16();
784
if (cn == 1)
785
{
786
for ( ; x <= len - cVectorWidth; x += cVectorWidth)
787
{
788
v_uint16 v_mask = vx_load_expand(mask + x);
789
v_mask = ~(v_mask == v_0);
790
v_uint16 v_src = vx_load(src + x);
791
v_src = v_src & v_mask;
792
v_uint32 v_int0, v_int1;
793
v_expand(v_src, v_int0, v_int1);
794
795
v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
796
v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
797
v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
798
v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
799
800
v_float64 v_dst0 = vx_load(dst + x);
801
v_float64 v_dst1 = vx_load(dst + x + step);
802
v_float64 v_dst2 = vx_load(dst + x + step * 2);
803
v_float64 v_dst3 = vx_load(dst + x + step * 3);
804
805
v_dst0 = v_dst0 + v_src0;
806
v_dst1 = v_dst1 + v_src1;
807
v_dst2 = v_dst2 + v_src2;
808
v_dst3 = v_dst3 + v_src3;
809
810
v_store(dst + x, v_dst0);
811
v_store(dst + x + step, v_dst1);
812
v_store(dst + x + step * 2, v_dst2);
813
v_store(dst + x + step * 3, v_dst3);
814
}
815
}
816
if (cn == 3)
817
{
818
for ( ; x <= len - cVectorWidth; x += cVectorWidth)
819
{
820
v_uint16 v_mask = vx_load_expand(mask + x);
821
v_mask = ~(v_mask == v_0);
822
v_uint16 v_src0, v_src1, v_src2;
823
v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
824
v_src0 = v_src0 & v_mask;
825
v_src1 = v_src1 & v_mask;
826
v_src2 = v_src2 & v_mask;
827
v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
828
v_expand(v_src0, v_int00, v_int01);
829
v_expand(v_src1, v_int10, v_int11);
830
v_expand(v_src2, v_int20, v_int21);
831
832
v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
833
v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
834
v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
835
v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
836
v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
837
v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
838
v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
839
v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
840
v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
841
v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
842
v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
843
v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
844
845
v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
846
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
847
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
848
v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
849
v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
850
851
v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
852
v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
853
v_store_interleave(dst + (x + step * 2) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);
854
v_store_interleave(dst + (x + step * 3) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);
855
}
856
}
857
}
858
#endif // CV_SIMD_64F
859
acc_general_(src, dst, mask, len, cn, x);
860
}
861
862
void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
863
{
864
int x = 0;
865
#if CV_SIMD_64F
866
const int cVectorWidth = v_float32::nlanes;
867
const int step = v_float64::nlanes;
868
869
if (!mask)
870
{
871
int size = len * cn;
872
#if CV_AVX && !CV_AVX2
873
for (; x <= size - 8 ; x += 8)
874
{
875
__m256 v_src = _mm256_loadu_ps(src + x);
876
__m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 0));
877
__m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 1));
878
__m256d v_dst0 = _mm256_loadu_pd(dst + x);
879
__m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
880
v_dst0 = _mm256_add_pd(v_src0, v_dst0);
881
v_dst1 = _mm256_add_pd(v_src1, v_dst1);
882
_mm256_storeu_pd(dst + x, v_dst0);
883
_mm256_storeu_pd(dst + x + 4, v_dst1);
884
}
885
#else
886
for (; x <= size - cVectorWidth; x += cVectorWidth)
887
{
888
v_float32 v_src = vx_load(src + x);
889
v_float64 v_src0 = v_cvt_f64(v_src);
890
v_float64 v_src1 = v_cvt_f64_high(v_src);
891
892
v_store(dst + x, vx_load(dst + x) + v_src0);
893
v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
894
}
895
#endif // CV_AVX && !CV_AVX2
896
}
897
else
898
{
899
v_uint64 v_0 = vx_setzero_u64();
900
if (cn == 1)
901
{
902
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
903
{
904
v_uint32 v_masku32 = vx_load_expand_q(mask + x);
905
v_uint64 v_masku640, v_masku641;
906
v_expand(v_masku32, v_masku640, v_masku641);
907
v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
908
v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
909
910
v_float32 v_src = vx_load(src + x);
911
v_float64 v_src0 = v_cvt_f64(v_src) & v_mask0;
912
v_float64 v_src1 = v_cvt_f64_high(v_src) & v_mask1;
913
914
v_store(dst + x, vx_load(dst + x) + v_src0);
915
v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
916
}
917
}
918
else if (cn == 3)
919
{
920
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
921
{
922
v_uint32 v_masku32 = vx_load_expand_q(mask + x);
923
v_uint64 v_masku640, v_masku641;
924
v_expand(v_masku32, v_masku640, v_masku641);
925
v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
926
v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
927
928
v_float32 v_src0, v_src1, v_src2;
929
v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
930
v_float64 v_src00 = v_cvt_f64(v_src0) & v_mask0;
931
v_float64 v_src01 = v_cvt_f64_high(v_src0) & v_mask1;
932
v_float64 v_src10 = v_cvt_f64(v_src1) & v_mask0;
933
v_float64 v_src11 = v_cvt_f64_high(v_src1) & v_mask1;
934
v_float64 v_src20 = v_cvt_f64(v_src2) & v_mask0;
935
v_float64 v_src21 = v_cvt_f64_high(v_src2) & v_mask1;
936
937
v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
938
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
939
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
940
941
v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
942
v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
943
}
944
}
945
}
946
#endif // CV_SIMD_64F
947
acc_general_(src, dst, mask, len, cn, x);
948
}
949
950
void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
951
{
952
int x = 0;
953
#if CV_SIMD_64F
954
const int cVectorWidth = v_float64::nlanes * 2;
955
const int step = v_float64::nlanes;
956
957
if (!mask)
958
{
959
int size = len * cn;
960
#if CV_AVX && !CV_AVX2
961
for ( ; x <= size - 4 ; x += 4)
962
{
963
__m256d v_src = _mm256_loadu_pd(src + x);
964
__m256d v_dst = _mm256_loadu_pd(dst + x);
965
v_dst = _mm256_add_pd(v_dst, v_src);
966
_mm256_storeu_pd(dst + x, v_dst);
967
}
968
#else
969
for (; x <= size - cVectorWidth; x += cVectorWidth)
970
{
971
v_float64 v_src0 = vx_load(src + x);
972
v_float64 v_src1 = vx_load(src + x + step);
973
974
v_store(dst + x, vx_load(dst + x) + v_src0);
975
v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
976
}
977
#endif // CV_AVX && !CV_AVX2
978
}
979
else
980
{
981
v_uint64 v_0 = vx_setzero_u64();
982
if (cn == 1)
983
{
984
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
985
{
986
v_uint32 v_masku32 = vx_load_expand_q(mask + x);
987
v_uint64 v_masku640, v_masku641;
988
v_expand(v_masku32, v_masku640, v_masku641);
989
v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
990
v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
991
992
v_float64 v_src0 = vx_load(src + x);
993
v_float64 v_src1 = vx_load(src + x + step);
994
995
v_store(dst + x, vx_load(dst + x) + (v_src0 & v_mask0));
996
v_store(dst + x + step, vx_load(dst + x + step) + (v_src1 & v_mask1));
997
}
998
}
999
else if (cn == 3)
1000
{
1001
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
1002
{
1003
v_uint32 v_masku32 = vx_load_expand_q(mask + x);
1004
v_uint64 v_masku640, v_masku641;
1005
v_expand(v_masku32, v_masku640, v_masku641);
1006
v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
1007
v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
1008
1009
v_float64 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
1010
v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
1011
v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
1012
v_src00 = v_src00 & v_mask0;
1013
v_src01 = v_src01 & v_mask1;
1014
v_src10 = v_src10 & v_mask0;
1015
v_src11 = v_src11 & v_mask1;
1016
v_src20 = v_src20 & v_mask0;
1017
v_src21 = v_src21 & v_mask1;
1018
1019
v_float64 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
1020
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1021
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1022
1023
v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
1024
v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
1025
}
1026
}
1027
}
1028
#endif // CV_SIMD_64F
1029
acc_general_(src, dst, mask, len, cn, x);
1030
}
1031
1032
// square accumulate optimized by universal intrinsic
1033
void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
1034
{
1035
int x = 0;
1036
#if CV_SIMD
1037
const int cVectorWidth = v_uint8::nlanes;
1038
const int step = v_float32::nlanes;
1039
1040
if (!mask)
1041
{
1042
int size = len * cn;
1043
for (; x <= size - cVectorWidth; x += cVectorWidth)
1044
{
1045
v_uint8 v_src = vx_load(src + x);
1046
v_uint16 v_src0, v_src1;
1047
v_expand(v_src, v_src0, v_src1);
1048
v_src0 = v_mul_wrap(v_src0, v_src0);
1049
v_src1 = v_mul_wrap(v_src1, v_src1);
1050
1051
v_uint32 v_src00, v_src01, v_src10, v_src11;
1052
v_expand(v_src0, v_src00, v_src01);
1053
v_expand(v_src1, v_src10, v_src11);
1054
1055
v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
1056
v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
1057
v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
1058
v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
1059
}
1060
}
1061
else
1062
{
1063
v_uint8 v_0 = vx_setall_u8(0);
1064
if (cn == 1)
1065
{
1066
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
1067
{
1068
v_uint8 v_mask = vx_load(mask + x);
1069
v_mask = ~(v_0 == v_mask);
1070
v_uint8 v_src = vx_load(src + x);
1071
v_src = v_src & v_mask;
1072
v_uint16 v_src0, v_src1;
1073
v_expand(v_src, v_src0, v_src1);
1074
v_src0 = v_mul_wrap(v_src0, v_src0);
1075
v_src1 = v_mul_wrap(v_src1, v_src1);
1076
1077
v_uint32 v_src00, v_src01, v_src10, v_src11;
1078
v_expand(v_src0, v_src00, v_src01);
1079
v_expand(v_src1, v_src10, v_src11);
1080
1081
v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
1082
v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
1083
v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
1084
v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
1085
}
1086
}
1087
else if (cn == 3)
1088
{
1089
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
1090
{
1091
v_uint8 v_mask = vx_load(mask + x);
1092
v_mask = ~(v_0 == v_mask);
1093
1094
v_uint8 v_src0, v_src1, v_src2;
1095
v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
1096
v_src0 = v_src0 & v_mask;
1097
v_src1 = v_src1 & v_mask;
1098
v_src2 = v_src2 & v_mask;
1099
1100
v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
1101
v_expand(v_src0, v_src00, v_src01);
1102
v_expand(v_src1, v_src10, v_src11);
1103
v_expand(v_src2, v_src20, v_src21);
1104
v_src00 = v_mul_wrap(v_src00, v_src00);
1105
v_src01 = v_mul_wrap(v_src01, v_src01);
1106
v_src10 = v_mul_wrap(v_src10, v_src10);
1107
v_src11 = v_mul_wrap(v_src11, v_src11);
1108
v_src20 = v_mul_wrap(v_src20, v_src20);
1109
v_src21 = v_mul_wrap(v_src21, v_src21);
1110
1111
v_uint32 v_src000, v_src001, v_src010, v_src011;
1112
v_uint32 v_src100, v_src101, v_src110, v_src111;
1113
v_uint32 v_src200, v_src201, v_src210, v_src211;
1114
v_expand(v_src00, v_src000, v_src001);
1115
v_expand(v_src01, v_src010, v_src011);
1116
v_expand(v_src10, v_src100, v_src101);
1117
v_expand(v_src11, v_src110, v_src111);
1118
v_expand(v_src20, v_src200, v_src201);
1119
v_expand(v_src21, v_src210, v_src211);
1120
1121
v_float32 v_dst000, v_dst001, v_dst010, v_dst011;
1122
v_float32 v_dst100, v_dst101, v_dst110, v_dst111;
1123
v_float32 v_dst200, v_dst201, v_dst210, v_dst211;
1124
v_load_deinterleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
1125
v_load_deinterleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
1126
v_load_deinterleave(dst + (x + step * 2) * cn, v_dst010, v_dst110, v_dst210);
1127
v_load_deinterleave(dst + (x + step * 3) * cn, v_dst011, v_dst111, v_dst211);
1128
1129
v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000));
1130
v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001));
1131
v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010));
1132
v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011));
1133
1134
v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100));
1135
v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101));
1136
v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110));
1137
v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111));
1138
1139
v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200));
1140
v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201));
1141
v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210));
1142
v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211));
1143
1144
v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
1145
v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
1146
v_store_interleave(dst + (x + step * 2) * cn, v_dst010, v_dst110, v_dst210);
1147
v_store_interleave(dst + (x + step * 3) * cn, v_dst011, v_dst111, v_dst211);
1148
}
1149
}
1150
}
1151
#endif // CV_SIMD
1152
accSqr_general_(src, dst, mask, len, cn, x);
1153
}
1154
1155
void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn)
1156
{
1157
int x = 0;
1158
#if CV_SIMD
1159
const int cVectorWidth = v_uint16::nlanes;
1160
const int step = v_float32::nlanes;
1161
1162
if (!mask)
1163
{
1164
int size = len * cn;
1165
for (; x <= size - cVectorWidth; x += cVectorWidth)
1166
{
1167
v_uint16 v_src = vx_load(src + x);
1168
v_uint32 v_src0, v_src1;
1169
v_expand(v_src, v_src0, v_src1);
1170
1171
v_float32 v_float0, v_float1;
1172
v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0));
1173
v_float1 = v_cvt_f32(v_reinterpret_as_s32(v_src1));
1174
1175
v_store(dst + x, v_fma(v_float0, v_float0, vx_load(dst + x)));
1176
v_store(dst + x + step, v_fma(v_float1, v_float1, vx_load(dst + x + step)));
1177
}
1178
}
1179
else
1180
{
1181
v_uint32 v_0 = vx_setzero_u32();
1182
if (cn == 1)
1183
{
1184
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
1185
{
1186
v_uint16 v_mask16 = vx_load_expand(mask + x);
1187
v_uint32 v_mask0, v_mask1;
1188
v_expand(v_mask16, v_mask0, v_mask1);
1189
v_mask0 = ~(v_mask0 == v_0);
1190
v_mask1 = ~(v_mask1 == v_0);
1191
v_uint16 v_src = vx_load(src + x);
1192
v_uint32 v_src0, v_src1;
1193
v_expand(v_src, v_src0, v_src1);
1194
v_src0 = v_src0 & v_mask0;
1195
v_src1 = v_src1 & v_mask1;
1196
1197
v_float32 v_float0, v_float1;
1198
v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0));
1199
v_float1 = v_cvt_f32(v_reinterpret_as_s32(v_src1));
1200
1201
v_store(dst + x, v_fma(v_float0, v_float0, vx_load(dst + x)));
1202
v_store(dst + x + step, v_fma(v_float1, v_float1, vx_load(dst + x + step)));
1203
}
1204
}
1205
else if (cn == 3)
1206
{
1207
for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
1208
{
1209
v_uint16 v_mask16 = vx_load_expand(mask + x);
1210
v_uint32 v_mask0, v_mask1;
1211
v_expand(v_mask16, v_mask0, v_mask1);
1212
v_mask0 = ~(v_mask0 == v_0);
1213
v_mask1 = ~(v_mask1 == v_0);
1214
1215
v_uint16 v_src0, v_src1, v_src2;
1216
v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
1217
v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
1218
v_expand(v_src0, v_int00, v_int01);
1219
v_expand(v_src1, v_int10, v_int11);
1220
v_expand(v_src2, v_int20, v_int21);
1221
v_int00 = v_int00 & v_mask0;
1222
v_int01 = v_int01 & v_mask1;
1223
v_int10 = v_int10 & v_mask0;
1224
v_int11 = v_int11 & v_mask1;
1225
v_int20 = v_int20 & v_mask0;
1226
v_int21 = v_int21 & v_mask1;
1227
1228
v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
1229
v_src00 = v_cvt_f32(v_reinterpret_as_s32(v_int00));
1230
v_src01 = v_cvt_f32(v_reinterpret_as_s32(v_int01));
1231
v_src10 = v_cvt_f32(v_reinterpret_as_s32(v_int10));
1232
v_src11 = v_cvt_f32(v_reinterpret_as_s32(v_int11));
1233
v_src20 = v_cvt_f32(v_reinterpret_as_s32(v_int20));
1234
v_src21 = v_cvt_f32(v_reinterpret_as_s32(v_int21));
1235
1236
v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
1237
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1238
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1239
1240
v_dst00 = v_fma(v_src00, v_src00, v_dst00);
1241
v_dst01 = v_fma(v_src01, v_src01, v_dst01);
1242
v_dst10 = v_fma(v_src10, v_src10, v_dst10);
1243
v_dst11 = v_fma(v_src11, v_src11, v_dst11);
1244
v_dst20 = v_fma(v_src20, v_src20, v_dst20);
1245
v_dst21 = v_fma(v_src21, v_src21, v_dst21);
1246
1247
v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1248
v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1249
}
1250
}
1251
}
1252
#endif // CV_SIMD
1253
accSqr_general_(src, dst, mask, len, cn, x);
1254
}
1255
1256
void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
1257
{
1258
int x = 0;
1259
#if CV_SIMD
1260
const int cVectorWidth = v_uint16::nlanes;
1261
const int step = v_float32::nlanes;
1262
1263
if (!mask)
1264
{
1265
int size = len * cn;
1266
#if CV_AVX && !CV_AVX2
1267
for ( ; x <= size - 8 ; x += 8)
1268
{
1269
__m256 v_src = _mm256_loadu_ps(src + x);
1270
__m256 v_dst = _mm256_loadu_ps(dst + x);
1271
v_src = _mm256_mul_ps(v_src, v_src);
1272
v_dst = _mm256_add_ps(v_src, v_dst);
1273
_mm256_storeu_ps(dst + x, v_dst);
1274
}
1275
#else
1276
for (; x <= size - cVectorWidth; x += cVectorWidth)
1277
{
1278
v_float32 v_src0 = vx_load(src + x);
1279
v_float32 v_src1 = vx_load(src + x + step);
1280
1281
v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
1282
v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
1283
}
1284
#endif // CV_AVX && !CV_AVX2
1285
}
1286
else
1287
{
1288
v_uint32 v_0 = vx_setzero_u32();
1289
if (cn == 1)
1290
{
1291
for (; x <= len - cVectorWidth; x += cVectorWidth)
1292
{
1293
v_uint16 v_mask16 = vx_load_expand(mask + x);
1294
v_uint32 v_mask_0, v_mask_1;
1295
v_expand(v_mask16, v_mask_0, v_mask_1);
1296
v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
1297
v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
1298
v_float32 v_src0 = vx_load(src + x);
1299
v_float32 v_src1 = vx_load(src + x + step);
1300
v_src0 = v_src0 & v_mask0;
1301
v_src1 = v_src1 & v_mask1;
1302
1303
v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
1304
v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
1305
}
1306
}
1307
else if (cn == 3)
1308
{
1309
for (; x <= len - cVectorWidth; x += cVectorWidth)
1310
{
1311
v_uint16 v_mask16 = vx_load_expand(mask + x);
1312
v_uint32 v_mask_0, v_mask_1;
1313
v_expand(v_mask16, v_mask_0, v_mask_1);
1314
v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
1315
v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
1316
1317
v_float32 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
1318
v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
1319
v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
1320
v_src00 = v_src00 & v_mask0;
1321
v_src01 = v_src01 & v_mask1;
1322
v_src10 = v_src10 & v_mask0;
1323
v_src11 = v_src11 & v_mask1;
1324
v_src20 = v_src20 & v_mask0;
1325
v_src21 = v_src21 & v_mask1;
1326
1327
v_float32 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
1328
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1329
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1330
1331
v_dst00 = v_fma(v_src00, v_src00, v_dst00);
1332
v_dst01 = v_fma(v_src01, v_src01, v_dst01);
1333
v_dst10 = v_fma(v_src10, v_src10, v_dst10);
1334
v_dst11 = v_fma(v_src11, v_src11, v_dst11);
1335
v_dst20 = v_fma(v_src20, v_src20, v_dst20);
1336
v_dst21 = v_fma(v_src21, v_src21, v_dst21);
1337
1338
v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1339
v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1340
}
1341
}
1342
}
1343
#endif // CV_SIMD
1344
accSqr_general_(src, dst, mask, len, cn, x);
1345
}
1346
1347
void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)
1348
{
1349
int x = 0;
1350
#if CV_SIMD_64F
1351
const int cVectorWidth = v_uint16::nlanes;
1352
const int step = v_float64::nlanes;
1353
1354
if (!mask)
1355
{
1356
int size = len * cn;
1357
for (; x <= size - cVectorWidth; x += cVectorWidth)
1358
{
1359
v_uint16 v_int = vx_load_expand(src + x);
1360
1361
v_uint32 v_int0, v_int1;
1362
v_expand(v_int, v_int0, v_int1);
1363
1364
v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
1365
v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
1366
v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
1367
v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
1368
1369
v_float64 v_dst0 = vx_load(dst + x);
1370
v_float64 v_dst1 = vx_load(dst + x + step);
1371
v_float64 v_dst2 = vx_load(dst + x + step * 2);
1372
v_float64 v_dst3 = vx_load(dst + x + step * 3);
1373
1374
v_dst0 = v_fma(v_src0, v_src0, v_dst0);
1375
v_dst1 = v_fma(v_src1, v_src1, v_dst1);
1376
v_dst2 = v_fma(v_src2, v_src2, v_dst2);
1377
v_dst3 = v_fma(v_src3, v_src3, v_dst3);
1378
1379
v_store(dst + x, v_dst0);
1380
v_store(dst + x + step, v_dst1);
1381
v_store(dst + x + step * 2, v_dst2);
1382
v_store(dst + x + step * 3, v_dst3);
1383
}
1384
}
1385
else
1386
{
1387
v_uint16 v_0 = vx_setzero_u16();
1388
if (cn == 1)
1389
{
1390
for (; x <= len - cVectorWidth; x += cVectorWidth)
1391
{
1392
v_uint16 v_mask = vx_load_expand(mask + x);
1393
v_mask = ~(v_mask == v_0);
1394
v_uint16 v_src = vx_load_expand(src + x);
1395
v_uint16 v_int = v_src & v_mask;
1396
1397
v_uint32 v_int0, v_int1;
1398
v_expand(v_int, v_int0, v_int1);
1399
1400
v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));
1401
v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));
1402
v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));
1403
v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));
1404
1405
v_float64 v_dst0 = vx_load(dst + x);
1406
v_float64 v_dst1 = vx_load(dst + x + step);
1407
v_float64 v_dst2 = vx_load(dst + x + step * 2);
1408
v_float64 v_dst3 = vx_load(dst + x + step * 3);
1409
1410
v_dst0 = v_fma(v_src0, v_src0, v_dst0);
1411
v_dst1 = v_fma(v_src1, v_src1, v_dst1);
1412
v_dst2 = v_fma(v_src2, v_src2, v_dst2);
1413
v_dst3 = v_fma(v_src3, v_src3, v_dst3);
1414
1415
v_store(dst + x, v_dst0);
1416
v_store(dst + x + step, v_dst1);
1417
v_store(dst + x + step * 2, v_dst2);
1418
v_store(dst + x + step * 3, v_dst3);
1419
}
1420
}
1421
else if (cn == 3)
1422
{
1423
for (; x <= len - cVectorWidth * 2; x += cVectorWidth)
1424
{
1425
v_uint8 v_src0, v_src1, v_src2;
1426
v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
1427
1428
v_uint16 v_int0 = v_expand_low(v_src0);
1429
v_uint16 v_int1 = v_expand_low(v_src1);
1430
v_uint16 v_int2 = v_expand_low(v_src2);
1431
1432
v_uint16 v_mask = vx_load_expand(mask + x);
1433
v_mask = ~(v_mask == v_0);
1434
v_int0 = v_int0 & v_mask;
1435
v_int1 = v_int1 & v_mask;
1436
v_int2 = v_int2 & v_mask;
1437
1438
v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
1439
v_expand(v_int0, v_int00, v_int01);
1440
v_expand(v_int1, v_int10, v_int11);
1441
v_expand(v_int2, v_int20, v_int21);
1442
1443
v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
1444
v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
1445
v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
1446
v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
1447
v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
1448
v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
1449
v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
1450
v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
1451
v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
1452
v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
1453
v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
1454
v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
1455
1456
v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
1457
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1458
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1459
v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
1460
v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
1461
1462
v_dst00 = v_fma(v_src00, v_src00, v_dst00);
1463
v_dst01 = v_fma(v_src01, v_src01, v_dst01);
1464
v_dst02 = v_fma(v_src02, v_src02, v_dst02);
1465
v_dst03 = v_fma(v_src03, v_src03, v_dst03);
1466
v_dst10 = v_fma(v_src10, v_src10, v_dst10);
1467
v_dst11 = v_fma(v_src11, v_src11, v_dst11);
1468
v_dst12 = v_fma(v_src12, v_src12, v_dst12);
1469
v_dst13 = v_fma(v_src13, v_src13, v_dst13);
1470
v_dst20 = v_fma(v_src20, v_src20, v_dst20);
1471
v_dst21 = v_fma(v_src21, v_src21, v_dst21);
1472
v_dst22 = v_fma(v_src22, v_src22, v_dst22);
1473
v_dst23 = v_fma(v_src23, v_src23, v_dst23);
1474
1475
v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1476
v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1477
v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
1478
v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
1479
}
1480
}
1481
}
1482
#endif // CV_SIMD_64F
1483
accSqr_general_(src, dst, mask, len, cn, x);
1484
}
1485
1486
void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)
1487
{
1488
int x = 0;
1489
#if CV_SIMD_64F
1490
const int cVectorWidth = v_uint16::nlanes;
1491
const int step = v_float64::nlanes;
1492
1493
if (!mask)
1494
{
1495
int size = len * cn;
1496
for (; x <= size - cVectorWidth; x += cVectorWidth)
1497
{
1498
v_uint16 v_src = vx_load(src + x);
1499
v_uint32 v_int_0, v_int_1;
1500
v_expand(v_src, v_int_0, v_int_1);
1501
1502
v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);
1503
v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);
1504
1505
v_float64 v_src0 = v_cvt_f64(v_int0);
1506
v_float64 v_src1 = v_cvt_f64_high(v_int0);
1507
v_float64 v_src2 = v_cvt_f64(v_int1);
1508
v_float64 v_src3 = v_cvt_f64_high(v_int1);
1509
1510
v_float64 v_dst0 = vx_load(dst + x);
1511
v_float64 v_dst1 = vx_load(dst + x + step);
1512
v_float64 v_dst2 = vx_load(dst + x + step * 2);
1513
v_float64 v_dst3 = vx_load(dst + x + step * 3);
1514
1515
v_dst0 = v_fma(v_src0, v_src0, v_dst0);
1516
v_dst1 = v_fma(v_src1, v_src1, v_dst1);
1517
v_dst2 = v_fma(v_src2, v_src2, v_dst2);
1518
v_dst3 = v_fma(v_src3, v_src3, v_dst3);
1519
1520
v_store(dst + x, v_dst0);
1521
v_store(dst + x + step, v_dst1);
1522
v_store(dst + x + step * 2, v_dst2);
1523
v_store(dst + x + step * 3, v_dst3);
1524
}
1525
}
1526
else
1527
{
1528
v_uint16 v_0 = vx_setzero_u16();
1529
if (cn == 1)
1530
{
1531
for (; x <= len - cVectorWidth; x += cVectorWidth)
1532
{
1533
v_uint16 v_mask = vx_load_expand(mask + x);
1534
v_mask = ~(v_mask == v_0);
1535
v_uint16 v_src = vx_load(src + x);
1536
v_src = v_src & v_mask;
1537
v_uint32 v_int_0, v_int_1;
1538
v_expand(v_src, v_int_0, v_int_1);
1539
1540
v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);
1541
v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);
1542
1543
v_float64 v_src0 = v_cvt_f64(v_int0);
1544
v_float64 v_src1 = v_cvt_f64_high(v_int0);
1545
v_float64 v_src2 = v_cvt_f64(v_int1);
1546
v_float64 v_src3 = v_cvt_f64_high(v_int1);
1547
1548
v_float64 v_dst0 = vx_load(dst + x);
1549
v_float64 v_dst1 = vx_load(dst + x + step);
1550
v_float64 v_dst2 = vx_load(dst + x + step * 2);
1551
v_float64 v_dst3 = vx_load(dst + x + step * 3);
1552
1553
v_dst0 = v_fma(v_src0, v_src0, v_dst0);
1554
v_dst1 = v_fma(v_src1, v_src1, v_dst1);
1555
v_dst2 = v_fma(v_src2, v_src2, v_dst2);
1556
v_dst3 = v_fma(v_src3, v_src3, v_dst3);
1557
1558
v_store(dst + x, v_dst0);
1559
v_store(dst + x + step, v_dst1);
1560
v_store(dst + x + step * 2, v_dst2);
1561
v_store(dst + x + step * 3, v_dst3);
1562
}
1563
}
1564
else if (cn == 3)
1565
{
1566
for (; x <= len - cVectorWidth; x += cVectorWidth)
1567
{
1568
v_uint16 v_mask = vx_load_expand(mask + x);
1569
v_mask = ~(v_mask == v_0);
1570
v_uint16 v_src0, v_src1, v_src2;
1571
v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
1572
v_src0 = v_src0 & v_mask;
1573
v_src1 = v_src1 & v_mask;
1574
v_src2 = v_src2 & v_mask;
1575
v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
1576
v_expand(v_src0, v_int00, v_int01);
1577
v_expand(v_src1, v_int10, v_int11);
1578
v_expand(v_src2, v_int20, v_int21);
1579
1580
v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));
1581
v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));
1582
v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));
1583
v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));
1584
v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));
1585
v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));
1586
v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));
1587
v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));
1588
v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));
1589
v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));
1590
v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));
1591
v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));
1592
1593
v_float64 v_dst00, v_dst01, v_dst02, v_dst03;
1594
v_float64 v_dst10, v_dst11, v_dst12, v_dst13;
1595
v_float64 v_dst20, v_dst21, v_dst22, v_dst23;
1596
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1597
v_load_deinterleave(dst + (x + step)* cn, v_dst01, v_dst11, v_dst21);
1598
v_load_deinterleave(dst + (x + step * 2)* cn, v_dst02, v_dst12, v_dst22);
1599
v_load_deinterleave(dst + (x + step * 3)* cn, v_dst03, v_dst13, v_dst23);
1600
1601
v_dst00 = v_fma(v_src00, v_src00, v_dst00);
1602
v_dst01 = v_fma(v_src01, v_src01, v_dst01);
1603
v_dst02 = v_fma(v_src02, v_src02, v_dst02);
1604
v_dst03 = v_fma(v_src03, v_src03, v_dst03);
1605
v_dst10 = v_fma(v_src10, v_src10, v_dst10);
1606
v_dst11 = v_fma(v_src11, v_src11, v_dst11);
1607
v_dst12 = v_fma(v_src12, v_src12, v_dst12);
1608
v_dst13 = v_fma(v_src13, v_src13, v_dst13);
1609
v_dst20 = v_fma(v_src20, v_src20, v_dst20);
1610
v_dst21 = v_fma(v_src21, v_src21, v_dst21);
1611
v_dst22 = v_fma(v_src22, v_src22, v_dst22);
1612
v_dst23 = v_fma(v_src23, v_src23, v_dst23);
1613
1614
v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1615
v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1616
v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
1617
v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
1618
}
1619
}
1620
}
1621
#endif // CV_SIMD_64F
1622
accSqr_general_(src, dst, mask, len, cn, x);
1623
}
1624
1625
void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
1626
{
1627
int x = 0;
1628
#if CV_SIMD_64F
1629
const int cVectorWidth = v_float32::nlanes;
1630
const int step = v_float64::nlanes;
1631
1632
if (!mask)
1633
{
1634
int size = len * cn;
1635
#if CV_AVX && !CV_AVX2
1636
for (; x <= size - 8 ; x += 8)
1637
{
1638
__m256 v_src = _mm256_loadu_ps(src + x);
1639
__m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0));
1640
__m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1));
1641
__m256d v_dst0 = _mm256_loadu_pd(dst + x);
1642
__m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
1643
v_src0 = _mm256_mul_pd(v_src0, v_src0);
1644
v_src1 = _mm256_mul_pd(v_src1, v_src1);
1645
v_dst0 = _mm256_add_pd(v_src0, v_dst0);
1646
v_dst1 = _mm256_add_pd(v_src1, v_dst1);
1647
_mm256_storeu_pd(dst + x, v_dst0);
1648
_mm256_storeu_pd(dst + x + 4, v_dst1);
1649
}
1650
#else
1651
for (; x <= size - cVectorWidth; x += cVectorWidth)
1652
{
1653
v_float32 v_src = vx_load(src + x);
1654
v_float64 v_src0 = v_cvt_f64(v_src);
1655
v_float64 v_src1 = v_cvt_f64_high(v_src);
1656
1657
v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
1658
v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
1659
}
1660
#endif // CV_AVX && !CV_AVX2
1661
}
1662
else
1663
{
1664
v_uint32 v_0 = vx_setzero_u32();
1665
if (cn == 1)
1666
{
1667
for (; x <= len - cVectorWidth; x += cVectorWidth)
1668
{
1669
v_uint32 v_mask = vx_load_expand_q(mask + x);;
1670
v_mask = ~(v_mask == v_0);
1671
v_float32 v_src = vx_load(src + x);
1672
v_src = v_src & v_reinterpret_as_f32(v_mask);
1673
v_float64 v_src0 = v_cvt_f64(v_src);
1674
v_float64 v_src1 = v_cvt_f64_high(v_src);
1675
1676
v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
1677
v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
1678
}
1679
}
1680
else if (cn == 3)
1681
{
1682
for (; x <= len - cVectorWidth; x += cVectorWidth)
1683
{
1684
v_uint32 v_mask = vx_load_expand_q(mask + x);
1685
v_mask = ~(v_mask == v_0);
1686
1687
v_float32 v_src0, v_src1, v_src2;
1688
v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
1689
v_src0 = v_src0 & v_reinterpret_as_f32(v_mask);
1690
v_src1 = v_src1 & v_reinterpret_as_f32(v_mask);
1691
v_src2 = v_src2 & v_reinterpret_as_f32(v_mask);
1692
1693
v_float64 v_src00 = v_cvt_f64(v_src0);
1694
v_float64 v_src01 = v_cvt_f64_high(v_src0);
1695
v_float64 v_src10 = v_cvt_f64(v_src1);
1696
v_float64 v_src11 = v_cvt_f64_high(v_src1);
1697
v_float64 v_src20 = v_cvt_f64(v_src2);
1698
v_float64 v_src21 = v_cvt_f64_high(v_src2);
1699
1700
v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
1701
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1702
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1703
1704
v_dst00 = v_fma(v_src00, v_src00, v_dst00);
1705
v_dst01 = v_fma(v_src01, v_src01, v_dst01);
1706
v_dst10 = v_fma(v_src10, v_src10, v_dst10);
1707
v_dst11 = v_fma(v_src11, v_src11, v_dst11);
1708
v_dst20 = v_fma(v_src20, v_src20, v_dst20);
1709
v_dst21 = v_fma(v_src21, v_src21, v_dst21);
1710
1711
v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1712
v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1713
}
1714
}
1715
}
1716
#endif // CV_SIMD_64F
1717
accSqr_general_(src, dst, mask, len, cn, x);
1718
}
1719
1720
void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
1721
{
1722
int x = 0;
1723
#if CV_SIMD_64F
1724
const int cVectorWidth = v_float64::nlanes * 2;
1725
const int step = v_float64::nlanes;
1726
1727
if (!mask)
1728
{
1729
int size = len * cn;
1730
#if CV_AVX && !CV_AVX2
1731
for (; x <= size - 4 ; x += 4)
1732
{
1733
__m256d v_src = _mm256_loadu_pd(src + x);
1734
__m256d v_dst = _mm256_loadu_pd(dst + x);
1735
v_src = _mm256_mul_pd(v_src, v_src);
1736
v_dst = _mm256_add_pd(v_dst, v_src);
1737
_mm256_storeu_pd(dst + x, v_dst);
1738
}
1739
#else
1740
for (; x <= size - cVectorWidth; x += cVectorWidth)
1741
{
1742
v_float64 v_src0 = vx_load(src + x);
1743
v_float64 v_src1 = vx_load(src + x + step);
1744
v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
1745
v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
1746
}
1747
#endif // CV_AVX && !CV_AVX2
1748
}
1749
else
1750
{
1751
v_uint64 v_0 = vx_setzero_u64();
1752
if (cn == 1)
1753
{
1754
for (; x <= len - cVectorWidth; x += cVectorWidth)
1755
{
1756
v_uint32 v_mask32 = vx_load_expand_q(mask + x);
1757
v_uint64 v_masku640, v_masku641;
1758
v_expand(v_mask32, v_masku640, v_masku641);
1759
v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
1760
v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
1761
v_float64 v_src0 = vx_load(src + x);
1762
v_float64 v_src1 = vx_load(src + x + step);
1763
v_src0 = v_src0 & v_mask0;
1764
v_src1 = v_src1 & v_mask1;
1765
v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
1766
v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
1767
}
1768
}
1769
else if (cn == 3)
1770
{
1771
for (; x <= len - cVectorWidth; x += cVectorWidth)
1772
{
1773
v_uint32 v_mask32 = vx_load_expand_q(mask + x);
1774
v_uint64 v_masku640, v_masku641;
1775
v_expand(v_mask32, v_masku640, v_masku641);
1776
v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
1777
v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
1778
1779
v_float64 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
1780
v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
1781
v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
1782
v_src00 = v_src00 & v_mask0;
1783
v_src01 = v_src01 & v_mask1;
1784
v_src10 = v_src10 & v_mask0;
1785
v_src11 = v_src11 & v_mask1;
1786
v_src20 = v_src20 & v_mask0;
1787
v_src21 = v_src21 & v_mask1;
1788
1789
v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
1790
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1791
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1792
1793
v_dst00 = v_fma(v_src00, v_src00, v_dst00);
1794
v_dst01 = v_fma(v_src01, v_src01, v_dst01);
1795
v_dst10 = v_fma(v_src10, v_src10, v_dst10);
1796
v_dst11 = v_fma(v_src11, v_src11, v_dst11);
1797
v_dst20 = v_fma(v_src20, v_src20, v_dst20);
1798
v_dst21 = v_fma(v_src21, v_src21, v_dst21);
1799
1800
v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
1801
v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
1802
}
1803
}
1804
}
1805
#endif // CV_SIMD_64F
1806
accSqr_general_(src, dst, mask, len, cn, x);
1807
}
1808
1809
// product accumulate optimized by universal intrinsic
1810
void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar* mask, int len, int cn)
1811
{
1812
int x = 0;
1813
#if CV_SIMD
1814
const int cVectorWidth = v_uint8::nlanes;
1815
const int step = v_uint32::nlanes;
1816
1817
if (!mask)
1818
{
1819
int size = len * cn;
1820
for (; x <= size - cVectorWidth; x += cVectorWidth)
1821
{
1822
v_uint8 v_1src = vx_load(src1 + x);
1823
v_uint8 v_2src = vx_load(src2 + x);
1824
1825
v_uint16 v_src0, v_src1;
1826
v_mul_expand(v_1src, v_2src, v_src0, v_src1);
1827
1828
v_uint32 v_src00, v_src01, v_src10, v_src11;
1829
v_expand(v_src0, v_src00, v_src01);
1830
v_expand(v_src1, v_src10, v_src11);
1831
1832
v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
1833
v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
1834
v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
1835
v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
1836
}
1837
}
1838
else
1839
{
1840
v_uint8 v_0 = vx_setzero_u8();
1841
if (cn == 1)
1842
{
1843
for (; x <= len - cVectorWidth; x += cVectorWidth)
1844
{
1845
v_uint8 v_mask = vx_load(mask + x);
1846
v_mask = ~(v_mask == v_0);
1847
v_uint8 v_1src = vx_load(src1 + x);
1848
v_uint8 v_2src = vx_load(src2 + x);
1849
v_1src = v_1src & v_mask;
1850
v_2src = v_2src & v_mask;
1851
1852
v_uint16 v_src0, v_src1;
1853
v_mul_expand(v_1src, v_2src, v_src0, v_src1);
1854
1855
v_uint32 v_src00, v_src01, v_src10, v_src11;
1856
v_expand(v_src0, v_src00, v_src01);
1857
v_expand(v_src1, v_src10, v_src11);
1858
1859
v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
1860
v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
1861
v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
1862
v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
1863
}
1864
}
1865
else if (cn == 3)
1866
{
1867
for (; x <= len - cVectorWidth; x += cVectorWidth)
1868
{
1869
v_uint8 v_mask = vx_load(mask + x);
1870
v_mask = ~(v_mask == v_0);
1871
v_uint8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
1872
v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
1873
v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
1874
v_1src0 = v_1src0 & v_mask;
1875
v_1src1 = v_1src1 & v_mask;
1876
v_1src2 = v_1src2 & v_mask;
1877
v_2src0 = v_2src0 & v_mask;
1878
v_2src1 = v_2src1 & v_mask;
1879
v_2src2 = v_2src2 & v_mask;
1880
1881
v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
1882
v_mul_expand(v_1src0, v_2src0, v_src00, v_src01);
1883
v_mul_expand(v_1src1, v_2src1, v_src10, v_src11);
1884
v_mul_expand(v_1src2, v_2src2, v_src20, v_src21);
1885
1886
v_uint32 v_src000, v_src001, v_src002, v_src003, v_src100, v_src101, v_src102, v_src103, v_src200, v_src201, v_src202, v_src203;
1887
v_expand(v_src00, v_src000, v_src001);
1888
v_expand(v_src01, v_src002, v_src003);
1889
v_expand(v_src10, v_src100, v_src101);
1890
v_expand(v_src11, v_src102, v_src103);
1891
v_expand(v_src20, v_src200, v_src201);
1892
v_expand(v_src21, v_src202, v_src203);
1893
1894
v_float32 v_dst000, v_dst001, v_dst002, v_dst003, v_dst100, v_dst101, v_dst102, v_dst103, v_dst200, v_dst201, v_dst202, v_dst203;
1895
v_load_deinterleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
1896
v_load_deinterleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
1897
v_load_deinterleave(dst + (x + step * 2) * cn, v_dst002, v_dst102, v_dst202);
1898
v_load_deinterleave(dst + (x + step * 3) * cn, v_dst003, v_dst103, v_dst203);
1899
v_dst000 = v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000));
1900
v_dst001 = v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001));
1901
v_dst002 = v_dst002 + v_cvt_f32(v_reinterpret_as_s32(v_src002));
1902
v_dst003 = v_dst003 + v_cvt_f32(v_reinterpret_as_s32(v_src003));
1903
v_dst100 = v_dst100 + v_cvt_f32(v_reinterpret_as_s32(v_src100));
1904
v_dst101 = v_dst101 + v_cvt_f32(v_reinterpret_as_s32(v_src101));
1905
v_dst102 = v_dst102 + v_cvt_f32(v_reinterpret_as_s32(v_src102));
1906
v_dst103 = v_dst103 + v_cvt_f32(v_reinterpret_as_s32(v_src103));
1907
v_dst200 = v_dst200 + v_cvt_f32(v_reinterpret_as_s32(v_src200));
1908
v_dst201 = v_dst201 + v_cvt_f32(v_reinterpret_as_s32(v_src201));
1909
v_dst202 = v_dst202 + v_cvt_f32(v_reinterpret_as_s32(v_src202));
1910
v_dst203 = v_dst203 + v_cvt_f32(v_reinterpret_as_s32(v_src203));
1911
1912
v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
1913
v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
1914
v_store_interleave(dst + (x + step * 2) * cn, v_dst002, v_dst102, v_dst202);
1915
v_store_interleave(dst + (x + step * 3) * cn, v_dst003, v_dst103, v_dst203);
1916
}
1917
}
1918
}
1919
#endif // CV_SIMD
1920
accProd_general_(src1, src2, dst, mask, len, cn, x);
1921
}
1922
1923
void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uchar* mask, int len, int cn)
1924
{
1925
int x = 0;
1926
#if CV_SIMD
1927
const int cVectorWidth = v_uint16::nlanes;
1928
const int step = v_float32::nlanes;
1929
1930
if (!mask)
1931
{
1932
int size = len * cn;
1933
for (; x <= size - cVectorWidth; x += cVectorWidth)
1934
{
1935
v_uint16 v_1src = vx_load(src1 + x);
1936
v_uint16 v_2src = vx_load(src2 + x);
1937
1938
v_uint32 v_1src0, v_1src1, v_2src0, v_2src1;
1939
v_expand(v_1src, v_1src0, v_1src1);
1940
v_expand(v_2src, v_2src0, v_2src1);
1941
1942
v_float32 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0));
1943
v_float32 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1));
1944
v_float32 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0));
1945
v_float32 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1));
1946
1947
v_store(dst + x, v_fma(v_1float0, v_2float0, vx_load(dst + x)));
1948
v_store(dst + x + step, v_fma(v_1float1, v_2float1, vx_load(dst + x + step)));
1949
}
1950
}
1951
else
1952
{
1953
v_uint16 v_0 = vx_setzero_u16();
1954
if (cn == 1)
1955
{
1956
for (; x <= len - cVectorWidth; x += cVectorWidth)
1957
{
1958
v_uint16 v_mask = vx_load_expand(mask + x);
1959
v_mask = ~(v_0 == v_mask);
1960
1961
v_uint16 v_1src = vx_load(src1 + x) & v_mask;
1962
v_uint16 v_2src = vx_load(src2 + x) & v_mask;
1963
1964
v_uint32 v_1src0, v_1src1, v_2src0, v_2src1;
1965
v_expand(v_1src, v_1src0, v_1src1);
1966
v_expand(v_2src, v_2src0, v_2src1);
1967
1968
v_float32 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0));
1969
v_float32 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1));
1970
v_float32 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0));
1971
v_float32 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1));
1972
1973
v_store(dst + x, v_fma(v_1float0, v_2float0, vx_load(dst + x)));
1974
v_store(dst + x + step, v_fma(v_1float1, v_2float1, vx_load(dst + x + step)));
1975
}
1976
}
1977
else if (cn == 3)
1978
{
1979
for (; x <= len - cVectorWidth; x += cVectorWidth)
1980
{
1981
v_uint16 v_mask = vx_load_expand(mask + x);
1982
v_mask = ~(v_0 == v_mask);
1983
1984
v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
1985
v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
1986
v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
1987
v_1src0 = v_1src0 & v_mask;
1988
v_1src1 = v_1src1 & v_mask;
1989
v_1src2 = v_1src2 & v_mask;
1990
v_2src0 = v_2src0 & v_mask;
1991
v_2src1 = v_2src1 & v_mask;
1992
v_2src2 = v_2src2 & v_mask;
1993
1994
v_uint32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
1995
v_expand(v_1src0, v_1src00, v_1src01);
1996
v_expand(v_1src1, v_1src10, v_1src11);
1997
v_expand(v_1src2, v_1src20, v_1src21);
1998
v_expand(v_2src0, v_2src00, v_2src01);
1999
v_expand(v_2src1, v_2src10, v_2src11);
2000
v_expand(v_2src2, v_2src20, v_2src21);
2001
2002
v_float32 v_1float00 = v_cvt_f32(v_reinterpret_as_s32(v_1src00));
2003
v_float32 v_1float01 = v_cvt_f32(v_reinterpret_as_s32(v_1src01));
2004
v_float32 v_1float10 = v_cvt_f32(v_reinterpret_as_s32(v_1src10));
2005
v_float32 v_1float11 = v_cvt_f32(v_reinterpret_as_s32(v_1src11));
2006
v_float32 v_1float20 = v_cvt_f32(v_reinterpret_as_s32(v_1src20));
2007
v_float32 v_1float21 = v_cvt_f32(v_reinterpret_as_s32(v_1src21));
2008
v_float32 v_2float00 = v_cvt_f32(v_reinterpret_as_s32(v_2src00));
2009
v_float32 v_2float01 = v_cvt_f32(v_reinterpret_as_s32(v_2src01));
2010
v_float32 v_2float10 = v_cvt_f32(v_reinterpret_as_s32(v_2src10));
2011
v_float32 v_2float11 = v_cvt_f32(v_reinterpret_as_s32(v_2src11));
2012
v_float32 v_2float20 = v_cvt_f32(v_reinterpret_as_s32(v_2src20));
2013
v_float32 v_2float21 = v_cvt_f32(v_reinterpret_as_s32(v_2src21));
2014
2015
v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
2016
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2017
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2018
2019
v_dst00 = v_fma(v_1float00, v_2float00, v_dst00);
2020
v_dst01 = v_fma(v_1float01, v_2float01, v_dst01);
2021
v_dst10 = v_fma(v_1float10, v_2float10, v_dst10);
2022
v_dst11 = v_fma(v_1float11, v_2float11, v_dst11);
2023
v_dst20 = v_fma(v_1float20, v_2float20, v_dst20);
2024
v_dst21 = v_fma(v_1float21, v_2float21, v_dst21);
2025
2026
v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2027
v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2028
}
2029
}
2030
}
2031
#endif // CV_SIMD
2032
accProd_general_(src1, src2, dst, mask, len, cn, x);
2033
}
2034
2035
void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn)
2036
{
2037
int x = 0;
2038
#if CV_SIMD
2039
const int cVectorWidth = v_uint16::nlanes;
2040
const int step = v_float32::nlanes;
2041
2042
if (!mask)
2043
{
2044
int size = len * cn;
2045
#if CV_AVX && !CV_AVX2
2046
for (; x <= size - 8 ; x += 8)
2047
{
2048
__m256 v_src0 = _mm256_loadu_ps(src1 + x);
2049
__m256 v_src1 = _mm256_loadu_ps(src2 + x);
2050
__m256 v_dst = _mm256_loadu_ps(dst + x);
2051
__m256 v_src = _mm256_mul_ps(v_src0, v_src1);
2052
v_dst = _mm256_add_ps(v_src, v_dst);
2053
_mm256_storeu_ps(dst + x, v_dst);
2054
}
2055
#else
2056
for (; x <= size - cVectorWidth; x += cVectorWidth)
2057
{
2058
v_store(dst + x, v_fma(vx_load(src1 + x), vx_load(src2 + x), vx_load(dst + x)));
2059
v_store(dst + x + step, v_fma(vx_load(src1 + x + step), vx_load(src2 + x + step), vx_load(dst + x + step)));
2060
}
2061
#endif // CV_AVX && !CV_AVX2
2062
}
2063
else
2064
{
2065
v_uint32 v_0 = vx_setzero_u32();
2066
if (cn == 1)
2067
{
2068
for (; x <= len - cVectorWidth; x += cVectorWidth)
2069
{
2070
v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);
2071
v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);
2072
v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
2073
v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
2074
2075
v_store(dst + x, vx_load(dst + x) + ((vx_load(src1 + x) * vx_load(src2 + x)) & v_mask0));
2076
v_store(dst + x + step, vx_load(dst + x + step) + ((vx_load(src1 + x + step) * vx_load(src2 + x + step)) & v_mask1));
2077
}
2078
}
2079
else if (cn == 3)
2080
{
2081
for (; x <= len - cVectorWidth; x += cVectorWidth)
2082
{
2083
v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);
2084
v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);
2085
v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
2086
v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
2087
2088
v_float32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
2089
v_float32 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
2090
v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20);
2091
v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20);
2092
v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21);
2093
v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21);
2094
2095
v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
2096
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2097
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2098
2099
v_store_interleave(dst + x * cn, v_dst00 + ((v_1src00 * v_2src00) & v_mask0), v_dst10 + ((v_1src10 * v_2src10) & v_mask0), v_dst20 + ((v_1src20 * v_2src20) & v_mask0));
2100
v_store_interleave(dst + (x + step) * cn, v_dst01 + ((v_1src01 * v_2src01) & v_mask1), v_dst11 + ((v_1src11 * v_2src11) & v_mask1), v_dst21 + ((v_1src21 * v_2src21) & v_mask1));
2101
}
2102
}
2103
}
2104
#endif // CV_SIMD
2105
accProd_general_(src1, src2, dst, mask, len, cn, x);
2106
}
2107
2108
void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn)
2109
{
2110
int x = 0;
2111
#if CV_SIMD_64F
2112
const int cVectorWidth = v_uint16::nlanes;
2113
const int step = v_float64::nlanes;
2114
2115
if (!mask)
2116
{
2117
int size = len * cn;
2118
for (; x <= size - cVectorWidth; x += cVectorWidth)
2119
{
2120
v_uint16 v_1int = vx_load_expand(src1 + x);
2121
v_uint16 v_2int = vx_load_expand(src2 + x);
2122
2123
v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
2124
v_expand(v_1int, v_1int_0, v_1int_1);
2125
v_expand(v_2int, v_2int_0, v_2int_1);
2126
2127
v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);
2128
v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);
2129
v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);
2130
v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);
2131
2132
v_float64 v_dst0 = vx_load(dst + x);
2133
v_float64 v_dst1 = vx_load(dst + x + step);
2134
v_float64 v_dst2 = vx_load(dst + x + step * 2);
2135
v_float64 v_dst3 = vx_load(dst + x + step * 3);
2136
2137
v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);
2138
v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);
2139
v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);
2140
v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);
2141
2142
v_store(dst + x, v_dst0);
2143
v_store(dst + x + step, v_dst1);
2144
v_store(dst + x + step * 2, v_dst2);
2145
v_store(dst + x + step * 3, v_dst3);
2146
}
2147
}
2148
else
2149
{
2150
v_uint16 v_0 = vx_setzero_u16();
2151
if (cn == 1)
2152
{
2153
for (; x <= len - cVectorWidth; x += cVectorWidth)
2154
{
2155
v_uint16 v_mask = vx_load_expand(mask + x);
2156
v_mask = ~(v_mask == v_0);
2157
v_uint16 v_1int = vx_load_expand(src1 + x) & v_mask;
2158
v_uint16 v_2int = vx_load_expand(src2 + x) & v_mask;
2159
2160
v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
2161
v_expand(v_1int, v_1int_0, v_1int_1);
2162
v_expand(v_2int, v_2int_0, v_2int_1);
2163
2164
v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);
2165
v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);
2166
v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);
2167
v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);
2168
2169
v_float64 v_dst0 = vx_load(dst + x);
2170
v_float64 v_dst1 = vx_load(dst + x + step);
2171
v_float64 v_dst2 = vx_load(dst + x + step * 2);
2172
v_float64 v_dst3 = vx_load(dst + x + step * 3);
2173
2174
v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);
2175
v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);
2176
v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);
2177
v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);
2178
2179
v_store(dst + x, v_dst0);
2180
v_store(dst + x + step, v_dst1);
2181
v_store(dst + x + step * 2, v_dst2);
2182
v_store(dst + x + step * 3, v_dst3);
2183
}
2184
}
2185
else if (cn == 3)
2186
{
2187
for (; x <= len - cVectorWidth * 2; x += cVectorWidth)
2188
{
2189
v_uint8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
2190
v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
2191
v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
2192
2193
v_uint16 v_1int0 = v_expand_low(v_1src0);
2194
v_uint16 v_1int1 = v_expand_low(v_1src1);
2195
v_uint16 v_1int2 = v_expand_low(v_1src2);
2196
v_uint16 v_2int0 = v_expand_low(v_2src0);
2197
v_uint16 v_2int1 = v_expand_low(v_2src1);
2198
v_uint16 v_2int2 = v_expand_low(v_2src2);
2199
2200
v_uint16 v_mask = vx_load_expand(mask + x);
2201
v_mask = ~(v_mask == v_0);
2202
v_1int0 = v_1int0 & v_mask;
2203
v_1int1 = v_1int1 & v_mask;
2204
v_1int2 = v_1int2 & v_mask;
2205
v_2int0 = v_2int0 & v_mask;
2206
v_2int1 = v_2int1 & v_mask;
2207
v_2int2 = v_2int2 & v_mask;
2208
2209
v_uint32 v_1int00, v_1int01, v_1int10, v_1int11, v_1int20, v_1int21;
2210
v_uint32 v_2int00, v_2int01, v_2int10, v_2int11, v_2int20, v_2int21;
2211
v_expand(v_1int0, v_1int00, v_1int01);
2212
v_expand(v_1int1, v_1int10, v_1int11);
2213
v_expand(v_1int2, v_1int20, v_1int21);
2214
v_expand(v_2int0, v_2int00, v_2int01);
2215
v_expand(v_2int1, v_2int10, v_2int11);
2216
v_expand(v_2int2, v_2int20, v_2int21);
2217
2218
v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;
2219
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2220
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2221
v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
2222
v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
2223
2224
v_dst00 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int00)), v_cvt_f64(v_reinterpret_as_s32(v_2int00)), v_dst00);
2225
v_dst01 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int00)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int00)), v_dst01);
2226
v_dst02 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int01)), v_cvt_f64(v_reinterpret_as_s32(v_2int01)), v_dst02);
2227
v_dst03 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int01)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int01)), v_dst03);
2228
v_dst10 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int10)), v_cvt_f64(v_reinterpret_as_s32(v_2int10)), v_dst10);
2229
v_dst11 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int10)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int10)), v_dst11);
2230
v_dst12 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int11)), v_cvt_f64(v_reinterpret_as_s32(v_2int11)), v_dst12);
2231
v_dst13 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int11)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int11)), v_dst13);
2232
v_dst20 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int20)), v_cvt_f64(v_reinterpret_as_s32(v_2int20)), v_dst20);
2233
v_dst21 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int20)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int20)), v_dst21);
2234
v_dst22 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int21)), v_cvt_f64(v_reinterpret_as_s32(v_2int21)), v_dst22);
2235
v_dst23 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int21)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int21)), v_dst23);
2236
2237
v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2238
v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2239
v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
2240
v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
2241
}
2242
}
2243
}
2244
#endif // CV_SIMD_64F
2245
accProd_general_(src1, src2, dst, mask, len, cn, x);
2246
}
2247
2248
void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn)
2249
{
2250
int x = 0;
2251
#if CV_SIMD_64F
2252
const int cVectorWidth = v_uint16::nlanes;
2253
const int step = v_float64::nlanes;
2254
2255
if (!mask)
2256
{
2257
int size = len * cn;
2258
for (; x <= size - cVectorWidth; x += cVectorWidth)
2259
{
2260
v_uint16 v_1src = vx_load(src1 + x);
2261
v_uint16 v_2src = vx_load(src2 + x);
2262
2263
v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
2264
v_expand(v_1src, v_1int_0, v_1int_1);
2265
v_expand(v_2src, v_2int_0, v_2int_1);
2266
2267
v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);
2268
v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);
2269
v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);
2270
v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);
2271
2272
v_float64 v_dst0 = vx_load(dst + x);
2273
v_float64 v_dst1 = vx_load(dst + x + step);
2274
v_float64 v_dst2 = vx_load(dst + x + step * 2);
2275
v_float64 v_dst3 = vx_load(dst + x + step * 3);
2276
2277
v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);
2278
v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);
2279
v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);
2280
v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);
2281
2282
v_store(dst + x, v_dst0);
2283
v_store(dst + x + step, v_dst1);
2284
v_store(dst + x + step * 2, v_dst2);
2285
v_store(dst + x + step * 3, v_dst3);
2286
}
2287
}
2288
else
2289
{
2290
v_uint16 v_0 = vx_setzero_u16();
2291
if (cn == 1)
2292
{
2293
for (; x <= len - cVectorWidth; x += cVectorWidth)
2294
{
2295
v_uint16 v_mask = vx_load_expand(mask + x);
2296
v_mask = ~(v_mask == v_0);
2297
v_uint16 v_1src = vx_load(src1 + x);
2298
v_uint16 v_2src = vx_load(src2 + x);
2299
v_1src = v_1src & v_mask;
2300
v_2src = v_2src & v_mask;
2301
2302
v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
2303
v_expand(v_1src, v_1int_0, v_1int_1);
2304
v_expand(v_2src, v_2int_0, v_2int_1);
2305
2306
v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);
2307
v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);
2308
v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);
2309
v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);
2310
2311
v_float64 v_dst0 = vx_load(dst + x);
2312
v_float64 v_dst1 = vx_load(dst + x + step);
2313
v_float64 v_dst2 = vx_load(dst + x + step * 2);
2314
v_float64 v_dst3 = vx_load(dst + x + step * 3);
2315
2316
v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);
2317
v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);
2318
v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);
2319
v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);
2320
2321
v_store(dst + x, v_dst0);
2322
v_store(dst + x + step, v_dst1);
2323
v_store(dst + x + step * 2, v_dst2);
2324
v_store(dst + x + step * 3, v_dst3);
2325
}
2326
}
2327
else if (cn == 3)
2328
{
2329
for (; x <= len - cVectorWidth; x += cVectorWidth)
2330
{
2331
v_uint16 v_mask = vx_load_expand(mask + x);
2332
v_mask = ~(v_mask == v_0);
2333
v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
2334
v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
2335
v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
2336
v_1src0 = v_1src0 & v_mask;
2337
v_1src1 = v_1src1 & v_mask;
2338
v_1src2 = v_1src2 & v_mask;
2339
v_2src0 = v_2src0 & v_mask;
2340
v_2src1 = v_2src1 & v_mask;
2341
v_2src2 = v_2src2 & v_mask;
2342
2343
v_uint32 v_1int_00, v_1int_01, v_2int_00, v_2int_01;
2344
v_uint32 v_1int_10, v_1int_11, v_2int_10, v_2int_11;
2345
v_uint32 v_1int_20, v_1int_21, v_2int_20, v_2int_21;
2346
v_expand(v_1src0, v_1int_00, v_1int_01);
2347
v_expand(v_1src1, v_1int_10, v_1int_11);
2348
v_expand(v_1src2, v_1int_20, v_1int_21);
2349
v_expand(v_2src0, v_2int_00, v_2int_01);
2350
v_expand(v_2src1, v_2int_10, v_2int_11);
2351
v_expand(v_2src2, v_2int_20, v_2int_21);
2352
2353
v_int32 v_1int00 = v_reinterpret_as_s32(v_1int_00);
2354
v_int32 v_1int01 = v_reinterpret_as_s32(v_1int_01);
2355
v_int32 v_1int10 = v_reinterpret_as_s32(v_1int_10);
2356
v_int32 v_1int11 = v_reinterpret_as_s32(v_1int_11);
2357
v_int32 v_1int20 = v_reinterpret_as_s32(v_1int_20);
2358
v_int32 v_1int21 = v_reinterpret_as_s32(v_1int_21);
2359
v_int32 v_2int00 = v_reinterpret_as_s32(v_2int_00);
2360
v_int32 v_2int01 = v_reinterpret_as_s32(v_2int_01);
2361
v_int32 v_2int10 = v_reinterpret_as_s32(v_2int_10);
2362
v_int32 v_2int11 = v_reinterpret_as_s32(v_2int_11);
2363
v_int32 v_2int20 = v_reinterpret_as_s32(v_2int_20);
2364
v_int32 v_2int21 = v_reinterpret_as_s32(v_2int_21);
2365
2366
v_float64 v_dst00, v_dst01, v_dst02, v_dst03;
2367
v_float64 v_dst10, v_dst11, v_dst12, v_dst13;
2368
v_float64 v_dst20, v_dst21, v_dst22, v_dst23;
2369
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2370
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2371
v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
2372
v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
2373
2374
v_dst00 = v_fma(v_cvt_f64(v_1int00), v_cvt_f64(v_2int00), v_dst00);
2375
v_dst01 = v_fma(v_cvt_f64_high(v_1int00), v_cvt_f64_high(v_2int00), v_dst01);
2376
v_dst02 = v_fma(v_cvt_f64(v_1int01), v_cvt_f64(v_2int01), v_dst02);
2377
v_dst03 = v_fma(v_cvt_f64_high(v_1int01), v_cvt_f64_high(v_2int01), v_dst03);
2378
v_dst10 = v_fma(v_cvt_f64(v_1int10), v_cvt_f64(v_2int10), v_dst10);
2379
v_dst11 = v_fma(v_cvt_f64_high(v_1int10), v_cvt_f64_high(v_2int10), v_dst11);
2380
v_dst12 = v_fma(v_cvt_f64(v_1int11), v_cvt_f64(v_2int11), v_dst12);
2381
v_dst13 = v_fma(v_cvt_f64_high(v_1int11), v_cvt_f64_high(v_2int11), v_dst13);
2382
v_dst20 = v_fma(v_cvt_f64(v_1int20), v_cvt_f64(v_2int20), v_dst20);
2383
v_dst21 = v_fma(v_cvt_f64_high(v_1int20), v_cvt_f64_high(v_2int20), v_dst21);
2384
v_dst22 = v_fma(v_cvt_f64(v_1int21), v_cvt_f64(v_2int21), v_dst22);
2385
v_dst23 = v_fma(v_cvt_f64_high(v_1int21), v_cvt_f64_high(v_2int21), v_dst23);
2386
2387
v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2388
v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2389
v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
2390
v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
2391
}
2392
}
2393
}
2394
#endif // CV_SIMD_64F
2395
accProd_general_(src1, src2, dst, mask, len, cn, x);
2396
}
2397
2398
void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn)
2399
{
2400
int x = 0;
2401
#if CV_SIMD_64F
2402
const int cVectorWidth = v_float32::nlanes;
2403
const int step = v_float64::nlanes;
2404
2405
if (!mask)
2406
{
2407
int size = len * cn;
2408
#if CV_AVX && !CV_AVX2
2409
for ( ; x <= size - 8 ; x += 8)
2410
{
2411
__m256 v_1src = _mm256_loadu_ps(src1 + x);
2412
__m256 v_2src = _mm256_loadu_ps(src2 + x);
2413
__m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,0));
2414
__m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,1));
2415
__m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,0));
2416
__m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,1));
2417
__m256d v_dst0 = _mm256_loadu_pd(dst + x);
2418
__m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);
2419
__m256d v_src0 = _mm256_mul_pd(v_src00, v_src10);
2420
__m256d v_src1 = _mm256_mul_pd(v_src01, v_src11);
2421
v_dst0 = _mm256_add_pd(v_src0, v_dst0);
2422
v_dst1 = _mm256_add_pd(v_src1, v_dst1);
2423
_mm256_storeu_pd(dst + x, v_dst0);
2424
_mm256_storeu_pd(dst + x + 4, v_dst1);
2425
}
2426
#else
2427
for (; x <= size - cVectorWidth; x += cVectorWidth)
2428
{
2429
v_float32 v_1src = vx_load(src1 + x);
2430
v_float32 v_2src = vx_load(src2 + x);
2431
2432
v_float64 v_1src0 = v_cvt_f64(v_1src);
2433
v_float64 v_1src1 = v_cvt_f64_high(v_1src);
2434
v_float64 v_2src0 = v_cvt_f64(v_2src);
2435
v_float64 v_2src1 = v_cvt_f64_high(v_2src);
2436
2437
v_store(dst + x, v_fma(v_1src0, v_2src0, vx_load(dst + x)));
2438
v_store(dst + x + step, v_fma(v_1src1, v_2src1, vx_load(dst + x + step)));
2439
}
2440
#endif // CV_AVX && !CV_AVX2
2441
}
2442
else
2443
{
2444
v_uint32 v_0 = vx_setzero_u32();
2445
if (cn == 1)
2446
{
2447
for (; x <= len - cVectorWidth; x += cVectorWidth)
2448
{
2449
v_uint32 v_mask = vx_load_expand_q(mask + x);
2450
v_mask = ~(v_mask == v_0);
2451
v_float32 v_1src = vx_load(src1 + x);
2452
v_float32 v_2src = vx_load(src2 + x);
2453
v_1src = v_1src & v_reinterpret_as_f32(v_mask);
2454
v_2src = v_2src & v_reinterpret_as_f32(v_mask);
2455
2456
v_float64 v_1src0 = v_cvt_f64(v_1src);
2457
v_float64 v_1src1 = v_cvt_f64_high(v_1src);
2458
v_float64 v_2src0 = v_cvt_f64(v_2src);
2459
v_float64 v_2src1 = v_cvt_f64_high(v_2src);
2460
2461
v_store(dst + x, v_fma(v_1src0, v_2src0, vx_load(dst + x)));
2462
v_store(dst + x + step, v_fma(v_1src1, v_2src1, vx_load(dst + x + step)));
2463
}
2464
}
2465
else if (cn == 3)
2466
{
2467
for (; x <= len - cVectorWidth; x += cVectorWidth)
2468
{
2469
v_uint32 v_mask = vx_load_expand_q(mask + x);
2470
v_mask = ~(v_mask == v_0);
2471
v_float32 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
2472
v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
2473
v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
2474
v_1src0 = v_1src0 & v_reinterpret_as_f32(v_mask);
2475
v_1src1 = v_1src1 & v_reinterpret_as_f32(v_mask);
2476
v_1src2 = v_1src2 & v_reinterpret_as_f32(v_mask);
2477
v_2src0 = v_2src0 & v_reinterpret_as_f32(v_mask);
2478
v_2src1 = v_2src1 & v_reinterpret_as_f32(v_mask);
2479
v_2src2 = v_2src2 & v_reinterpret_as_f32(v_mask);
2480
2481
v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
2482
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2483
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2484
2485
v_dst00 = v_fma(v_cvt_f64(v_1src0), v_cvt_f64(v_2src0), v_dst00);
2486
v_dst01 = v_fma(v_cvt_f64_high(v_1src0), v_cvt_f64_high(v_2src0), v_dst01);
2487
v_dst10 = v_fma(v_cvt_f64(v_1src1), v_cvt_f64(v_2src1), v_dst10);
2488
v_dst11 = v_fma(v_cvt_f64_high(v_1src1), v_cvt_f64_high(v_2src1), v_dst11);
2489
v_dst20 = v_fma(v_cvt_f64(v_1src2), v_cvt_f64(v_2src2), v_dst20);
2490
v_dst21 = v_fma(v_cvt_f64_high(v_1src2), v_cvt_f64_high(v_2src2), v_dst21);
2491
2492
v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2493
v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2494
}
2495
}
2496
}
2497
#endif // CV_SIMD_64F
2498
accProd_general_(src1, src2, dst, mask, len, cn, x);
2499
}
2500
2501
void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn)
2502
{
2503
int x = 0;
2504
#if CV_SIMD_64F
2505
const int cVectorWidth = v_float64::nlanes * 2;
2506
const int step = v_float64::nlanes;
2507
2508
if (!mask)
2509
{
2510
int size = len * cn;
2511
#if CV_AVX && !CV_AVX2
2512
for ( ; x <= size - 4 ; x += 4)
2513
{
2514
__m256d v_src0 = _mm256_loadu_pd(src1 + x);
2515
__m256d v_src1 = _mm256_loadu_pd(src2 + x);
2516
__m256d v_dst = _mm256_loadu_pd(dst + x);
2517
v_src0 = _mm256_mul_pd(v_src0, v_src1);
2518
v_dst = _mm256_add_pd(v_dst, v_src0);
2519
_mm256_storeu_pd(dst + x, v_dst);
2520
}
2521
#else
2522
for (; x <= size - cVectorWidth; x += cVectorWidth)
2523
{
2524
v_float64 v_src00 = vx_load(src1 + x);
2525
v_float64 v_src01 = vx_load(src1 + x + step);
2526
v_float64 v_src10 = vx_load(src2 + x);
2527
v_float64 v_src11 = vx_load(src2 + x + step);
2528
2529
v_store(dst + x, v_fma(v_src00, v_src10, vx_load(dst + x)));
2530
v_store(dst + x + step, v_fma(v_src01, v_src11, vx_load(dst + x + step)));
2531
}
2532
#endif
2533
}
2534
else
2535
{
2536
// todo: try fma
2537
v_uint64 v_0 = vx_setzero_u64();
2538
if (cn == 1)
2539
{
2540
for (; x <= len - cVectorWidth; x += cVectorWidth)
2541
{
2542
v_uint32 v_mask32 = vx_load_expand_q(mask + x);
2543
v_uint64 v_masku640, v_masku641;
2544
v_expand(v_mask32, v_masku640, v_masku641);
2545
v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
2546
v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
2547
2548
v_float64 v_src00 = vx_load(src1 + x);
2549
v_float64 v_src01 = vx_load(src1 + x + step);
2550
v_float64 v_src10 = vx_load(src2 + x);
2551
v_float64 v_src11 = vx_load(src2 + x + step);
2552
2553
v_store(dst + x, vx_load(dst + x) + ((v_src00 * v_src10) & v_mask0));
2554
v_store(dst + x + step, vx_load(dst + x + step) + ((v_src01 * v_src11) & v_mask1));
2555
}
2556
}
2557
else if (cn == 3)
2558
{
2559
for (; x <= len - cVectorWidth; x += cVectorWidth)
2560
{
2561
v_uint32 v_mask32 = vx_load_expand_q(mask + x);
2562
v_uint64 v_masku640, v_masku641;
2563
v_expand(v_mask32, v_masku640, v_masku641);
2564
v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
2565
v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
2566
2567
v_float64 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
2568
v_float64 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
2569
v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20);
2570
v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21);
2571
v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20);
2572
v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21);
2573
v_float64 v_src00 = (v_1src00 & v_mask0) * v_2src00;
2574
v_float64 v_src01 = (v_1src01 & v_mask1) * v_2src01;
2575
v_float64 v_src10 = (v_1src10 & v_mask0) * v_2src10;
2576
v_float64 v_src11 = (v_1src11 & v_mask1) * v_2src11;
2577
v_float64 v_src20 = (v_1src20 & v_mask0) * v_2src20;
2578
v_float64 v_src21 = (v_1src21 & v_mask1) * v_2src21;
2579
2580
v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
2581
v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
2582
v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
2583
2584
v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
2585
v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
2586
}
2587
}
2588
}
2589
#endif // CV_SIMD_64F
2590
accProd_general_(src1, src2, dst, mask, len, cn, x);
2591
}
2592
2593
// running weight accumulate optimized by universal intrinsic
2594
void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn, double alpha)
2595
{
2596
int x = 0;
2597
#if CV_SIMD
2598
const v_float32 v_alpha = vx_setall_f32((float)alpha);
2599
const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
2600
const int cVectorWidth = v_uint8::nlanes;
2601
const int step = v_float32::nlanes;
2602
2603
if (!mask)
2604
{
2605
int size = len * cn;
2606
for (; x <= size - cVectorWidth; x += cVectorWidth)
2607
{
2608
v_uint8 v_src = vx_load(src + x);
2609
2610
v_uint16 v_src0, v_src1;
2611
v_expand(v_src, v_src0, v_src1);
2612
2613
v_uint32 v_src00, v_src01, v_src10, v_src11;
2614
v_expand(v_src0, v_src00, v_src01);
2615
v_expand(v_src1, v_src10, v_src11);
2616
2617
v_float32 v_dst00 = vx_load(dst + x);
2618
v_float32 v_dst01 = vx_load(dst + x + step);
2619
v_float32 v_dst10 = vx_load(dst + x + step * 2);
2620
v_float32 v_dst11 = vx_load(dst + x + step * 3);
2621
2622
v_dst00 = v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha);
2623
v_dst01 = v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha);
2624
v_dst10 = v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha);
2625
v_dst11 = v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha);
2626
2627
v_store(dst + x, v_dst00);
2628
v_store(dst + x + step, v_dst01);
2629
v_store(dst + x + step * 2, v_dst10);
2630
v_store(dst + x + step * 3, v_dst11);
2631
}
2632
}
2633
#endif // CV_SIMD
2634
accW_general_(src, dst, mask, len, cn, alpha, x);
2635
}
2636
2637
void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn, double alpha)
2638
{
2639
int x = 0;
2640
#if CV_SIMD
2641
const v_float32 v_alpha = vx_setall_f32((float)alpha);
2642
const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
2643
const int cVectorWidth = v_uint16::nlanes;
2644
const int step = v_float32::nlanes;
2645
2646
if (!mask)
2647
{
2648
int size = len * cn;
2649
for (; x <= size - cVectorWidth; x += cVectorWidth)
2650
{
2651
v_uint16 v_src = vx_load(src + x);
2652
v_uint32 v_int0, v_int1;
2653
v_expand(v_src, v_int0, v_int1);
2654
2655
v_float32 v_dst0 = vx_load(dst + x);
2656
v_float32 v_dst1 = vx_load(dst + x + step);
2657
v_dst0 = v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int0)) * v_alpha);
2658
v_dst1 = v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int1)) * v_alpha);
2659
2660
v_store(dst + x, v_dst0);
2661
v_store(dst + x + step, v_dst1);
2662
}
2663
}
2664
#endif // CV_SIMD
2665
accW_general_(src, dst, mask, len, cn, alpha, x);
2666
}
2667
2668
void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha)
2669
{
2670
int x = 0;
2671
#if CV_AVX && !CV_AVX2
2672
const __m256 v_alpha = _mm256_set1_ps((float)alpha);
2673
const __m256 v_beta = _mm256_set1_ps((float)(1.0f - alpha));
2674
const int cVectorWidth = 16;
2675
2676
if (!mask)
2677
{
2678
int size = len * cn;
2679
for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
2680
{
2681
_mm256_storeu_ps(dst + x, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x), v_alpha)));
2682
_mm256_storeu_ps(dst + x + 8, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x + 8), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x + 8), v_alpha)));
2683
}
2684
}
2685
#elif CV_SIMD
2686
const v_float32 v_alpha = vx_setall_f32((float)alpha);
2687
const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
2688
const int cVectorWidth = v_uint16::nlanes;
2689
const int step = v_float32::nlanes;
2690
2691
if (!mask)
2692
{
2693
int size = len * cn;
2694
for (; x <= size - cVectorWidth; x += cVectorWidth)
2695
{
2696
v_float32 v_dst0 = vx_load(dst + x);
2697
v_float32 v_dst1 = vx_load(dst + x + step);
2698
2699
v_dst0 = v_fma(v_dst0, v_beta, vx_load(src + x) * v_alpha);
2700
v_dst1 = v_fma(v_dst1, v_beta, vx_load(src + x + step) * v_alpha);
2701
2702
v_store(dst + x, v_dst0);
2703
v_store(dst + x + step, v_dst1);
2704
}
2705
}
2706
#endif // CV_SIMD
2707
accW_general_(src, dst, mask, len, cn, alpha, x);
2708
}
2709
2710
void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha)
2711
{
2712
int x = 0;
2713
#if CV_SIMD_64F
2714
const v_float64 v_alpha = vx_setall_f64(alpha);
2715
const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
2716
const int cVectorWidth = v_uint16::nlanes;
2717
const int step = v_float64::nlanes;
2718
2719
if (!mask)
2720
{
2721
int size = len * cn;
2722
for (; x <= size - cVectorWidth; x += cVectorWidth)
2723
{
2724
v_uint16 v_src16 = vx_load_expand(src + x);
2725
2726
v_uint32 v_int_0, v_int_1;
2727
v_expand(v_src16, v_int_0, v_int_1);
2728
2729
v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);
2730
v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);
2731
2732
v_float64 v_src0 = v_cvt_f64(v_int0);
2733
v_float64 v_src1 = v_cvt_f64_high(v_int0);
2734
v_float64 v_src2 = v_cvt_f64(v_int1);
2735
v_float64 v_src3 = v_cvt_f64_high(v_int1);
2736
2737
v_float64 v_dst0 = vx_load(dst + x);
2738
v_float64 v_dst1 = vx_load(dst + x + step);
2739
v_float64 v_dst2 = vx_load(dst + x + step * 2);
2740
v_float64 v_dst3 = vx_load(dst + x + step * 3);
2741
2742
v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha);
2743
v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha);
2744
v_dst2 = v_fma(v_dst2, v_beta, v_src2 * v_alpha);
2745
v_dst3 = v_fma(v_dst3, v_beta, v_src3 * v_alpha);
2746
2747
v_store(dst + x, v_dst0);
2748
v_store(dst + x + step, v_dst1);
2749
v_store(dst + x + step * 2, v_dst2);
2750
v_store(dst + x + step * 3, v_dst3);
2751
}
2752
}
2753
#endif // CV_SIMD_64F
2754
accW_general_(src, dst, mask, len, cn, alpha, x);
2755
}
2756
2757
void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha)
2758
{
2759
int x = 0;
2760
#if CV_SIMD_64F
2761
const v_float64 v_alpha = vx_setall_f64(alpha);
2762
const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
2763
const int cVectorWidth = v_uint16::nlanes;
2764
const int step = v_float64::nlanes;
2765
2766
if (!mask)
2767
{
2768
int size = len * cn;
2769
for (; x <= size - cVectorWidth; x += cVectorWidth)
2770
{
2771
v_uint16 v_src = vx_load(src + x);
2772
v_uint32 v_int_0, v_int_1;
2773
v_expand(v_src, v_int_0, v_int_1);
2774
2775
v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);
2776
v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);
2777
2778
v_float64 v_src00 = v_cvt_f64(v_int0);
2779
v_float64 v_src01 = v_cvt_f64_high(v_int0);
2780
v_float64 v_src10 = v_cvt_f64(v_int1);
2781
v_float64 v_src11 = v_cvt_f64_high(v_int1);
2782
2783
v_float64 v_dst00 = vx_load(dst + x);
2784
v_float64 v_dst01 = vx_load(dst + x + step);
2785
v_float64 v_dst10 = vx_load(dst + x + step * 2);
2786
v_float64 v_dst11 = vx_load(dst + x + step * 3);
2787
2788
v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha);
2789
v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha);
2790
v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha);
2791
v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha);
2792
2793
v_store(dst + x, v_dst00);
2794
v_store(dst + x + step, v_dst01);
2795
v_store(dst + x + step * 2, v_dst10);
2796
v_store(dst + x + step * 3, v_dst11);
2797
}
2798
}
2799
#endif // CV_SIMD_64F
2800
accW_general_(src, dst, mask, len, cn, alpha, x);
2801
}
2802
2803
void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha)
2804
{
2805
int x = 0;
2806
#if CV_AVX && !CV_AVX2
2807
const __m256d v_alpha = _mm256_set1_pd(alpha);
2808
const __m256d v_beta = _mm256_set1_pd(1.0f - alpha);
2809
const int cVectorWidth = 16;
2810
2811
if (!mask)
2812
{
2813
int size = len * cn;
2814
for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
2815
{
2816
__m256 v_src0 = _mm256_loadu_ps(src + x);
2817
__m256 v_src1 = _mm256_loadu_ps(src + x + 8);
2818
__m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src0,0));
2819
__m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src0,1));
2820
__m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src1,0));
2821
__m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src1,1));
2822
2823
_mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src00, v_alpha)));
2824
_mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src01, v_alpha)));
2825
_mm256_storeu_pd(dst + x + 8, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 8), v_beta), _mm256_mul_pd(v_src10, v_alpha)));
2826
_mm256_storeu_pd(dst + x + 12, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 12), v_beta), _mm256_mul_pd(v_src11, v_alpha)));
2827
}
2828
}
2829
#elif CV_SIMD_64F
2830
const v_float64 v_alpha = vx_setall_f64(alpha);
2831
const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
2832
const int cVectorWidth = v_float32::nlanes * 2;
2833
const int step = v_float64::nlanes;
2834
2835
if (!mask)
2836
{
2837
int size = len * cn;
2838
for (; x <= size - cVectorWidth; x += cVectorWidth)
2839
{
2840
v_float32 v_src0 = vx_load(src + x);
2841
v_float32 v_src1 = vx_load(src + x + v_float32::nlanes);
2842
v_float64 v_src00 = v_cvt_f64(v_src0);
2843
v_float64 v_src01 = v_cvt_f64_high(v_src0);
2844
v_float64 v_src10 = v_cvt_f64(v_src1);
2845
v_float64 v_src11 = v_cvt_f64_high(v_src1);
2846
2847
v_float64 v_dst00 = vx_load(dst + x);
2848
v_float64 v_dst01 = vx_load(dst + x + step);
2849
v_float64 v_dst10 = vx_load(dst + x + step * 2);
2850
v_float64 v_dst11 = vx_load(dst + x + step * 3);
2851
2852
v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha);
2853
v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha);
2854
v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha);
2855
v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha);
2856
2857
v_store(dst + x, v_dst00);
2858
v_store(dst + x + step, v_dst01);
2859
v_store(dst + x + step * 2, v_dst10);
2860
v_store(dst + x + step * 3, v_dst11);
2861
}
2862
}
2863
#endif // CV_SIMD_64F
2864
accW_general_(src, dst, mask, len, cn, alpha, x);
2865
}
2866
2867
void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha)
2868
{
2869
int x = 0;
2870
#if CV_AVX && !CV_AVX2
2871
const __m256d v_alpha = _mm256_set1_pd(alpha);
2872
const __m256d v_beta = _mm256_set1_pd(1.0f - alpha);
2873
const int cVectorWidth = 8;
2874
2875
if (!mask)
2876
{
2877
int size = len * cn;
2878
for ( ; x <= size - cVectorWidth ; x += cVectorWidth)
2879
{
2880
__m256d v_src0 = _mm256_loadu_pd(src + x);
2881
__m256d v_src1 = _mm256_loadu_pd(src + x + 4);
2882
2883
_mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src0, v_alpha)));
2884
_mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src1, v_alpha)));
2885
}
2886
}
2887
#elif CV_SIMD_64F
2888
const v_float64 v_alpha = vx_setall_f64(alpha);
2889
const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
2890
const int cVectorWidth = v_float64::nlanes * 2;
2891
const int step = v_float64::nlanes;
2892
2893
if (!mask)
2894
{
2895
int size = len * cn;
2896
for (; x <= size - cVectorWidth; x += cVectorWidth)
2897
{
2898
v_float64 v_src0 = vx_load(src + x);
2899
v_float64 v_src1 = vx_load(src + x + step);
2900
2901
v_float64 v_dst0 = vx_load(dst + x);
2902
v_float64 v_dst1 = vx_load(dst + x + step);
2903
2904
v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha);
2905
v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha);
2906
2907
v_store(dst + x, v_dst0);
2908
v_store(dst + x + step, v_dst1);
2909
}
2910
}
2911
#endif // CV_SIMD_64F
2912
accW_general_(src, dst, mask, len, cn, alpha, x);
2913
}
2914
2915
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
2916
2917
CV_CPU_OPTIMIZATION_NAMESPACE_END
2918
2919
} // namespace cv
2920
2921
///* End of file. */
2922
2923