Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/core/src/arithm_simd.hpp
16337 views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
8
//
9
//
10
// License Agreement
11
// For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16
// Copyright (C) 2015, Itseez Inc., all rights reserved.
17
// Third party copyrights are property of their respective owners.
18
//
19
// Redistribution and use in source and binary forms, with or without modification,
20
// are permitted provided that the following conditions are met:
21
//
22
// * Redistribution's of source code must retain the above copyright notice,
23
// this list of conditions and the following disclaimer.
24
//
25
// * Redistribution's in binary form must reproduce the above copyright notice,
26
// this list of conditions and the following disclaimer in the documentation
27
// and/or other materials provided with the distribution.
28
//
29
// * The name of the copyright holders may not be used to endorse or promote products
30
// derived from this software without specific prior written permission.
31
//
32
// This software is provided by the copyright holders and contributors "as is" and
33
// any express or implied warranties, including, but not limited to, the implied
34
// warranties of merchantability and fitness for a particular purpose are disclaimed.
35
// In no event shall the Intel Corporation or contributors be liable for any direct,
36
// indirect, incidental, special, exemplary, or consequential damages
37
// (including, but not limited to, procurement of substitute goods or services;
38
// loss of use, data, or profits; or business interruption) however caused
39
// and on any theory of liability, whether in contract, strict liability,
40
// or tort (including negligence or otherwise) arising in any way out of
41
// the use of this software, even if advised of the possibility of such damage.
42
//
43
//M*/
44
45
#ifndef __OPENCV_ARITHM_SIMD_HPP__
46
#define __OPENCV_ARITHM_SIMD_HPP__
47
48
namespace cv {
49
50
struct NOP {};
51
52
#if CV_SSE2 || CV_NEON
53
#define IF_SIMD(op) op
54
#else
55
#define IF_SIMD(op) NOP
56
#endif
57
58
59
#if CV_SSE2 || CV_NEON
60
61
#define FUNCTOR_TEMPLATE(name) \
62
template<typename T> struct name {}
63
64
FUNCTOR_TEMPLATE(VLoadStore128);
65
#if CV_SSE2
66
FUNCTOR_TEMPLATE(VLoadStore64);
67
FUNCTOR_TEMPLATE(VLoadStore128Aligned);
68
#if CV_AVX2
69
FUNCTOR_TEMPLATE(VLoadStore256);
70
FUNCTOR_TEMPLATE(VLoadStore256Aligned);
71
#endif
72
#endif
73
74
#endif
75
76
#if CV_AVX2
77
78
#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body) \
79
template <> \
80
struct name<template_arg>{ \
81
typedef register_type reg_type; \
82
static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
83
static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \
84
}
85
86
#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \
87
template <> \
88
struct name<template_arg>{ \
89
typedef register_type reg_type; \
90
static reg_type load(const template_arg * p) { return load_body (p); } \
91
static void store(template_arg * p, reg_type v) { store_body (p, v); } \
92
}
93
94
#define FUNCTOR_CLOSURE_2arg(name, template_arg, body) \
95
template<> \
96
struct name<template_arg> \
97
{ \
98
VLoadStore256<template_arg>::reg_type operator()( \
99
const VLoadStore256<template_arg>::reg_type & a, \
100
const VLoadStore256<template_arg>::reg_type & b) const \
101
{ \
102
body; \
103
} \
104
}
105
106
#define FUNCTOR_CLOSURE_1arg(name, template_arg, body) \
107
template<> \
108
struct name<template_arg> \
109
{ \
110
VLoadStore256<template_arg>::reg_type operator()( \
111
const VLoadStore256<template_arg>::reg_type & a, \
112
const VLoadStore256<template_arg>::reg_type & ) const \
113
{ \
114
body; \
115
} \
116
}
117
118
FUNCTOR_LOADSTORE_CAST(VLoadStore256, uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
119
FUNCTOR_LOADSTORE_CAST(VLoadStore256, schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
120
FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
121
FUNCTOR_LOADSTORE_CAST(VLoadStore256, short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
122
FUNCTOR_LOADSTORE_CAST(VLoadStore256, int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
123
FUNCTOR_LOADSTORE( VLoadStore256, float, __m256 , _mm256_loadu_ps , _mm256_storeu_ps );
124
FUNCTOR_LOADSTORE( VLoadStore256, double, __m256d, _mm256_loadu_pd , _mm256_storeu_pd );
125
126
FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned, int, __m256i, _mm256_load_si256, _mm256_store_si256);
127
FUNCTOR_LOADSTORE( VLoadStore256Aligned, float, __m256 , _mm256_load_ps , _mm256_store_ps );
128
FUNCTOR_LOADSTORE( VLoadStore256Aligned, double, __m256d, _mm256_load_pd , _mm256_store_pd );
129
130
FUNCTOR_TEMPLATE(VAdd);
131
FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm256_adds_epu8 (a, b));
132
FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm256_adds_epi8 (a, b));
133
FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b));
134
FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm256_adds_epi16(a, b));
135
FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm256_add_epi32 (a, b));
136
FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm256_add_ps (a, b));
137
FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd (a, b));
138
139
FUNCTOR_TEMPLATE(VSub);
140
FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm256_subs_epu8 (a, b));
141
FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm256_subs_epi8 (a, b));
142
FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b));
143
FUNCTOR_CLOSURE_2arg(VSub, short, return _mm256_subs_epi16(a, b));
144
FUNCTOR_CLOSURE_2arg(VSub, int, return _mm256_sub_epi32 (a, b));
145
FUNCTOR_CLOSURE_2arg(VSub, float, return _mm256_sub_ps (a, b));
146
FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd (a, b));
147
148
FUNCTOR_TEMPLATE(VMin);
149
FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm256_min_epu8 (a, b));
150
FUNCTOR_CLOSURE_2arg(VMin, schar, return _mm256_min_epi8 (a, b));
151
FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epu16(a, b));
152
FUNCTOR_CLOSURE_2arg(VMin, short, return _mm256_min_epi16(a, b));
153
FUNCTOR_CLOSURE_2arg(VMin, int, return _mm256_min_epi32(a, b));
154
FUNCTOR_CLOSURE_2arg(VMin, float, return _mm256_min_ps (a, b));
155
FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd (a, b));
156
157
FUNCTOR_TEMPLATE(VMax);
158
FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm256_max_epu8 (a, b));
159
FUNCTOR_CLOSURE_2arg(VMax, schar, return _mm256_max_epi8 (a, b));
160
FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b));
161
FUNCTOR_CLOSURE_2arg(VMax, short, return _mm256_max_epi16(a, b));
162
FUNCTOR_CLOSURE_2arg(VMax, int, return _mm256_max_epi32(a, b));
163
FUNCTOR_CLOSURE_2arg(VMax, float, return _mm256_max_ps (a, b));
164
FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd (a, b));
165
166
167
static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
168
0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
169
static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff,
170
0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
171
172
FUNCTOR_TEMPLATE(VAbsDiff);
173
FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar,
174
return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a));
175
);
176
FUNCTOR_CLOSURE_2arg(VAbsDiff, schar,
177
__m256i d = _mm256_subs_epi8(a, b);
178
__m256i m = _mm256_cmpgt_epi8(b, a);
179
return _mm256_subs_epi8(_mm256_xor_si256(d, m), m);
180
);
181
FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
182
return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a));
183
);
184
FUNCTOR_CLOSURE_2arg(VAbsDiff, short,
185
__m256i M = _mm256_max_epi16(a, b);
186
__m256i m = _mm256_min_epi16(a, b);
187
return _mm256_subs_epi16(M, m);
188
);
189
FUNCTOR_CLOSURE_2arg(VAbsDiff, int,
190
__m256i d = _mm256_sub_epi32(a, b);
191
__m256i m = _mm256_cmpgt_epi32(b, a);
192
return _mm256_sub_epi32(_mm256_xor_si256(d, m), m);
193
);
194
FUNCTOR_CLOSURE_2arg(VAbsDiff, float,
195
return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask);
196
);
197
FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
198
return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask);
199
);
200
201
FUNCTOR_TEMPLATE(VAnd);
202
FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b));
203
FUNCTOR_TEMPLATE(VOr);
204
FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b));
205
FUNCTOR_TEMPLATE(VXor);
206
FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b));
207
FUNCTOR_TEMPLATE(VNot);
208
FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a));
209
210
#elif CV_SSE2
211
212
#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
213
template <> \
214
struct name<template_arg>{ \
215
typedef register_type reg_type; \
216
static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
217
static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \
218
}
219
220
#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
221
template <> \
222
struct name<template_arg>{ \
223
typedef register_type reg_type; \
224
static reg_type load(const template_arg * p) { return load_body (p); } \
225
static void store(template_arg * p, reg_type v) { store_body (p, v); } \
226
}
227
228
#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
229
template<> \
230
struct name<template_arg> \
231
{ \
232
VLoadStore128<template_arg>::reg_type operator()( \
233
const VLoadStore128<template_arg>::reg_type & a, \
234
const VLoadStore128<template_arg>::reg_type & b) const \
235
{ \
236
body; \
237
} \
238
}
239
240
#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
241
template<> \
242
struct name<template_arg> \
243
{ \
244
VLoadStore128<template_arg>::reg_type operator()( \
245
const VLoadStore128<template_arg>::reg_type & a, \
246
const VLoadStore128<template_arg>::reg_type & ) const \
247
{ \
248
body; \
249
} \
250
}
251
252
FUNCTOR_LOADSTORE_CAST(VLoadStore128, uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
253
FUNCTOR_LOADSTORE_CAST(VLoadStore128, schar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
254
FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128);
255
FUNCTOR_LOADSTORE_CAST(VLoadStore128, short, __m128i, _mm_loadu_si128, _mm_storeu_si128);
256
FUNCTOR_LOADSTORE_CAST(VLoadStore128, int, __m128i, _mm_loadu_si128, _mm_storeu_si128);
257
FUNCTOR_LOADSTORE( VLoadStore128, float, __m128 , _mm_loadu_ps , _mm_storeu_ps );
258
FUNCTOR_LOADSTORE( VLoadStore128, double, __m128d, _mm_loadu_pd , _mm_storeu_pd );
259
260
FUNCTOR_LOADSTORE_CAST(VLoadStore64, uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
261
FUNCTOR_LOADSTORE_CAST(VLoadStore64, schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
262
FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
263
FUNCTOR_LOADSTORE_CAST(VLoadStore64, short, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
264
265
FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned, int, __m128i, _mm_load_si128, _mm_store_si128);
266
FUNCTOR_LOADSTORE( VLoadStore128Aligned, float, __m128 , _mm_load_ps , _mm_store_ps );
267
FUNCTOR_LOADSTORE( VLoadStore128Aligned, double, __m128d, _mm_load_pd , _mm_store_pd );
268
269
FUNCTOR_TEMPLATE(VAdd);
270
FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm_adds_epu8 (a, b));
271
FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm_adds_epi8 (a, b));
272
FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b));
273
FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm_adds_epi16(a, b));
274
FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm_add_epi32 (a, b));
275
FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm_add_ps (a, b));
276
FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd (a, b));
277
278
FUNCTOR_TEMPLATE(VSub);
279
FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm_subs_epu8 (a, b));
280
FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm_subs_epi8 (a, b));
281
FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b));
282
FUNCTOR_CLOSURE_2arg(VSub, short, return _mm_subs_epi16(a, b));
283
FUNCTOR_CLOSURE_2arg(VSub, int, return _mm_sub_epi32 (a, b));
284
FUNCTOR_CLOSURE_2arg(VSub, float, return _mm_sub_ps (a, b));
285
FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd (a, b));
286
287
FUNCTOR_TEMPLATE(VMin);
288
FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b));
289
FUNCTOR_CLOSURE_2arg(VMin, schar,
290
__m128i m = _mm_cmpgt_epi8(a, b);
291
return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
292
);
293
FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b)));
294
FUNCTOR_CLOSURE_2arg(VMin, short, return _mm_min_epi16(a, b));
295
FUNCTOR_CLOSURE_2arg(VMin, int,
296
__m128i m = _mm_cmpgt_epi32(a, b);
297
return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
298
);
299
FUNCTOR_CLOSURE_2arg(VMin, float, return _mm_min_ps(a, b));
300
FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b));
301
302
FUNCTOR_TEMPLATE(VMax);
303
FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b));
304
FUNCTOR_CLOSURE_2arg(VMax, schar,
305
__m128i m = _mm_cmpgt_epi8(b, a);
306
return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
307
);
308
FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b));
309
FUNCTOR_CLOSURE_2arg(VMax, short, return _mm_max_epi16(a, b));
310
FUNCTOR_CLOSURE_2arg(VMax, int,
311
__m128i m = _mm_cmpgt_epi32(b, a);
312
return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
313
);
314
FUNCTOR_CLOSURE_2arg(VMax, float, return _mm_max_ps(a, b));
315
FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b));
316
317
318
static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
319
static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
320
321
FUNCTOR_TEMPLATE(VAbsDiff);
322
FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar,
323
return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
324
);
325
FUNCTOR_CLOSURE_2arg(VAbsDiff, schar,
326
__m128i d = _mm_subs_epi8(a, b);
327
__m128i m = _mm_cmpgt_epi8(b, a);
328
return _mm_subs_epi8(_mm_xor_si128(d, m), m);
329
);
330
FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
331
return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
332
);
333
FUNCTOR_CLOSURE_2arg(VAbsDiff, short,
334
__m128i M = _mm_max_epi16(a, b);
335
__m128i m = _mm_min_epi16(a, b);
336
return _mm_subs_epi16(M, m);
337
);
338
FUNCTOR_CLOSURE_2arg(VAbsDiff, int,
339
__m128i d = _mm_sub_epi32(a, b);
340
__m128i m = _mm_cmpgt_epi32(b, a);
341
return _mm_sub_epi32(_mm_xor_si128(d, m), m);
342
);
343
FUNCTOR_CLOSURE_2arg(VAbsDiff, float,
344
return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
345
);
346
FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
347
return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
348
);
349
350
FUNCTOR_TEMPLATE(VAnd);
351
FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b));
352
FUNCTOR_TEMPLATE(VOr);
353
FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b));
354
FUNCTOR_TEMPLATE(VXor);
355
FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b));
356
FUNCTOR_TEMPLATE(VNot);
357
FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
358
#endif
359
360
#if CV_NEON
361
362
#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
363
template <> \
364
struct name<template_arg>{ \
365
typedef register_type reg_type; \
366
static reg_type load(const template_arg * p) { return load_body (p);}; \
367
static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
368
}
369
370
#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
371
template<> \
372
struct name<template_arg> \
373
{ \
374
VLoadStore128<template_arg>::reg_type operator()( \
375
VLoadStore128<template_arg>::reg_type a, \
376
VLoadStore128<template_arg>::reg_type b) const \
377
{ \
378
return body; \
379
}; \
380
}
381
382
#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
383
template<> \
384
struct name<template_arg> \
385
{ \
386
VLoadStore128<template_arg>::reg_type operator()( \
387
VLoadStore128<template_arg>::reg_type a, \
388
VLoadStore128<template_arg>::reg_type ) const \
389
{ \
390
return body; \
391
}; \
392
}
393
394
FUNCTOR_LOADSTORE(VLoadStore128, uchar, uint8x16_t, vld1q_u8 , vst1q_u8 );
395
FUNCTOR_LOADSTORE(VLoadStore128, schar, int8x16_t, vld1q_s8 , vst1q_s8 );
396
FUNCTOR_LOADSTORE(VLoadStore128, ushort, uint16x8_t, vld1q_u16, vst1q_u16);
397
FUNCTOR_LOADSTORE(VLoadStore128, short, int16x8_t, vld1q_s16, vst1q_s16);
398
FUNCTOR_LOADSTORE(VLoadStore128, int, int32x4_t, vld1q_s32, vst1q_s32);
399
FUNCTOR_LOADSTORE(VLoadStore128, float, float32x4_t, vld1q_f32, vst1q_f32);
400
401
FUNCTOR_TEMPLATE(VAdd);
402
FUNCTOR_CLOSURE_2arg(VAdd, uchar, vqaddq_u8 (a, b));
403
FUNCTOR_CLOSURE_2arg(VAdd, schar, vqaddq_s8 (a, b));
404
FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
405
FUNCTOR_CLOSURE_2arg(VAdd, short, vqaddq_s16(a, b));
406
FUNCTOR_CLOSURE_2arg(VAdd, int, vaddq_s32 (a, b));
407
FUNCTOR_CLOSURE_2arg(VAdd, float, vaddq_f32 (a, b));
408
409
FUNCTOR_TEMPLATE(VSub);
410
FUNCTOR_CLOSURE_2arg(VSub, uchar, vqsubq_u8 (a, b));
411
FUNCTOR_CLOSURE_2arg(VSub, schar, vqsubq_s8 (a, b));
412
FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
413
FUNCTOR_CLOSURE_2arg(VSub, short, vqsubq_s16(a, b));
414
FUNCTOR_CLOSURE_2arg(VSub, int, vsubq_s32 (a, b));
415
FUNCTOR_CLOSURE_2arg(VSub, float, vsubq_f32 (a, b));
416
417
FUNCTOR_TEMPLATE(VMin);
418
FUNCTOR_CLOSURE_2arg(VMin, uchar, vminq_u8 (a, b));
419
FUNCTOR_CLOSURE_2arg(VMin, schar, vminq_s8 (a, b));
420
FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
421
FUNCTOR_CLOSURE_2arg(VMin, short, vminq_s16(a, b));
422
FUNCTOR_CLOSURE_2arg(VMin, int, vminq_s32(a, b));
423
FUNCTOR_CLOSURE_2arg(VMin, float, vminq_f32(a, b));
424
425
FUNCTOR_TEMPLATE(VMax);
426
FUNCTOR_CLOSURE_2arg(VMax, uchar, vmaxq_u8 (a, b));
427
FUNCTOR_CLOSURE_2arg(VMax, schar, vmaxq_s8 (a, b));
428
FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
429
FUNCTOR_CLOSURE_2arg(VMax, short, vmaxq_s16(a, b));
430
FUNCTOR_CLOSURE_2arg(VMax, int, vmaxq_s32(a, b));
431
FUNCTOR_CLOSURE_2arg(VMax, float, vmaxq_f32(a, b));
432
433
FUNCTOR_TEMPLATE(VAbsDiff);
434
FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, vabdq_u8 (a, b));
435
FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, vqabsq_s8 (vqsubq_s8(a, b)));
436
FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
437
FUNCTOR_CLOSURE_2arg(VAbsDiff, short, vqabsq_s16(vqsubq_s16(a, b)));
438
FUNCTOR_CLOSURE_2arg(VAbsDiff, int, vabdq_s32 (a, b));
439
FUNCTOR_CLOSURE_2arg(VAbsDiff, float, vabdq_f32 (a, b));
440
441
FUNCTOR_TEMPLATE(VAnd);
442
FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
443
FUNCTOR_TEMPLATE(VOr);
444
FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
445
FUNCTOR_TEMPLATE(VXor);
446
FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
447
FUNCTOR_TEMPLATE(VNot);
448
FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a ));
449
#endif
450
451
452
template <typename T>
453
struct Cmp_SIMD
454
{
455
explicit Cmp_SIMD(int)
456
{
457
}
458
459
int operator () (const T *, const T *, uchar *, int) const
460
{
461
return 0;
462
}
463
};
464
465
#if CV_NEON
466
467
template <>
468
struct Cmp_SIMD<schar>
469
{
470
explicit Cmp_SIMD(int code_) :
471
code(code_)
472
{
473
// CV_Assert(code == CMP_GT || code == CMP_LE ||
474
// code == CMP_EQ || code == CMP_NE);
475
476
v_mask = vdupq_n_u8(255);
477
}
478
479
int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
480
{
481
int x = 0;
482
483
if (code == CMP_GT)
484
for ( ; x <= width - 16; x += 16)
485
vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
486
else if (code == CMP_LE)
487
for ( ; x <= width - 16; x += 16)
488
vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
489
else if (code == CMP_EQ)
490
for ( ; x <= width - 16; x += 16)
491
vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
492
else if (code == CMP_NE)
493
for ( ; x <= width - 16; x += 16)
494
vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask));
495
496
return x;
497
}
498
499
int code;
500
uint8x16_t v_mask;
501
};
502
503
template <>
504
struct Cmp_SIMD<ushort>
505
{
506
explicit Cmp_SIMD(int code_) :
507
code(code_)
508
{
509
// CV_Assert(code == CMP_GT || code == CMP_LE ||
510
// code == CMP_EQ || code == CMP_NE);
511
512
v_mask = vdup_n_u8(255);
513
}
514
515
int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const
516
{
517
int x = 0;
518
519
if (code == CMP_GT)
520
for ( ; x <= width - 8; x += 8)
521
{
522
uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
523
vst1_u8(dst + x, vmovn_u16(v_dst));
524
}
525
else if (code == CMP_LE)
526
for ( ; x <= width - 8; x += 8)
527
{
528
uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
529
vst1_u8(dst + x, vmovn_u16(v_dst));
530
}
531
else if (code == CMP_EQ)
532
for ( ; x <= width - 8; x += 8)
533
{
534
uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
535
vst1_u8(dst + x, vmovn_u16(v_dst));
536
}
537
else if (code == CMP_NE)
538
for ( ; x <= width - 8; x += 8)
539
{
540
uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
541
vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask));
542
}
543
544
return x;
545
}
546
547
int code;
548
uint8x8_t v_mask;
549
};
550
551
template <>
552
struct Cmp_SIMD<int>
553
{
554
explicit Cmp_SIMD(int code_) :
555
code(code_)
556
{
557
// CV_Assert(code == CMP_GT || code == CMP_LE ||
558
// code == CMP_EQ || code == CMP_NE);
559
560
v_mask = vdup_n_u8(255);
561
}
562
563
int operator () (const int * src1, const int * src2, uchar * dst, int width) const
564
{
565
int x = 0;
566
567
if (code == CMP_GT)
568
for ( ; x <= width - 8; x += 8)
569
{
570
uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
571
uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
572
vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
573
}
574
else if (code == CMP_LE)
575
for ( ; x <= width - 8; x += 8)
576
{
577
uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
578
uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
579
vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
580
}
581
else if (code == CMP_EQ)
582
for ( ; x <= width - 8; x += 8)
583
{
584
uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
585
uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
586
vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
587
}
588
else if (code == CMP_NE)
589
for ( ; x <= width - 8; x += 8)
590
{
591
uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
592
uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
593
uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
594
vst1_u8(dst + x, veor_u8(v_dst, v_mask));
595
}
596
597
return x;
598
}
599
600
int code;
601
uint8x8_t v_mask;
602
};
603
604
template <>
605
struct Cmp_SIMD<float>
606
{
607
explicit Cmp_SIMD(int code_) :
608
code(code_)
609
{
610
// CV_Assert(code == CMP_GT || code == CMP_LE ||
611
// code == CMP_EQ || code == CMP_NE);
612
613
v_mask = vdup_n_u8(255);
614
}
615
616
int operator () (const float * src1, const float * src2, uchar * dst, int width) const
617
{
618
int x = 0;
619
620
if (code == CMP_GT)
621
for ( ; x <= width - 8; x += 8)
622
{
623
uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
624
uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
625
vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
626
}
627
else if (code == CMP_LE)
628
for ( ; x <= width - 8; x += 8)
629
{
630
uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
631
uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
632
vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
633
}
634
else if (code == CMP_EQ)
635
for ( ; x <= width - 8; x += 8)
636
{
637
uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
638
uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
639
vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
640
}
641
else if (code == CMP_NE)
642
for ( ; x <= width - 8; x += 8)
643
{
644
uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
645
uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
646
uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
647
vst1_u8(dst + x, veor_u8(v_dst, v_mask));
648
}
649
650
return x;
651
}
652
653
int code;
654
uint8x8_t v_mask;
655
};
656
657
#elif CV_SSE2
658
659
template <>
660
struct Cmp_SIMD<schar>
661
{
662
explicit Cmp_SIMD(int code_) :
663
code(code_)
664
{
665
// CV_Assert(code == CMP_GT || code == CMP_LE ||
666
// code == CMP_EQ || code == CMP_NE);
667
668
haveSSE = checkHardwareSupport(CV_CPU_SSE2);
669
670
v_mask = _mm_set1_epi8(-1);
671
}
672
673
int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
674
{
675
int x = 0;
676
677
if (!haveSSE)
678
return x;
679
680
if (code == CMP_GT)
681
for ( ; x <= width - 16; x += 16)
682
_mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
683
_mm_loadu_si128((const __m128i *)(src2 + x))));
684
else if (code == CMP_LE)
685
for ( ; x <= width - 16; x += 16)
686
{
687
__m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
688
_mm_loadu_si128((const __m128i *)(src2 + x)));
689
_mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
690
}
691
else if (code == CMP_EQ)
692
for ( ; x <= width - 16; x += 16)
693
_mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
694
_mm_loadu_si128((const __m128i *)(src2 + x))));
695
else if (code == CMP_NE)
696
for ( ; x <= width - 16; x += 16)
697
{
698
__m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
699
_mm_loadu_si128((const __m128i *)(src2 + x)));
700
_mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
701
}
702
703
return x;
704
}
705
706
int code;
707
__m128i v_mask;
708
bool haveSSE;
709
};
710
711
template <>
712
struct Cmp_SIMD<int>
713
{
714
explicit Cmp_SIMD(int code_) :
715
code(code_)
716
{
717
// CV_Assert(code == CMP_GT || code == CMP_LE ||
718
// code == CMP_EQ || code == CMP_NE);
719
720
haveSSE = checkHardwareSupport(CV_CPU_SSE2);
721
722
v_mask = _mm_set1_epi32(0xffffffff);
723
}
724
725
int operator () (const int * src1, const int * src2, uchar * dst, int width) const
726
{
727
int x = 0;
728
729
if (!haveSSE)
730
return x;
731
732
if (code == CMP_GT)
733
for ( ; x <= width - 8; x += 8)
734
{
735
__m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
736
_mm_loadu_si128((const __m128i *)(src2 + x)));
737
__m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
738
_mm_loadu_si128((const __m128i *)(src2 + x + 4)));
739
740
_mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
741
}
742
else if (code == CMP_LE)
743
for ( ; x <= width - 8; x += 8)
744
{
745
__m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
746
_mm_loadu_si128((const __m128i *)(src2 + x)));
747
__m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
748
_mm_loadu_si128((const __m128i *)(src2 + x + 4)));
749
750
_mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
751
}
752
else if (code == CMP_EQ)
753
for ( ; x <= width - 8; x += 8)
754
{
755
__m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
756
_mm_loadu_si128((const __m128i *)(src2 + x)));
757
__m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
758
_mm_loadu_si128((const __m128i *)(src2 + x + 4)));
759
760
_mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
761
}
762
else if (code == CMP_NE)
763
for ( ; x <= width - 8; x += 8)
764
{
765
__m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
766
_mm_loadu_si128((const __m128i *)(src2 + x)));
767
__m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
768
_mm_loadu_si128((const __m128i *)(src2 + x + 4)));
769
770
_mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
771
}
772
773
return x;
774
}
775
776
int code;
777
__m128i v_mask;
778
bool haveSSE;
779
};
780
781
#endif
782
783
784
template <typename T, typename WT>
785
struct Mul_SIMD
786
{
787
int operator() (const T *, const T *, T *, int, WT) const
788
{
789
return 0;
790
}
791
};
792
793
#if CV_NEON
794
795
template <>
796
struct Mul_SIMD<uchar, float>
797
{
798
int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const
799
{
800
int x = 0;
801
802
if( scale == 1.0f )
803
for ( ; x <= width - 8; x += 8)
804
{
805
uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
806
uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
807
808
float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
809
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
810
float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
811
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
812
813
uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
814
vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
815
vst1_u8(dst + x, vqmovn_u16(v_dst));
816
}
817
else
818
{
819
float32x4_t v_scale = vdupq_n_f32(scale);
820
for ( ; x <= width - 8; x += 8)
821
{
822
uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
823
uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
824
825
float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
826
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
827
v_dst1 = vmulq_f32(v_dst1, v_scale);
828
float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
829
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
830
v_dst2 = vmulq_f32(v_dst2, v_scale);
831
832
uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
833
vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
834
vst1_u8(dst + x, vqmovn_u16(v_dst));
835
}
836
}
837
838
return x;
839
}
840
};
841
842
template <>
843
struct Mul_SIMD<schar, float>
844
{
845
int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
846
{
847
int x = 0;
848
849
if( scale == 1.0f )
850
for ( ; x <= width - 8; x += 8)
851
{
852
int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
853
int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
854
855
float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
856
vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
857
float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
858
vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
859
860
int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
861
vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
862
vst1_s8(dst + x, vqmovn_s16(v_dst));
863
}
864
else
865
{
866
float32x4_t v_scale = vdupq_n_f32(scale);
867
for ( ; x <= width - 8; x += 8)
868
{
869
int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
870
int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
871
872
float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
873
vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
874
v_dst1 = vmulq_f32(v_dst1, v_scale);
875
float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
876
vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
877
v_dst2 = vmulq_f32(v_dst2, v_scale);
878
879
int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
880
vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
881
vst1_s8(dst + x, vqmovn_s16(v_dst));
882
}
883
}
884
885
return x;
886
}
887
};
888
889
template <>
890
struct Mul_SIMD<ushort, float>
891
{
892
int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
893
{
894
int x = 0;
895
896
if( scale == 1.0f )
897
for ( ; x <= width - 8; x += 8)
898
{
899
uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
900
901
float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
902
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
903
float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
904
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
905
906
uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
907
vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
908
vst1q_u16(dst + x, v_dst);
909
}
910
else
911
{
912
float32x4_t v_scale = vdupq_n_f32(scale);
913
for ( ; x <= width - 8; x += 8)
914
{
915
uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
916
917
float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
918
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
919
v_dst1 = vmulq_f32(v_dst1, v_scale);
920
float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
921
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
922
v_dst2 = vmulq_f32(v_dst2, v_scale);
923
924
uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
925
vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
926
vst1q_u16(dst + x, v_dst);
927
}
928
}
929
930
return x;
931
}
932
};
933
934
template <>
935
struct Mul_SIMD<short, float>
936
{
937
int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
938
{
939
int x = 0;
940
941
if( scale == 1.0f )
942
for ( ; x <= width - 8; x += 8)
943
{
944
int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
945
946
float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
947
vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
948
float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
949
vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
950
951
int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
952
vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
953
vst1q_s16(dst + x, v_dst);
954
}
955
else
956
{
957
float32x4_t v_scale = vdupq_n_f32(scale);
958
for ( ; x <= width - 8; x += 8)
959
{
960
int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
961
962
float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
963
vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
964
v_dst1 = vmulq_f32(v_dst1, v_scale);
965
float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
966
vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
967
v_dst2 = vmulq_f32(v_dst2, v_scale);
968
969
int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
970
vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
971
vst1q_s16(dst + x, v_dst);
972
}
973
}
974
975
return x;
976
}
977
};
978
979
template <>
980
struct Mul_SIMD<float, float>
981
{
982
int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const
983
{
984
int x = 0;
985
986
if( scale == 1.0f )
987
for ( ; x <= width - 8; x += 8)
988
{
989
float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
990
float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
991
vst1q_f32(dst + x, v_dst1);
992
vst1q_f32(dst + x + 4, v_dst2);
993
}
994
else
995
{
996
float32x4_t v_scale = vdupq_n_f32(scale);
997
for ( ; x <= width - 8; x += 8)
998
{
999
float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
1000
v_dst1 = vmulq_f32(v_dst1, v_scale);
1001
1002
float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
1003
v_dst2 = vmulq_f32(v_dst2, v_scale);
1004
1005
vst1q_f32(dst + x, v_dst1);
1006
vst1q_f32(dst + x + 4, v_dst2);
1007
}
1008
}
1009
1010
return x;
1011
}
1012
};
1013
1014
#elif CV_SSE2
1015
1016
#if CV_SSE4_1
1017
1018
template <>
1019
struct Mul_SIMD<ushort, float>
1020
{
1021
Mul_SIMD()
1022
{
1023
haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
1024
}
1025
1026
int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
1027
{
1028
int x = 0;
1029
1030
if (!haveSSE)
1031
return x;
1032
1033
__m128i v_zero = _mm_setzero_si128();
1034
1035
if( scale != 1.0f )
1036
{
1037
__m128 v_scale = _mm_set1_ps(scale);
1038
for ( ; x <= width - 8; x += 8)
1039
{
1040
__m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
1041
__m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
1042
1043
__m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)),
1044
_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)));
1045
v_dst1 = _mm_mul_ps(v_dst1, v_scale);
1046
1047
__m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)),
1048
_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)));
1049
v_dst2 = _mm_mul_ps(v_dst2, v_scale);
1050
1051
__m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1052
_mm_storeu_si128((__m128i *)(dst + x), v_dsti);
1053
}
1054
}
1055
1056
return x;
1057
}
1058
1059
bool haveSSE;
1060
};
1061
1062
#endif
1063
1064
template <>
1065
struct Mul_SIMD<schar, float>
1066
{
1067
Mul_SIMD()
1068
{
1069
haveSSE = checkHardwareSupport(CV_CPU_SSE2);
1070
}
1071
1072
int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
1073
{
1074
int x = 0;
1075
1076
if (!haveSSE)
1077
return x;
1078
1079
__m128i v_zero = _mm_setzero_si128();
1080
1081
if( scale == 1.0f )
1082
for ( ; x <= width - 8; x += 8)
1083
{
1084
__m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
1085
__m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
1086
1087
v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
1088
v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
1089
1090
__m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
1091
_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
1092
1093
__m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
1094
_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
1095
1096
__m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1097
_mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
1098
}
1099
else
1100
{
1101
__m128 v_scale = _mm_set1_ps(scale);
1102
for ( ; x <= width - 8; x += 8)
1103
{
1104
__m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
1105
__m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
1106
1107
v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
1108
v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
1109
1110
__m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
1111
_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
1112
v_dst1 = _mm_mul_ps(v_dst1, v_scale);
1113
1114
__m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
1115
_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
1116
v_dst2 = _mm_mul_ps(v_dst2, v_scale);
1117
1118
__m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1119
_mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
1120
}
1121
}
1122
1123
return x;
1124
}
1125
1126
bool haveSSE;
1127
};
1128
1129
template <>
1130
struct Mul_SIMD<short, float>
1131
{
1132
Mul_SIMD()
1133
{
1134
haveSSE = checkHardwareSupport(CV_CPU_SSE2);
1135
}
1136
1137
int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
1138
{
1139
int x = 0;
1140
1141
if (!haveSSE)
1142
return x;
1143
1144
__m128i v_zero = _mm_setzero_si128();
1145
1146
if( scale != 1.0f )
1147
{
1148
__m128 v_scale = _mm_set1_ps(scale);
1149
for ( ; x <= width - 8; x += 8)
1150
{
1151
__m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
1152
__m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
1153
1154
__m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
1155
_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
1156
v_dst1 = _mm_mul_ps(v_dst1, v_scale);
1157
1158
__m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
1159
_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
1160
v_dst2 = _mm_mul_ps(v_dst2, v_scale);
1161
1162
__m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
1163
_mm_storeu_si128((__m128i *)(dst + x), v_dsti);
1164
}
1165
}
1166
1167
return x;
1168
}
1169
1170
bool haveSSE;
1171
};
1172
1173
#endif
1174
1175
template <typename T>
1176
struct Div_SIMD
1177
{
1178
int operator() (const T *, const T *, T *, int, double) const
1179
{
1180
return 0;
1181
}
1182
};
1183
1184
template <typename T>
1185
struct Recip_SIMD
1186
{
1187
int operator() (const T *, T *, int, double) const
1188
{
1189
return 0;
1190
}
1191
};
1192
1193
1194
#if CV_SIMD128
1195
1196
template <>
1197
struct Div_SIMD<uchar>
1198
{
1199
bool haveSIMD;
1200
Div_SIMD() { haveSIMD = hasSIMD128(); }
1201
1202
int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
1203
{
1204
int x = 0;
1205
1206
if (!haveSIMD)
1207
return x;
1208
1209
v_float32x4 v_scale = v_setall_f32((float)scale);
1210
v_uint16x8 v_zero = v_setzero_u16();
1211
1212
for ( ; x <= width - 8; x += 8)
1213
{
1214
v_uint16x8 v_src1 = v_load_expand(src1 + x);
1215
v_uint16x8 v_src2 = v_load_expand(src2 + x);
1216
1217
v_uint32x4 t0, t1, t2, t3;
1218
v_expand(v_src1, t0, t1);
1219
v_expand(v_src2, t2, t3);
1220
1221
v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1222
v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1223
1224
v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
1225
v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
1226
1227
f0 = f0 * v_scale / f2;
1228
f1 = f1 * v_scale / f3;
1229
1230
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1231
v_uint16x8 res = v_pack_u(i0, i1);
1232
1233
res = v_select(v_src2 == v_zero, v_zero, res);
1234
v_pack_store(dst + x, res);
1235
}
1236
1237
return x;
1238
}
1239
};
1240
1241
1242
template <>
1243
struct Div_SIMD<schar>
1244
{
1245
bool haveSIMD;
1246
Div_SIMD() { haveSIMD = hasSIMD128(); }
1247
1248
int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
1249
{
1250
int x = 0;
1251
1252
if (!haveSIMD)
1253
return x;
1254
1255
v_float32x4 v_scale = v_setall_f32((float)scale);
1256
v_int16x8 v_zero = v_setzero_s16();
1257
1258
for ( ; x <= width - 8; x += 8)
1259
{
1260
v_int16x8 v_src1 = v_load_expand(src1 + x);
1261
v_int16x8 v_src2 = v_load_expand(src2 + x);
1262
1263
v_int32x4 t0, t1, t2, t3;
1264
v_expand(v_src1, t0, t1);
1265
v_expand(v_src2, t2, t3);
1266
1267
v_float32x4 f0 = v_cvt_f32(t0);
1268
v_float32x4 f1 = v_cvt_f32(t1);
1269
1270
v_float32x4 f2 = v_cvt_f32(t2);
1271
v_float32x4 f3 = v_cvt_f32(t3);
1272
1273
f0 = f0 * v_scale / f2;
1274
f1 = f1 * v_scale / f3;
1275
1276
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1277
v_int16x8 res = v_pack(i0, i1);
1278
1279
res = v_select(v_src2 == v_zero, v_zero, res);
1280
v_pack_store(dst + x, res);
1281
}
1282
1283
return x;
1284
}
1285
};
1286
1287
1288
template <>
1289
struct Div_SIMD<ushort>
1290
{
1291
bool haveSIMD;
1292
Div_SIMD() { haveSIMD = hasSIMD128(); }
1293
1294
int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
1295
{
1296
int x = 0;
1297
1298
if (!haveSIMD)
1299
return x;
1300
1301
v_float32x4 v_scale = v_setall_f32((float)scale);
1302
v_uint16x8 v_zero = v_setzero_u16();
1303
1304
for ( ; x <= width - 8; x += 8)
1305
{
1306
v_uint16x8 v_src1 = v_load(src1 + x);
1307
v_uint16x8 v_src2 = v_load(src2 + x);
1308
1309
v_uint32x4 t0, t1, t2, t3;
1310
v_expand(v_src1, t0, t1);
1311
v_expand(v_src2, t2, t3);
1312
1313
v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1314
v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1315
1316
v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
1317
v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
1318
1319
f0 = f0 * v_scale / f2;
1320
f1 = f1 * v_scale / f3;
1321
1322
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1323
v_uint16x8 res = v_pack_u(i0, i1);
1324
1325
res = v_select(v_src2 == v_zero, v_zero, res);
1326
v_store(dst + x, res);
1327
}
1328
1329
return x;
1330
}
1331
};
1332
1333
template <>
1334
struct Div_SIMD<short>
1335
{
1336
bool haveSIMD;
1337
Div_SIMD() { haveSIMD = hasSIMD128(); }
1338
1339
int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
1340
{
1341
int x = 0;
1342
1343
if (!haveSIMD)
1344
return x;
1345
1346
v_float32x4 v_scale = v_setall_f32((float)scale);
1347
v_int16x8 v_zero = v_setzero_s16();
1348
1349
for ( ; x <= width - 8; x += 8)
1350
{
1351
v_int16x8 v_src1 = v_load(src1 + x);
1352
v_int16x8 v_src2 = v_load(src2 + x);
1353
1354
v_int32x4 t0, t1, t2, t3;
1355
v_expand(v_src1, t0, t1);
1356
v_expand(v_src2, t2, t3);
1357
1358
v_float32x4 f0 = v_cvt_f32(t0);
1359
v_float32x4 f1 = v_cvt_f32(t1);
1360
1361
v_float32x4 f2 = v_cvt_f32(t2);
1362
v_float32x4 f3 = v_cvt_f32(t3);
1363
1364
f0 = f0 * v_scale / f2;
1365
f1 = f1 * v_scale / f3;
1366
1367
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1368
v_int16x8 res = v_pack(i0, i1);
1369
1370
res = v_select(v_src2 == v_zero, v_zero, res);
1371
v_store(dst + x, res);
1372
}
1373
1374
return x;
1375
}
1376
};
1377
1378
template <>
1379
struct Div_SIMD<int>
1380
{
1381
bool haveSIMD;
1382
Div_SIMD() { haveSIMD = hasSIMD128(); }
1383
1384
int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
1385
{
1386
int x = 0;
1387
1388
if (!haveSIMD)
1389
return x;
1390
1391
v_float32x4 v_scale = v_setall_f32((float)scale);
1392
v_int32x4 v_zero = v_setzero_s32();
1393
1394
for ( ; x <= width - 8; x += 8)
1395
{
1396
v_int32x4 t0 = v_load(src1 + x);
1397
v_int32x4 t1 = v_load(src1 + x + 4);
1398
v_int32x4 t2 = v_load(src2 + x);
1399
v_int32x4 t3 = v_load(src2 + x + 4);
1400
1401
v_float32x4 f0 = v_cvt_f32(t0);
1402
v_float32x4 f1 = v_cvt_f32(t1);
1403
v_float32x4 f2 = v_cvt_f32(t2);
1404
v_float32x4 f3 = v_cvt_f32(t3);
1405
1406
f0 = f0 * v_scale / f2;
1407
f1 = f1 * v_scale / f3;
1408
1409
v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
1410
1411
res0 = v_select(t2 == v_zero, v_zero, res0);
1412
res1 = v_select(t3 == v_zero, v_zero, res1);
1413
v_store(dst + x, res0);
1414
v_store(dst + x + 4, res1);
1415
}
1416
1417
return x;
1418
}
1419
};
1420
1421
1422
template <>
1423
struct Div_SIMD<float>
1424
{
1425
bool haveSIMD;
1426
Div_SIMD() { haveSIMD = hasSIMD128(); }
1427
1428
int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
1429
{
1430
int x = 0;
1431
1432
if (!haveSIMD)
1433
return x;
1434
1435
v_float32x4 v_scale = v_setall_f32((float)scale);
1436
1437
for ( ; x <= width - 8; x += 8)
1438
{
1439
v_float32x4 f0 = v_load(src1 + x);
1440
v_float32x4 f1 = v_load(src1 + x + 4);
1441
v_float32x4 f2 = v_load(src2 + x);
1442
v_float32x4 f3 = v_load(src2 + x + 4);
1443
1444
v_float32x4 res0 = f0 * v_scale / f2;
1445
v_float32x4 res1 = f1 * v_scale / f3;
1446
1447
v_store(dst + x, res0);
1448
v_store(dst + x + 4, res1);
1449
}
1450
1451
return x;
1452
}
1453
};
1454
1455
1456
///////////////////////// RECIPROCAL //////////////////////
1457
1458
template <>
1459
struct Recip_SIMD<uchar>
1460
{
1461
bool haveSIMD;
1462
Recip_SIMD() { haveSIMD = hasSIMD128(); }
1463
1464
int operator() (const uchar * src2, uchar * dst, int width, double scale) const
1465
{
1466
int x = 0;
1467
1468
if (!haveSIMD)
1469
return x;
1470
1471
v_float32x4 v_scale = v_setall_f32((float)scale);
1472
v_uint16x8 v_zero = v_setzero_u16();
1473
1474
for ( ; x <= width - 8; x += 8)
1475
{
1476
v_uint16x8 v_src2 = v_load_expand(src2 + x);
1477
1478
v_uint32x4 t0, t1;
1479
v_expand(v_src2, t0, t1);
1480
1481
v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1482
v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1483
1484
f0 = v_scale / f0;
1485
f1 = v_scale / f1;
1486
1487
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1488
v_uint16x8 res = v_pack_u(i0, i1);
1489
1490
res = v_select(v_src2 == v_zero, v_zero, res);
1491
v_pack_store(dst + x, res);
1492
}
1493
1494
return x;
1495
}
1496
};
1497
1498
1499
template <>
1500
struct Recip_SIMD<schar>
1501
{
1502
bool haveSIMD;
1503
Recip_SIMD() { haveSIMD = hasSIMD128(); }
1504
1505
int operator() (const schar * src2, schar * dst, int width, double scale) const
1506
{
1507
int x = 0;
1508
1509
if (!haveSIMD)
1510
return x;
1511
1512
v_float32x4 v_scale = v_setall_f32((float)scale);
1513
v_int16x8 v_zero = v_setzero_s16();
1514
1515
for ( ; x <= width - 8; x += 8)
1516
{
1517
v_int16x8 v_src2 = v_load_expand(src2 + x);
1518
1519
v_int32x4 t0, t1;
1520
v_expand(v_src2, t0, t1);
1521
1522
v_float32x4 f0 = v_cvt_f32(t0);
1523
v_float32x4 f1 = v_cvt_f32(t1);
1524
1525
f0 = v_scale / f0;
1526
f1 = v_scale / f1;
1527
1528
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1529
v_int16x8 res = v_pack(i0, i1);
1530
1531
res = v_select(v_src2 == v_zero, v_zero, res);
1532
v_pack_store(dst + x, res);
1533
}
1534
1535
return x;
1536
}
1537
};
1538
1539
1540
template <>
1541
struct Recip_SIMD<ushort>
1542
{
1543
bool haveSIMD;
1544
Recip_SIMD() { haveSIMD = hasSIMD128(); }
1545
1546
int operator() (const ushort * src2, ushort * dst, int width, double scale) const
1547
{
1548
int x = 0;
1549
1550
if (!haveSIMD)
1551
return x;
1552
1553
v_float32x4 v_scale = v_setall_f32((float)scale);
1554
v_uint16x8 v_zero = v_setzero_u16();
1555
1556
for ( ; x <= width - 8; x += 8)
1557
{
1558
v_uint16x8 v_src2 = v_load(src2 + x);
1559
1560
v_uint32x4 t0, t1;
1561
v_expand(v_src2, t0, t1);
1562
1563
v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
1564
v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
1565
1566
f0 = v_scale / f0;
1567
f1 = v_scale / f1;
1568
1569
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1570
v_uint16x8 res = v_pack_u(i0, i1);
1571
1572
res = v_select(v_src2 == v_zero, v_zero, res);
1573
v_store(dst + x, res);
1574
}
1575
1576
return x;
1577
}
1578
};
1579
1580
template <>
1581
struct Recip_SIMD<short>
1582
{
1583
bool haveSIMD;
1584
Recip_SIMD() { haveSIMD = hasSIMD128(); }
1585
1586
int operator() (const short * src2, short * dst, int width, double scale) const
1587
{
1588
int x = 0;
1589
1590
if (!haveSIMD)
1591
return x;
1592
1593
v_float32x4 v_scale = v_setall_f32((float)scale);
1594
v_int16x8 v_zero = v_setzero_s16();
1595
1596
for ( ; x <= width - 8; x += 8)
1597
{
1598
v_int16x8 v_src2 = v_load(src2 + x);
1599
1600
v_int32x4 t0, t1;
1601
v_expand(v_src2, t0, t1);
1602
1603
v_float32x4 f0 = v_cvt_f32(t0);
1604
v_float32x4 f1 = v_cvt_f32(t1);
1605
1606
f0 = v_scale / f0;
1607
f1 = v_scale / f1;
1608
1609
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
1610
v_int16x8 res = v_pack(i0, i1);
1611
1612
res = v_select(v_src2 == v_zero, v_zero, res);
1613
v_store(dst + x, res);
1614
}
1615
1616
return x;
1617
}
1618
};
1619
1620
template <>
1621
struct Recip_SIMD<int>
1622
{
1623
bool haveSIMD;
1624
Recip_SIMD() { haveSIMD = hasSIMD128(); }
1625
1626
int operator() (const int * src2, int * dst, int width, double scale) const
1627
{
1628
int x = 0;
1629
1630
if (!haveSIMD)
1631
return x;
1632
1633
v_float32x4 v_scale = v_setall_f32((float)scale);
1634
v_int32x4 v_zero = v_setzero_s32();
1635
1636
for ( ; x <= width - 8; x += 8)
1637
{
1638
v_int32x4 t0 = v_load(src2 + x);
1639
v_int32x4 t1 = v_load(src2 + x + 4);
1640
1641
v_float32x4 f0 = v_cvt_f32(t0);
1642
v_float32x4 f1 = v_cvt_f32(t1);
1643
1644
f0 = v_scale / f0;
1645
f1 = v_scale / f1;
1646
1647
v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
1648
1649
res0 = v_select(t0 == v_zero, v_zero, res0);
1650
res1 = v_select(t1 == v_zero, v_zero, res1);
1651
v_store(dst + x, res0);
1652
v_store(dst + x + 4, res1);
1653
}
1654
1655
return x;
1656
}
1657
};
1658
1659
1660
template <>
1661
struct Recip_SIMD<float>
1662
{
1663
bool haveSIMD;
1664
Recip_SIMD() { haveSIMD = hasSIMD128(); }
1665
1666
int operator() (const float * src2, float * dst, int width, double scale) const
1667
{
1668
int x = 0;
1669
1670
if (!haveSIMD)
1671
return x;
1672
1673
v_float32x4 v_scale = v_setall_f32((float)scale);
1674
1675
for ( ; x <= width - 8; x += 8)
1676
{
1677
v_float32x4 f0 = v_load(src2 + x);
1678
v_float32x4 f1 = v_load(src2 + x + 4);
1679
1680
v_float32x4 res0 = v_scale / f0;
1681
v_float32x4 res1 = v_scale / f1;
1682
1683
v_store(dst + x, res0);
1684
v_store(dst + x + 4, res1);
1685
}
1686
1687
return x;
1688
}
1689
};
1690
1691
#if CV_SIMD128_64F
1692
1693
template <>
1694
struct Div_SIMD<double>
1695
{
1696
bool haveSIMD;
1697
Div_SIMD() { haveSIMD = hasSIMD128(); }
1698
1699
int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
1700
{
1701
int x = 0;
1702
1703
if (!haveSIMD)
1704
return x;
1705
1706
v_float64x2 v_scale = v_setall_f64(scale);
1707
1708
for ( ; x <= width - 4; x += 4)
1709
{
1710
v_float64x2 f0 = v_load(src1 + x);
1711
v_float64x2 f1 = v_load(src1 + x + 2);
1712
v_float64x2 f2 = v_load(src2 + x);
1713
v_float64x2 f3 = v_load(src2 + x + 2);
1714
1715
v_float64x2 res0 = f0 * v_scale / f2;
1716
v_float64x2 res1 = f1 * v_scale / f3;
1717
1718
v_store(dst + x, res0);
1719
v_store(dst + x + 2, res1);
1720
}
1721
1722
return x;
1723
}
1724
};
1725
1726
template <>
1727
struct Recip_SIMD<double>
1728
{
1729
bool haveSIMD;
1730
Recip_SIMD() { haveSIMD = hasSIMD128(); }
1731
1732
int operator() (const double * src2, double * dst, int width, double scale) const
1733
{
1734
int x = 0;
1735
1736
if (!haveSIMD)
1737
return x;
1738
1739
v_float64x2 v_scale = v_setall_f64(scale);
1740
1741
for ( ; x <= width - 4; x += 4)
1742
{
1743
v_float64x2 f0 = v_load(src2 + x);
1744
v_float64x2 f1 = v_load(src2 + x + 2);
1745
1746
v_float64x2 res0 = v_scale / f0;
1747
v_float64x2 res1 = v_scale / f1;
1748
1749
v_store(dst + x, res0);
1750
v_store(dst + x + 2, res1);
1751
}
1752
1753
return x;
1754
}
1755
};
1756
1757
#endif
1758
1759
#endif
1760
1761
1762
template <typename T, typename WT>
1763
struct AddWeighted_SIMD
1764
{
1765
int operator() (const T *, const T *, T *, int, WT, WT, WT) const
1766
{
1767
return 0;
1768
}
1769
};
1770
1771
#if CV_SSE2
1772
1773
template <>
1774
struct AddWeighted_SIMD<schar, float>
1775
{
1776
AddWeighted_SIMD()
1777
{
1778
haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
1779
}
1780
1781
int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
1782
{
1783
int x = 0;
1784
1785
if (!haveSSE2)
1786
return x;
1787
1788
__m128i v_zero = _mm_setzero_si128();
1789
__m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
1790
v_gamma = _mm_set1_ps(gamma);
1791
1792
for( ; x <= width - 8; x += 8 )
1793
{
1794
__m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x));
1795
__m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
1796
1797
__m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
1798
__m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
1799
1800
__m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha);
1801
v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
1802
_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta));
1803
1804
__m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha);
1805
v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
1806
_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta));
1807
1808
__m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
1809
_mm_cvtps_epi32(v_dstf1));
1810
1811
_mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero));
1812
}
1813
1814
return x;
1815
}
1816
1817
bool haveSSE2;
1818
};
1819
1820
template <>
1821
struct AddWeighted_SIMD<short, float>
1822
{
1823
AddWeighted_SIMD()
1824
{
1825
haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
1826
}
1827
1828
int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
1829
{
1830
int x = 0;
1831
1832
if (!haveSSE2)
1833
return x;
1834
1835
__m128i v_zero = _mm_setzero_si128();
1836
__m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
1837
v_gamma = _mm_set1_ps(gamma);
1838
1839
for( ; x <= width - 8; x += 8 )
1840
{
1841
__m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
1842
__m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
1843
1844
__m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha);
1845
v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
1846
_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta));
1847
1848
__m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha);
1849
v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
1850
_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta));
1851
1852
_mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
1853
_mm_cvtps_epi32(v_dstf1)));
1854
}
1855
1856
return x;
1857
}
1858
1859
bool haveSSE2;
1860
};
1861
1862
#if CV_SSE4_1
1863
1864
template <>
1865
struct AddWeighted_SIMD<ushort, float>
1866
{
1867
AddWeighted_SIMD()
1868
{
1869
haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
1870
}
1871
1872
int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
1873
{
1874
int x = 0;
1875
1876
if (!haveSSE4_1)
1877
return x;
1878
1879
__m128i v_zero = _mm_setzero_si128();
1880
__m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
1881
v_gamma = _mm_set1_ps(gamma);
1882
1883
for( ; x <= width - 8; x += 8 )
1884
{
1885
__m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
1886
__m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
1887
1888
__m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha);
1889
v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
1890
_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta));
1891
1892
__m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha);
1893
v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
1894
_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta));
1895
1896
_mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0),
1897
_mm_cvtps_epi32(v_dstf1)));
1898
}
1899
1900
return x;
1901
}
1902
1903
bool haveSSE4_1;
1904
};
1905
1906
#endif
1907
1908
#elif CV_NEON
1909
1910
template <>
1911
struct AddWeighted_SIMD<schar, float>
1912
{
1913
int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
1914
{
1915
int x = 0;
1916
1917
float32x4_t g = vdupq_n_f32 (gamma);
1918
1919
for( ; x <= width - 8; x += 8 )
1920
{
1921
int8x8_t in1 = vld1_s8(src1 + x);
1922
int16x8_t in1_16 = vmovl_s8(in1);
1923
float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16)));
1924
float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16)));
1925
1926
int8x8_t in2 = vld1_s8(src2+x);
1927
int16x8_t in2_16 = vmovl_s8(in2);
1928
float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16)));
1929
float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16)));
1930
1931
float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
1932
float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
1933
out_f_l = vaddq_f32(out_f_l, g);
1934
out_f_h = vaddq_f32(out_f_h, g);
1935
1936
int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l));
1937
int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h));
1938
1939
int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h);
1940
int8x8_t out = vqmovn_s16(out_16);
1941
1942
vst1_s8(dst + x, out);
1943
}
1944
1945
return x;
1946
}
1947
};
1948
1949
template <>
1950
struct AddWeighted_SIMD<ushort, float>
1951
{
1952
int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
1953
{
1954
int x = 0;
1955
1956
float32x4_t g = vdupq_n_f32(gamma);
1957
1958
for( ; x <= width - 8; x += 8 )
1959
{
1960
uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
1961
1962
float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha);
1963
float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta);
1964
uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
1965
1966
v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha);
1967
v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta);
1968
uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
1969
1970
vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
1971
}
1972
1973
return x;
1974
}
1975
};
1976
1977
template <>
1978
struct AddWeighted_SIMD<short, float>
1979
{
1980
int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
1981
{
1982
int x = 0;
1983
1984
float32x4_t g = vdupq_n_f32(gamma);
1985
1986
for( ; x <= width - 8; x += 8 )
1987
{
1988
int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
1989
1990
float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha);
1991
float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta);
1992
int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
1993
1994
v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha);
1995
v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta);
1996
int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
1997
1998
vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2));
1999
}
2000
2001
return x;
2002
}
2003
};
2004
2005
#endif
2006
2007
}
2008
2009
#endif // __OPENCV_ARITHM_SIMD_HPP__
2010
2011