Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/accumulate.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
41
#include "common.hpp"
42
#include "vtransform.hpp"
43
44
#include <cstring>
45
46
namespace CAROTENE_NS {
47
48
void accumulate(const Size2D &size,
49
const u8 *srcBase, ptrdiff_t srcStride,
50
s16 *dstBase, ptrdiff_t dstStride)
51
{
52
internal::assertSupportedConfiguration();
53
#ifdef CAROTENE_NEON
54
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
55
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
56
57
for (size_t i = 0; i < size.height; ++i)
58
{
59
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
60
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
61
size_t j = 0;
62
63
for (; j < roiw16; j += 16)
64
{
65
internal::prefetch(src + j);
66
internal::prefetch(dst + j);
67
uint8x16_t v_src = vld1q_u8(src + j);
68
int16x8_t v_dst0 = vld1q_s16(dst + j);
69
int16x8_t v_dst1 = vld1q_s16(dst + j + 8);
70
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
71
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
72
v_dst0 = vqaddq_s16(v_dst0, v_src0);
73
v_dst1 = vqaddq_s16(v_dst1, v_src1);
74
vst1q_s16(dst + j, v_dst0);
75
vst1q_s16(dst + j + 8, v_dst1);
76
}
77
for (; j < roiw8; j += 8)
78
{
79
uint8x8_t v_src = vld1_u8(src + j);
80
int16x8_t v_src16 = vreinterpretq_s16_u16(vmovl_u8(v_src));
81
int16x8_t v_dst = vld1q_s16(dst + j);
82
v_dst = vqaddq_s16(v_dst, v_src16);
83
vst1q_s16(dst + j, v_dst);
84
}
85
86
for (; j < size.width; j++)
87
dst[j] = internal::saturate_cast<s16>(src[j] + dst[j]);
88
}
89
#else
90
(void)size;
91
(void)srcBase;
92
(void)srcStride;
93
(void)dstBase;
94
(void)dstStride;
95
#endif
96
}
97
98
#ifdef CAROTENE_NEON
99
100
namespace {
101
102
template <int shift>
103
void accumulateSquareConst(const Size2D &size,
104
const u8 *srcBase, ptrdiff_t srcStride,
105
s16 *dstBase, ptrdiff_t dstStride)
106
{
107
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
108
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
109
110
for (size_t i = 0; i < size.height; ++i)
111
{
112
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
113
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
114
size_t j = 0;
115
116
for (; j < roiw16; j += 16)
117
{
118
internal::prefetch(src + j);
119
internal::prefetch(dst + j);
120
uint8x16_t v_src = vld1q_u8(src + j);
121
int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
122
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
123
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
124
125
int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
126
v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst0))),
127
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst0))));
128
129
v_srclo = vget_low_s16(v_src1);
130
v_srchi = vget_high_s16(v_src1);
131
v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst1))),
132
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst1))));
133
134
vst1q_s16(dst + j, v_dst0);
135
vst1q_s16(dst + j + 8, v_dst1);
136
}
137
for (; j < roiw8; j += 8)
138
{
139
int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
140
int16x8_t v_dst = vld1q_s16(dst + j);
141
int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
142
v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst))),
143
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst))));
144
vst1q_s16(dst + j, v_dst);
145
}
146
147
for (; j < size.width; j++)
148
{
149
s32 srcVal = src[j];
150
dst[j] = internal::saturate_cast<s16>(dst[j] + ((srcVal * srcVal) >> shift));
151
}
152
}
153
}
154
155
template <>
156
void accumulateSquareConst<0>(const Size2D &size,
157
const u8 *srcBase, ptrdiff_t srcStride,
158
s16 *dstBase, ptrdiff_t dstStride)
159
{
160
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
161
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
162
163
for (size_t i = 0; i < size.height; ++i)
164
{
165
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
166
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
167
size_t j = 0;
168
169
for (; j < roiw16; j += 16)
170
{
171
internal::prefetch(src + j);
172
internal::prefetch(dst + j);
173
uint8x16_t v_src = vld1q_u8(src + j);
174
int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
175
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
176
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
177
178
int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
179
v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst0))),
180
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst0))));
181
182
v_srclo = vget_low_s16(v_src1);
183
v_srchi = vget_high_s16(v_src1);
184
v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst1))),
185
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst1))));
186
187
vst1q_s16(dst + j, v_dst0);
188
vst1q_s16(dst + j + 8, v_dst1);
189
}
190
for (; j < roiw8; j += 8)
191
{
192
int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
193
int16x8_t v_dst = vld1q_s16(dst + j);
194
int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
195
v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst))),
196
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst))));
197
vst1q_s16(dst + j, v_dst);
198
}
199
200
for (; j < size.width; j++)
201
{
202
s32 srcVal = src[j];
203
dst[j] = internal::saturate_cast<s16>(dst[j] + srcVal * srcVal);
204
}
205
}
206
}
207
208
typedef void (* accumulateSquareConstFunc)(const Size2D &size,
209
const u8 *srcBase, ptrdiff_t srcStride,
210
s16 *dstBase, ptrdiff_t dstStride);
211
212
} // namespace
213
214
#endif
215
216
void accumulateSquare(const Size2D &size,
217
const u8 *srcBase, ptrdiff_t srcStride,
218
s16 *dstBase, ptrdiff_t dstStride,
219
u32 shift)
220
{
221
if (shift >= 16)
222
{
223
for (size_t i = 0; i < size.height; ++i)
224
{
225
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
226
std::memset(dst, 0, sizeof(s16) * size.width);
227
}
228
return;
229
}
230
231
internal::assertSupportedConfiguration();
232
233
#ifdef CAROTENE_NEON
234
// this ugly contruction is needed to avoid:
235
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
236
// return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
237
238
accumulateSquareConstFunc funcs[16] =
239
{
240
accumulateSquareConst<0>,
241
accumulateSquareConst<1>,
242
accumulateSquareConst<2>,
243
accumulateSquareConst<3>,
244
accumulateSquareConst<4>,
245
accumulateSquareConst<5>,
246
accumulateSquareConst<6>,
247
accumulateSquareConst<7>,
248
accumulateSquareConst<8>,
249
accumulateSquareConst<9>,
250
accumulateSquareConst<10>,
251
accumulateSquareConst<11>,
252
accumulateSquareConst<12>,
253
accumulateSquareConst<13>,
254
accumulateSquareConst<14>,
255
accumulateSquareConst<15>
256
}, func = funcs[shift];
257
258
func(size, srcBase, srcStride, dstBase, dstStride);
259
#else
260
(void)size;
261
(void)srcBase;
262
(void)srcStride;
263
(void)dstBase;
264
(void)dstStride;
265
(void)shift;
266
#endif
267
}
268
269
#ifdef CAROTENE_NEON
270
271
namespace {
272
273
struct AccumulateWeightedHalf
274
{
275
typedef u8 type;
276
277
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
278
uint8x16_t & v_dst) const
279
{
280
v_dst = vhaddq_u8(v_src0, v_src1);
281
}
282
283
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
284
uint8x8_t & v_dst) const
285
{
286
v_dst = vhadd_u8(v_src0, v_src1);
287
}
288
289
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
290
{
291
dst[0] = ((u16)(src0[0]) + src1[0]) >> 1;
292
}
293
};
294
295
struct AccumulateWeighted
296
{
297
typedef u8 type;
298
299
float alpha, beta;
300
float32x4_t v_alpha, v_beta;
301
302
explicit AccumulateWeighted(float _alpha) :
303
alpha(_alpha), beta(1 - _alpha)
304
{
305
v_alpha = vdupq_n_f32(alpha);
306
v_beta = vdupq_n_f32(beta);
307
}
308
309
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
310
uint8x16_t & v_dst) const
311
{
312
uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
313
uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
314
float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
315
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
316
float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
317
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
318
uint16x8_t v_dst0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
319
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
320
321
v_src0_p = vmovl_u8(vget_high_u8(v_src0));
322
v_src1_p = vmovl_u8(vget_high_u8(v_src1));
323
v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
324
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
325
v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
326
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
327
uint16x8_t v_dst1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
328
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
329
330
v_dst = vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1));
331
}
332
333
void operator() (const uint8x8_t & _v_src0, const uint8x8_t & _v_src1,
334
uint8x8_t & v_dst) const
335
{
336
uint16x8_t v_src0 = vmovl_u8(_v_src0), v_src1 = vmovl_u8(_v_src1);
337
338
float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_beta),
339
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))));
340
float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_beta),
341
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))));
342
uint16x8_t _v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
343
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
344
345
v_dst = vmovn_u16(_v_dst);
346
}
347
348
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
349
{
350
dst[0] = beta * src1[0] + alpha * src0[0];
351
}
352
};
353
354
} // namespace
355
356
#endif
357
358
void accumulateWeighted(const Size2D &size,
359
const u8 *srcBase, ptrdiff_t srcStride,
360
u8 *dstBase, ptrdiff_t dstStride,
361
f32 alpha)
362
{
363
if (alpha == 0.0f)
364
return;
365
if (alpha == 1.0f)
366
{
367
for (size_t i = 0; i < size.height; ++i)
368
{
369
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
370
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
371
std::memcpy(dst, src, sizeof(u8) * size.width);
372
}
373
return;
374
}
375
376
internal::assertSupportedConfiguration();
377
378
#ifdef CAROTENE_NEON
379
// in this case we can use the following scheme:
380
// dst[p] = (src[p] + dst[p]) >> 1
381
// which is faster
382
if (alpha == 0.5f)
383
{
384
internal::vtransform(size,
385
srcBase, srcStride,
386
dstBase, dstStride,
387
dstBase, dstStride,
388
AccumulateWeightedHalf());
389
390
return;
391
}
392
393
internal::vtransform(size,
394
srcBase, srcStride,
395
dstBase, dstStride,
396
dstBase, dstStride,
397
AccumulateWeighted(alpha));
398
#else
399
(void)size;
400
(void)srcBase;
401
(void)srcStride;
402
(void)dstBase;
403
(void)dstStride;
404
(void)alpha;
405
#endif
406
}
407
408
} //namespace CAROTENE_NS
409
410