CoCalc -- accumulate.cpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/accumulate.cpp
¹⁶³³⁷ views
1
/*
2
 * By downloading, copying, installing or using the software you agree to this license.
3
 * If you do not agree to this license, do not download, install,
4
 * copy or use the software.
5
 *
6
 *
7
 *                           License Agreement
8
 *                For Open Source Computer Vision Library
9
 *                        (3-clause BSD License)
10
 *
11
 * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
12
 * Third party copyrights are property of their respective owners.
13
 *
14
 * Redistribution and use in source and binary forms, with or without modification,
15
 * are permitted provided that the following conditions are met:
16
 *
17
 *   * Redistributions of source code must retain the above copyright notice,
18
 *     this list of conditions and the following disclaimer.
19
 *
20
 *   * Redistributions in binary form must reproduce the above copyright notice,
21
 *     this list of conditions and the following disclaimer in the documentation
22
 *     and/or other materials provided with the distribution.
23
 *
24
 *   * Neither the names of the copyright holders nor the names of the contributors
25
 *     may be used to endorse or promote products derived from this software
26
 *     without specific prior written permission.
27
 *
28
 * This software is provided by the copyright holders and contributors "as is" and
29
 * any express or implied warranties, including, but not limited to, the implied
30
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31
 * In no event shall copyright holders or contributors be liable for any direct,
32
 * indirect, incidental, special, exemplary, or consequential damages
33
 * (including, but not limited to, procurement of substitute goods or services;
34
 * loss of use, data, or profits; or business interruption) however caused
35
 * and on any theory of liability, whether in contract, strict liability,
36
 * or tort (including negligence or otherwise) arising in any way out of
37
 * the use of this software, even if advised of the possibility of such damage.
38
 */
39

40

41
#include "common.hpp"
42
#include "vtransform.hpp"
43

44
#include <cstring>
45

46
namespace CAROTENE_NS {
47

48
void accumulate(const Size2D &size,
49
                const u8 *srcBase, ptrdiff_t srcStride,
50
                s16 *dstBase, ptrdiff_t dstStride)
51
{
52
    internal::assertSupportedConfiguration();
53
#ifdef CAROTENE_NEON
54
    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
55
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
56

57
    for (size_t i = 0; i < size.height; ++i)
58
    {
59
        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
60
        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
61
        size_t j = 0;
62

63
        for (; j < roiw16; j += 16)
64
        {
65
            internal::prefetch(src + j);
66
            internal::prefetch(dst + j);
67
            uint8x16_t v_src = vld1q_u8(src + j);
68
            int16x8_t v_dst0 = vld1q_s16(dst + j);
69
            int16x8_t v_dst1 = vld1q_s16(dst + j + 8);
70
            int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
71
            int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
72
            v_dst0 = vqaddq_s16(v_dst0, v_src0);
73
            v_dst1 = vqaddq_s16(v_dst1, v_src1);
74
            vst1q_s16(dst + j, v_dst0);
75
            vst1q_s16(dst + j + 8, v_dst1);
76
        }
77
        for (; j < roiw8; j += 8)
78
        {
79
            uint8x8_t v_src = vld1_u8(src + j);
80
            int16x8_t v_src16 = vreinterpretq_s16_u16(vmovl_u8(v_src));
81
            int16x8_t v_dst = vld1q_s16(dst + j);
82
            v_dst = vqaddq_s16(v_dst, v_src16);
83
            vst1q_s16(dst + j, v_dst);
84
        }
85

86
        for (; j < size.width; j++)
87
            dst[j] = internal::saturate_cast<s16>(src[j] + dst[j]);
88
    }
89
#else
90
    (void)size;
91
    (void)srcBase;
92
    (void)srcStride;
93
    (void)dstBase;
94
    (void)dstStride;
95
#endif
96
}
97

98
#ifdef CAROTENE_NEON
99

100
namespace {
101

102
template <int shift>
103
void accumulateSquareConst(const Size2D &size,
104
                           const u8 *srcBase, ptrdiff_t srcStride,
105
                           s16 *dstBase, ptrdiff_t dstStride)
106
{
107
    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
108
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
109

110
    for (size_t i = 0; i < size.height; ++i)
111
    {
112
        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
113
        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
114
        size_t j = 0;
115

116
        for (; j < roiw16; j += 16)
117
        {
118
            internal::prefetch(src + j);
119
            internal::prefetch(dst + j);
120
            uint8x16_t v_src = vld1q_u8(src + j);
121
            int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
122
            int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
123
            int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
124

125
            int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
126
            v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst0))),
127
                                  vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst0))));
128

129
            v_srclo = vget_low_s16(v_src1);
130
            v_srchi = vget_high_s16(v_src1);
131
            v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst1))),
132
                                  vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst1))));
133

134
            vst1q_s16(dst + j, v_dst0);
135
            vst1q_s16(dst + j + 8, v_dst1);
136
        }
137
        for (; j < roiw8; j += 8)
138
        {
139
            int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
140
            int16x8_t v_dst = vld1q_s16(dst + j);
141
            int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
142
            v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst))),
143
                                 vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst))));
144
            vst1q_s16(dst + j, v_dst);
145
        }
146

147
        for (; j < size.width; j++)
148
        {
149
            s32 srcVal = src[j];
150
            dst[j] = internal::saturate_cast<s16>(dst[j] + ((srcVal * srcVal) >> shift));
151
        }
152
    }
153
}
154

155
template <>
156
void accumulateSquareConst<0>(const Size2D &size,
157
                              const u8 *srcBase, ptrdiff_t srcStride,
158
                              s16 *dstBase, ptrdiff_t dstStride)
159
{
160
    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
161
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
162

163
    for (size_t i = 0; i < size.height; ++i)
164
    {
165
        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
166
        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
167
        size_t j = 0;
168

169
        for (; j < roiw16; j += 16)
170
        {
171
            internal::prefetch(src + j);
172
            internal::prefetch(dst + j);
173
            uint8x16_t v_src = vld1q_u8(src + j);
174
            int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
175
            int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
176
            int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
177

178
            int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
179
            v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst0))),
180
                                  vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst0))));
181

182
            v_srclo = vget_low_s16(v_src1);
183
            v_srchi = vget_high_s16(v_src1);
184
            v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst1))),
185
                                  vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst1))));
186

187
            vst1q_s16(dst + j, v_dst0);
188
            vst1q_s16(dst + j + 8, v_dst1);
189
        }
190
        for (; j < roiw8; j += 8)
191
        {
192
            int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
193
            int16x8_t v_dst = vld1q_s16(dst + j);
194
            int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
195
            v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst))),
196
                                 vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst))));
197
            vst1q_s16(dst + j, v_dst);
198
        }
199

200
        for (; j < size.width; j++)
201
        {
202
            s32 srcVal = src[j];
203
            dst[j] = internal::saturate_cast<s16>(dst[j] + srcVal * srcVal);
204
        }
205
    }
206
}
207

208
typedef void (* accumulateSquareConstFunc)(const Size2D &size,
209
                                           const u8 *srcBase, ptrdiff_t srcStride,
210
                                           s16 *dstBase, ptrdiff_t dstStride);
211

212
} // namespace
213

214
#endif
215

216
void accumulateSquare(const Size2D &size,
217
                      const u8 *srcBase, ptrdiff_t srcStride,
218
                      s16 *dstBase, ptrdiff_t dstStride,
219
                      u32 shift)
220
{
221
    if (shift >= 16)
222
    {
223
        for (size_t i = 0; i < size.height; ++i)
224
        {
225
            s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
226
            std::memset(dst, 0, sizeof(s16) * size.width);
227
        }
228
        return;
229
    }
230

231
    internal::assertSupportedConfiguration();
232

233
#ifdef CAROTENE_NEON
234
    // this ugly contruction is needed to avoid:
235
    // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
236
    // return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
237

238
    accumulateSquareConstFunc funcs[16] =
239
    {
240
        accumulateSquareConst<0>,
241
        accumulateSquareConst<1>,
242
        accumulateSquareConst<2>,
243
        accumulateSquareConst<3>,
244
        accumulateSquareConst<4>,
245
        accumulateSquareConst<5>,
246
        accumulateSquareConst<6>,
247
        accumulateSquareConst<7>,
248
        accumulateSquareConst<8>,
249
        accumulateSquareConst<9>,
250
        accumulateSquareConst<10>,
251
        accumulateSquareConst<11>,
252
        accumulateSquareConst<12>,
253
        accumulateSquareConst<13>,
254
        accumulateSquareConst<14>,
255
        accumulateSquareConst<15>
256
    }, func = funcs[shift];
257

258
    func(size, srcBase, srcStride, dstBase, dstStride);
259
#else
260
    (void)size;
261
    (void)srcBase;
262
    (void)srcStride;
263
    (void)dstBase;
264
    (void)dstStride;
265
    (void)shift;
266
#endif
267
}
268

269
#ifdef CAROTENE_NEON
270

271
namespace {
272

273
struct AccumulateWeightedHalf
274
{
275
    typedef u8 type;
276

277
    void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
278
                     uint8x16_t & v_dst) const
279
    {
280
        v_dst = vhaddq_u8(v_src0, v_src1);
281
    }
282

283
    void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
284
                     uint8x8_t & v_dst) const
285
    {
286
        v_dst = vhadd_u8(v_src0, v_src1);
287
    }
288

289
    void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
290
    {
291
        dst[0] = ((u16)(src0[0]) + src1[0]) >> 1;
292
    }
293
};
294

295
struct AccumulateWeighted
296
{
297
    typedef u8 type;
298

299
    float alpha, beta;
300
    float32x4_t v_alpha, v_beta;
301

302
    explicit AccumulateWeighted(float _alpha) :
303
        alpha(_alpha), beta(1 - _alpha)
304
    {
305
        v_alpha = vdupq_n_f32(alpha);
306
        v_beta = vdupq_n_f32(beta);
307
    }
308

309
    void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
310
                     uint8x16_t & v_dst) const
311
    {
312
        uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
313
        uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
314
        float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
315
                                        v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
316
        float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
317
                                        v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
318
        uint16x8_t v_dst0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
319
                                         vmovn_u32(vcvtq_u32_f32(v_dst1f)));
320

321
        v_src0_p = vmovl_u8(vget_high_u8(v_src0));
322
        v_src1_p = vmovl_u8(vget_high_u8(v_src1));
323
        v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
324
                            v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
325
        v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
326
                            v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
327
        uint16x8_t v_dst1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
328
                                         vmovn_u32(vcvtq_u32_f32(v_dst1f)));
329

330
        v_dst = vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1));
331
    }
332

333
    void operator() (const uint8x8_t & _v_src0, const uint8x8_t & _v_src1,
334
                     uint8x8_t & v_dst) const
335
    {
336
        uint16x8_t v_src0 = vmovl_u8(_v_src0), v_src1 = vmovl_u8(_v_src1);
337

338
        float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_beta),
339
                                        v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))));
340
        float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_beta),
341
                                        v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))));
342
        uint16x8_t _v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
343
                                        vmovn_u32(vcvtq_u32_f32(v_dst1f)));
344

345
        v_dst = vmovn_u16(_v_dst);
346
    }
347

348
    void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
349
    {
350
        dst[0] = beta * src1[0] + alpha * src0[0];
351
    }
352
};
353

354
} // namespace
355

356
#endif
357

358
void accumulateWeighted(const Size2D &size,
359
                        const u8 *srcBase, ptrdiff_t srcStride,
360
                        u8 *dstBase, ptrdiff_t dstStride,
361
                        f32 alpha)
362
{
363
    if (alpha == 0.0f)
364
        return;
365
    if (alpha == 1.0f)
366
    {
367
        for (size_t i = 0; i < size.height; ++i)
368
        {
369
            const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
370
            u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
371
            std::memcpy(dst, src, sizeof(u8) * size.width);
372
        }
373
        return;
374
    }
375

376
    internal::assertSupportedConfiguration();
377

378
#ifdef CAROTENE_NEON
379
    // in this case we can use the following scheme:
380
    // dst[p] = (src[p] + dst[p]) >> 1
381
    // which is faster
382
    if (alpha == 0.5f)
383
    {
384
        internal::vtransform(size,
385
                             srcBase, srcStride,
386
                             dstBase, dstStride,
387
                             dstBase, dstStride,
388
                             AccumulateWeightedHalf());
389

390
        return;
391
    }
392

393
    internal::vtransform(size,
394
                     srcBase, srcStride,
395
                     dstBase, dstStride,
396
                     dstBase, dstStride,
397
                     AccumulateWeighted(alpha));
398
#else
399
    (void)size;
400
    (void)srcBase;
401
    (void)srcStride;
402
    (void)dstBase;
403
    (void)dstStride;
404
    (void)alpha;
405
#endif
406
}
407

408
} //namespace CAROTENE_NS
409

410
Product

Resources

Company