CoCalc -- convolution.cpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/convolution.cpp
¹⁶³³⁷ views
1
/*
2
 * By downloading, copying, installing or using the software you agree to this license.
3
 * If you do not agree to this license, do not download, install,
4
 * copy or use the software.
5
 *
6
 *
7
 *                           License Agreement
8
 *                For Open Source Computer Vision Library
9
 *                        (3-clause BSD License)
10
 *
11
 * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
12
 * Third party copyrights are property of their respective owners.
13
 *
14
 * Redistribution and use in source and binary forms, with or without modification,
15
 * are permitted provided that the following conditions are met:
16
 *
17
 *   * Redistributions of source code must retain the above copyright notice,
18
 *     this list of conditions and the following disclaimer.
19
 *
20
 *   * Redistributions in binary form must reproduce the above copyright notice,
21
 *     this list of conditions and the following disclaimer in the documentation
22
 *     and/or other materials provided with the distribution.
23
 *
24
 *   * Neither the names of the copyright holders nor the names of the contributors
25
 *     may be used to endorse or promote products derived from this software
26
 *     without specific prior written permission.
27
 *
28
 * This software is provided by the copyright holders and contributors "as is" and
29
 * any express or implied warranties, including, but not limited to, the implied
30
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31
 * In no event shall copyright holders or contributors be liable for any direct,
32
 * indirect, incidental, special, exemplary, or consequential damages
33
 * (including, but not limited to, procurement of substitute goods or services;
34
 * loss of use, data, or profits; or business interruption) however caused
35
 * and on any theory of liability, whether in contract, strict liability,
36
 * or tort (including negligence or otherwise) arising in any way out of
37
 * the use of this software, even if advised of the possibility of such damage.
38
 */
39

40
#include "common.hpp"
41
#include "saturate_cast.hpp"
42

43
namespace CAROTENE_NS {
44

45
bool isConvolutionSupported(const Size2D &size, const Size2D &ksize,
46
                            BORDER_MODE border)
47
{
48
    return isSupportedConfiguration() && size.width >= 8 &&
49
        (border == BORDER_MODE_CONSTANT ||
50
            border == BORDER_MODE_REPLICATE) &&
51
        (ksize.width == 3) && (ksize.height == 3);
52
}
53

54
#ifdef CAROTENE_NEON
55

56
namespace {
57

58
template <int shift>
59
int32x4_t vshrq_s32(int32x4_t value)
60
{
61
    return vshrq_n_s32(value, shift);
62
}
63

64
template <>
65
int32x4_t vshrq_s32<0>(int32x4_t value)
66
{
67
    return value;
68
}
69

70
} // namespace
71

72
typedef int32x4_t (* vshrq_s32_func)(int32x4_t value);
73

74
#endif
75

76
void convolution(const Size2D &size,
77
                 const u8 * srcBase, ptrdiff_t srcStride,
78
                 u8 * dstBase, ptrdiff_t dstStride,
79
                 BORDER_MODE border, u8 borderValue,
80
                 const Size2D & ksize, s16 * kernelBase, u32 scale)
81
{
82
    internal::assertSupportedConfiguration(isConvolutionSupported(size, ksize, border));
83
#ifdef CAROTENE_NEON
84
    const uint8x8_t v_zero_u8 = vdup_n_u8(0);
85
    const uint8x8_t v_border = vdup_n_u8(borderValue);
86
    const int32x4_t v_zero_s32 = vdupq_n_s32(0);
87

88
    uint8x8_t tprev[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
89
              tcurr[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
90
              tnext[3] = { v_zero_u8, v_zero_u8, v_zero_u8 };
91
    uint8x8_t t0 = v_zero_u8, t1 = v_zero_u8, t2 = v_zero_u8;
92

93
    ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
94
    static const vshrq_s32_func vshrq_s32_a[33] =
95
    {
96
        vshrq_s32<0>,
97
        vshrq_s32<1>,
98
        vshrq_s32<2>,
99
        vshrq_s32<3>,
100
        vshrq_s32<4>,
101
        vshrq_s32<5>,
102
        vshrq_s32<6>,
103
        vshrq_s32<7>,
104
        vshrq_s32<8>,
105
        vshrq_s32<9>,
106
        vshrq_s32<10>,
107
        vshrq_s32<11>,
108
        vshrq_s32<12>,
109
        vshrq_s32<13>,
110
        vshrq_s32<14>,
111
        vshrq_s32<15>,
112
        vshrq_s32<16>,
113
        vshrq_s32<17>,
114
        vshrq_s32<18>,
115
        vshrq_s32<19>,
116
        vshrq_s32<20>,
117
        vshrq_s32<21>,
118
        vshrq_s32<22>,
119
        vshrq_s32<23>,
120
        vshrq_s32<24>,
121
        vshrq_s32<25>,
122
        vshrq_s32<26>,
123
        vshrq_s32<27>,
124
        vshrq_s32<28>,
125
        vshrq_s32<29>,
126
        vshrq_s32<30>,
127
        vshrq_s32<31>,
128
        vshrq_s32<32>
129
    };
130
    vshrq_s32_func vshrq_s32_p = vshrq_s32_a[scale];
131

132
    for (ptrdiff_t y = 0; y < height; ++y)
133
    {
134
        const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
135
        const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
136
        const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
137
        u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
138

139
        u8 prevx[3] = { 0, 0, 0 },
140
           currx[3] = { 0, 0, 0 },
141
           nextx[3] = { 0, 0, 0 };
142
        ptrdiff_t x = 0;
143
        const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
144

145
        // perform vertical convolution
146
        for ( ; x <= bwidth; x += 8)
147
        {
148
            internal::prefetch(srow0 + x);
149
            internal::prefetch(srow1 + x);
150
            internal::prefetch(srow2 + x);
151

152
            uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
153
            uint8x8_t x1 = vld1_u8(srow1 + x);
154
            uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
155

156
            // calculate values for plain CPU part below if needed
157
            if (x + 8 >= bwidth)
158
            {
159
                ptrdiff_t x3 = x == width ? width - 1 : x;
160
                ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
161

162
                if (border == BORDER_MODE_CONSTANT && x4 < 0)
163
                    prevx[0] = prevx[1] = prevx[2] = borderValue;
164
                else
165
                {
166
                    prevx[0] = srow0 ? srow0[x4] : borderValue;
167
                    prevx[1] =         srow1[x4]              ;
168
                    prevx[2] = srow2 ? srow2[x4] : borderValue;
169
                }
170

171
                currx[0] = srow0 ? srow0[x3] : borderValue;
172
                currx[1] =         srow1[x3]              ;
173
                currx[2] = srow2 ? srow2[x3] : borderValue;
174
            }
175

176
            // make shift
177
            if (x)
178
            {
179
                tprev[0] = tcurr[0];
180
                tcurr[0] = tnext[0];
181

182
                tprev[1] = tcurr[1];
183
                tcurr[1] = tnext[1];
184

185
                tprev[2] = tcurr[2];
186
                tcurr[2] = tnext[2];
187
            }
188

189
            tnext[0] = x0;
190
            tnext[1] = x1;
191
            tnext[2] = x2;
192

193
            // make extrapolation for the first elements
194
            if (!x)
195
            {
196
                // make border
197
                if (border == BORDER_MODE_CONSTANT)
198
                    tcurr[0] = tcurr[1] = tcurr[2] = v_border;
199
                else if (border == BORDER_MODE_REPLICATE)
200
                {
201
                    tcurr[0] = vdup_n_u8(vget_lane_u8(tnext[0], 0));
202
                    tcurr[1] = vdup_n_u8(vget_lane_u8(tnext[1], 0));
203
                    tcurr[2] = vdup_n_u8(vget_lane_u8(tnext[2], 0));
204
                }
205

206
                continue;
207
            }
208

209
            int32x4_t v_dst0 = v_zero_s32, v_dst1 = v_zero_s32;
210

211
            {
212
                // combine 3 "shifted" vectors
213
                t0 = vext_u8(tprev[0], tcurr[0], 7);
214
                t1 = tcurr[0];
215
                t2 = vext_u8(tcurr[0], tnext[0], 1);
216

217
                int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
218
                int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
219
                int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
220

221
                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[8]);
222
                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[7]);
223
                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[6]);
224

225
                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[8]);
226
                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[7]);
227
                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[6]);
228
            }
229

230
            {
231
                // combine 3 "shifted" vectors
232
                t0 = vext_u8(tprev[1], tcurr[1], 7);
233
                t1 = tcurr[1];
234
                t2 = vext_u8(tcurr[1], tnext[1], 1);
235

236
                int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
237
                int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
238
                int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
239

240
                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[5]);
241
                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[4]);
242
                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[3]);
243

244
                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[5]);
245
                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[4]);
246
                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[3]);
247
            }
248

249
            {
250
                // combine 3 "shifted" vectors
251
                t0 = vext_u8(tprev[2], tcurr[2], 7);
252
                t1 = tcurr[2];
253
                t2 = vext_u8(tcurr[2], tnext[2], 1);
254

255
                int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
256
                int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
257
                int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
258

259
                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[2]);
260
                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[1]);
261
                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[0]);
262

263
                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[2]);
264
                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[1]);
265
                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[0]);
266
            }
267

268

269
            // make scale
270
            v_dst0 = vshrq_s32_p(v_dst0);
271
            v_dst1 = vshrq_s32_p(v_dst1);
272

273
            // and add them
274
            vst1_u8(drow + x - 8, vqmovn_u16(vcombine_u16(vqmovun_s32(v_dst0),
275
                                                          vqmovun_s32(v_dst1))));
276
        }
277

278
        x -= 8;
279
        if (x == width)
280
            --x;
281

282
        for ( ; x < width; ++x)
283
        {
284
            // make extrapolation for the last elements
285
            if (x + 1 >= width)
286
            {
287
                if (border == BORDER_MODE_CONSTANT)
288
                {
289
                    nextx[0] = borderValue;
290
                    nextx[1] = borderValue;
291
                    nextx[2] = borderValue;
292
                }
293
                else if (border == BORDER_MODE_REPLICATE)
294
                {
295
                    nextx[0] = srow0[x];
296
                    nextx[1] = srow1[x];
297
                    nextx[2] = srow2[x];
298
                }
299
            }
300
            else
301
            {
302
                nextx[0] = srow0 ? srow0[x + 1] : borderValue;
303
                nextx[1] =         srow1[x + 1]              ;
304
                nextx[2] = srow2 ? srow2[x + 1] : borderValue;
305
            }
306

307
            s32 val = 0;
308
            for (s32 _y = 0; _y < 3; ++_y)
309
                val += prevx[_y] * kernelBase[(2 - _y) * 3 + 2] +
310
                       currx[_y] * kernelBase[(2 - _y) * 3 + 1] +
311
                       nextx[_y] * kernelBase[(2 - _y) * 3 + 0];
312

313
            drow[x] = internal::saturate_cast<u8>(val >> scale);
314

315
            // make shift
316
            prevx[0] = currx[0];
317
            currx[0] = nextx[0];
318

319
            prevx[1] = currx[1];
320
            currx[1] = nextx[1];
321

322
            prevx[2] = currx[2];
323
            currx[2] = nextx[2];
324
        }
325
    }
326
#else
327
    (void)size;
328
    (void)srcBase;
329
    (void)srcStride;
330
    (void)dstBase;
331
    (void)dstStride;
332
    (void)border;
333
    (void)borderValue;
334
    (void)ksize;
335
    (void)kernelBase;
336
    (void)scale;
337
#endif
338
}
339

340
} // namespace CAROTENE_NS
341

342
Product

Resources

Company