CoCalc -- gaussian

GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/gaussian_blur.cpp
¹⁶³³⁷ views
1
/*
2
 * By downloading, copying, installing or using the software you agree to this license.
3
 * If you do not agree to this license, do not download, install,
4
 * copy or use the software.
5
 *
6
 *
7
 *                           License Agreement
8
 *                For Open Source Computer Vision Library
9
 *                        (3-clause BSD License)
10
 *
11
 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
 * Third party copyrights are property of their respective owners.
13
 *
14
 * Redistribution and use in source and binary forms, with or without modification,
15
 * are permitted provided that the following conditions are met:
16
 *
17
 *   * Redistributions of source code must retain the above copyright notice,
18
 *     this list of conditions and the following disclaimer.
19
 *
20
 *   * Redistributions in binary form must reproduce the above copyright notice,
21
 *     this list of conditions and the following disclaimer in the documentation
22
 *     and/or other materials provided with the distribution.
23
 *
24
 *   * Neither the names of the copyright holders nor the names of the contributors
25
 *     may be used to endorse or promote products derived from this software
26
 *     without specific prior written permission.
27
 *
28
 * This software is provided by the copyright holders and contributors "as is" and
29
 * any express or implied warranties, including, but not limited to, the implied
30
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31
 * In no event shall copyright holders or contributors be liable for any direct,
32
 * indirect, incidental, special, exemplary, or consequential damages
33
 * (including, but not limited to, procurement of substitute goods or services;
34
 * loss of use, data, or profits; or business interruption) however caused
35
 * and on any theory of liability, whether in contract, strict liability,
36
 * or tort (including negligence or otherwise) arising in any way out of
37
 * the use of this software, even if advised of the possibility of such damage.
38
 */
39

40
#include "common.hpp"
41
#include "saturate_cast.hpp"
42
#include "separable_filter.hpp"
43

44
namespace CAROTENE_NS {
45

46
bool isGaussianBlur3x3Supported(const Size2D &size, BORDER_MODE border)
47
{
48
    return isSupportedConfiguration() && size.width >= 8 &&
49
        (border == BORDER_MODE_CONSTANT ||
50
            border == BORDER_MODE_REPLICATE);
51
}
52

53
void gaussianBlur3x3(const Size2D &size,
54
                     const u8 * srcBase, ptrdiff_t srcStride,
55
                     u8 * dstBase, ptrdiff_t dstStride,
56
                     BORDER_MODE border, u8 borderValue)
57
{
58
    internal::assertSupportedConfiguration(isGaussianBlur3x3Supported(size, border));
59
#ifdef CAROTENE_NEON
60
    const uint16x8_t v_border_x4 = vdupq_n_u16(borderValue << 2);
61
    const uint16x8_t v_zero = vdupq_n_u16(0);
62
    const uint8x8_t v_border = vdup_n_u8(borderValue);
63

64
    uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
65
    uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
66

67
    ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
68

69
    for (ptrdiff_t y = 0; y < height; ++y)
70
    {
71
        const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
72
        const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
73
        const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
74
        u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
75

76
        s16 prevx = 0, currx = 0, nextx = 0;
77
        ptrdiff_t x = 0;
78
        const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
79

80
        // perform vertical convolution
81
        for ( ; x <= bwidth; x += 8)
82
        {
83
            internal::prefetch(srow0 + x);
84
            internal::prefetch(srow1 + x);
85
            internal::prefetch(srow2 + x);
86

87
            uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
88
            uint8x8_t x1 = vld1_u8(srow1 + x);
89
            uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
90

91
            // calculate values for plain CPU part below if needed
92
            if (x + 8 >= bwidth)
93
            {
94
                ptrdiff_t x3 = x == width ? width - 1 : x;
95
                ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
96

97
                if (border == BORDER_MODE_CONSTANT && x4 < 0)
98
                    prevx = borderValue;
99
                else
100
                    prevx = (srow2 ? srow2[x4] : borderValue) + (srow1[x4] << 1) + (srow0 ? srow0[x4] : borderValue);
101

102
                currx = (srow2 ? srow2[x3] : borderValue) + (srow1[x3] << 1) + (srow0 ? srow0[x3] : borderValue);
103
            }
104

105
            // make shift
106
            if (x)
107
            {
108
                tprev = tcurr;
109
                tcurr = tnext;
110
            }
111

112
            // and calculate next value
113
            tnext = vaddq_u16(vaddl_u8(x0, x2), vshll_n_u8(x1, 1));
114

115
            // make extrapolation for the first elements
116
            if (!x)
117
            {
118
                // make border
119
                if (border == BORDER_MODE_CONSTANT)
120
                    tcurr = v_border_x4;
121
                else if (border == BORDER_MODE_REPLICATE)
122
                    tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
123

124
                continue;
125
            }
126

127
            // combine 3 "shifted" vectors
128
            t0 = vextq_u16(tprev, tcurr, 7);
129
            t1 = tcurr;
130
            t2 = vextq_u16(tcurr, tnext, 1);
131

132
            // and add them
133
            t0 = vqaddq_u16(vshlq_n_u16(t1, 1), vqaddq_u16(t0, t2));
134
            vst1_u8(drow + x - 8, vshrn_n_u16(t0, 4));
135
        }
136

137
        x -= 8;
138
        if (x == width)
139
            --x;
140

141
        for ( ; x < width; ++x)
142
        {
143
            // make extrapolation for the last elements
144
            if (x + 1 >= width)
145
            {
146
                if (border == BORDER_MODE_CONSTANT)
147
                    nextx = borderValue << 2;
148
                else if (border == BORDER_MODE_REPLICATE)
149
                    nextx = srow2[x] + (srow1[x] << 1) + srow0[x];
150
            }
151
            else
152
                nextx = (srow2 ? srow2[x + 1] : borderValue) +
153
                                (srow1[x + 1] << 1) +
154
                        (srow0 ? srow0[x + 1] : borderValue);
155

156
            f32 val = (prevx + (currx << 1) + nextx) >> 4;
157
            drow[x] = internal::saturate_cast<u8>((s32)val);
158

159
            // make shift
160
            prevx = currx;
161
            currx = nextx;
162
        }
163
    }
164
#else
165
    (void)srcBase;
166
    (void)srcStride;
167
    (void)dstBase;
168
    (void)dstStride;
169
    (void)borderValue;
170
#endif
171
}
172

173
bool isGaussianBlur3x3MarginSupported(const Size2D &size, BORDER_MODE border, Margin borderMargin)
174
{
175
    return isSeparableFilter3x3Supported(size, border, 0, 0, borderMargin);
176
}
177

178
void gaussianBlur3x3Margin(const Size2D &size,
179
                           const u8 * srcBase, ptrdiff_t srcStride,
180
                           u8 * dstBase, ptrdiff_t dstStride,
181
                           BORDER_MODE border, u8 borderValue, Margin borderMargin)
182
{
183
    internal::assertSupportedConfiguration(isGaussianBlur3x3MarginSupported(size, border, borderMargin));
184
#ifdef CAROTENE_NEON
185
    internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3U8_121>::process(
186
                           size, srcBase, srcStride, dstBase, dstStride,
187
                           0, 0, border, borderValue, borderMargin);
188
#else
189
    (void)srcBase;
190
    (void)srcStride;
191
    (void)dstBase;
192
    (void)dstStride;
193
    (void)borderValue;
194
#endif
195
}
196

197
bool isGaussianBlur5x5Supported(const Size2D &size, s32 cn, BORDER_MODE border)
198
{
199
    return isSupportedConfiguration() &&
200
           cn > 0 && cn <= 4 &&
201
           size.width >= 8 && size.height >= 2 &&
202
           (border == BORDER_MODE_CONSTANT ||
203
            border == BORDER_MODE_REFLECT101 ||
204
            border == BORDER_MODE_REFLECT ||
205
            border == BORDER_MODE_REPLICATE ||
206
            border == BORDER_MODE_WRAP);
207
}
208

209
void gaussianBlur5x5(const Size2D &size, s32 cn,
210
                     const u8 * srcBase, ptrdiff_t srcStride,
211
                     u8 * dstBase, ptrdiff_t dstStride,
212
                     BORDER_MODE borderType, u8 borderValue, Margin borderMargin)
213
{
214
    internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
215
#ifdef CAROTENE_NEON
216
    size_t colsn = size.width * cn;
217

218
    std::vector<u8> _tmp;
219
    u8 *tmp = 0;
220
    if (borderType == BORDER_MODE_CONSTANT)
221
    {
222
        _tmp.assign(colsn + 4*cn, borderValue);
223
        tmp = &_tmp[cn << 1];
224
    }
225

226
    ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
227
    ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
228
    ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
229
    ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
230

231
    //1-line buffer
232
    std::vector<u16> _buf(cn * (size.width + 4) + 32 / sizeof(u16));
233
    u16* lane = internal::alignPtr(&_buf[cn << 1], 32);
234

235
    if (borderType == BORDER_MODE_CONSTANT)
236
        for (s32 k = 0; k < cn; ++k)
237
        {
238
            lane[-cn+k] = borderValue;
239
            lane[-cn-cn+k] = borderValue;
240
            lane[colsn+k] = borderValue;
241
            lane[colsn+cn+k] = borderValue;
242
        }
243

244
    uint8x8_t vc6u8 = vmov_n_u8(6);
245
    uint16x8_t vc6u16 = vmovq_n_u16(6);
246
    uint16x8_t vc4u16 = vmovq_n_u16(4);
247

248
    for (size_t i = 0; i < size.height; ++i)
249
    {
250
        u8* dst = internal::getRowPtr(dstBase, dstStride, i);
251
        //vertical convolution
252
        ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
253
        ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
254
        ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
255
        ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
256

257
        const u8* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
258
        const u8* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
259
        const u8* ln2 = internal::getRowPtr(srcBase, srcStride, i);
260
        const u8* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
261
        const u8* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
262

263
        size_t x = 0;
264
        for (; x <= colsn - 8; x += 8)
265
        {
266
            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
267
            uint8x8_t v0 = vld1_u8(ln0+x);
268
            uint8x8_t v1 = vld1_u8(ln1+x);
269
            uint8x8_t v2 = vld1_u8(ln2+x);
270
            uint8x8_t v3 = vld1_u8(ln3+x);
271
            uint8x8_t v4 = vld1_u8(ln4+x);
272

273
            uint16x8_t v = vaddl_u8(v0, v4);
274
            uint16x8_t v13 = vaddl_u8(v1, v3);
275

276
            v = vmlal_u8(v, v2, vc6u8);
277
            v = vmlaq_u16(v, v13, vc4u16);
278

279
            vst1q_u16(lane + x, v);
280
        }
281
        for (; x < colsn; ++x)
282
            lane[x] = ln0[x] + ln4[x] + u16(4) * (ln1[x] + ln3[x]) + u16(6) * ln2[x];
283

284
        //left&right borders
285
        if (borderType != BORDER_MODE_CONSTANT)
286
            for (s32 k = 0; k < cn; ++k)
287
            {
288
                lane[-cn+k] = lane[idx_l1 + k];
289
                lane[-cn-cn+k] = lane[idx_l2 + k];
290

291
                lane[colsn+k] = lane[idx_r1 + k];
292
                lane[colsn+cn+k] = lane[idx_r2 + k];
293
            }
294

295
        //horizontal convolution
296
        x = 0;
297
        switch(cn)
298
        {
299
        case 1:
300
            for (; x <= colsn - 8; x += 8)
301
            {
302
                internal::prefetch(lane + x);
303

304
                uint16x8_t lane0 = vld1q_u16(lane + x - 2);
305
                uint16x8_t lane4 = vld1q_u16(lane + x + 2);
306
                uint16x8_t lane1 = vld1q_u16(lane + x - 1);
307
                uint16x8_t lane3 = vld1q_u16(lane + x + 1);
308
                uint16x8_t lane2 = vld1q_u16(lane + x + 0);
309

310
                uint16x8_t ln04 = vaddq_u16(lane0, lane4);
311
                uint16x8_t ln13 = vaddq_u16(lane1, lane3);
312

313
                uint16x8_t ln042 = vmlaq_u16(ln04, lane2, vc6u16);
314
                uint16x8_t lsw = vmlaq_u16(ln042, ln13, vc4u16);
315

316
                uint8x8_t ls = vrshrn_n_u16(lsw, 8);
317

318
                vst1_u8(dst + x, ls);
319
            }
320
            break;
321
        case 2:
322
            for (; x <= colsn - 8*2; x += 8*2)
323
            {
324
                internal::prefetch(lane + x);
325

326
                u16* lidx0 = lane + x - 2*2;
327
                u16* lidx1 = lane + x - 1*2;
328
                u16* lidx3 = lane + x + 1*2;
329
                u16* lidx4 = lane + x + 2*2;
330
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
331
                __asm__ __volatile__ (
332
                    "vld2.16 {d0, d2}, [%[in0]]!                              \n\t"
333
                    "vld2.16 {d1, d3}, [%[in0]]                               \n\t"
334
                    "vld2.16 {d8, d10}, [%[in4]]!                             \n\t"
335
                    "vld2.16 {d9, d11}, [%[in4]]                              \n\t"
336
                    "vadd.i16 q0, q4                                          \n\t"
337
                    "vadd.i16 q1, q5                                          \n\t"
338
                    "vld2.16 {d16, d18}, [%[in1]]!                            \n\t"
339
                    "vld2.16 {d17, d19}, [%[in1]]                             \n\t"
340
                    "vld2.16 {d8, d10}, [%[in3]]!                             \n\t"
341
                    "vld2.16 {d9, d11}, [%[in3]]                              \n\t"
342
                    "vadd.i16 q4, q8                                          \n\t"
343
                    "vadd.i16 q5, q9                                          \n\t"
344
                    "vld2.16 {d16, d18}, [%[in2]]                             \n\t"
345
                    "vld2.16 {d17, d19}, [%[in22]]                            \n\t"
346
                    "vmla.i16 q0, q4, %q[c4]                                  \n\t"
347
                    "vmla.i16 q1, q5, %q[c4]                                  \n\t"
348
                    "vmla.i16 q0, q8, %q[c6]                                  \n\t"
349
                    "vmla.i16 q1, q9, %q[c6]                                  \n\t"
350
                    "vrshrn.u16 d8, q0, #8                                    \n\t"
351
                    "vrshrn.u16 d9, q1, #8                                    \n\t"
352
                    "vst2.8 {d8-d9}, [%[out]]                                 \n\t"
353
                    : [in0] "=r" (lidx0),
354
                      [in1] "=r" (lidx1),
355
                      [in3] "=r" (lidx3),
356
                      [in4] "=r" (lidx4)
357
                    : [out] "r" (dst + x),
358
                      "0" (lidx0),
359
                      "1" (lidx1),
360
                      "2" (lidx3),
361
                      "3" (lidx4),
362
                      [in2] "r" (lane + x),
363
                      [in22] "r" (lane + x + 4*2),
364
                      [c4] "w" (vc4u16), [c6] "w" (vc6u16)
365
                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
366
                );
367
#else
368
                uint16x8x2_t vLane0 = vld2q_u16(lidx0);
369
                uint16x8x2_t vLane1 = vld2q_u16(lidx1);
370
                uint16x8x2_t vLane2 = vld2q_u16(lane + x);
371
                uint16x8x2_t vLane3 = vld2q_u16(lidx3);
372
                uint16x8x2_t vLane4 = vld2q_u16(lidx4);
373

374
                uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
375
                uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]);
376

377
                uint16x8_t vSum_4_8 = vaddq_u16(vLane1.val[0], vLane3.val[0]);
378
                uint16x8_t vSum_5_9 = vaddq_u16(vLane1.val[1], vLane3.val[1]);
379

380
                vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16);
381
                vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16);
382
                vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
383
                vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16);
384

385
                uint8x8x2_t vRes;
386
                vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
387
                vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
388
                vst2_u8(dst + x, vRes);
389
#endif
390
            }
391
            break;
392
        case 3:
393
            for (; x <= colsn - 8*3; x += 8*3)
394
            {
395
                internal::prefetch(lane + x);
396

397
                u16* lidx0 = lane + x - 2*3;
398
                u16* lidx1 = lane + x - 1*3;
399
                u16* lidx3 = lane + x + 1*3;
400
                u16* lidx4 = lane + x + 2*3;
401
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
402
                __asm__ __volatile__ (
403
                    "vld3.16 {d0, d2, d4}, [%[in0]]!                          \n\t"
404
                    "vld3.16 {d1, d3, d5}, [%[in0]]                           \n\t"
405
                    "vld3.16 {d8, d10, d12}, [%[in4]]!                        \n\t"
406
                    "vld3.16 {d9, d11, d13}, [%[in4]]                         \n\t"
407
                    "vadd.i16 q0, q4                                          \n\t"
408
                    "vadd.i16 q1, q5                                          \n\t"
409
                    "vadd.i16 q2, q6                                          \n\t"
410
                    "vld3.16 {d16, d18, d20}, [%[in1]]!                       \n\t"
411
                    "vld3.16 {d17, d19, d21}, [%[in1]]                        \n\t"
412
                    "vld3.16 {d8, d10, d12}, [%[in3]]!                        \n\t"
413
                    "vld3.16 {d9, d11, d13}, [%[in3]]                         \n\t"
414
                    "vadd.i16 q4, q8                                          \n\t"
415
                    "vadd.i16 q5, q9                                          \n\t"
416
                    "vadd.i16 q6, q10                                         \n\t"
417
                    "vld3.16 {d16, d18, d20}, [%[in2]]                        \n\t"
418
                    "vld3.16 {d17, d19, d21}, [%[in22]]                       \n\t"
419
                    "vmla.i16 q0, q4, %q[c4]                                  \n\t"
420
                    "vmla.i16 q1, q5, %q[c4]                                  \n\t"
421
                    "vmla.i16 q2, q6, %q[c4]                                  \n\t"
422
                    "vmla.i16 q0, q8, %q[c6]                                  \n\t"
423
                    "vmla.i16 q1, q9, %q[c6]                                  \n\t"
424
                    "vmla.i16 q2, q10, %q[c6]                                 \n\t"
425
                    "vrshrn.u16 d8, q0, #8                                    \n\t"
426
                    "vrshrn.u16 d9, q1, #8                                    \n\t"
427
                    "vrshrn.u16 d10, q2, #8                                   \n\t"
428
                    "vst3.8 {d8-d10}, [%[out]]                                \n\t"
429
                    : [in0] "=r" (lidx0),
430
                      [in1] "=r" (lidx1),
431
                      [in3] "=r" (lidx3),
432
                      [in4] "=r" (lidx4)
433
                    : [out] "r" (dst + x),
434
                      "0" (lidx0),
435
                      "1" (lidx1),
436
                      "2" (lidx3),
437
                      "3" (lidx4),
438
                      [in2] "r" (lane + x),
439
                      [in22] "r" (lane + x + 4*3),
440
                      [c4] "w" (vc4u16), [c6] "w" (vc6u16)
441
                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
442
                );
443
#else
444
                uint16x8x3_t vLane0 = vld3q_u16(lidx0);
445
                uint16x8x3_t vLane1 = vld3q_u16(lidx1);
446
                uint16x8x3_t vLane2 = vld3q_u16(lane + x);
447
                uint16x8x3_t vLane3 = vld3q_u16(lidx3);
448
                uint16x8x3_t vLane4 = vld3q_u16(lidx4);
449

450
                uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
451
                uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]);
452
                uint16x8_t vSum_2_6 = vaddq_u16(vLane0.val[2], vLane4.val[2]);
453

454
                uint16x8_t vSum_3_1 = vaddq_u16(vLane3.val[0], vLane1.val[0]);
455
                uint16x8_t vSum_4_2 = vaddq_u16(vLane3.val[1], vLane1.val[1]);
456
                uint16x8_t vSum_5_6 = vaddq_u16(vLane3.val[2], vLane1.val[2]);
457

458
                vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_3_1, vc4u16);
459
                vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_4_2, vc4u16);
460
                vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_5_6, vc4u16);
461

462
                vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
463
                vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16);
464
                vSum_2_6 = vmlaq_u16(vSum_2_6, vLane2.val[2], vc6u16);
465

466
                uint8x8x3_t vRes;
467
                vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
468
                vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
469
                vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8);
470

471
                vst3_u8(dst + x, vRes);
472
#endif
473
            }
474
            break;
475
        case 4:
476
            for (; x <= colsn - 8*4; x += 8*4)
477
            {
478
                internal::prefetch(lane + x);
479
                internal::prefetch(lane + x + 16);
480

481
                u16* lidx0 = lane + x - 2*4;
482
                u16* lidx1 = lane + x - 1*4;
483
                u16* lidx3 = lane + x + 1*4;
484
                u16* lidx4 = lane + x + 2*4;
485
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
486
                __asm__ __volatile__ (
487
                    "vld4.16 {d0, d2, d4, d6}, [%[in0]]!                      \n\t"
488
                    "vld4.16 {d1, d3, d5, d7}, [%[in0]]                       \n\t"
489
                    "vld4.16 {d8, d10, d12, d14}, [%[in4]]!                   \n\t"
490
                    "vld4.16 {d9, d11, d13, d15}, [%[in4]]                    \n\t"
491
                    "vadd.i16 q0, q4                                          \n\t"
492
                    "vadd.i16 q1, q5                                          \n\t"
493
                    "vadd.i16 q2, q6                                          \n\t"
494
                    "vadd.i16 q3, q7                                          \n\t"
495
                    "vld4.16 {d16, d18, d20, d22}, [%[in1]]!                  \n\t"
496
                    "vld4.16 {d17, d19, d21, d23}, [%[in1]]                   \n\t"
497
                    "vld4.16 {d8, d10, d12, d14}, [%[in3]]!                   \n\t"
498
                    "vld4.16 {d9, d11, d13, d15}, [%[in3]]                    \n\t"
499
                    "vadd.i16 q4, q8                                          \n\t"
500
                    "vadd.i16 q5, q9                                          \n\t"
501
                    "vadd.i16 q6, q10                                         \n\t"
502
                    "vadd.i16 q7, q11                                         \n\t"
503
                    "vld4.16 {d16, d18, d20, d22}, [%[in2],:256]              \n\t"
504
                    "vld4.16 {d17, d19, d21, d23}, [%[in22],:256]             \n\t"
505
                    "vmla.i16 q0, q4, %q[c4]                                  \n\t"
506
                    "vmla.i16 q1, q5, %q[c4]                                  \n\t"
507
                    "vmla.i16 q2, q6, %q[c4]                                  \n\t"
508
                    "vmla.i16 q3, q7, %q[c4]                                  \n\t"
509
                    "vmla.i16 q0, q8, %q[c6]                                  \n\t"
510
                    "vmla.i16 q1, q9, %q[c6]                                  \n\t"
511
                    "vmla.i16 q2, q10, %q[c6]                                 \n\t"
512
                    "vmla.i16 q3, q11, %q[c6]                                 \n\t"
513
                    "vrshrn.u16 d8, q0, #8                                    \n\t"
514
                    "vrshrn.u16 d9, q1, #8                                    \n\t"
515
                    "vrshrn.u16 d10, q2, #8                                   \n\t"
516
                    "vrshrn.u16 d11, q3, #8                                   \n\t"
517
                    "vst4.8 {d8-d11}, [%[out]]                                \n\t"
518
                    : [in0] "=r" (lidx0),
519
                      [in1] "=r" (lidx1),
520
                      [in3] "=r" (lidx3),
521
                      [in4] "=r" (lidx4)
522
                    : [out] "r" (dst + x),
523
                      "0" (lidx0),
524
                      "1" (lidx1),
525
                      "2" (lidx3),
526
                      "3" (lidx4),
527
                      [in2] "r" (lane + x),
528
                      [in22] "r" (lane + x + 4*4),
529
                      [c4] "w" (vc4u16), [c6] "w" (vc6u16)
530
                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
531
                );
532
#else
533
                uint16x8x4_t vLane0 = vld4q_u16(lidx0);
534
                uint16x8x4_t vLane2 = vld4q_u16(lidx4);
535
                uint16x8x4_t vLane4 = vld4q_u16(lidx1);
536
                uint16x8x4_t vLane6 = vld4q_u16(lidx3);
537
                uint16x8x4_t vLane8 = vld4q_u16(lane + x);
538

539
                uint16x8_t vSum_0_4  = vaddq_u16(vLane0.val[0], vLane2.val[0]);
540
                uint16x8_t vSum_1_5  = vaddq_u16(vLane0.val[1], vLane2.val[1]);
541
                uint16x8_t vSum_2_6  = vaddq_u16(vLane0.val[2], vLane2.val[2]);
542
                uint16x8_t vSum_3_7  = vaddq_u16(vLane0.val[3], vLane2.val[3]);
543

544
                uint16x8_t vSum_4_8  = vaddq_u16(vLane4.val[0], vLane6.val[0]);
545
                uint16x8_t vSum_5_9  = vaddq_u16(vLane4.val[1], vLane6.val[1]);
546
                uint16x8_t vSum_6_10 = vaddq_u16(vLane4.val[2], vLane6.val[2]);
547
                uint16x8_t vSum_7_11 = vaddq_u16(vLane4.val[3], vLane6.val[3]);
548

549
                vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16);
550
                vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16);
551
                vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_6_10, vc4u16);
552
                vSum_3_7 = vmlaq_u16(vSum_3_7, vSum_7_11, vc4u16);
553

554
                vSum_0_4 = vmlaq_u16(vSum_0_4, vLane8.val[0], vc6u16);
555
                vSum_1_5 = vmlaq_u16(vSum_1_5, vLane8.val[1], vc6u16);
556
                vSum_2_6 = vmlaq_u16(vSum_2_6, vLane8.val[2], vc6u16);
557
                vSum_3_7 = vmlaq_u16(vSum_3_7, vLane8.val[3], vc6u16);
558

559
                uint8x8x4_t vRes;
560
                vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
561
                vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
562
                vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8);
563
                vRes.val[3] = vrshrn_n_u16(vSum_3_7, 8);
564

565
                vst4_u8(dst + x, vRes);
566
#endif
567
            }
568
            break;
569
        }
570
        for (s32 h = 0; h < cn; ++h)
571
        {
572
            u16* ln = lane + h;
573
            u8* dt = dst + h;
574
            for (size_t k = x; k < colsn; k += cn)
575
            {
576
                dt[k] = (u8)((ln[k-2*cn] + ln[k+2*cn]
577
                               + u16(4) * (ln[k-cn] + ln[k+cn])
578
                               + u16(6) * ln[k] + (1 << 7)) >> 8);
579
            }
580
        }
581
    }
582
#else
583
    (void)srcBase;
584
    (void)srcStride;
585
    (void)dstBase;
586
    (void)dstStride;
587
    (void)borderValue;
588
    (void)borderMargin;
589
#endif
590
}
591

592
void gaussianBlur5x5(const Size2D &size, s32 cn,
593
                     const u16 * srcBase, ptrdiff_t srcStride,
594
                     u16 * dstBase, ptrdiff_t dstStride,
595
                     BORDER_MODE borderType, u16 borderValue, Margin borderMargin)
596
{
597
    internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
598
#ifdef CAROTENE_NEON
599
    size_t colsn = size.width * cn;
600

601
    std::vector<u16> _tmp;
602
    u16 *tmp = 0;
603
    if (borderType == BORDER_MODE_CONSTANT)
604
    {
605
        _tmp.assign(colsn + 4*cn, borderValue);
606
        tmp = &_tmp[cn << 1];
607
    }
608

609
    ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
610
    ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
611
    ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
612
    ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
613

614
    //1-line buffer
615
    std::vector<u32> _buf(cn * (size.width + 4) + 32 / sizeof(u32));
616
    u32* lane = internal::alignPtr(&_buf[cn << 1], 32);
617

618
    if (borderType == BORDER_MODE_CONSTANT)
619
        for (s32 k = 0; k < cn; ++k)
620
        {
621
            lane[-cn+k] = borderValue;
622
            lane[-cn-cn+k] = borderValue;
623
            lane[colsn+k] = borderValue;
624
            lane[colsn+cn+k] = borderValue;
625
        }
626

627
    uint16x4_t vc6u16 = vmov_n_u16(6);
628
    uint32x4_t vc6u32 = vmovq_n_u32(6);
629
    uint32x4_t vc4u32 = vmovq_n_u32(4);
630

631
    for (size_t i = 0; i < size.height; ++i)
632
    {
633
        u16* dst = internal::getRowPtr(dstBase, dstStride, i);
634
        //vertical convolution
635
        ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
636
        ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
637
        ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
638
        ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
639

640
        const u16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
641
        const u16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
642
        const u16* ln2 = internal::getRowPtr(srcBase, srcStride, i);
643
        const u16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
644
        const u16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
645

646
        size_t x = 0;
647
        for (; x <= colsn - 4; x += 4)
648
        {
649
            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
650
            uint16x4_t v0 = vld1_u16(ln0+x);
651
            uint16x4_t v1 = vld1_u16(ln1+x);
652
            uint16x4_t v2 = vld1_u16(ln2+x);
653
            uint16x4_t v3 = vld1_u16(ln3+x);
654
            uint16x4_t v4 = vld1_u16(ln4+x);
655

656
            uint32x4_t v = vaddl_u16(v0, v4);
657
            uint32x4_t v13 = vaddl_u16(v1, v3);
658

659
            v = vmlal_u16(v, v2, vc6u16);
660
            v = vmlaq_u32(v, v13, vc4u32);
661

662
            vst1q_u32(lane + x, v);
663
        }
664
        for (; x < colsn; ++x)
665
            lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
666

667
        //left&right borders
668
        if (borderType != BORDER_MODE_CONSTANT)
669
            for (s32 k = 0; k < cn; ++k)
670
            {
671
                lane[-cn+k] = lane[idx_l1 + k];
672
                lane[-cn-cn+k] = lane[idx_l2 + k];
673

674
                lane[colsn+k] = lane[idx_r1 + k];
675
                lane[colsn+cn+k] = lane[idx_r2 + k];
676
            }
677

678
        //horizontal convolution
679
        x = 0;
680
        for (; x <= colsn - 4; x += 4)
681
        {
682
            internal::prefetch(lane + x);
683

684
            uint32x4_t lane0 = vld1q_u32(lane + x - 2);
685
            uint32x4_t lane4 = vld1q_u32(lane + x + 2);
686
            uint32x4_t lane1 = vld1q_u32(lane + x - 1);
687
            uint32x4_t lane3 = vld1q_u32(lane + x + 1);
688
            uint32x4_t lane2 = vld1q_u32(lane + x + 0);
689

690
            uint32x4_t ln04 = vaddq_u32(lane0, lane4);
691
            uint32x4_t ln13 = vaddq_u32(lane1, lane3);
692

693
            uint32x4_t ln042 = vmlaq_u32(ln04, lane2, vc6u32);
694
            uint32x4_t lsw = vmlaq_u32(ln042, ln13, vc4u32);
695

696
            uint16x4_t ls = vrshrn_n_u32(lsw, 8);
697

698
            vst1_u16(dst + x, ls);
699
        }
700
        for (s32 h = 0; h < cn; ++h)
701
        {
702
            u32* ln = lane + h;
703
            u16* dt = dst + h;
704
            for (size_t k = x; k < colsn; k += cn)
705
            {
706
                dt[k] = (u16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8);
707
            }
708
        }
709
    }
710
#else
711
    (void)srcBase;
712
    (void)srcStride;
713
    (void)dstBase;
714
    (void)dstStride;
715
    (void)borderValue;
716
    (void)borderMargin;
717
#endif
718
}
719

720
void gaussianBlur5x5(const Size2D &size, s32 cn,
721
                     const s16 * srcBase, ptrdiff_t srcStride,
722
                     s16 * dstBase, ptrdiff_t dstStride,
723
                     BORDER_MODE borderType, s16 borderValue, Margin borderMargin)
724
{
725
    internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
726
#ifdef CAROTENE_NEON
727
    size_t colsn = size.width * cn;
728

729
    std::vector<s16> _tmp;
730
    s16 *tmp = 0;
731
    if (borderType == BORDER_MODE_CONSTANT)
732
    {
733
        _tmp.assign(colsn + 4*cn, borderValue);
734
        tmp = &_tmp[cn << 1];
735
    }
736

737
    ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
738
    ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
739
    ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
740
    ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
741

742
    //1-line buffer
743
    std::vector<s32> _buf(cn * (size.width + 4) + 32 / sizeof(s32));
744
    s32* lane = internal::alignPtr(&_buf[cn << 1], 32);
745

746
    if (borderType == BORDER_MODE_CONSTANT)
747
        for (s32 k = 0; k < cn; ++k)
748
        {
749
            lane[-cn+k] = borderValue;
750
            lane[-cn-cn+k] = borderValue;
751
            lane[colsn+k] = borderValue;
752
            lane[colsn+cn+k] = borderValue;
753
        }
754

755
    int16x4_t vc6s16 = vmov_n_s16(6);
756
    int32x4_t vc6s32 = vmovq_n_s32(6);
757
    int32x4_t vc4s32 = vmovq_n_s32(4);
758

759
    for (size_t i = 0; i < size.height; ++i)
760
    {
761
        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
762
        //vertical convolution
763
        ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
764
        ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
765
        ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
766
        ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
767

768
        const s16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
769
        const s16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
770
        const s16* ln2 = internal::getRowPtr(srcBase, srcStride, i);
771
        const s16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
772
        const s16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
773

774
        size_t x = 0;
775
        for (; x <= colsn - 4; x += 4)
776
        {
777
            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
778
            int16x4_t v0 = vld1_s16(ln0+x);
779
            int16x4_t v1 = vld1_s16(ln1+x);
780
            int16x4_t v2 = vld1_s16(ln2+x);
781
            int16x4_t v3 = vld1_s16(ln3+x);
782
            int16x4_t v4 = vld1_s16(ln4+x);
783

784
            int32x4_t v = vaddl_s16(v0, v4);
785
            int32x4_t v13 = vaddl_s16(v1, v3);
786

787
            v = vmlal_s16(v, v2, vc6s16);
788
            v = vmlaq_s32(v, v13, vc4s32);
789

790
            vst1q_s32(lane + x, v);
791
        }
792
        for (; x < colsn; ++x)
793
            lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
794

795
        //left&right borders
796
        if (borderType != BORDER_MODE_CONSTANT)
797
            for (s32 k = 0; k < cn; ++k)
798
            {
799
                lane[-cn+k] = lane[idx_l1 + k];
800
                lane[-cn-cn+k] = lane[idx_l2 + k];
801

802
                lane[colsn+k] = lane[idx_r1 + k];
803
                lane[colsn+cn+k] = lane[idx_r2 + k];
804
            }
805

806
        //horizontal convolution
807
        x = 0;
808
       switch(cn)
809
        {
810
        case 1:
811
        case 2:
812
        case 3:
813
            for (; x <= colsn - 4; x += 4)
814
            {
815
                internal::prefetch(lane + x);
816

817
                int32x4_t lane0 = vld1q_s32(lane + x - 2);
818
                int32x4_t lane4 = vld1q_s32(lane + x + 2);
819
                int32x4_t lane1 = vld1q_s32(lane + x - 1);
820
                int32x4_t lane3 = vld1q_s32(lane + x + 1);
821
                int32x4_t lane2 = vld1q_s32(lane + x + 0);
822

823
                int32x4_t ln04 = vaddq_s32(lane0, lane4);
824
                int32x4_t ln13 = vaddq_s32(lane1, lane3);
825

826
                int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
827
                int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
828

829
                int16x4_t ls = vrshrn_n_s32(lsw, 8);
830

831
                vst1_s16(dst + x, ls);
832
           }
833
            break;
834
        case 4:
835
/*            for (; x <= colsn - 4*4; x += 4*4)
836
            {
837
                internal::prefetch(lane + x);
838
                internal::prefetch(lane + x + 16);
839

840
                ptrdiff_t* lidx0 = lane + x - 2*4;
841
                ptrdiff_t* lidx1 = lane + x - 1*4;
842
                ptrdiff_t* lidx3 = lane + x + 1*4;
843
                ptrdiff_t* lidx4 = lane + x + 2*4;
844

845
                __asm__ __volatile__ (
846
                    "vld4.32 {d0, d2, d4, d6}, [%[in0]]!                      \n\t"
847
                    "vld4.32 {d1, d3, d5, d7}, [%[in0]]                       \n\t"
848
                    "vld4.32 {d8, d10, d12, d14}, [%[in4]]!                   \n\t"
849
                    "vld4.32 {d9, d11, d13, d15}, [%[in4]]                    \n\t"
850
                    "vadd.i32 q0, q4                                          \n\t"
851
                    "vadd.i32 q1, q5                                          \n\t"
852
                    "vadd.i32 q2, q6                                          \n\t"
853
                    "vadd.i32 q3, q7                                          \n\t"
854
                    "vld4.32 {d16, d18, d20, d22}, [%[in1]]!                  \n\t"
855
                    "vld4.32 {d17, d19, d21, d23}, [%[in1]]                   \n\t"
856
                    "vld4.32 {d8, d10, d12, d14}, [%[in3]]!                   \n\t"
857
                    "vld4.32 {d9, d11, d13, d15}, [%[in3]]                    \n\t"
858
                    "vadd.i32 q4, q8                                          \n\t"
859
                    "vadd.i32 q5, q9                                          \n\t"
860
                    "vadd.i32 q6, q10                                         \n\t"
861
                    "vadd.i32 q7, q11                                         \n\t"
862
                    "vld4.32 {d16, d18, d20, d22}, [%[in2],:256]              \n\t"
863
                    "vld4.32 {d17, d19, d21, d23}, [%[in22],:256]             \n\t"
864
                    "vmla.i32 q0, q4, %q[c4]                                  \n\t"
865
                    "vmla.i32 q1, q5, %q[c4]                                  \n\t"
866
                    "vmla.i32 q2, q6, %q[c4]                                  \n\t"
867
                    "vmla.i32 q3, q7, %q[c4]                                  \n\t"
868
                    "vmla.i32 q0, q8, %q[c6]                                  \n\t"
869
                    "vmla.i32 q1, q9, %q[c6]                                  \n\t"
870
                    "vmla.i32 q2, q10, %q[c6]                                 \n\t"
871
                    "vmla.i32 q3, q11, %q[c6]                                 \n\t"
872
                    "vrshrn.i32 d8, q0, #8                                    \n\t"
873
                    "vrshrn.i32 d9, q1, #8                                    \n\t"
874
                    "vrshrn.i32 d10, q2, #8                                   \n\t"
875
                    "vrshrn.i32 d11, q3, #8                                   \n\t"
876
                   "vst4.16 {d8-d11}, [%[out]]                                \n\t"
877
                    : [in0] "=r" (lidx0),
878
                      [in1] "=r" (lidx1),
879
                      [in3] "=r" (lidx3),
880
                      [in4] "=r" (lidx4)
881
                    : [out] "r" (dst + x),
882
                      "0" (lidx0),
883
                      "1" (lidx1),
884
                      "2" (lidx3),
885
                      "3" (lidx4),
886
                      [in2] "r" (lane + x),
887
                      [in22] "r" (lane + x + 4*2),
888
                      [c4] "w" (vc4s32), [c6] "w" (vc6s32)
889
                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
890
                );
891
*/
892
            for (; x <= colsn - 4; x += 4)
893
            {
894
                internal::prefetch(lane + x);
895

896
                int32x4_t lane0 = vld1q_s32(lane + x - 2);
897
                int32x4_t lane4 = vld1q_s32(lane + x + 2);
898
                int32x4_t lane1 = vld1q_s32(lane + x - 1);
899
                int32x4_t lane3 = vld1q_s32(lane + x + 1);
900
                int32x4_t lane2 = vld1q_s32(lane + x + 0);
901

902
                int32x4_t ln04 = vaddq_s32(lane0, lane4);
903
                int32x4_t ln13 = vaddq_s32(lane1, lane3);
904

905
                int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
906
                int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
907

908
                int16x4_t ls = vrshrn_n_s32(lsw, 8);
909

910
                vst1_s16(dst + x, ls);
911
            }
912
            break;
913
        }
914
        for (s32 h = 0; h < cn; ++h)
915
        {
916
            s32* ln = lane + h;
917
            s16* dt = dst + h;
918
            for (size_t k = x; k < colsn; k += cn)
919
            {
920
                dt[k] = (s16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8);
921
            }
922
        }
923
    }
924
#else
925
    (void)srcBase;
926
    (void)srcStride;
927
    (void)dstBase;
928
    (void)dstStride;
929
    (void)borderValue;
930
    (void)borderMargin;
931
#endif
932
}
933

934
void gaussianBlur5x5(const Size2D &size, s32 cn,
935
                     const s32 * srcBase, ptrdiff_t srcStride,
936
                     s32 * dstBase, ptrdiff_t dstStride,
937
                     BORDER_MODE borderType, s32 borderValue, Margin borderMargin)
938
{
939
    internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
940
#ifdef CAROTENE_NEON
941
    size_t colsn = size.width * cn;
942

943
    std::vector<s32> _tmp;
944
    s32 *tmp = 0;
945
    if (borderType == BORDER_MODE_CONSTANT)
946
    {
947
        _tmp.assign(colsn + 4*cn, borderValue);
948
        tmp = &_tmp[cn << 1];
949
    }
950

951
    ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
952
    ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
953
    ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
954
    ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
955

956
    //1-line buffer
957
    std::vector<s32> _buf(cn * (size.width + 4) + 32 / sizeof(s32));
958
    s32* lane = internal::alignPtr(&_buf[cn << 1], 32);
959

960
    if (borderType == BORDER_MODE_CONSTANT)
961
        for (s32 k = 0; k < cn; ++k)
962
        {
963
            lane[-cn+k] = borderValue;
964
            lane[-cn-cn+k] = borderValue;
965
            lane[colsn+k] = borderValue;
966
            lane[colsn+cn+k] = borderValue;
967
        }
968

969
    int32x4_t vc6s32 = vmovq_n_s32(6);
970
    int32x4_t vc4s32 = vmovq_n_s32(4);
971

972
    for (size_t i = 0; i < size.height; ++i)
973
    {
974
        s32* dst = internal::getRowPtr(dstBase, dstStride, i);
975
        //vertical convolution
976
        ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
977
        ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
978
        ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
979
        ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
980

981
        const s32* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
982
        const s32* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
983
        const s32* ln2 = internal::getRowPtr(srcBase, srcStride, i);
984
        const s32* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
985
        const s32* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
986

987
        size_t x = 0;
988
        for (; x <= colsn - 4; x += 4)
989
        {
990
            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
991
            int32x4_t v0 = vld1q_s32(ln0+x);
992
            int32x4_t v1 = vld1q_s32(ln1+x);
993
            int32x4_t v2 = vld1q_s32(ln2+x);
994
            int32x4_t v3 = vld1q_s32(ln3+x);
995
            int32x4_t v4 = vld1q_s32(ln4+x);
996

997
            int32x4_t v = vaddq_s32(v0, v4);
998
            int32x4_t v13 = vaddq_s32(v1, v3);
999

1000
            v = vmlaq_s32(v, v2, vc6s32);
1001
            v = vmlaq_s32(v, v13, vc4s32);
1002

1003
            vst1q_s32(lane + x, v);
1004
        }
1005
        for (; x < colsn; ++x)
1006
            lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
1007

1008
        //left&right borders
1009
        if (borderType != BORDER_MODE_CONSTANT)
1010
            for (s32 k = 0; k < cn; ++k)
1011
            {
1012
                lane[-cn+k] = lane[idx_l1 + k];
1013
                lane[-cn-cn+k] = lane[idx_l2 + k];
1014

1015
                lane[colsn+k] = lane[idx_r1 + k];
1016
                lane[colsn+cn+k] = lane[idx_r2 + k];
1017
            }
1018

1019
        //horizontal convolution
1020
        x = 0;
1021
        for (; x <= colsn - 4; x += 4)
1022
        {
1023
            internal::prefetch(lane + x);
1024

1025
            int32x4_t lane0 = vld1q_s32(lane + x - 2);
1026
            int32x4_t lane4 = vld1q_s32(lane + x + 2);
1027
            int32x4_t lane1 = vld1q_s32(lane + x - 1);
1028
            int32x4_t lane3 = vld1q_s32(lane + x + 1);
1029
            int32x4_t lane2 = vld1q_s32(lane + x + 0);
1030

1031
            int32x4_t ln04 = vaddq_s32(lane0, lane4);
1032
            int32x4_t ln13 = vaddq_s32(lane1, lane3);
1033

1034
            int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
1035
            int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
1036

1037
            vst1q_s32(dst + x, lsw);
1038
        }
1039
        for (s32 h = 0; h < cn; ++h)
1040
        {
1041
            s32* ln = lane + h;
1042
            s32* dt = dst + h;
1043
            for (size_t k = x; k < colsn; k += cn)
1044
            {
1045
                dt[k] = ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k];
1046
            }
1047
        }
1048
    }
1049
#else
1050
    (void)srcBase;
1051
    (void)srcStride;
1052
    (void)dstBase;
1053
    (void)dstStride;
1054
    (void)borderValue;
1055
    (void)borderMargin;
1056
#endif
1057
}
1058

1059
} // namespace CAROTENE_NS
1060

1061
Product

Resources

Company