CoCalc -- pyramid.cpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/pyramid.cpp
¹⁶³³⁷ views
1
/*
2
 * By downloading, copying, installing or using the software you agree to this license.
3
 * If you do not agree to this license, do not download, install,
4
 * copy or use the software.
5
 *
6
 *
7
 *                           License Agreement
8
 *                For Open Source Computer Vision Library
9
 *                        (3-clause BSD License)
10
 *
11
 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
 * Third party copyrights are property of their respective owners.
13
 *
14
 * Redistribution and use in source and binary forms, with or without modification,
15
 * are permitted provided that the following conditions are met:
16
 *
17
 *   * Redistributions of source code must retain the above copyright notice,
18
 *     this list of conditions and the following disclaimer.
19
 *
20
 *   * Redistributions in binary form must reproduce the above copyright notice,
21
 *     this list of conditions and the following disclaimer in the documentation
22
 *     and/or other materials provided with the distribution.
23
 *
24
 *   * Neither the names of the copyright holders nor the names of the contributors
25
 *     may be used to endorse or promote products derived from this software
26
 *     without specific prior written permission.
27
 *
28
 * This software is provided by the copyright holders and contributors "as is" and
29
 * any express or implied warranties, including, but not limited to, the implied
30
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31
 * In no event shall copyright holders or contributors be liable for any direct,
32
 * indirect, incidental, special, exemplary, or consequential damages
33
 * (including, but not limited to, procurement of substitute goods or services;
34
 * loss of use, data, or profits; or business interruption) however caused
35
 * and on any theory of liability, whether in contract, strict liability,
36
 * or tort (including negligence or otherwise) arising in any way out of
37
 * the use of this software, even if advised of the possibility of such damage.
38
 */
39

40
#include "common.hpp"
41

42
#include <vector>
43

44
namespace CAROTENE_NS {
45

46
bool isGaussianPyramidDownRTZSupported(const Size2D &srcSize, const Size2D &dstSize, BORDER_MODE border_mode)
47
{
48
    if (!isSupportedConfiguration())
49
        return false;
50
    // Need at least 8 pixels for vectorization.
51
    // Need to make sure dst width is half the src width.
52
    // Don't care about dst height.
53
    if ( dstSize.width < 8 || std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 )
54
        return false;
55

56
    // Current implementation only supports Reflect101 (ie: UNDEFINED mode)
57
    if (border_mode != BORDER_MODE_UNDEFINED)
58
        return false;
59

60
    return true;
61
}
62

63
bool isGaussianPyramidDownU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
64
{
65
    if (!isSupportedConfiguration())
66
        return false;
67
    if ( (dstSize.width * cn) < 8 ||
68
         (cn != 1 && cn !=3 && cn!=4) ||
69
         std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 ||
70
         std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 )
71
        return false;
72

73
    return true;
74
}
75

76
bool isGaussianPyramidDownS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
77
{
78
    if (!isSupportedConfiguration())
79
        return false;
80
    if ( (dstSize.width * cn) < 4 ||
81
         (cn != 1 && cn !=3 && cn!=4) ||
82
         std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 ||
83
         std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 )
84
        return false;
85

86
    return true;
87
}
88

89
bool isGaussianPyramidDownF32Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
90
{
91
    if (!isSupportedConfiguration())
92
        return false;
93
    if ( (dstSize.width * cn) < 4 ||
94
         (cn != 1 && cn !=3 && cn!=4) ||
95
         std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 ||
96
         std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 )
97
        return false;
98

99
    return true;
100
}
101

102
bool isGaussianPyramidUpU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
103
{
104
    if (!isSupportedConfiguration())
105
        return false;
106
    if ( (srcSize.width * cn) < 8 ||
107
         (cn != 1 && cn !=3 && cn!=4) ||
108
         std::abs((ptrdiff_t)dstSize.width - (ptrdiff_t)srcSize.width*2) != (ptrdiff_t)dstSize.width % 2 ||
109
         std::abs((ptrdiff_t)dstSize.height - (ptrdiff_t)srcSize.height*2) != (ptrdiff_t)dstSize.height % 2 )
110
        return false;
111

112
    return true;
113
}
114

115
bool isGaussianPyramidUpS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
116
{
117
    if (!isSupportedConfiguration())
118
        return false;
119
    if ( (srcSize.width * cn) < 12 ||
120
         (cn != 1 && cn !=3 && cn!=4) ||
121
         std::abs((ptrdiff_t)dstSize.width - (ptrdiff_t)srcSize.width*2) != (ptrdiff_t)dstSize.width % 2 ||
122
         std::abs((ptrdiff_t)dstSize.height - (ptrdiff_t)srcSize.height*2) != (ptrdiff_t)dstSize.height % 2 )
123
        return false;
124

125
    return true;
126
}
127

128
#ifdef CAROTENE_NEON
129

130
namespace {
131

132
ptrdiff_t borderInterpolate101(ptrdiff_t p, ptrdiff_t len)
133
{
134
    if (len == 1)
135
        return 0;
136
    else
137
    {
138
        while ((unsigned)p >= (unsigned)len)
139
        {
140
            if (p < 0)
141
                p = -p;
142
            else
143
                p = (len - 1)*2 - p;
144
        }
145
    }
146
    return p;
147
}
148

149
} // namespace
150

151
#endif
152

153
void gaussianPyramidDownRTZ(const Size2D &srcSize,
154
                            const u8 *srcBase, ptrdiff_t srcStride,
155
                            const Size2D &dstSize,
156
                            u8 *dstBase, ptrdiff_t dstStride,
157
                            BORDER_MODE border, u8 borderValue)
158
{
159
    internal::assertSupportedConfiguration(isGaussianPyramidDownRTZSupported(srcSize, dstSize, border));
160
#ifdef CAROTENE_NEON
161
    // Single-core NEON code
162
    const size_t dwidth = dstSize.width;
163
    const size_t dheight = dstSize.height;
164
    const size_t swidth = srcSize.width;
165
    const size_t sheight = srcSize.height;
166

167
    ptrdiff_t idx_l1 = borderInterpolate101(-1, swidth);
168
    ptrdiff_t idx_l2 = borderInterpolate101(-2, swidth);
169
    ptrdiff_t idx_r1 = borderInterpolate101(swidth + 0, swidth);
170
    ptrdiff_t idx_r2 = borderInterpolate101(swidth + 1, swidth);
171

172
    //1-line buffer
173
    std::vector<u16> _buf((swidth + 4) + 32/sizeof(u16));
174
    u16* lane = internal::alignPtr(&_buf[2], 32);
175

176
    uint8x8_t vc6u8 = vmov_n_u8(6);
177
    uint16x8_t vc6u16 = vmovq_n_u16(6);
178
    uint16x8_t vc4u16 = vmovq_n_u16(4);
179

180
    u8* dst = dstBase;
181

182
    for (size_t i = 0; i < dheight; ++i, dst += dstStride)
183
    {
184
        //vertical convolution
185
        const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, sheight));
186
        const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, sheight));
187
        const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, sheight));
188
        const u8* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, sheight));
189
        const u8* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, sheight));
190

191
        size_t x = 0;
192
        for (; x <= swidth - 8; x += 8)
193
        {
194
            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
195
            uint8x8_t v0 = vld1_u8(ln0+x);
196
            uint8x8_t v1 = vld1_u8(ln1+x);
197
            uint8x8_t v2 = vld1_u8(ln2+x);
198
            uint8x8_t v3 = vld1_u8(ln3+x);
199
            uint8x8_t v4 = vld1_u8(ln4+x);
200

201
            uint16x8_t v = vaddl_u8(v0, v4);
202
            uint16x8_t v13 = vaddl_u8(v1, v3);
203

204
            v = vmlal_u8(v, v2, vc6u8);
205
            v = vmlaq_u16(v, v13, vc4u16);
206

207
            vst1q_u16(lane + x, v);
208
        }
209
        for (; x < swidth; ++x)
210
        {
211
            lane[x] = ln0[x] + ln4[x] + 4u * (ln1[x] + ln3[x]) + 6u * ln2[x];
212
        }
213

214
        //left&right borders
215
        lane[-1] = lane[idx_l1];
216
        lane[-2] = lane[idx_l2];
217

218
        lane[swidth] = lane[idx_r1];
219
        lane[swidth+1] = lane[idx_r2];
220

221
        //horizontal convolution
222
        x = 0;
223
        size_t vw = (swidth/2) - 7;    // Using 7 instead of 8 allows swidth of 14 or 15.
224
        for (; x < vw; x += 8)
225
        {
226
            internal::prefetch(lane + 2 * x);
227
            uint16x8x2_t vLane0 = vld2q_u16(lane + 2*x-2);  // L0[0] = x0 x2 x4 x6 x8 x10 x12 x14   L0[1] = x1 x3 x5 x7 x9 x11 x13 x15
228
            uint16x8x2_t vLane1 = vld2q_u16(lane + 2*x-1);  // L1[0] = x1 x3 x5 x7 x9 x11 x13 x15   L1[1] = x2 x4 x6 x8 x10 x12 x14 x16
229
            uint16x8x2_t vLane2 = vld2q_u16(lane + 2*x+0);  // L2[0] = x2 x4 x6 x8 x10 x12 x14 x16  L2[1] = x3 x5 x7 x9 x11 x13 x15 x17
230
            uint16x8x2_t vLane3 = vld2q_u16(lane + 2*x+1);  // L3[0] = x3 x5 x7 x9 x11 x13 x15 x17  L3[1] = x4 x6 x8 x10 x12 x14 x16 x18
231
            uint16x8x2_t vLane4 = vld2q_u16(lane + 2*x+2);  // L4[0] = x4 x6 x8 x10 x12 x14 x16 x18 L4[1] = x5 x7 x9 x11 x13 x15 x17 x19
232
            uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
233
            uint16x8_t vSum_1_3 = vaddq_u16(vLane1.val[0], vLane3.val[0]);
234
            vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
235
            vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_1_3, vc4u16);
236
            uint8x8_t vRes = vshrn_n_u16(vSum_0_4, 8);
237

238
            vst1_u8(dst + x, vRes);
239
        }
240

241
        for (; x < dwidth; x++)
242
        {
243
            dst[x] = u8((lane[2*x-2] + lane[2*x+2] + 4u * (lane[2*x-1] + lane[2*x+1]) + 6u * lane[2*x]) >> 8);
244
        }
245
    }
246
#else
247
    // Remove 'unused parameter' warnings.
248
    (void)srcSize;
249
    (void)srcBase;
250
    (void)srcStride;
251
    (void)dstSize;
252
    (void)dstBase;
253
    (void)dstStride;
254
    (void)border;
255
#endif
256
    (void)borderValue;
257
}
258

259
void gaussianPyramidDown(const Size2D &srcSize,
260
                         const u8 *srcBase, ptrdiff_t srcStride,
261
                         const Size2D &dstSize,
262
                         u8 *dstBase, ptrdiff_t dstStride, u8 cn)
263
{
264
    internal::assertSupportedConfiguration(isGaussianPyramidDownU8Supported(srcSize, dstSize, cn));
265
#ifdef CAROTENE_NEON
266
    size_t dcolcn = dstSize.width*cn;
267
    size_t scolcn = srcSize.width*cn;
268
    size_t roiw8 = dcolcn - 7;
269

270
    size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn;
271
    size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn;
272
    size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn;
273
    size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn;
274

275
    //1-line buffer
276
    std::vector<u16> _buf(cn*(srcSize.width + 4) + 32/sizeof(u16));
277
    u16* lane = internal::alignPtr(&_buf[2*cn], 32);
278

279
    uint8x8_t vc6u8 = vmov_n_u8(6);
280
    uint16x8_t vc6u16 = vmovq_n_u16(6);
281
    uint16x8_t vc4u16 = vmovq_n_u16(4);
282

283
    for (size_t i = 0; i < dstSize.height; ++i)
284
    {
285
        u8* dst = internal::getRowPtr(dstBase, dstStride, i);
286
        //vertical convolution
287
        const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height));
288
        const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height));
289
        const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height));
290
        const u8* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height));
291
        const u8* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height));
292

293
        size_t x = 0;
294
        for (; x <= scolcn - 8; x += 8)
295
        {
296
            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2));
297
            uint8x8_t v0 = vld1_u8(ln0+x);
298
            uint8x8_t v1 = vld1_u8(ln1+x);
299
            uint8x8_t v2 = vld1_u8(ln2+x);
300
            uint8x8_t v3 = vld1_u8(ln3+x);
301
            uint8x8_t v4 = vld1_u8(ln4+x);
302

303
            uint16x8_t v = vaddl_u8(v0, v4);
304
            uint16x8_t v13 = vaddl_u8(v1, v3);
305

306
            v = vmlal_u8(v, v2, vc6u8);
307
            v = vmlaq_u16(v, v13, vc4u16);
308

309
            vst1q_u16(lane + x, v);
310
        }
311
        for (; x < scolcn; ++x)
312
        {
313
            lane[x] = ln0[x] + ln4[x] + 4u * (ln1[x] + ln3[x]) + 6u * ln2[x];
314
        }
315

316
        //left&right borders
317
        for (u32 k = 0; k < cn; ++k)
318
        {
319
            lane[(s32)(-cn+k)] = lane[idx_l1 + k];
320
            lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k];
321

322
            lane[scolcn+k] = lane[idx_r1 + k];
323
            lane[scolcn+cn+k] = lane[idx_r2 + k];
324
        }
325

326
        //horizontal convolution
327
        x = 0;
328
        switch(cn)
329
        {
330
        case 1:
331
            for (; x < roiw8; x += 8)
332
            {
333
                internal::prefetch(lane + 2 * x);
334
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
335
                __asm__ (
336
                    "vld2.16 {d0-d3}, [%[in0]]                               \n\t"
337
                    "vld2.16 {d4-d7}, [%[in4]]                               \n\t"
338
                    "vld2.16 {d12-d15}, [%[in1]]                             \n\t"
339
                    "vld2.16 {d16-d19}, [%[in3]]                             \n\t"
340
                    "vld2.16 {d8-d11}, [%[in2],:256]                         \n\t"
341
                    "vadd.i16 q0, q2                  /*q0 = v0 + v4*/       \n\t"
342
                    "vadd.i16 q6, q8                  /*q6 = v1 + v3*/       \n\t"
343
                    "vmla.i16 q0, q4, %q[c6]          /*q0 += v2 * 6*/       \n\t"
344
                    "vmla.i16 q0, q6, %q[c4]          /*q1 += (v1+v3) * 4*/  \n\t"
345
                    "vrshrn.u16 d8, q0, #8                                   \n\t"
346
                    "vst1.8 {d8}, [%[out]]                                   \n\t"
347
                    : /*no output*/
348
                    : [out] "r" (dst + x),
349
                      [in0] "r" (lane + 2*x-2),
350
                      [in1] "r" (lane + 2*x-1),
351
                      [in2] "r" (lane + 2*x+0),
352
                      [in3] "r" (lane + 2*x+1),
353
                      [in4] "r" (lane + 2*x+2),
354
                      [c4] "w" (vc4u16), [c6] "w" (vc6u16)
355
                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
356
                );
357
#else
358
                uint16x8x2_t vLane0 = vld2q_u16(lane + 2*x-2);
359
                uint16x8x2_t vLane1 = vld2q_u16(lane + 2*x-1);
360
                uint16x8x2_t vLane2 = vld2q_u16(lane + 2*x+0);
361
                uint16x8x2_t vLane3 = vld2q_u16(lane + 2*x+1);
362
                uint16x8x2_t vLane4 = vld2q_u16(lane + 2*x+2);
363

364
                uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
365
                uint16x8_t vSum_1_3 = vaddq_u16(vLane1.val[0], vLane3.val[0]);
366
                vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
367
                vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_1_3, vc4u16);
368
                uint8x8_t vRes = vrshrn_n_u16(vSum_0_4, 8);
369

370
                vst1_u8(dst + x, vRes);
371
#endif
372
            }
373
            break;
374
        case 3:
375
        {
376
            uint16x4_t vx1 = vld1_u16(lane - 2*3);
377
            uint16x4_t vx2 = vld1_u16(lane - 1*3);
378
            uint16x4_t vx3 = vld1_u16(lane + 0*3);
379
            uint16x8_t v0 = vcombine_u16(vx1, vx3);
380

381
            uint8x8_t map = vreinterpret_u8_u64(vmov_n_u64(0xFFFF060504020100ULL));
382
            for (; x < roiw8; x += 6)
383
            {
384
                internal::prefetch(lane + 2 * x + 12);
385

386
                uint16x4_t vx_ = vld1_u16(lane + 2*x-1*3 + 6);
387
                uint16x4_t vx4 = vld1_u16(lane + 2*x+0*3 + 6);
388
                uint16x4_t vx5 = vld1_u16(lane + 2*x+1*3 + 6);
389
                uint16x4_t vx6 = vld1_u16(lane + 2*x+2*3 + 6);
390

391
                uint16x8_t v1 = vcombine_u16(vx2, vx_);
392
                uint16x8_t v2 = vcombine_u16(vget_high_u16(v0), vx4);
393
                uint16x8_t v3 = vcombine_u16(vx_, vx5);
394
                uint16x8_t v4 = vcombine_u16(vx4, vx6);
395
                vx2 = vx5;
396

397
                uint16x8_t v = vaddq_u16(v0, v4);
398
                uint16x8_t v13 = vaddq_u16(v1, v3);
399

400
                v = vmlaq_u16(v, v2, vc6u16);
401
                v = vmlaq_u16(v, v13, vc4u16);
402

403
                uint8x8_t v8 = vrshrn_n_u16(v, 8);
404

405
                v0 = v4;
406

407
                vst1_u8(dst + x, vtbl1_u8(v8, map));
408
            }
409
        }
410
        break;
411
        case 4:
412
        {
413
            uint16x4_t vx1 = vld1_u16(lane - 2*4);
414
            uint16x4_t vx2 = vld1_u16(lane - 1*4);
415
            uint16x4_t vx3 = vld1_u16(lane + 0*4);
416
            uint16x8_t v0 = vcombine_u16(vx1, vx3);
417

418
            for (; x < roiw8; x += 8)
419
            {
420
                internal::prefetch(lane + 2 * x + 16);
421

422
                uint16x4_t vx_ = vld1_u16(lane + 2 * x - 1*4 + 8);
423
                uint16x4_t vx4 = vld1_u16(lane + 2 * x + 0*4 + 8);
424
                uint16x4_t vx5 = vld1_u16(lane + 2 * x + 1*4 + 8);
425
                uint16x4_t vx6 = vld1_u16(lane + 2 * x + 2*4 + 8);
426

427
                uint16x8_t v1 = vcombine_u16(vx2, vx_);
428
                uint16x8_t v2 = vcombine_u16(vget_high_u16(v0), vx4);
429
                uint16x8_t v3 = vcombine_u16(vx_, vx5);
430
                uint16x8_t v4 = vcombine_u16(vx4, vx6);
431
                vx2 = vx5;
432

433
                uint16x8_t v = vaddq_u16(v0, v4);
434
                uint16x8_t v13 = vaddq_u16(v1, v3);
435

436
                v = vmlaq_u16(v, v2, vc6u16);
437
                v = vmlaq_u16(v, v13, vc4u16);
438

439
                uint8x8_t v8 = vrshrn_n_u16(v, 8);
440

441
                v0 = v4;
442

443
                vst1_u8(dst + x, v8);
444
            }
445
        }
446
        break;
447
        }
448

449
        for (u32 h = 0; h < cn; ++h)
450
        {
451
            u16* ln = lane + h;
452
            u8* dt = dst + h;
453
            for (size_t k = x; k < dcolcn; k += cn)
454
                dt[k] = u8((ln[2*k-2*cn] + ln[2*k+2*cn] + 4u * (ln[2*k-cn] + ln[2*k+cn]) + 6u * ln[2*k] + (1 << 7)) >> 8);
455
        }
456
    }
457
#else
458
    // Remove 'unused parameter' warnings.
459
    (void)srcBase;
460
    (void)srcStride;
461
    (void)dstBase;
462
    (void)dstStride;
463
#endif
464
}
465

466
void gaussianPyramidDown(const Size2D &srcSize,
467
                         const s16 *srcBase, ptrdiff_t srcStride,
468
                         const Size2D &dstSize,
469
                         s16 *dstBase, ptrdiff_t dstStride, u8 cn)
470
{
471
    internal::assertSupportedConfiguration(isGaussianPyramidDownS16Supported(srcSize, dstSize, cn));
472
#ifdef CAROTENE_NEON
473
    size_t dcolcn = dstSize.width*cn;
474
    size_t scolcn = srcSize.width*cn;
475
    size_t roiw4 = dcolcn - 3;
476

477
    size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn;
478
    size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn;
479
    size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn;
480
    size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn;
481

482
    //1-line buffer
483
    std::vector<s32> _buf(cn*(srcSize.width + 4) + 32/sizeof(s32));
484
    s32* lane = internal::alignPtr(&_buf[2*cn], 32);
485

486
    int16x4_t vc6s16 = vmov_n_s16(6);
487
    int32x4_t vc6s32 = vmovq_n_s32(6);
488
    int32x4_t vc4s32 = vmovq_n_s32(4);
489

490
    for (size_t i = 0; i < dstSize.height; ++i)
491
    {
492
        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
493
        //vertical convolution
494
        const s16* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height));
495
        const s16* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height));
496
        const s16* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height));
497
        const s16* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height));
498
        const s16* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height));
499

500
        size_t x = 0;
501
        for (; x <= scolcn - 4; x += 4)
502
        {
503
            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2));
504
            int16x4_t v0 = vld1_s16(ln0 + x);
505
            int16x4_t v1 = vld1_s16(ln1 + x);
506
            int16x4_t v2 = vld1_s16(ln2 + x);
507
            int16x4_t v3 = vld1_s16(ln3 + x);
508
            int16x4_t v4 = vld1_s16(ln4 + x);
509

510
            int32x4_t v = vaddl_s16(v0, v4);
511
            int32x4_t v13 = vaddl_s16(v1, v3);
512

513
            v = vmlal_s16(v, v2, vc6s16);
514
            v = vmlaq_s32(v, v13, vc4s32);
515

516
            vst1q_s32(lane + x, v);
517
        }
518
        for (; x < scolcn; ++x)
519
        {
520
            lane[x] = ln0[x] + ln4[x] + 4 * (ln1[x] + ln3[x]) + 6 * ln2[x];
521
        }
522

523
        //left&right borders
524
        for (u32 k = 0; k < cn; ++k)
525
        {
526
            lane[(s32)(-cn+k)] = lane[idx_l1 + k];
527
            lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k];
528

529
            lane[scolcn+k] = lane[idx_r1 + k];
530
            lane[scolcn+cn+k] = lane[idx_r2 + k];
531
        }
532

533
        //horizontal convolution
534
        x = 0;
535
        switch(cn)
536
        {
537
        case 1:
538
            for (; x < roiw4; x += 4)
539
            {
540
                internal::prefetch(lane + 2 * x);
541
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
542
                __asm__ (
543
                    "vld2.32 {d0-d3}, [%[in0]]                              \n\t"
544
                    "vld2.32 {d4-d7}, [%[in4]]                              \n\t"
545
                    "vld2.32 {d12-d15}, [%[in1]]                            \n\t"
546
                    "vld2.32 {d16-d19}, [%[in3]]                            \n\t"
547
                    "vld2.32 {d8-d11}, [%[in2],:256]                        \n\t"
548
                    "vadd.i32 q0, q2                                        \n\t"
549
                    "vadd.i32 q6, q8                                        \n\t"
550
                    "vmla.i32 q0, q4, %q[c6]                                \n\t"
551
                    "vmla.i32 q0, q6, %q[c4]                                \n\t"
552
                    "vrshrn.s32 d8, q0, #8                                  \n\t"
553
                    "vst1.16 {d8}, [%[out]]                                 \n\t"
554
                    : /*no output*/
555
                    : [out] "r" (dst + x),
556
                      [in0] "r" (lane + 2*x-2),
557
                      [in1] "r" (lane + 2*x-1),
558
                      [in2] "r" (lane + 2*x+0),
559
                      [in3] "r" (lane + 2*x+1),
560
                      [in4] "r" (lane + 2*x+2),
561
                      [c4] "w" (vc4s32), [c6] "w" (vc6s32)
562
                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
563
                );
564
#else
565
                int32x4x2_t vLane0 = vld2q_s32(lane + 2*x-2);
566
                int32x4x2_t vLane1 = vld2q_s32(lane + 2*x-1);
567
                int32x4x2_t vLane2 = vld2q_s32(lane + 2*x+0);
568
                int32x4x2_t vLane3 = vld2q_s32(lane + 2*x+1);
569
                int32x4x2_t vLane4 = vld2q_s32(lane + 2*x+2);
570

571
                int32x4_t vSum_0_4 = vaddq_s32(vLane0.val[0], vLane4.val[0]);
572
                int32x4_t vSum_1_3 = vaddq_s32(vLane1.val[0], vLane3.val[0]);
573
                vSum_0_4 = vmlaq_s32(vSum_0_4, vLane2.val[0], vc6s32);
574
                vSum_0_4 = vmlaq_s32(vSum_0_4, vSum_1_3, vc4s32);
575
                int16x4_t vRes = vrshrn_n_s32(vSum_0_4, 8);
576

577
                vst1_s16(dst + x, vRes);
578
#endif
579
            }
580
            break;
581
        case 3:
582
        {
583
            int32x4_t v0 = vld1q_s32(lane - 2*3);
584
            int32x4_t v1 = vld1q_s32(lane - 1*3);
585
            int32x4_t v2 = vld1q_s32(lane + 0*3);
586
            for (; x < roiw4; x += 3)
587
            {
588
                internal::prefetch(lane + 2 * x);
589

590
                int32x4_t v3 = vld1q_s32(lane + 2 * x + 1*3);
591
                int32x4_t v4 = vld1q_s32(lane + 2 * x + 2*3);
592

593
                int32x4_t v = vaddq_s32(v0, v4);
594
                int32x4_t v13 = vaddq_s32(v1, v3);
595

596
                v = vmlaq_s32(v, v2, vc6s32);
597
                v = vmlaq_s32(v, v13, vc4s32);
598

599
                int16x4_t vv = vrshrn_n_s32(v, 8);
600

601
                v0 = v2;
602
                v1 = v3;
603
                v2 = v4;
604

605
                vst1_s16(dst + x, vv);
606
            }
607
        }
608
        break;
609
        case 4:
610
        {
611
            int32x4_t v0 = vld1q_s32(lane - 2*4);
612
            int32x4_t v1 = vld1q_s32(lane - 1*4);
613
            int32x4_t v2 = vld1q_s32(lane + 0*4);
614
            for (; x < roiw4; x += 4)
615
            {
616
                internal::prefetch(lane + 2 * x + 8);
617
                int32x4_t v3 = vld1q_s32(lane + 2 * x + 1*4);
618
                int32x4_t v4 = vld1q_s32(lane + 2 * x + 2*4);
619

620
                int32x4_t v = vaddq_s32(v0, v4);
621
                int32x4_t v13 = vaddq_s32(v1, v3);
622

623
                v = vmlaq_s32(v, v2, vc6s32);
624
                v = vmlaq_s32(v, v13, vc4s32);
625

626
                int16x4_t vv = vrshrn_n_s32(v, 8);
627

628
                v0 = v2;
629
                v1 = v3;
630
                v2 = v4;
631

632
                vst1_s16(dst + x, vv);
633
            }
634
        }
635
        break;
636
        }
637

638
        for (u32 h = 0; h < cn; ++h)
639
        {
640
            s32* ln = lane + h;
641
            s16* dt = dst + h;
642
            for (size_t k = x; k < dcolcn; k += cn)
643
                dt[k] = s16((ln[2*k-2*cn] + ln[2*k+2*cn] + 4 * (ln[2*k-cn] + ln[2*k+cn]) + 6 * ln[2*k] + (1 << 7)) >> 8);
644
        }
645
    }
646
#else
647
    // Remove 'unused parameter' warnings.
648
    (void)srcBase;
649
    (void)srcStride;
650
    (void)dstBase;
651
    (void)dstStride;
652
#endif
653
}
654

655
void gaussianPyramidDown(const Size2D &srcSize,
656
                         const f32 *srcBase, ptrdiff_t srcStride,
657
                         const Size2D &dstSize,
658
                         f32 *dstBase, ptrdiff_t dstStride, u8 cn)
659
{
660
    internal::assertSupportedConfiguration(isGaussianPyramidDownF32Supported(srcSize, dstSize, cn));
661
#ifdef CAROTENE_NEON
662
    size_t dcolcn = dstSize.width*cn;
663
    size_t scolcn = srcSize.width*cn;
664
    size_t roiw4 = dcolcn - 3;
665

666
    size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn;
667
    size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn;
668
    size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn;
669
    size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn;
670

671
    //1-line buffer
672
    std::vector<f32> _buf(cn*(srcSize.width + 4) + 32/sizeof(f32));
673
    f32* lane = internal::alignPtr(&_buf[2*cn], 32);
674

675
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
676
    register float32x4_t vc6d4f32  asm ("q11") = vmovq_n_f32(1.5f);  // 6/4
677
    register float32x4_t vc1d4f32  asm ("q12") = vmovq_n_f32(0.25f); // 1/4
678

679
    register float32x4_t vc1d64f32 asm ("q13") = vmovq_n_f32(0.015625f); //1/4/16
680
    register float32x4_t vc4d64f32 asm ("q14") = vmovq_n_f32(0.0625f);   //4/4/16
681
    register float32x4_t vc6d64f32 asm ("q15") = vmovq_n_f32(0.09375f);  //6/4/16
682
#else
683
    float32x4_t vc6d4f32  = vmovq_n_f32(1.5f);  // 6/4
684
    float32x4_t vc1d4f32  = vmovq_n_f32(0.25f); // 1/4
685

686
    float32x4_t vc1d64f32 = vmovq_n_f32(0.015625f); //1/4/16
687
    float32x4_t vc4d64f32 = vmovq_n_f32(0.0625f);   //4/4/16
688
    float32x4_t vc6d64f32 = vmovq_n_f32(0.09375f);  //6/4/16
689
#endif
690

691
    for (size_t i = 0; i < dstSize.height; ++i)
692
    {
693
        f32* dst = internal::getRowPtr(dstBase, dstStride, i);
694
        //vertical convolution
695
        const f32* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height));
696
        const f32* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height));
697
        const f32* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height));
698
        const f32* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height));
699
        const f32* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height));
700

701
        size_t x = 0;
702
        for (; x <= scolcn - 4; x += 4)
703
        {
704
            internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2));
705
            float32x4_t v0 = vld1q_f32((const float32_t*)ln0 + x);
706
            float32x4_t v1 = vld1q_f32((const float32_t*)ln1 + x);
707
            float32x4_t v2 = vld1q_f32((const float32_t*)ln2 + x);
708
            float32x4_t v3 = vld1q_f32((const float32_t*)ln3 + x);
709
            float32x4_t v4 = vld1q_f32((const float32_t*)ln4 + x);
710

711
            float32x4_t v   = vaddq_f32(v1, v3);
712
            float32x4_t v04 = vaddq_f32(v0, v4);
713

714
            v = vmlaq_f32(v, v2, vc6d4f32);
715
            v = vmlaq_f32(v, v04, vc1d4f32);
716

717
            vst1q_f32(lane + x, v);
718
        }
719
        for (; x < scolcn; ++x)
720
        {
721
            lane[x] = 0.25f*(ln0[x] + ln4[x]) + (ln1[x] + ln3[x]) + 1.5f * ln2[x];
722
        }
723

724
        //left&right borders
725
        for (u32 k = 0; k < cn; ++k)
726
        {
727
            lane[(s32)(-cn+k)] = lane[idx_l1 + k];
728
            lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k];
729

730
            lane[scolcn+k] = lane[idx_r1 + k];
731
            lane[scolcn+cn+k] = lane[idx_r2 + k];
732
        }
733

734
        //horizontal convolution
735
        x = 0;
736
        switch(cn)
737
        {
738
        case 1:
739
            for (; x < roiw4; x += 4)
740
            {
741
                internal::prefetch(lane + 2 * x);
742
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
743
                __asm__ __volatile__ (
744
                    "vld2.32 {d0-d3}, [%[in0]]                              \n\t"
745
                    "vld2.32 {d8-d11}, [%[in4]]                             \n\t"
746
                    "vld2.32 {d14-d17}, [%[in2],:256]                       \n\t"
747
                    "vld2.32 {d10-d13}, [%[in1]]                            \n\t"
748
                    "vld2.32 {d16-d19}, [%[in3]]                            \n\t"
749
                    "vmul.f32 q7, %q[c6d64]                                 \n\t"
750
                    "vadd.f32 q0, q4         @v04                           \n\t"
751
                    "vadd.f32 q5, q8         @v13                           \n\t"
752
                    "vmla.f32 q7, q0, %q[c1d64]                             \n\t"
753
                    "vmla.f32 q7, q5, %q[c4d64]                             \n\t"
754
                    "vst1.32 {d14-d15}, [%[out]]                            \n\t"
755
                    :
756
                    : [out] "r" (dst + x),
757
                      [in0] "r" (lane + 2*x-2),
758
                      [in1] "r" (lane + 2*x-1),
759
                      [in2] "r" (lane + 2*x+0),
760
                      [in3] "r" (lane + 2*x+1),
761
                      [in4] "r" (lane + 2*x+2),
762
                      [c4d64] "w" (vc4d64f32), [c6d64] "w" (vc6d64f32), [c1d64] "w" (vc1d64f32)
763
                    : "d0","d1","d2","d3","d4",/*"d5","d6","d7",*/"d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" //ugly compiler "bug" - can't touch d5-d7
764
                );
765
#else
766
                float32x4x2_t vLane0 = vld2q_f32(lane + 2*x-2);
767
                float32x4x2_t vLane1 = vld2q_f32(lane + 2*x-1);
768
                float32x4x2_t vLane2 = vld2q_f32(lane + 2*x+0);
769
                float32x4x2_t vLane3 = vld2q_f32(lane + 2*x+1);
770
                float32x4x2_t vLane4 = vld2q_f32(lane + 2*x+2);
771

772
                float32x4_t vSum_0_4 = vaddq_f32(vLane0.val[0], vLane4.val[0]);
773
                float32x4_t vSum_1_3 = vaddq_f32(vLane1.val[0], vLane3.val[0]);
774
                float32x4_t vRes = vmulq_f32(vLane2.val[0], vc6d64f32);
775
                vRes = vmlaq_f32(vRes, vSum_0_4, vc1d64f32);
776
                vRes = vmlaq_f32(vRes, vSum_1_3, vc4d64f32);
777

778
                vst1q_f32(dst + x, vRes);
779
#endif
780
            }
781
            break;
782
        case 3:
783
        {
784
            float32x4_t v0 = vld1q_f32((const float32_t*)lane - 2*3);
785
            float32x4_t v1 = vld1q_f32((const float32_t*)lane - 1*3);
786
            float32x4_t v2 = vld1q_f32((const float32_t*)lane + 0*3);
787

788
            for (; x < roiw4; x += 3)
789
            {
790
                internal::prefetch(lane + 2 * x);
791

792
                float32x4_t v3 = vld1q_f32((const float32_t*)lane + 2 * x + 1*3);
793
                float32x4_t v4 = vld1q_f32((const float32_t*)lane + 2 * x + 2*3);
794

795
                float32x4_t v04 = vaddq_f32(v0, v4);
796
                float32x4_t v13 = vaddq_f32(v1, v3);
797

798
                float32x4_t v = vmulq_f32(v2, vc6d64f32);
799
                v = vmlaq_f32(v, v04, vc1d64f32);
800
                v = vmlaq_f32(v, v13, vc4d64f32);
801

802
                v0 = v2;
803
                v1 = v3;
804
                v2 = v4;
805

806
                vst1q_f32(dst + x, v);
807
            }
808
        }
809
        break;
810
        case 4:
811
        {
812
            float32x4_t v0 = vld1q_f32((const float32_t*)lane - 2*4);
813
            float32x4_t v1 = vld1q_f32((const float32_t*)lane - 1*4);
814
            float32x4_t v2 = vld1q_f32((const float32_t*)lane + 0*4);
815

816
            for (; x < roiw4; x += 4)
817
            {
818
                internal::prefetch(lane + 2 * x + 8);
819

820
                float32x4_t v3 = vld1q_f32((const float32_t*)lane + 2 * x + 1*4);
821
                float32x4_t v4 = vld1q_f32((const float32_t*)lane + 2 * x + 2*4);
822

823
                float32x4_t v04 = vaddq_f32(v0, v4);
824
                float32x4_t v13 = vaddq_f32(v1, v3);
825

826
                float32x4_t v = vmulq_f32(v2, vc6d64f32);
827
                v = vmlaq_f32(v, v04, vc1d64f32);
828
                v = vmlaq_f32(v, v13, vc4d64f32);
829

830
                v0 = v2;
831
                v1 = v3;
832
                v2 = v4;
833

834
                vst1q_f32(dst + x, v);
835
            }
836
        }
837
        break;
838
        }
839

840
        for (u32 h = 0; h < cn; ++h)
841
        {
842
            f32* ln = lane + h;
843
            f32* dt = dst + h;
844
            for (size_t k = x; k < dcolcn; k += cn)
845
                dt[k] = 0.015625f * (ln[2*k-2*cn] + ln[2*k+2*cn]) + 0.0625f * (ln[2*k-cn] + ln[2*k+cn]) + 0.09375f * ln[2*k];
846
        }
847
    }
848
#else
849
    // Remove 'unused parameter' warnings.
850
    (void)srcBase;
851
    (void)srcStride;
852
    (void)dstBase;
853
    (void)dstStride;
854
#endif
855
}
856

857
void gaussianPyramidUp(const Size2D &srcSize,
858
                       const u8 *srcBase, ptrdiff_t srcStride,
859
                       const Size2D &dstSize,
860
                       u8 *dstBase, ptrdiff_t dstStride, u8 cn)
861
{
862
    internal::assertSupportedConfiguration(isGaussianPyramidUpU8Supported(srcSize, dstSize, cn));
863
#ifdef CAROTENE_NEON
864
    size_t dcolshn = (dstSize.width/2) * cn;
865
    size_t dcolshw = ((dstSize.width+1)/2) * cn;
866
    size_t scolsn = srcSize.width*cn;
867

868
    size_t idx_l =  (borderInterpolate101(-2, 2 * srcSize.width)/2) * cn;
869
    size_t idx_r1 = (borderInterpolate101(2 * srcSize.width + 0, 2 * srcSize.width)/2) * cn;
870
    size_t idx_r2 = (borderInterpolate101(2 * srcSize.width + 2, 2 * srcSize.width + 2)/2) * cn;
871

872
    //2-lines buffer
873
    std::vector<u16> _buf(2*(cn*(srcSize.width + 3) + 32/sizeof(u16)));
874
    u16* lane0 = internal::alignPtr(&_buf[cn], 32);
875
    u16* lane1 = internal::alignPtr(lane0 + (3 + srcSize.width)*cn, 32);
876

877
    uint8x8_t vc6u8 = vmov_n_u8(6);
878
    uint16x8_t vc6u16 = vmovq_n_u16(6);
879

880
    for (size_t i = 0; i < (dstSize.height + 1)/2; ++i)
881
    {
882
        u8* dst = internal::getRowPtr(dstBase, dstStride, 2*i);
883
        //vertical convolution
884
        const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 - 2, srcSize.height * 2)/2);
885
        const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 0, srcSize.height * 2)/2);
886
        const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 2, srcSize.height * 2)/2);
887

888
        size_t x = 0;
889
        for (; x <= scolsn - 8; x += 8)
890
        {
891
            internal::prefetch(internal::getRowPtr(ln1 + x, srcStride, (ptrdiff_t)x % 3 - 1));
892
            uint8x8_t v0 = vld1_u8(ln0+x);
893
            uint8x8_t v2 = vld1_u8(ln2+x);
894
            uint8x8_t v1 = vld1_u8(ln1+x);
895

896
            uint16x8_t vl0 = vaddl_u8(v0, v2);
897
            uint16x8_t vl1 = vaddl_u8(v1, v2);
898

899
            vl0 = vmlal_u8(vl0, v1, vc6u8);
900
            vl1 = vshlq_n_u16(vl1, 2);
901

902
            vst1q_u16(lane0 + x, vl0);
903
            vst1q_u16(lane1 + x, vl1);
904
        }
905
        for (; x < scolsn; ++x)
906
        {
907
            lane0[x] = ln0[x] + ln2[x] + 6u * ln1[x];
908
            lane1[x] = 4u * (ln1[x] + ln2[x]);
909
        }
910

911
        //left&right borders
912
        for (u32 k = 0; k < cn; ++k)
913
        {
914
            lane0[(s32)(-cn+k)] = lane0[idx_l + k];
915
            lane1[(s32)(-cn+k)] = lane1[idx_l + k];
916

917
            lane0[scolsn+k] = lane0[idx_r1 + k];
918
            lane0[scolsn+cn+k] = lane0[idx_r2 + k];
919
            lane1[scolsn+k] = lane1[idx_r1 + k];
920
            lane1[scolsn+cn+k] = lane1[idx_r2 + k];
921
        }
922

923
        //horizontal convolution
924
        const u16* lane = lane0;
925
pyrUp8uHorizontalConvolution:
926
        x = 0;
927
        size_t lim;
928
        switch(cn)
929
        {
930
        case 1:
931
            lim = dcolshn > 7 ? dcolshn - 7 : 0;
932
            for (; x < lim; x += 8)
933
            {
934
                internal::prefetch(lane + x);
935
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
936
                __asm__ (
937
                    "vld1.16 {d0-d1}, [%[in0]]       /*q0 = v0*/            \n\t"
938
                    "vld1.16 {d2-d3}, [%[in2]]       /*q1 = v2*/            \n\t"
939
                    "vld1.16 {d4-d5}, [%[in1],:128]  /*q2 = v1*/            \n\t"
940
                    "vadd.i16 q0, q1                 /*q0 = v0 + v2*/       \n\t"
941
                    "vadd.i16 q3, q1, q2             /*q3 = v1 + v2*/       \n\t"
942
                    "vmla.i16 q0, q2, %q[c6]         /*q0 += v1*6*/         \n\t"
943
                    "vrshrn.u16 d9, q3, #4                                  \n\t"
944
                    "vrshrn.u16 d8, q0, #6                                  \n\t"
945
                    "vst2.8 {d8-d9}, [%[out]]                               \n\t"
946
                    : /*no output*/
947
                    : [out] "r" (dst + x*2),
948
                      [in0] "r" (lane + x - 1),
949
                      [in1] "r" (lane + x + 0),
950
                      [in2] "r" (lane + x + 1),
951
                      [c6] "w" (vc6u16)
952
                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
953
                );
954
#else
955
                uint16x8_t vLane0 = vld1q_u16(lane + x - 1);
956
                uint16x8_t vLane1 = vld1q_u16(lane + x + 0);
957
                uint16x8_t vLane2 = vld1q_u16(lane + x + 1);
958

959
                vLane0 = vaddq_u16(vLane0, vLane2);
960
                vLane2 = vaddq_u16(vLane2, vLane1);
961
                vLane0 = vmlaq_u16(vLane0, vLane1, vc6u16);
962
                uint8x8x2_t vRes;
963
                vRes.val[0] = vrshrn_n_u16(vLane0, 6);
964
                vRes.val[1] = vrshrn_n_u16(vLane2, 4);
965

966
                vst2_u8(dst + x*2, vRes);
967
#endif
968
            }
969
            break;
970
        case 3:
971
        {
972
            lim = dcolshn > 23 ? dcolshn - 23 : 0;
973
            for (; x < lim; x += 24)
974
            {
975
                internal::prefetch(lane + x);
976
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
977
                __asm__ (
978
                    "vmov.u16 q9, #6                                           \n\t"
979
                    "vld3.16 {d0, d2, d4}, [%[in0]]        /*v0*/              \n\t"
980
                    "vld3.16 {d1, d3, d5}, [%[in02]]                           \n\t"
981
                    "vld3.16 {d6, d8, d10}, [%[in2]]       /*v2*/              \n\t"
982
                    "vld3.16 {d7, d9, d11}, [%[in22]]                          \n\t"
983
                    "vld3.16 {d12, d14, d16}, [%[in1]]     /*v1*/              \n\t"
984
                    "vld3.16 {d13, d15, d17}, [%[in12]]                        \n\t"
985
                    "vadd.i16 q0, q3                       /*v0 + v2*/         \n\t"
986
                    "vadd.i16 q1, q4                       /*v0 + v2*/         \n\t"
987
                    "vadd.i16 q2, q5                       /*v0 + v2*/         \n\t"
988
                    "vadd.i16 q3, q6                       /*v1 + v2*/         \n\t"
989
                    "vadd.i16 q4, q7                       /*v1 + v2*/         \n\t"
990
                    "vadd.i16 q5, q8                       /*v1 + v2*/         \n\t"
991
                    "vmla.i16 q0, q6, q9                   /*v0 + v2 + v1*6 */ \n\t"
992
                    "vmla.i16 q1, q7, q9                   /*v0 + v2 + v1*6 */ \n\t"
993
                    "vmla.i16 q2, q8, q9                   /*v0 + v2 + v1*6 */ \n\t"
994
                    "vrshrn.u16 d19, q3, #4                                    \n\t"
995
                    "vrshrn.u16 d21, q4, #4                                    \n\t"
996
                    "vrshrn.u16 d23, q5, #4                                    \n\t"
997
                    "vrshrn.u16 d18, q0, #6                                    \n\t"
998
                    "vrshrn.u16 d20, q1, #6                                    \n\t"
999
                    "vrshrn.u16 d22, q2, #6                                    \n\t"
1000
                    "vzip.8 d18, d19                                           \n\t"
1001
                    "vzip.8 d20, d21                                           \n\t"
1002
                    "vzip.8 d22, d23                                           \n\t"
1003
                    "vst3.8 {d18, d20, d22}, [%[out1]]                         \n\t"
1004
                    "vst3.8 {d19, d21, d23}, [%[out2]]                         \n\t"
1005
                    : /*no output*/
1006
                    : [out1] "r" (dst + 2 * x),
1007
                      [out2] "r" (dst + 2 * x + 24),
1008
                      [in0]  "r" (lane + x - 3),
1009
                      [in02] "r" (lane + x + 9),
1010
                      [in1]  "r" (lane + x),
1011
                      [in12] "r" (lane + x + 12),
1012
                      [in2]  "r" (lane + x + 3),
1013
                      [in22] "r" (lane + x + 15)
1014
                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
1015
                );
1016
#else
1017
                uint16x8_t vc6 = vmovq_n_u16(6);
1018
                uint16x8x3_t vLane0  = vld3q_u16(lane + x - 3);
1019
                uint16x8x3_t vLane1  = vld3q_u16(lane + x + 0);
1020
                uint16x8x3_t vLane2  = vld3q_u16(lane + x + 3);
1021

1022
                uint16x8_t vSum_0_3 = vaddq_u16(vLane0.val[0], vLane2.val[0]);
1023
                uint16x8_t vSum_1_4 = vaddq_u16(vLane0.val[1], vLane2.val[1]);
1024
                uint16x8_t vSum_2_5 = vaddq_u16(vLane0.val[2], vLane2.val[2]);
1025
                uint16x8_t vSum_3_6 = vaddq_u16(vLane2.val[0], vLane1.val[0]);
1026
                uint16x8_t vSum_4_7 = vaddq_u16(vLane2.val[1], vLane1.val[1]);
1027
                uint16x8_t vSum_5_8 = vaddq_u16(vLane2.val[2], vLane1.val[2]);
1028

1029
                vSum_0_3 = vmlaq_u16(vSum_0_3, vLane1.val[0], vc6);
1030
                vSum_1_4 = vmlaq_u16(vSum_1_4, vLane1.val[1], vc6);
1031
                vSum_2_5 = vmlaq_u16(vSum_2_5, vLane1.val[2], vc6);
1032

1033
                uint8x8x2_t vSumShr3;
1034
                vSumShr3.val[0] = vrshrn_n_u16(vSum_3_6, 4);
1035
                vSumShr3.val[1] = vrshrn_n_u16(vSum_0_3, 6);;
1036
                uint8x8x2_t vSumShr4;
1037
                vSumShr4.val[0] = vrshrn_n_u16(vSum_4_7, 4);
1038
                vSumShr4.val[1] = vrshrn_n_u16(vSum_1_4, 6);
1039
                uint8x8x2_t vSumShr5;
1040
                vSumShr5.val[0] = vrshrn_n_u16(vSum_5_8, 4);
1041
                vSumShr5.val[1] = vrshrn_n_u16(vSum_2_5, 6);
1042

1043
                vSumShr3 = vzip_u8(vSumShr3.val[1], vSumShr3.val[0]);
1044
                vSumShr4 = vzip_u8(vSumShr4.val[1], vSumShr4.val[0]);
1045
                vSumShr5 = vzip_u8(vSumShr5.val[1], vSumShr5.val[0]);
1046

1047
                uint8x8x3_t vRes1;
1048
                vRes1.val[0] = vSumShr3.val[0];
1049
                vRes1.val[1] = vSumShr4.val[0];
1050
                vRes1.val[2] = vSumShr5.val[0];
1051
                vst3_u8(dst + 2 * x,      vRes1);
1052

1053
                uint8x8x3_t vRes2;
1054
                vRes2.val[0] = vSumShr3.val[1];
1055
                vRes2.val[1] = vSumShr4.val[1];
1056
                vRes2.val[2] = vSumShr5.val[1];
1057
                vst3_u8(dst + 2 * x + 24, vRes2);
1058
#endif
1059
            }
1060
        }
1061
        break;
1062
        case 4:
1063
            lim = dcolshn > 7 ? dcolshn - 7 : 0;
1064
            for (; x < lim; x += 8)
1065
            {
1066
                internal::prefetch(lane + x);
1067
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
1068
                __asm__ (
1069
                    "vld1.16 {d0-d1}, [%[in0]]       /*q0 = v0*/            \n\t"
1070
                    "vld1.16 {d2-d3}, [%[in2]]       /*q1 = v2*/            \n\t"
1071
                    "vld1.16 {d4-d5}, [%[in1],:128]  /*q2 = v1*/            \n\t"
1072
                    "vadd.i16 q0, q1                 /*q0 = v0 + v2*/       \n\t"
1073
                    "vadd.i16 q3, q1, q2             /*q3 = v1 + v2*/       \n\t"
1074
                    "vmla.i16 q0, q2, %q[c6]         /*q0 += v1*6*/         \n\t"
1075
                    "vrshrn.u16 d9, q3, #4                                  \n\t"
1076
                    "vrshrn.u16 d8, q0, #6                                  \n\t"
1077
                    "vst2.32 {d8-d9}, [%[out]]                              \n\t"
1078
                    : /*no output*/
1079
                    : [out] "r" (dst + x*2),
1080
                      [in0] "r" (lane + x-4),
1081
                      [in1] "r" (lane + x),
1082
                      [in2] "r" (lane + x+4),
1083
                      [c6] "w" (vc6u16)
1084
                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
1085
                );
1086
#else
1087
                uint16x8_t vLane0 = vld1q_u16(lane + x-4);
1088
                uint16x8_t vLane1 = vld1q_u16(lane + x+0);
1089
                uint16x8_t vLane2 = vld1q_u16(lane + x+4);
1090

1091
                vLane0 = vaddq_u16(vLane0, vLane2);
1092
                vLane2 = vaddq_u16(vLane2, vLane1);
1093
                vLane0 = vmlaq_u16(vLane0, vLane1, vc6u16);
1094
                uint32x2x2_t vRes;
1095
                vRes.val[1] = vreinterpret_u32_u8(vrshrn_n_u16(vLane2, 4));
1096
                vRes.val[0] = vreinterpret_u32_u8(vrshrn_n_u16(vLane0, 6));
1097

1098
                vst2_u32((uint32_t*)(dst + x*2), vRes);
1099
#endif
1100
            }
1101
            break;
1102
        };
1103

1104
        for (u32 h = 0; h < cn; ++h)
1105
        {
1106
            const u16* ln = lane + h;
1107
            u8* dt = dst + h;
1108
            size_t k = x;
1109
            for (; k < dcolshn; k += cn)
1110
            {
1111
                dt[2*k+0] = u8((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6u * ln[k] + (1 << 5)) >> 6);
1112
                dt[2*k+cn] = u8(((ln[k] + ln[k+cn]) * 4u + (1 << 5)) >> 6);
1113
            }
1114
            for (; k < dcolshw; k += cn)
1115
                dt[2*k] = u8((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6u * ln[k] + (1 << 5)) >> 6);
1116
        }
1117
        dst = internal::getRowPtr(dstBase, dstStride, 2*i+1);
1118

1119
        //second row
1120
        if (lane == lane0 && 2*i+1 < dstSize.height)
1121
        {
1122
            lane = lane1;
1123
            goto pyrUp8uHorizontalConvolution;
1124
        }
1125
    }
1126
#else
1127
    // Remove 'unused parameter' warnings.
1128
    (void)srcBase;
1129
    (void)srcStride;
1130
    (void)dstBase;
1131
    (void)dstStride;
1132
#endif
1133
}
1134

1135
void gaussianPyramidUp(const Size2D &srcSize,
1136
                       const s16 *srcBase, ptrdiff_t srcStride,
1137
                       const Size2D &dstSize,
1138
                       s16 *dstBase, ptrdiff_t dstStride, u8 cn)
1139
{
1140
    internal::assertSupportedConfiguration(isGaussianPyramidUpS16Supported(srcSize, dstSize, cn));
1141
#ifdef CAROTENE_NEON
1142
    size_t dcolshn = (dstSize.width/2) * cn;
1143
    size_t dcolshw = ((dstSize.width+1)/2) * cn;
1144
    size_t scolsn = srcSize.width*cn;
1145

1146
    size_t idx_l =  (borderInterpolate101(-2, 2 * srcSize.width)/2) * cn;
1147
    size_t idx_r1 = (borderInterpolate101(2 * srcSize.width + 0, 2 * srcSize.width)/2) * cn;
1148
    size_t idx_r2 = (borderInterpolate101(2 * srcSize.width + 2, 2 * srcSize.width + 2)/2) * cn;
1149

1150
    //2-lines buffer
1151
    std::vector<s32> _buf(2*(cn*(srcSize.width + 3) + 32/sizeof(s32)));
1152
    s32* lane0 = internal::alignPtr(&_buf[cn], 32);
1153
    s32* lane1 = internal::alignPtr(lane0 + (3 + srcSize.width)*cn, 32);
1154

1155
    int16x4_t vc6s16 = vmov_n_s16(6);
1156
    int32x4_t vc6s32 = vmovq_n_s32(6);
1157

1158
    for (size_t i = 0; i < (dstSize.height + 1)/2; ++i)
1159
    {
1160
        s16* dst = internal::getRowPtr(dstBase, dstStride, 2*i);
1161
        //vertical convolution
1162
        const s16* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 - 2, srcSize.height * 2)/2);
1163
        const s16* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 0, srcSize.height * 2)/2);
1164
        const s16* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 2, srcSize.height * 2)/2);
1165

1166
        size_t x = 0;
1167
        for (; x <= scolsn - 4; x += 4)
1168
        {
1169
            internal::prefetch(internal::getRowPtr(ln1 + x, srcStride, (ptrdiff_t)x % 3 - 1));
1170
            int16x4_t v0 = vld1_s16(ln0 + x);
1171
            int16x4_t v2 = vld1_s16(ln2 + x);
1172
            int16x4_t v1 = vld1_s16(ln1 + x);
1173

1174
            int32x4_t vl0 = vaddl_s16(v0, v2);
1175
            int32x4_t vl1 = vaddl_s16(v1, v2);
1176

1177
            vl0 = vmlal_s16(vl0, v1, vc6s16);
1178
            vl1 = vshlq_n_s32(vl1, 2);
1179

1180
            vst1q_s32(lane0 + x, vl0);
1181
            vst1q_s32(lane1 + x, vl1);
1182
        }
1183
        for (; x < scolsn; ++x)
1184
        {
1185
            lane0[x] = ln0[x] + ln2[x] + 6 * ln1[x];
1186
            lane1[x] = 4 * (ln1[x] + ln2[x]);
1187
        }
1188

1189
        //left&right borders
1190
        for (u32 k = 0; k < cn; ++k)
1191
        {
1192
            lane0[(s32)(-cn+k)] = lane0[idx_l + k];
1193
            lane1[(s32)(-cn+k)] = lane1[idx_l + k];
1194

1195
            lane0[scolsn+k] = lane0[idx_r1 + k];
1196
            lane0[scolsn+cn+k] = lane0[idx_r2 + k];
1197
            lane1[scolsn+k] = lane1[idx_r1 + k];
1198
            lane1[scolsn+cn+k] = lane1[idx_r2 + k];
1199
        }
1200

1201
        //horizontal convolution
1202
        const s32* lane = lane0;
1203
pyrUp16sHorizontalConvolution:
1204
        x = 0;
1205
        size_t lim;
1206
        switch(cn)
1207
        {
1208
        case 1:
1209
            lim = dcolshn > 3 ? dcolshn - 3 : 0;
1210
            for (; x < lim; x += 4)
1211
            {
1212
                internal::prefetch(lane + x);
1213
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
1214
                __asm__ (
1215
                    "vld1.32 {d0-d1}, [%[in0]]       /*q0 = v0*/            \n\t"
1216
                    "vld1.32 {d2-d3}, [%[in2]]       /*q1 = v2*/            \n\t"
1217
                    "vld1.32 {d4-d5}, [%[in1],:128]  /*q2 = v1*/            \n\t"
1218
                    "vadd.i32 q0, q0, q1             /*q0 = v0 + v2*/       \n\t"
1219
                    "vadd.i32 q3, q1, q2             /*q3 = v1 + v2*/       \n\t"
1220
                    "vmla.i32 q0, q2, %q[c6]         /*q0 += v1*6*/         \n\t"
1221
                    "vrshrn.s32 d9, q3, #4                                  \n\t"
1222
                    "vrshrn.s32 d8, q0, #6                                  \n\t"
1223
                    "vst2.16 {d8-d9}, [%[out]]                              \n\t"
1224
                    : /*no output*/
1225
                    : [out] "r" (dst + x * 2),
1226
                      [in0] "r" (lane + x - 1),
1227
                      [in1] "r" (lane + x),
1228
                      [in2] "r" (lane + x + 1),
1229
                      [c6] "w" (vc6s32)
1230
                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
1231
                );
1232
#else
1233
                int32x4_t vLane0 = vld1q_s32(lane + x - 1);
1234
                int32x4_t vLane1 = vld1q_s32(lane + x);
1235
                int32x4_t vLane2 = vld1q_s32(lane + x + 1);
1236

1237
                vLane0 = vaddq_s32(vLane0, vLane2);
1238
                vLane2 = vaddq_s32(vLane2, vLane1);
1239
                vLane0 = vmlaq_s32(vLane0, vLane1, vc6s32);
1240
                int16x4x2_t vRes;
1241
                vRes.val[0] = vrshrn_n_s32(vLane0, 6);
1242
                vRes.val[1] = vrshrn_n_s32(vLane2, 4);
1243

1244
                vst2_s16(dst + x * 2, vRes);
1245
#endif
1246
            }
1247
            break;
1248
        case 3:
1249
        {
1250
            lim = dcolshn > 11 ? dcolshn - 11 : 0;
1251
            for (; x < lim; x += 12)
1252
            {
1253
                internal::prefetch(lane + x + 3);
1254
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
1255
                __asm__ (
1256
                    "vmov.s32 q9, #6                                           \n\t"
1257
                    "vld3.32 {d0, d2, d4}, [%[in0]]        /*v0*/              \n\t"
1258
                    "vld3.32 {d1, d3, d5}, [%[in2]]                            \n\t"
1259
                    "vld3.32 {d6, d8, d10}, [%[in2]]       /*v2*/              \n\t"
1260
                    "vld3.32 {d7, d9, d11}, [%[in22]]                          \n\t"
1261
                    "vld3.32 {d12, d14, d16}, [%[in1]]     /*v1*/              \n\t"
1262
                    "vld3.32 {d13, d15, d17}, [%[in12]]                        \n\t"
1263
                    "vadd.i32 q0, q3                       /*v0 + v2*/         \n\t"
1264
                    "vadd.i32 q1, q4                       /*v0 + v2*/         \n\t"
1265
                    "vadd.i32 q2, q5                       /*v0 + v2*/         \n\t"
1266
                    "vadd.i32 q3, q6                       /*v1 + v2*/         \n\t"
1267
                    "vadd.i32 q4, q7                       /*v1 + v2*/         \n\t"
1268
                    "vadd.i32 q5, q8                       /*v1 + v2*/         \n\t"
1269
                    "vmla.i32 q0, q6, q9                   /*v0 + v2 + v1*6 */ \n\t"
1270
                    "vmla.i32 q1, q7, q9                   /*v0 + v2 + v1*6 */ \n\t"
1271
                    "vmla.i32 q2, q8, q9                   /*v0 + v2 + v1*6 */ \n\t"
1272
                    "vrshrn.s32 d19, q3, #4                                    \n\t"
1273
                    "vrshrn.s32 d21, q4, #4                                    \n\t"
1274
                    "vrshrn.s32 d23, q5, #4                                    \n\t"
1275
                    "vrshrn.s32 d18, q0, #6                                    \n\t"
1276
                    "vrshrn.s32 d20, q1, #6                                    \n\t"
1277
                    "vrshrn.s32 d22, q2, #6                                    \n\t"
1278
                    "vzip.16 d18, d19                                          \n\t"
1279
                    "vzip.16 d20, d21                                          \n\t"
1280
                    "vzip.16 d22, d23                                          \n\t"
1281
                    "vst3.16 {d18, d20, d22}, [%[out1]]                        \n\t"
1282
                    "vst3.16 {d19, d21, d23}, [%[out2]]                        \n\t"
1283
                    : /*no output*/
1284
                    : [out1] "r" (dst + 2*x),
1285
                      [out2] "r" (dst + 2*x + 12),
1286
                      [in0]  "r" (lane + x - 3),
1287
                      [in1]  "r" (lane + x),
1288
                      [in12] "r" (lane + x + 6),
1289
                      [in2]  "r" (lane + x + 3),
1290
                      [in22] "r" (lane + x + 9)
1291
                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
1292
                );
1293
#else
1294
                int32x4_t vc6 = vmovq_n_s32(6);
1295
                int32x4x3_t vLane0  = vld3q_s32(lane + x - 3);
1296
                int32x4x3_t vLane1  = vld3q_s32(lane + x);
1297
                int32x4x3_t vLane2  = vld3q_s32(lane + x + 3);
1298

1299
                int32x4_t vSum_0_3 = vaddq_s32(vLane0.val[0], vLane2.val[0]);
1300
                int32x4_t vSum_1_4 = vaddq_s32(vLane0.val[1], vLane2.val[1]);
1301
                int32x4_t vSum_2_5 = vaddq_s32(vLane0.val[2], vLane2.val[2]);
1302
                int32x4_t vSum_3_6 = vaddq_s32(vLane2.val[0], vLane1.val[0]);
1303
                int32x4_t vSum_4_7 = vaddq_s32(vLane2.val[1], vLane1.val[1]);
1304
                int32x4_t vSum_5_8 = vaddq_s32(vLane2.val[2], vLane1.val[2]);
1305

1306
                vSum_0_3 = vmlaq_s32(vSum_0_3, vLane1.val[0], vc6);
1307
                vSum_1_4 = vmlaq_s32(vSum_1_4, vLane1.val[1], vc6);
1308
                vSum_2_5 = vmlaq_s32(vSum_2_5, vLane1.val[2], vc6);
1309

1310
                int16x4x2_t vSumShr1;
1311
                vSumShr1.val[1] = vrshrn_n_s32(vSum_3_6, 4);
1312
                vSumShr1.val[0] = vrshrn_n_s32(vSum_0_3, 6);
1313

1314
                int16x4x2_t vSumShr2;
1315
                vSumShr2.val[1] = vrshrn_n_s32(vSum_4_7, 4);
1316
                vSumShr2.val[0] = vrshrn_n_s32(vSum_1_4, 6);
1317

1318
                int16x4x2_t vSumShr3;
1319
                vSumShr3.val[1] = vrshrn_n_s32(vSum_5_8, 4);
1320
                vSumShr3.val[0] = vrshrn_n_s32(vSum_2_5, 6);
1321

1322
                vSumShr1 = vzip_s16(vSumShr1.val[0], vSumShr1.val[1]);
1323
                vSumShr2 = vzip_s16(vSumShr2.val[0], vSumShr2.val[1]);
1324
                vSumShr3 = vzip_s16(vSumShr3.val[0], vSumShr3.val[1]);
1325

1326
                int16x4x3_t vRes1;
1327
                vRes1.val[0] = vSumShr1.val[0];
1328
                vRes1.val[1] = vSumShr2.val[0];
1329
                vRes1.val[2] = vSumShr3.val[0];
1330
                vst3_s16((int16_t*)(dst + 2 * x), vRes1);
1331

1332
                int16x4x3_t vRes2;
1333
                vRes2.val[0] = vSumShr1.val[1];
1334
                vRes2.val[1] = vSumShr2.val[1];
1335
                vRes2.val[2] = vSumShr3.val[1];
1336
                vst3_s16(dst + 2 * x + 12, vRes2);
1337
#endif
1338
            }
1339
        }
1340
        break;
1341
        case 4:
1342
            lim = dcolshn > 3 ? dcolshn - 3 : 0;
1343
            for (; x < lim; x += 4)
1344
            {
1345
                internal::prefetch(lane + x);
1346
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
1347
                __asm__ (
1348
                    "vld1.32 {d0-d1}, [%[in0]]       /*q0 = v0*/            \n\t"
1349
                    "vld1.32 {d2-d3}, [%[in2]]       /*q1 = v2*/            \n\t"
1350
                    "vld1.32 {d4-d5}, [%[in1],:128]  /*q2 = v1*/            \n\t"
1351
                    "vadd.i32 q0, q1                 /*q0 = v0 + v2*/       \n\t"
1352
                    "vadd.i32 q3, q1, q2             /*q3 = v1 + v2*/       \n\t"
1353
                    "vmla.i32 q0, q2, %q[c6]         /*q0 += v1*6*/         \n\t"
1354
                    "vrshrn.s32 d9, q3, #4                                  \n\t"
1355
                    "vrshrn.s32 d8, q0, #6                                  \n\t"
1356
                    "vst1.16 {d8-d9}, [%[out]]                              \n\t"
1357
                    : /*no output*/
1358
                    : [out] "r" (dst + x * 2),
1359
                      [in0] "r" (lane + x - 4),
1360
                      [in1] "r" (lane + x),
1361
                      [in2] "r" (lane + x + 4),
1362
                      [c6] "w" (vc6s32)
1363
                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
1364
                );
1365
#else
1366
                int32x4_t vLane0 = vld1q_s32(lane + x - 4);
1367
                int32x4_t vLane1 = vld1q_s32(lane + x);
1368
                int32x4_t vLane2 = vld1q_s32(lane + x + 4);
1369

1370
                vLane0 = vaddq_s32(vLane0, vLane2);
1371
                vLane2 = vaddq_s32(vLane2, vLane1);
1372
                vLane0 = vmlaq_s32(vLane0, vLane1, vc6s32);
1373
                int16x4x2_t vRes;
1374
                vRes.val[0] = vrshrn_n_s32(vLane0, 6);
1375
                vRes.val[1] = vrshrn_n_s32(vLane2, 4);
1376

1377
                vst1q_s16(dst + x * 2, vcombine_s16(vRes.val[0], vRes.val[1]));
1378
#endif
1379
            }
1380
            break;
1381
        };
1382

1383
        for (u32 h = 0; h < cn; ++h)
1384
        {
1385
            const s32* ln = lane + h;
1386
            s16* dt = dst + h;
1387
            size_t k = x;
1388
            for (; k < dcolshn; k += cn)
1389
            {
1390
                dt[2*k+0] = s16((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6 * ln[k] + (1 << 5)) >> 6);
1391
                dt[2*k+cn] = s16(((ln[k] + ln[k+cn]) * 4 + (1 << 5)) >> 6);
1392
            }
1393
            for (; k < dcolshw; k += cn)
1394
                dt[2*k] = s16((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6 * ln[k] + (1 << 5)) >> 6);
1395
        }
1396
        dst = internal::getRowPtr(dstBase, dstStride, 2*i+1);
1397

1398
        //second row
1399
        if (lane == lane0 && 2*i+1 < dstSize.height)
1400
        {
1401
            lane = lane1;
1402
            goto pyrUp16sHorizontalConvolution;
1403
        }
1404
    }
1405
#else
1406
    // Remove 'unused parameter' warnings.
1407
    (void)srcBase;
1408
    (void)srcStride;
1409
    (void)dstBase;
1410
    (void)dstStride;
1411
#endif
1412
}
1413

1414
} // namespace CAROTENE_NS
1415

1416
Product

Resources

Company