CoCalc -- scharr.cpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/scharr.cpp
¹⁶³³⁷ views
1
/*
2
 * By downloading, copying, installing or using the software you agree to this license.
3
 * If you do not agree to this license, do not download, install,
4
 * copy or use the software.
5
 *
6
 *
7
 *                           License Agreement
8
 *                For Open Source Computer Vision Library
9
 *                        (3-clause BSD License)
10
 *
11
 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
 * Third party copyrights are property of their respective owners.
13
 *
14
 * Redistribution and use in source and binary forms, with or without modification,
15
 * are permitted provided that the following conditions are met:
16
 *
17
 *   * Redistributions of source code must retain the above copyright notice,
18
 *     this list of conditions and the following disclaimer.
19
 *
20
 *   * Redistributions in binary form must reproduce the above copyright notice,
21
 *     this list of conditions and the following disclaimer in the documentation
22
 *     and/or other materials provided with the distribution.
23
 *
24
 *   * Neither the names of the copyright holders nor the names of the contributors
25
 *     may be used to endorse or promote products derived from this software
26
 *     without specific prior written permission.
27
 *
28
 * This software is provided by the copyright holders and contributors "as is" and
29
 * any express or implied warranties, including, but not limited to, the implied
30
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31
 * In no event shall copyright holders or contributors be liable for any direct,
32
 * indirect, incidental, special, exemplary, or consequential damages
33
 * (including, but not limited to, procurement of substitute goods or services;
34
 * loss of use, data, or profits; or business interruption) however caused
35
 * and on any theory of liability, whether in contract, strict liability,
36
 * or tort (including negligence or otherwise) arising in any way out of
37
 * the use of this software, even if advised of the possibility of such damage.
38
 */
39

40
#include <vector>
41

42
#include "common.hpp"
43

44
namespace CAROTENE_NS {
45

46
bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
47
{
48
    return (dx == 0 && dy == 1 &&
49
                   isSeparableFilter3x3Supported(size, border, 3, 1, borderMargin)) ||
50
           (dx == 1 && dy == 0 &&
51
                   isSeparableFilter3x3Supported(size, border, 1, 3, borderMargin));
52
}
53

54
void Scharr3x3(const Size2D &size,
55
               const u8 * srcBase, ptrdiff_t srcStride,
56
               s16 * dstBase, ptrdiff_t dstStride,
57
               s32 dx, s32 dy,
58
               BORDER_MODE border, u8 borderValue, Margin borderMargin)
59
{
60
    internal::assertSupportedConfiguration(isScharr3x3Supported(size, border, dx, dy, borderMargin));
61
#ifdef CAROTENE_NEON
62
    static s16 dw[] = {3, 10, 3};
63

64
    if (dy == 1)
65
        SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
66
                           3, 1, dw, 0,
67
                           border, borderValue, borderMargin);
68
    else
69
        SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
70
                           1, 3, 0, dw,
71
                           border, borderValue, borderMargin);
72
#else
73
    (void)srcBase;
74
    (void)srcStride;
75
    (void)dstBase;
76
    (void)dstStride;
77
    (void)borderValue;
78
#endif
79
}
80

81
void ScharrDeriv(const Size2D &size, s32 cn,
82
                 const u8 * srcBase, ptrdiff_t srcStride,
83
                 s16 * dstBase, ptrdiff_t dstStride)
84
{
85
    internal::assertSupportedConfiguration();
86
#ifdef CAROTENE_NEON
87
    size_t colsn = size.width*cn;
88
    size_t roiw8 = colsn > 7 ? colsn - 7 : 0;
89

90
    ptrdiff_t delta = (ptrdiff_t)(((size.width + 2)*cn + 15) & -16);//align size
91
    std::vector<s16> _tempBuf((delta << 1) + 64);
92
    s16 *trow0 = internal::alignPtr(&_tempBuf[cn], 16), *trow1 = internal::alignPtr(trow0 + delta, 16);
93

94
    int16x8_t vc3 = vmovq_n_s16(3);
95
    int16x8_t vc10 = vmovq_n_s16(10);
96
    uint8x8_t v8c10 = vmov_n_u8(10);
97

98
    for(size_t y = 0; y < size.height; y++ )
99
    {
100
        const u8* srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : size.height > 1 ? 1 : 0);
101
        const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
102
        const u8* srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height > 1 ? size.height-2 : 0);
103
        s16* drow = internal::getRowPtr(dstBase, dstStride, y);
104

105
        // do vertical convolution
106
        size_t x = 0;
107
        for( ; x < roiw8; x += 8 )
108
        {
109
            internal::prefetch(srow0 + x);
110
            internal::prefetch(srow1 + x);
111
            internal::prefetch(srow2 + x);
112
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
113
            __asm__ (
114
                "vld1.8 {d0}, [%[src0]]                                \n\t"
115
                "vld1.8 {d2}, [%[src2]]                                \n\t"
116
                "vld1.8 {d1}, [%[src1]]                                \n\t"
117
                "vaddl.u8 q2, d2, d0                                   \n\t"
118
                "vmull.u8 q3, d1, %[vc10]                              \n\t"
119
                "vsubl.u8 q4, d2, d0                                   \n\t"
120
                "vmla.s16 q3, q2, %q[vc3]                              \n\t"
121
                "vst1.16 {d8-d9}, [%[out1],:128]                       \n\t"
122
                "vst1.16 {d6-d7}, [%[out0],:128]                       \n\t"
123
                :
124
                : [out0] "r" (trow0 + x),
125
                  [out1] "r" (trow1 + x),
126
                  [src0] "r" (srow0 + x),
127
                  [src1] "r" (srow1 + x),
128
                  [src2] "r" (srow2 + x),
129
                  [vc10] "w" (v8c10), [vc3] "w" (vc3)
130
                : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
131
            );
132
#else
133
            uint8x8_t s0 = vld1_u8(srow0 + x);
134
            uint8x8_t s1 = vld1_u8(srow1 + x);
135
            uint8x8_t s2 = vld1_u8(srow2 + x);
136

137
            int16x8_t s1x10 = vreinterpretq_s16_u16(vmull_u8(s1, v8c10));
138
            int16x8_t s02 = vreinterpretq_s16_u16(vaddl_u8(s2, s0));
139
            int16x8_t t1 = vreinterpretq_s16_u16(vsubl_u8(s2, s0));
140
            int16x8_t t0 = vmlaq_s16(s1x10, s02, vc3);
141

142
            vst1q_s16(trow1 + x, t1);
143
            vst1q_s16(trow0 + x, t0);
144
#endif
145
        }
146
        for( ; x < colsn; x++ )
147
        {
148
            trow0[x] = (s16)((srow0[x] + srow2[x])*3 + srow1[x]*10);
149
            trow1[x] = (s16)(srow2[x] - srow0[x]);
150
        }
151

152
        // make border
153
        size_t x0 = (size.width > 1 ? cn : 0), x1 = (size.width > 1 ? (size.width-2)*cn : 0);
154
        for( s32 k = 0; k < cn; k++ )
155
        {
156
            trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
157
            trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
158
        }
159

160
        // do horizontal convolution, interleave the results and store them to dst
161
        x = 0;
162
        for( ; x < roiw8; x += 8 )
163
        {
164
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
165
            __asm__ (
166
                "vld1.16 {d4-d5}, [%[s2ptr]]                           \n\t"
167
                "vld1.16 {d8-d9}, [%[s4ptr]]                           \n\t"
168
                "vld1.16 {d6-d7}, [%[s3ptr],:128]                      \n\t"
169
                "vld1.16 {d0-d1}, [%[s0ptr]]                           \n\t"
170
                "vld1.16 {d2-d3}, [%[s1ptr]]                           \n\t"
171
                "vadd.i16 q7, q2, q4                                   \n\t"
172
                "vmul.s16 q6, q3, %q[vc10]                             \n\t"
173
                "vsub.s16 q5, q1, q0                                   \n\t"
174
                "vmla.s16 q6, q7, %q[vc3]                              \n\t"
175
                "vst2.16 {d10-d13}, [%[out]]                           \n\t"
176
                :
177
                : [out] "r" (drow + x * 2),
178
                  [s0ptr] "r" (trow0 + x - cn),
179
                  [s1ptr] "r" (trow0 + x + cn),
180
                  [s2ptr] "r" (trow1 + x - cn),
181
                  [s3ptr] "r" (trow1 + x),
182
                  [s4ptr] "r" (trow1 + x + cn),
183
                  [vc10] "w" (vc10), [vc3] "w" (vc3)
184
                : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
185
            );
186
#else
187
            int16x8_t s0 = vld1q_s16(trow0 + x - cn);
188
            int16x8_t s1 = vld1q_s16(trow0 + x + cn);
189
            int16x8_t s2 = vld1q_s16(trow1 + x - cn);
190
            int16x8_t s3 = vld1q_s16(trow1 + x);
191
            int16x8_t s4 = vld1q_s16(trow1 + x + cn);
192

193
            int16x8_t s3x10 = vmulq_s16(s3, vc10);
194
            int16x8_t s24 = vaddq_s16(s2, s4);
195

196
            int16x8x2_t vr;
197
            vr.val[0] = vsubq_s16(s1, s0);
198
            vr.val[1] = vmlaq_s16(s3x10, s24, vc3);
199

200
            vst2q_s16(drow + x*2, vr);
201
#endif
202
        }
203
        for( ; x < colsn; x++ )
204
        {
205
            drow[x*2] = (s16)(trow0[x+cn] - trow0[x-cn]);
206
            drow[x*2+1] = (s16)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10);
207
        }
208
    }
209
#else
210
    (void)size;
211
    (void)cn;
212
    (void)srcBase;
213
    (void)srcStride;
214
    (void)dstBase;
215
    (void)dstStride;
216
#endif
217
}
218

219
} // namespace CAROTENE_NS
220

221
Product

Resources

Company