Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/scharr.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include <vector>
41
42
#include "common.hpp"
43
44
namespace CAROTENE_NS {
45
46
bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
47
{
48
return (dx == 0 && dy == 1 &&
49
isSeparableFilter3x3Supported(size, border, 3, 1, borderMargin)) ||
50
(dx == 1 && dy == 0 &&
51
isSeparableFilter3x3Supported(size, border, 1, 3, borderMargin));
52
}
53
54
void Scharr3x3(const Size2D &size,
55
const u8 * srcBase, ptrdiff_t srcStride,
56
s16 * dstBase, ptrdiff_t dstStride,
57
s32 dx, s32 dy,
58
BORDER_MODE border, u8 borderValue, Margin borderMargin)
59
{
60
internal::assertSupportedConfiguration(isScharr3x3Supported(size, border, dx, dy, borderMargin));
61
#ifdef CAROTENE_NEON
62
static s16 dw[] = {3, 10, 3};
63
64
if (dy == 1)
65
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
66
3, 1, dw, 0,
67
border, borderValue, borderMargin);
68
else
69
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
70
1, 3, 0, dw,
71
border, borderValue, borderMargin);
72
#else
73
(void)srcBase;
74
(void)srcStride;
75
(void)dstBase;
76
(void)dstStride;
77
(void)borderValue;
78
#endif
79
}
80
81
void ScharrDeriv(const Size2D &size, s32 cn,
82
const u8 * srcBase, ptrdiff_t srcStride,
83
s16 * dstBase, ptrdiff_t dstStride)
84
{
85
internal::assertSupportedConfiguration();
86
#ifdef CAROTENE_NEON
87
size_t colsn = size.width*cn;
88
size_t roiw8 = colsn > 7 ? colsn - 7 : 0;
89
90
ptrdiff_t delta = (ptrdiff_t)(((size.width + 2)*cn + 15) & -16);//align size
91
std::vector<s16> _tempBuf((delta << 1) + 64);
92
s16 *trow0 = internal::alignPtr(&_tempBuf[cn], 16), *trow1 = internal::alignPtr(trow0 + delta, 16);
93
94
int16x8_t vc3 = vmovq_n_s16(3);
95
int16x8_t vc10 = vmovq_n_s16(10);
96
uint8x8_t v8c10 = vmov_n_u8(10);
97
98
for(size_t y = 0; y < size.height; y++ )
99
{
100
const u8* srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : size.height > 1 ? 1 : 0);
101
const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
102
const u8* srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height > 1 ? size.height-2 : 0);
103
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
104
105
// do vertical convolution
106
size_t x = 0;
107
for( ; x < roiw8; x += 8 )
108
{
109
internal::prefetch(srow0 + x);
110
internal::prefetch(srow1 + x);
111
internal::prefetch(srow2 + x);
112
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
113
__asm__ (
114
"vld1.8 {d0}, [%[src0]] \n\t"
115
"vld1.8 {d2}, [%[src2]] \n\t"
116
"vld1.8 {d1}, [%[src1]] \n\t"
117
"vaddl.u8 q2, d2, d0 \n\t"
118
"vmull.u8 q3, d1, %[vc10] \n\t"
119
"vsubl.u8 q4, d2, d0 \n\t"
120
"vmla.s16 q3, q2, %q[vc3] \n\t"
121
"vst1.16 {d8-d9}, [%[out1],:128] \n\t"
122
"vst1.16 {d6-d7}, [%[out0],:128] \n\t"
123
:
124
: [out0] "r" (trow0 + x),
125
[out1] "r" (trow1 + x),
126
[src0] "r" (srow0 + x),
127
[src1] "r" (srow1 + x),
128
[src2] "r" (srow2 + x),
129
[vc10] "w" (v8c10), [vc3] "w" (vc3)
130
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
131
);
132
#else
133
uint8x8_t s0 = vld1_u8(srow0 + x);
134
uint8x8_t s1 = vld1_u8(srow1 + x);
135
uint8x8_t s2 = vld1_u8(srow2 + x);
136
137
int16x8_t s1x10 = vreinterpretq_s16_u16(vmull_u8(s1, v8c10));
138
int16x8_t s02 = vreinterpretq_s16_u16(vaddl_u8(s2, s0));
139
int16x8_t t1 = vreinterpretq_s16_u16(vsubl_u8(s2, s0));
140
int16x8_t t0 = vmlaq_s16(s1x10, s02, vc3);
141
142
vst1q_s16(trow1 + x, t1);
143
vst1q_s16(trow0 + x, t0);
144
#endif
145
}
146
for( ; x < colsn; x++ )
147
{
148
trow0[x] = (s16)((srow0[x] + srow2[x])*3 + srow1[x]*10);
149
trow1[x] = (s16)(srow2[x] - srow0[x]);
150
}
151
152
// make border
153
size_t x0 = (size.width > 1 ? cn : 0), x1 = (size.width > 1 ? (size.width-2)*cn : 0);
154
for( s32 k = 0; k < cn; k++ )
155
{
156
trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
157
trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
158
}
159
160
// do horizontal convolution, interleave the results and store them to dst
161
x = 0;
162
for( ; x < roiw8; x += 8 )
163
{
164
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
165
__asm__ (
166
"vld1.16 {d4-d5}, [%[s2ptr]] \n\t"
167
"vld1.16 {d8-d9}, [%[s4ptr]] \n\t"
168
"vld1.16 {d6-d7}, [%[s3ptr],:128] \n\t"
169
"vld1.16 {d0-d1}, [%[s0ptr]] \n\t"
170
"vld1.16 {d2-d3}, [%[s1ptr]] \n\t"
171
"vadd.i16 q7, q2, q4 \n\t"
172
"vmul.s16 q6, q3, %q[vc10] \n\t"
173
"vsub.s16 q5, q1, q0 \n\t"
174
"vmla.s16 q6, q7, %q[vc3] \n\t"
175
"vst2.16 {d10-d13}, [%[out]] \n\t"
176
:
177
: [out] "r" (drow + x * 2),
178
[s0ptr] "r" (trow0 + x - cn),
179
[s1ptr] "r" (trow0 + x + cn),
180
[s2ptr] "r" (trow1 + x - cn),
181
[s3ptr] "r" (trow1 + x),
182
[s4ptr] "r" (trow1 + x + cn),
183
[vc10] "w" (vc10), [vc3] "w" (vc3)
184
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
185
);
186
#else
187
int16x8_t s0 = vld1q_s16(trow0 + x - cn);
188
int16x8_t s1 = vld1q_s16(trow0 + x + cn);
189
int16x8_t s2 = vld1q_s16(trow1 + x - cn);
190
int16x8_t s3 = vld1q_s16(trow1 + x);
191
int16x8_t s4 = vld1q_s16(trow1 + x + cn);
192
193
int16x8_t s3x10 = vmulq_s16(s3, vc10);
194
int16x8_t s24 = vaddq_s16(s2, s4);
195
196
int16x8x2_t vr;
197
vr.val[0] = vsubq_s16(s1, s0);
198
vr.val[1] = vmlaq_s16(s3x10, s24, vc3);
199
200
vst2q_s16(drow + x*2, vr);
201
#endif
202
}
203
for( ; x < colsn; x++ )
204
{
205
drow[x*2] = (s16)(trow0[x+cn] - trow0[x-cn]);
206
drow[x*2+1] = (s16)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10);
207
}
208
}
209
#else
210
(void)size;
211
(void)cn;
212
(void)srcBase;
213
(void)srcStride;
214
(void)dstBase;
215
(void)dstStride;
216
#endif
217
}
218
219
} // namespace CAROTENE_NS
220
221