Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/integral.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
42
namespace CAROTENE_NS {
43
44
void integral(const Size2D &size,
45
const u8 * srcBase, ptrdiff_t srcStride,
46
u32 * sumBase, ptrdiff_t sumStride)
47
{
48
internal::assertSupportedConfiguration();
49
#ifdef CAROTENE_NEON
50
uint32x4_t v_zero = vmovq_n_u32(0u);
51
52
// the first iteration
53
const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
54
u32 * sum = internal::getRowPtr(sumBase, sumStride, 0);
55
56
uint32x4_t prev = v_zero;
57
size_t j = 0u;
58
59
for ( ; j + 7 < size.width; j += 8)
60
{
61
internal::prefetch(sum + j);
62
internal::prefetch(src + j);
63
64
uint8x8_t el8shr0 = vld1_u8(src + j);
65
uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
66
uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
67
uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
68
69
uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
70
uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
71
72
uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
73
uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
74
75
uint32x4_t vsuml = vaddw_u16(prev, vget_low_u16(el8));
76
uint32x4_t vsumh = vaddw_u16(prev, el4h);
77
78
vst1q_u32(sum + j, vsuml);
79
vst1q_u32(sum + j + 4, vsumh);
80
81
prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
82
}
83
84
for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
85
sum[j] = (v += src[j]);
86
87
// the others
88
for (size_t i = 1; i < size.height ; ++i)
89
{
90
src = internal::getRowPtr(srcBase, srcStride, i);
91
u32 * prevSum = internal::getRowPtr(sumBase, sumStride, i - 1);
92
sum = internal::getRowPtr(sumBase, sumStride, i);
93
94
prev = v_zero;
95
j = 0u;
96
97
for ( ; j + 7 < size.width; j += 8)
98
{
99
internal::prefetch(sum + j);
100
internal::prefetch(src + j);
101
102
uint32x4_t vsuml = vld1q_u32(prevSum + j);
103
uint32x4_t vsumh = vld1q_u32(prevSum + j + 4);
104
105
uint8x8_t el8shr0 = vld1_u8(src + j);
106
uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
107
uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
108
uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
109
110
vsuml = vaddq_u32(vsuml, prev);
111
vsumh = vaddq_u32(vsumh, prev);
112
113
uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
114
uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
115
116
uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
117
uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
118
119
vsuml = vaddw_u16(vsuml, vget_low_u16(el8));
120
vsumh = vaddw_u16(vsumh, el4h);
121
122
vst1q_u32(sum + j, vsuml);
123
vst1q_u32(sum + j + 4, vsumh);
124
125
prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
126
}
127
128
for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
129
sum[j] = (v += src[j]) + prevSum[j];
130
}
131
#else
132
(void)size;
133
(void)srcBase;
134
(void)srcStride;
135
(void)sumBase;
136
(void)sumStride;
137
#endif
138
}
139
140
void sqrIntegral(const Size2D &size,
141
const u8 * srcBase, ptrdiff_t srcStride,
142
f64 * sqsumBase, ptrdiff_t sqsumStride)
143
{
144
internal::assertSupportedConfiguration();
145
#ifdef CAROTENE_NEON
146
uint16x8_t v_zero8 = vmovq_n_u16(0u);
147
148
// the first iteration
149
const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
150
f64 * sqsum = internal::getRowPtr(sqsumBase, sqsumStride, 0);
151
152
double prev = 0.;
153
size_t j = 0u;
154
155
for ( ; j + 7 < size.width; j += 8)
156
{
157
internal::prefetch(sqsum + j);
158
internal::prefetch(src + j);
159
160
uint8x8_t vsrc = vld1_u8(src + j);
161
162
uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
163
uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
164
165
uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
166
uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
167
168
uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
169
170
uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
171
uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
172
uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
173
174
u32 buf[8];
175
vst1_u32(buf, vget_low_u32(el8shr01l));
176
vst1_u32(buf+2, el2l);
177
vst1_u32(buf+4, el2hl);
178
vst1_u32(buf+6, el2hh);
179
for(u32 k=0; k < 8; k++)
180
sqsum[j+k] = prev + buf[k];
181
prev += buf[7];
182
}
183
184
for (; j < size.width; ++j)
185
sqsum[j] = (prev += src[j]*src[j]);
186
187
// the others
188
for (size_t i = 1; i < size.height ; ++i)
189
{
190
src = internal::getRowPtr(srcBase, srcStride, i);
191
f64 * prevSqSum = internal::getRowPtr(sqsumBase, sqsumStride, i - 1);
192
sqsum = internal::getRowPtr(sqsumBase, sqsumStride, i);
193
194
prev = 0.;
195
j = 0u;
196
197
for ( ; j + 7 < size.width; j += 8)
198
{
199
internal::prefetch(sqsum + j);
200
internal::prefetch(src + j);
201
202
uint8x8_t vsrc = vld1_u8(src + j);
203
204
uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
205
uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
206
207
uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
208
uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
209
210
uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
211
212
uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
213
uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
214
uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
215
216
u32 buf[8];
217
vst1_u32(buf, vget_low_u32(el8shr01l));
218
vst1_u32(buf+2, el2l);
219
vst1_u32(buf+4, el2hl);
220
vst1_u32(buf+6, el2hh);
221
for(u32 k=0; k < 8; k++)
222
sqsum[j+k] = prev + prevSqSum[j+k] + buf[k];
223
prev += buf[7];
224
}
225
226
for (; j < size.width; ++j)
227
sqsum[j] = (prev += src[j]*src[j]) + prevSqSum[j];
228
}
229
#else
230
(void)size;
231
(void)srcBase;
232
(void)srcStride;
233
(void)sqsumBase;
234
(void)sqsumStride;
235
#endif
236
}
237
238
} // namespace CAROTENE_NS
239
240