Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/count_nonzero.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
42
#include <limits>
43
44
namespace CAROTENE_NS {
45
46
s32 countNonZero(const Size2D &_size,
47
const u8 * srcBase, ptrdiff_t srcStride)
48
{
49
internal::assertSupportedConfiguration();
50
#ifdef CAROTENE_NEON
51
Size2D size(_size);
52
if (srcStride == (ptrdiff_t)(size.width))
53
{
54
size.width *= size.height;
55
size.height = 1;
56
}
57
size_t roiw16 = size.width & ~15u;
58
s32 result = 0;
59
for(size_t k = 0; k < size.height; ++k)
60
{
61
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
62
size_t i = 0;
63
64
#define COUNTNONZERO8U_BLOCK_SIZE (16*255)
65
uint8x16_t vc1 = vmovq_n_u8(1);
66
for (; i < roiw16;)
67
{
68
size_t lim = std::min(i + COUNTNONZERO8U_BLOCK_SIZE, size.width) - 16;
69
uint8x16_t vs = vmovq_n_u8(0);
70
71
for (; i <= lim; i+= 16)
72
{
73
internal::prefetch(src + i);
74
uint8x16_t vln = vld1q_u8(src + i);
75
uint8x16_t vnz = vminq_u8(vln, vc1);
76
vs = vaddq_u8(vs, vnz);
77
}
78
79
uint32x4_t vs4 = vpaddlq_u16(vpaddlq_u8(vs));
80
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
81
82
s32 s[2];
83
vst1_u32((u32*)s, vs2);
84
85
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 2GB of non-zeros...
86
{
87
return 0x7fFFffFF;
88
}
89
result += (s[0] += s[1]);
90
if (s[0] < 0 || result < 0)
91
{
92
return 0x7fFFffFF;
93
}
94
}
95
for (; i < size.width; i++)
96
result += (src[i] != 0)?1:0;
97
if (result < 0)//saturate in case of overflow ~ 2GB of non-zeros...
98
{
99
return 0x7fFFffFF;
100
}
101
}
102
return result;
103
#else
104
(void)_size;
105
(void)srcBase;
106
(void)srcStride;
107
108
return 0;
109
#endif
110
}
111
112
s32 countNonZero(const Size2D &_size,
113
const u16 * srcBase, ptrdiff_t srcStride)
114
{
115
internal::assertSupportedConfiguration();
116
#ifdef CAROTENE_NEON
117
Size2D size(_size);
118
if (srcStride == (ptrdiff_t)(size.width))
119
{
120
size.width *= size.height;
121
size.height = 1;
122
}
123
size_t roiw8 = size.width & ~7u;
124
s32 result = 0;
125
for(size_t k = 0; k < size.height; ++k)
126
{
127
const u16* src = internal::getRowPtr( srcBase, srcStride, k);
128
size_t i = 0;
129
130
#define COUNTNONZERO16U_BLOCK_SIZE (8*(256*256-1))
131
uint16x8_t vc1 = vmovq_n_u16(1);
132
for (; i < roiw8;)
133
{
134
size_t lim = std::min(i + COUNTNONZERO16U_BLOCK_SIZE, size.width) - 8;
135
uint16x8_t vs = vmovq_n_u16(0);
136
137
for (; i <= lim; i+= 8)
138
{
139
internal::prefetch(src + i);
140
uint16x8_t vln = vld1q_u16(src + i);
141
uint16x8_t vnz = vminq_u16(vln, vc1);
142
vs = vaddq_u16(vs, vnz);
143
}
144
145
uint32x4_t vs4 = vpaddlq_u16(vs);
146
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
147
148
s32 s[2];
149
vst1_u32((u32*)s, vs2);
150
151
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 4GB of non-zeros...
152
{
153
return 0x7fFFffFF;
154
}
155
result += (s[0] += s[1]);
156
if (s[0] < 0 || result < 0)
157
{
158
return 0x7fFFffFF;
159
}
160
}
161
for (; i < size.width; i++)
162
result += (src[i] != 0)?1:0;
163
if (result < 0)//saturate in case of overflow ~ 4GB of non-zeros...
164
{
165
return 0x7fFFffFF;
166
}
167
}
168
return result;
169
#else
170
(void)_size;
171
(void)srcBase;
172
(void)srcStride;
173
174
return 0;
175
#endif
176
}
177
178
s32 countNonZero(const Size2D &_size,
179
const s32 * srcBase, ptrdiff_t srcStride)
180
{
181
internal::assertSupportedConfiguration();
182
#ifdef CAROTENE_NEON
183
Size2D size(_size);
184
if (srcStride == (ptrdiff_t)(size.width))
185
{
186
size.width *= size.height;
187
size.height = 1;
188
}
189
size_t roiw4 = size.width & ~3u;
190
s32 result = 0;
191
for(size_t k = 0; k < size.height; ++k)
192
{
193
const u32* src = (const u32*)internal::getRowPtr( srcBase, srcStride, k);
194
u32 i = 0;
195
196
uint32x4_t vc1 = vmovq_n_u32(1);
197
uint32x4_t vs = vmovq_n_u32(0);
198
199
for (; i < roiw4; i += 4 )
200
{
201
internal::prefetch(src + i);
202
uint32x4_t vln = vld1q_u32(src + i);
203
uint32x4_t vnz = vminq_u32(vln, vc1);
204
vs = vqaddq_u32(vs, vnz);
205
}
206
207
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
208
209
s32 s[2];
210
vst1_u32((u32*)s, vs2);
211
212
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 8GB of non-zeros...
213
{
214
return 0x7fFFffFF;
215
}
216
result += (s[0] += s[1]);
217
if (s[0] < 0 || result < 0)
218
{
219
return 0x7fFFffFF;
220
}
221
222
for (; i < size.width; i++)
223
result += (src[i] != 0)?1:0;
224
if (result < 0)//saturate in case of overflow ~ 8GB of non-zeros...
225
{
226
return 0x7fFFffFF;
227
}
228
}
229
return result;
230
#else
231
(void)_size;
232
(void)srcBase;
233
(void)srcStride;
234
235
return 0;
236
#endif
237
}
238
239
s32 countNonZero(const Size2D &_size,
240
const f32 * srcBase, ptrdiff_t srcStride)
241
{
242
internal::assertSupportedConfiguration();
243
#ifdef CAROTENE_NEON
244
Size2D size(_size);
245
if (srcStride == (ptrdiff_t)(size.width))
246
{
247
size.width *= size.height;
248
size.height = 1;
249
}
250
size_t roiw4 = size.width & ~3u;
251
s32 result = 0;
252
for(size_t k = 0; k < size.height; ++k)
253
{
254
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
255
size_t i = 0;
256
257
float32x4_t vc0 = vmovq_n_f32(0);
258
int32x4_t vs = vmovq_n_s32(0);
259
260
for (; i < roiw4; i += 4 )
261
{
262
internal::prefetch(src + i);
263
float32x4_t vln = vld1q_f32(src + i);
264
int32x4_t vnz = vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(vln, vc0)));
265
vs = vqaddq_s32(vs, vnz);
266
}
267
268
int32x2_t vs2 = vqneg_s32(vqadd_s32(vget_low_s32(vs), vget_high_s32(vs)));
269
270
int s[2];
271
vst1_s32(s, vs2);
272
273
result += (s[0] += s[1]);
274
if (s[0] < 0 || result < 0)//case of overflow ~ 8GB of non-zeros...
275
{
276
return 0x7fFFffFF;
277
}
278
279
for (; i < size.width; i++)
280
result += (src[i] < std::numeric_limits<float>::min() && src[i] > -std::numeric_limits<float>::min())?0:1;
281
282
if (result < 0)
283
{
284
return 0x7fFFffFF;
285
}
286
}
287
return result;
288
#else
289
(void)_size;
290
(void)srcBase;
291
(void)srcStride;
292
293
return 0;
294
#endif
295
}
296
297
s32 countNonZero(const Size2D &_size,
298
const f64 * srcBase, ptrdiff_t srcStride)
299
{
300
internal::assertSupportedConfiguration();
301
#ifdef CAROTENE_NEON
302
Size2D size(_size);
303
if (srcStride == (ptrdiff_t)(size.width))
304
{
305
size.width *= size.height;
306
size.height = 1;
307
}
308
size_t roiw8 = size.width & ~7u;
309
size_t roiw4 = size.width & ~3u;
310
size_t roiw2 = size.width & ~1u;
311
uint64x2_t vmask1 = vdupq_n_u64(0x7fFFffFFffFFffFFULL); //will treat denormals as non-zero
312
uint32x4_t vc0 = vmovq_n_u32(0);
313
314
s32 result = 0;
315
for(size_t k = 0; k < size.height; ++k)
316
{
317
const f64* src = internal::getRowPtr( srcBase, srcStride, k);
318
size_t i = 0;
319
320
int32x2_t vs1 = vmov_n_s32(0);
321
int32x2_t vs2 = vmov_n_s32(0);
322
int32x2_t vs3 = vmov_n_s32(0);
323
int32x2_t vs4 = vmov_n_s32(0);
324
325
for (; i < roiw8; i += 8 )
326
{
327
internal::prefetch(src + i + 6);
328
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
329
uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
330
uint64x2_t vln3 = vld1q_u64((const u64*)(src + i + 4));
331
uint64x2_t vln4 = vld1q_u64((const u64*)(src + i + 6));
332
333
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
334
uint64x2_t vm2 = vandq_u64(vln2, vmask1);
335
uint64x2_t vm3 = vandq_u64(vln3, vmask1);
336
uint64x2_t vm4 = vandq_u64(vln4, vmask1);
337
338
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
339
uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
340
uint32x4_t vequ3 = vceqq_u32(vreinterpretq_u32_u64(vm3), vc0);
341
uint32x4_t vequ4 = vceqq_u32(vreinterpretq_u32_u64(vm4), vc0);
342
343
uint32x4_t vlx1 = vmvnq_u32(vequ1);
344
uint32x4_t vlx2 = vmvnq_u32(vequ2);
345
uint32x4_t vlx3 = vmvnq_u32(vequ3);
346
uint32x4_t vlx4 = vmvnq_u32(vequ4);
347
348
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
349
int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
350
int32x2_t vnz3 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx3), vget_high_u32(vlx3)));
351
int32x2_t vnz4 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx4), vget_high_u32(vlx4)));
352
353
vs1 = vqadd_s32(vs1, vnz1);
354
vs2 = vqadd_s32(vs2, vnz2);
355
vs3 = vqadd_s32(vs3, vnz3);
356
vs4 = vqadd_s32(vs4, vnz4);
357
}
358
359
if (i < roiw4)
360
{
361
internal::prefetch(src + i + 2);
362
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
363
uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
364
365
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
366
uint64x2_t vm2 = vandq_u64(vln2, vmask1);
367
368
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
369
uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
370
371
uint32x4_t vlx1 = vmvnq_u32(vequ1);
372
uint32x4_t vlx2 = vmvnq_u32(vequ2);
373
374
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
375
int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
376
377
vs1 = vqadd_s32(vs1, vnz1);
378
vs2 = vqadd_s32(vs2, vnz2);
379
i += 4;
380
}
381
382
if (i < roiw2)
383
{
384
internal::prefetch(src + i);
385
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
386
387
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
388
389
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
390
391
uint32x4_t vlx1 = vmvnq_u32(vequ1);
392
393
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
394
395
vs1 = vqadd_s32(vs1, vnz1);
396
i += 2;
397
}
398
399
vs1 = vqadd_s32(vs1, vs2);
400
vs3 = vqadd_s32(vs3, vs4);
401
vs1 = vqadd_s32(vs1, vs3);
402
int32x2_t vsneg = vqneg_s32(vs1);
403
404
s32 s[2];
405
vst1_s32(s, vsneg);
406
407
result += (s[0] += s[1]);
408
if (s[0] < 0 || result < 0)//case of overflow ~ 16GB of non-zeros...
409
{
410
return 0x7fFFffFF;
411
}
412
413
for (; i < size.width; i++)
414
result += (src[i] < std::numeric_limits<double>::min() && src[i] > -std::numeric_limits<double>::min())?0:1;
415
if (result < 0)
416
{
417
return 0x7fFFffFF;
418
}
419
}
420
return result;
421
#else
422
(void)_size;
423
(void)srcBase;
424
(void)srcStride;
425
426
return 0;
427
#endif
428
}
429
430
} // namespace CAROTENE_NS
431
432