Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/fill_minmaxloc.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
42
namespace CAROTENE_NS {
43
44
#ifdef CAROTENE_NEON
45
46
namespace {
47
48
template <typename T>
49
void process(const T * src, size_t j0, size_t j1, size_t i,
50
T minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
51
T maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
52
{
53
for (size_t j = j0; j < j1; ++j)
54
{
55
T val = src[j];
56
57
if (val == maxVal)
58
{
59
if (maxLocCount < maxLocCapacity)
60
{
61
maxLocPtr[maxLocCount] = j;
62
maxLocPtr[maxLocCount + 1] = i;
63
}
64
maxLocCount += 2;
65
}
66
67
if (val == minVal)
68
{
69
if (minLocCount < minLocCapacity)
70
{
71
minLocPtr[minLocCount] = j;
72
minLocPtr[minLocCount + 1] = i;
73
}
74
minLocCount += 2;
75
}
76
}
77
}
78
79
} // namespace
80
81
#endif
82
83
void fillMinMaxLocs(const Size2D & size,
84
const u8 * srcBase, ptrdiff_t srcStride,
85
u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
86
u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
87
{
88
internal::assertSupportedConfiguration();
89
#ifdef CAROTENE_NEON
90
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
91
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
92
93
uint8x16_t v_maxval16 = vdupq_n_u8(maxVal), v_minval16 = vdupq_n_u8(minVal);
94
uint8x8_t v_maxval8 = vdup_n_u8(maxVal), v_minval8 = vdup_n_u8(minVal);
95
96
u64 mask[2] = { 0ul };
97
98
minLocCapacity <<= 1;
99
maxLocCapacity <<= 1;
100
101
for (size_t i = 0; i < size.height; ++i)
102
{
103
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
104
size_t j = 0;
105
106
for ( ; j < roiw16; j += 16)
107
{
108
internal::prefetch(src + j);
109
uint8x16_t v_src = vld1q_u8(src + j);
110
111
uint8x16_t v_maxmask = vceqq_u8(v_src, v_maxval16);
112
uint8x16_t v_minmask = vceqq_u8(v_src, v_minval16);
113
uint8x16_t v_mask = vorrq_u8(v_maxmask, v_minmask);
114
115
vst1q_u8((u8 *)&mask[0], v_mask);
116
117
if (mask[0])
118
process(src, j, j + 8, i,
119
minVal, minLocPtr, minLocCount, minLocCapacity,
120
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
121
if (mask[1])
122
process(src, j + 8, j + 16, i,
123
minVal, minLocPtr, minLocCount, minLocCapacity,
124
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
125
}
126
for ( ; j < roiw8; j += 8)
127
{
128
uint8x8_t v_src = vld1_u8(src + j);
129
130
uint8x8_t v_maxmask = vceq_u8(v_src, v_maxval8);
131
uint8x8_t v_minmask = vceq_u8(v_src, v_minval8);
132
uint8x8_t v_mask = vorr_u8(v_maxmask, v_minmask);
133
134
vst1_u8((u8 *)&mask[0], v_mask);
135
136
if (mask[0])
137
process(src, j, j + 8, i,
138
minVal, minLocPtr, minLocCount, minLocCapacity,
139
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
140
}
141
142
process(src, j, size.width, i,
143
minVal, minLocPtr, minLocCount, minLocCapacity,
144
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
145
}
146
147
minLocCount >>= 1;
148
maxLocCount >>= 1;
149
#else
150
(void)size;
151
(void)srcBase;
152
(void)srcStride;
153
(void)minVal;
154
(void)minLocPtr;
155
(void)minLocCount;
156
(void)minLocCapacity;
157
(void)maxVal;
158
(void)maxLocPtr;
159
(void)maxLocCount;
160
(void)maxLocCapacity;
161
#endif
162
}
163
164
void fillMinMaxLocs(const Size2D & size,
165
const u16 * srcBase, ptrdiff_t srcStride,
166
u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
167
u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
168
{
169
internal::assertSupportedConfiguration();
170
#ifdef CAROTENE_NEON
171
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
172
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
173
174
uint16x8_t v_maxval8 = vdupq_n_u16(maxVal),
175
v_minval8 = vdupq_n_u16(minVal);
176
u64 mask[2] = { 0ul };
177
178
minLocCapacity <<= 1;
179
maxLocCapacity <<= 1;
180
181
for (size_t i = 0; i < size.height; ++i)
182
{
183
const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
184
size_t j = 0;
185
186
for ( ; j < roiw16; j += 16)
187
{
188
internal::prefetch(src + j);
189
uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
190
191
uint16x8_t v_mask0 = vorrq_u16(vceqq_u16(v_src0, v_maxval8), vceqq_u16(v_src0, v_minval8));
192
uint16x8_t v_mask1 = vorrq_u16(vceqq_u16(v_src1, v_maxval8), vceqq_u16(v_src1, v_minval8));
193
194
vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
195
196
if (mask[0])
197
process(src, j, j + 8, i,
198
minVal, minLocPtr, minLocCount, minLocCapacity,
199
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
200
if (mask[1])
201
process(src, j + 8, j + 16, i,
202
minVal, minLocPtr, minLocCount, minLocCapacity,
203
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
204
}
205
for ( ; j < roiw8; j += 8)
206
{
207
internal::prefetch(src + j);
208
uint16x8_t v_src = vld1q_u16(src + j);
209
210
uint16x8_t v_maxmask = vceqq_u16(v_src, v_maxval8);
211
uint16x8_t v_minmask = vceqq_u16(v_src, v_minval8);
212
uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
213
214
vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
215
216
if (mask[0])
217
process(src, j, j + 8, i,
218
minVal, minLocPtr, minLocCount, minLocCapacity,
219
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
220
}
221
222
process(src, j, size.width, i,
223
minVal, minLocPtr, minLocCount, minLocCapacity,
224
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
225
}
226
227
minLocCount >>= 1;
228
maxLocCount >>= 1;
229
#else
230
(void)size;
231
(void)srcBase;
232
(void)srcStride;
233
(void)minVal;
234
(void)minLocPtr;
235
(void)minLocCount;
236
(void)minLocCapacity;
237
(void)maxVal;
238
(void)maxLocPtr;
239
(void)maxLocCount;
240
(void)maxLocCapacity;
241
#endif
242
}
243
244
void fillMinMaxLocs(const Size2D & size,
245
const s16 * srcBase, ptrdiff_t srcStride,
246
s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
247
s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
248
{
249
internal::assertSupportedConfiguration();
250
#ifdef CAROTENE_NEON
251
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
252
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
253
254
int16x8_t v_maxval8 = vdupq_n_s16(maxVal),
255
v_minval8 = vdupq_n_s16(minVal);
256
u64 mask[2] = { 0ul };
257
258
minLocCapacity <<= 1;
259
maxLocCapacity <<= 1;
260
261
for (size_t i = 0; i < size.height; ++i)
262
{
263
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
264
size_t j = 0;
265
266
for ( ; j < roiw16; j += 16)
267
{
268
internal::prefetch(src + j);
269
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
270
271
uint16x8_t v_mask0 = vorrq_u16(vceqq_s16(v_src0, v_maxval8), vceqq_s16(v_src0, v_minval8));
272
uint16x8_t v_mask1 = vorrq_u16(vceqq_s16(v_src1, v_maxval8), vceqq_s16(v_src1, v_minval8));
273
274
vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
275
276
if (mask[0])
277
process(src, j, j + 8, i,
278
minVal, minLocPtr, minLocCount, minLocCapacity,
279
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
280
if (mask[1])
281
process(src, j + 8, j + 16, i,
282
minVal, minLocPtr, minLocCount, minLocCapacity,
283
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
284
}
285
for ( ; j < roiw8; j += 8)
286
{
287
internal::prefetch(src + j);
288
int16x8_t v_src = vld1q_s16(src + j);
289
290
uint16x8_t v_maxmask = vceqq_s16(v_src, v_maxval8);
291
uint16x8_t v_minmask = vceqq_s16(v_src, v_minval8);
292
uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
293
294
vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
295
296
if (mask[0])
297
process(src, j, j + 8, i,
298
minVal, minLocPtr, minLocCount, minLocCapacity,
299
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
300
}
301
302
process(src, j, size.width, i,
303
minVal, minLocPtr, minLocCount, minLocCapacity,
304
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
305
}
306
307
minLocCount >>= 1;
308
maxLocCount >>= 1;
309
#else
310
(void)size;
311
(void)srcBase;
312
(void)srcStride;
313
(void)minVal;
314
(void)minLocPtr;
315
(void)minLocCount;
316
(void)minLocCapacity;
317
(void)maxVal;
318
(void)maxLocPtr;
319
(void)maxLocCount;
320
(void)maxLocCapacity;
321
#endif
322
}
323
324
void fillMinMaxLocs(const Size2D & size,
325
const s32 * srcBase, ptrdiff_t srcStride,
326
s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
327
s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
328
{
329
internal::assertSupportedConfiguration();
330
#ifdef CAROTENE_NEON
331
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
332
333
int32x4_t v_maxval4 = vdupq_n_s32(maxVal),
334
v_minval4 = vdupq_n_s32(minVal);
335
u64 mask = 0ul;
336
337
minLocCapacity <<= 1;
338
maxLocCapacity <<= 1;
339
340
for (size_t i = 0; i < size.height; ++i)
341
{
342
const s32 * src = internal::getRowPtr(srcBase, srcStride, i);
343
size_t j = 0;
344
345
for ( ; j < roiw8; j += 8)
346
{
347
internal::prefetch(src + j);
348
int32x4_t v_src0 = vld1q_s32(src + j), v_src1 = vld1q_s32(src + j + 4);
349
350
uint32x4_t v_mask0 = vorrq_u32(vceqq_s32(v_src0, v_maxval4), vceqq_s32(v_src0, v_minval4));
351
uint32x4_t v_mask1 = vorrq_u32(vceqq_s32(v_src1, v_maxval4), vceqq_s32(v_src1, v_minval4));
352
353
vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
354
355
if (mask)
356
process(src, j, j + 8, i,
357
minVal, minLocPtr, minLocCount, minLocCapacity,
358
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
359
}
360
361
process(src, j, size.width, i,
362
minVal, minLocPtr, minLocCount, minLocCapacity,
363
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
364
}
365
366
minLocCount >>= 1;
367
maxLocCount >>= 1;
368
#else
369
(void)size;
370
(void)srcBase;
371
(void)srcStride;
372
(void)minVal;
373
(void)minLocPtr;
374
(void)minLocCount;
375
(void)minLocCapacity;
376
(void)maxVal;
377
(void)maxLocPtr;
378
(void)maxLocCount;
379
(void)maxLocCapacity;
380
#endif
381
}
382
383
void fillMinMaxLocs(const Size2D & size,
384
const u32 * srcBase, ptrdiff_t srcStride,
385
u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
386
u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
387
{
388
internal::assertSupportedConfiguration();
389
#ifdef CAROTENE_NEON
390
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
391
392
uint32x4_t v_maxval4 = vdupq_n_u32(maxVal),
393
v_minval4 = vdupq_n_u32(minVal);
394
u64 mask = 0ul;
395
396
minLocCapacity <<= 1;
397
maxLocCapacity <<= 1;
398
399
for (size_t i = 0; i < size.height; ++i)
400
{
401
const u32 * src = internal::getRowPtr(srcBase, srcStride, i);
402
size_t j = 0;
403
404
for ( ; j < roiw8; j += 8)
405
{
406
internal::prefetch(src + j);
407
uint32x4_t v_src0 = vld1q_u32(src + j), v_src1 = vld1q_u32(src + j + 4);
408
409
uint32x4_t v_mask0 = vorrq_u32(vceqq_u32(v_src0, v_maxval4), vceqq_u32(v_src0, v_minval4));
410
uint32x4_t v_mask1 = vorrq_u32(vceqq_u32(v_src1, v_maxval4), vceqq_u32(v_src1, v_minval4));
411
412
vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
413
414
if (mask)
415
process(src, j, j + 8, i,
416
minVal, minLocPtr, minLocCount, minLocCapacity,
417
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
418
}
419
420
process(src, j, size.width, i,
421
minVal, minLocPtr, minLocCount, minLocCapacity,
422
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
423
}
424
425
minLocCount >>= 1;
426
maxLocCount >>= 1;
427
#else
428
(void)size;
429
(void)srcBase;
430
(void)srcStride;
431
(void)minVal;
432
(void)minLocPtr;
433
(void)minLocCount;
434
(void)minLocCapacity;
435
(void)maxVal;
436
(void)maxLocPtr;
437
(void)maxLocCount;
438
(void)maxLocCapacity;
439
#endif
440
}
441
442
} // namespace CAROTENE_NS
443
444