Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/reduce.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
42
#include <cstring>
43
44
namespace CAROTENE_NS {
45
46
void reduceColSum(const Size2D &size,
47
const u8 * srcBase, ptrdiff_t srcStride,
48
s32 * dstBase)
49
{
50
internal::assertSupportedConfiguration();
51
#ifdef CAROTENE_NEON
52
memset(dstBase, 0, size.width*sizeof(s32));
53
size_t i = 0;
54
for (; i + 16 <= size.width; i += 16)
55
{
56
const u8* src_address = srcBase + i;
57
58
int32x4_t sll = vmovq_n_s32(0);
59
int32x4_t slh = vmovq_n_s32(0);
60
int32x4_t shl = vmovq_n_s32(0);
61
int32x4_t shh = vmovq_n_s32(0);
62
63
for (size_t h = 0; h < size.height; h += 256)
64
{
65
size_t lim = std::min(h + 256, size.height);
66
67
uint16x8_t sl = vmovq_n_u16(0);
68
uint16x8_t sh = vmovq_n_u16(0);
69
70
for (size_t k = h; k < lim; ++k, src_address += srcStride)
71
{
72
internal::prefetch(src_address + srcStride, 0);
73
74
uint8x16_t v = vld1q_u8(src_address);
75
76
sl = vaddw_u8(sl, vget_low_u8(v));
77
sh = vaddw_u8(sh, vget_high_u8(v));
78
}
79
80
int32x4_t vsll = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sl)));
81
int32x4_t vslh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sl)));
82
int32x4_t vshl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sh)));
83
int32x4_t vshh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sh)));
84
85
sll = vqaddq_s32(sll, vsll);
86
slh = vqaddq_s32(slh, vslh);
87
shl = vqaddq_s32(shl, vshl);
88
shh = vqaddq_s32(shh, vshh);
89
}
90
91
vst1q_s32(dstBase + i + 0, sll);
92
vst1q_s32(dstBase + i + 4, slh);
93
vst1q_s32(dstBase + i + 8, shl);
94
vst1q_s32(dstBase + i + 12, shh);
95
}
96
97
for(size_t h = 0; h < size.height; ++h)
98
{
99
for(size_t j = i ; j < size.width; j++ )
100
{
101
if (((u32)(dstBase[j] += srcBase[j + srcStride * h])) > 0x7fFFffFFu)
102
dstBase[j] = 0x7fFFffFF;
103
}
104
}
105
#else
106
(void)size;
107
(void)srcBase;
108
(void)srcStride;
109
(void)dstBase;
110
#endif
111
}
112
113
void reduceColMax(const Size2D &size,
114
const u8 * srcBase, ptrdiff_t srcStride,
115
u8 * dstBase)
116
{
117
internal::assertSupportedConfiguration();
118
#ifdef CAROTENE_NEON
119
memcpy(dstBase, srcBase, size.width);
120
size_t i = 0;
121
for (; i + 16*4 <= size.width; i += 16*4)
122
{
123
const u8* src_address = srcBase + i;
124
125
uint8x16_t s1 = vld1q_u8(src_address + 0);
126
uint8x16_t s2 = vld1q_u8(src_address + 16);
127
uint8x16_t s3 = vld1q_u8(src_address + 32);
128
uint8x16_t s4 = vld1q_u8(src_address + 48);
129
130
src_address += srcStride;
131
132
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
133
{
134
internal::prefetch(src_address + srcStride, 0);
135
internal::prefetch(src_address + srcStride, 32);
136
137
uint8x16_t v1 = vld1q_u8(src_address + 0);
138
uint8x16_t v2 = vld1q_u8(src_address + 16);
139
uint8x16_t v3 = vld1q_u8(src_address + 32);
140
uint8x16_t v4 = vld1q_u8(src_address + 48);
141
142
s1 = vmaxq_u8(s1, v1);
143
s2 = vmaxq_u8(s2, v2);
144
s3 = vmaxq_u8(s3, v3);
145
s4 = vmaxq_u8(s4, v4);
146
}
147
148
vst1q_u8(dstBase + i + 0, s1);
149
vst1q_u8(dstBase + i + 16, s2);
150
vst1q_u8(dstBase + i + 32, s3);
151
vst1q_u8(dstBase + i + 48, s4);
152
}
153
154
for (; i + 16 <= size.width; i += 16)
155
{
156
const u8* src_address = srcBase + i;
157
uint8x16_t s1 = vld1q_u8(src_address);
158
src_address += srcStride;
159
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
160
{
161
internal::prefetch(src_address + srcStride, 0);
162
163
uint8x16_t v1 = vld1q_u8(src_address);
164
s1 = vmaxq_u8(s1, v1);
165
}
166
vst1q_u8(dstBase + i, s1);
167
}
168
169
if (i < size.width)
170
for(size_t h = 1; h < size.height; ++h)
171
for(size_t j = i ; j < size.width; j++ )
172
dstBase[j] = std::max(dstBase[j], srcBase[j + srcStride * h]);
173
#else
174
(void)size;
175
(void)srcBase;
176
(void)srcStride;
177
(void)dstBase;
178
#endif
179
}
180
181
void reduceColMin(const Size2D &size,
182
const u8 * srcBase, ptrdiff_t srcStride,
183
u8 * dstBase)
184
{
185
internal::assertSupportedConfiguration();
186
#ifdef CAROTENE_NEON
187
memcpy(dstBase, srcBase, size.width);
188
size_t i = 0;
189
for (; i + 16*4 <= size.width; i += 16*4)
190
{
191
const u8* src_address = srcBase + i;
192
193
uint8x16_t s1 = vld1q_u8(src_address + 0);
194
uint8x16_t s2 = vld1q_u8(src_address + 16);
195
uint8x16_t s3 = vld1q_u8(src_address + 32);
196
uint8x16_t s4 = vld1q_u8(src_address + 48);
197
198
src_address += srcStride;
199
200
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
201
{
202
internal::prefetch(src_address + srcStride, 0);
203
internal::prefetch(src_address + srcStride, 32);
204
205
uint8x16_t v1 = vld1q_u8(src_address + 0);
206
uint8x16_t v2 = vld1q_u8(src_address + 16);
207
uint8x16_t v3 = vld1q_u8(src_address + 32);
208
uint8x16_t v4 = vld1q_u8(src_address + 48);
209
210
s1 = vminq_u8(s1, v1);
211
s2 = vminq_u8(s2, v2);
212
s3 = vminq_u8(s3, v3);
213
s4 = vminq_u8(s4, v4);
214
}
215
216
vst1q_u8(dstBase + i + 0, s1);
217
vst1q_u8(dstBase + i + 16, s2);
218
vst1q_u8(dstBase + i + 32, s3);
219
vst1q_u8(dstBase + i + 48, s4);
220
}
221
222
for (; i + 16 <= size.width; i += 16)
223
{
224
const u8* src_address = srcBase + i;
225
uint8x16_t s1 = vld1q_u8(src_address);
226
src_address += srcStride;
227
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
228
{
229
internal::prefetch(src_address + srcStride, 0);
230
231
uint8x16_t v1 = vld1q_u8(src_address);
232
s1 = vminq_u8(s1, v1);
233
}
234
vst1q_u8(dstBase + i, s1);
235
}
236
237
if (i < size.width)
238
for(size_t h = 1; h < size.height; ++h)
239
for(size_t j = i ; j < size.width; j++ )
240
dstBase[j] = std::min(dstBase[j], srcBase[j + srcStride * h]);
241
#else
242
(void)size;
243
(void)srcBase;
244
(void)srcStride;
245
(void)dstBase;
246
#endif
247
}
248
249
void reduceColSum(const Size2D &size,
250
const f32 * srcBase, ptrdiff_t srcStride,
251
f32 * dstBase)
252
{
253
internal::assertSupportedConfiguration();
254
#ifdef CAROTENE_NEON
255
memcpy(dstBase, srcBase, size.width*sizeof(f32));
256
size_t srcstep = srcStride/sizeof(f32);
257
size_t i = 0;
258
for (; i + 16 <= size.width; i += 16)
259
{
260
const f32* src_address = srcBase + i;
261
262
float32x4_t s1 = vld1q_f32(src_address + 0);
263
float32x4_t s2 = vld1q_f32(src_address + 4);
264
float32x4_t s3 = vld1q_f32(src_address + 8);
265
float32x4_t s4 = vld1q_f32(src_address + 12);
266
267
src_address += srcstep;
268
269
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
270
{
271
internal::prefetch(src_address + srcstep, 0);
272
internal::prefetch(src_address + srcstep, 32);
273
274
float32x4_t v1 = vld1q_f32(src_address + 0);
275
float32x4_t v2 = vld1q_f32(src_address + 4);
276
float32x4_t v3 = vld1q_f32(src_address + 8);
277
float32x4_t v4 = vld1q_f32(src_address + 12);
278
279
s1 = vaddq_f32(s1, v1);
280
s2 = vaddq_f32(s2, v2);
281
s3 = vaddq_f32(s3, v3);
282
s4 = vaddq_f32(s4, v4);
283
}
284
285
vst1q_f32(dstBase + i + 0, s1);
286
vst1q_f32(dstBase + i + 4, s2);
287
vst1q_f32(dstBase + i + 8, s3);
288
vst1q_f32(dstBase + i + 12, s4);
289
}
290
291
for (; i + 4 <= size.width; i += 4)
292
{
293
const f32* src_address = srcBase + i;
294
float32x4_t s1 = vld1q_f32(src_address);
295
src_address += srcstep;
296
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
297
{
298
internal::prefetch(src_address + srcstep, 0);
299
300
float32x4_t v1 = vld1q_f32(src_address);
301
s1 = vaddq_f32(s1, v1);
302
}
303
vst1q_f32(dstBase + i, s1);
304
}
305
306
if (i < size.width)
307
for(size_t h = 1; h < size.height; ++h)
308
{
309
for(size_t j = i ; j < size.width; j++ )
310
{
311
dstBase[j] += srcBase[j + srcstep * h];
312
}
313
}
314
#else
315
(void)size;
316
(void)srcBase;
317
(void)srcStride;
318
(void)dstBase;
319
#endif
320
}
321
322
void reduceColMax(const Size2D &size,
323
const f32 * srcBase, ptrdiff_t srcStride,
324
f32 * dstBase)
325
{
326
internal::assertSupportedConfiguration();
327
#ifdef CAROTENE_NEON
328
memcpy(dstBase, srcBase, size.width*sizeof(f32));
329
size_t srcstep = srcStride/sizeof(f32);
330
size_t i = 0;
331
for (; i + 16 <= size.width; i += 16)
332
{
333
const f32* src_address = srcBase + i;
334
335
float32x4_t s1 = vld1q_f32(src_address + 0);
336
float32x4_t s2 = vld1q_f32(src_address + 4);
337
float32x4_t s3 = vld1q_f32(src_address + 8);
338
float32x4_t s4 = vld1q_f32(src_address + 12);
339
340
src_address += srcstep;
341
342
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
343
{
344
internal::prefetch(src_address + srcstep, 0);
345
internal::prefetch(src_address + srcstep, 32);
346
347
float32x4_t v1 = vld1q_f32(src_address + 0);
348
float32x4_t v2 = vld1q_f32(src_address + 4);
349
float32x4_t v3 = vld1q_f32(src_address + 8);
350
float32x4_t v4 = vld1q_f32(src_address + 12);
351
352
s1 = vmaxq_f32(s1, v1);
353
s2 = vmaxq_f32(s2, v2);
354
s3 = vmaxq_f32(s3, v3);
355
s4 = vmaxq_f32(s4, v4);
356
}
357
358
vst1q_f32(dstBase + i + 0, s1);
359
vst1q_f32(dstBase + i + 4, s2);
360
vst1q_f32(dstBase + i + 8, s3);
361
vst1q_f32(dstBase + i + 12, s4);
362
}
363
364
for (; i + 4 <= size.width; i += 4)
365
{
366
const f32* src_address = srcBase + i;
367
float32x4_t s1 = vld1q_f32(src_address);
368
src_address += srcstep;
369
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
370
{
371
internal::prefetch(src_address + srcstep, 0);
372
373
float32x4_t v1 = vld1q_f32(src_address);
374
s1 = vmaxq_f32(s1, v1);
375
}
376
vst1q_f32(dstBase + i, s1);
377
}
378
379
if (i < size.width)
380
for(size_t h = 1; h < size.height; ++h)
381
for(size_t j = i ; j < size.width; j++ )
382
dstBase[j] = std::max(dstBase[j], srcBase[j + srcstep * h]);
383
#else
384
(void)size;
385
(void)srcBase;
386
(void)srcStride;
387
(void)dstBase;
388
#endif
389
}
390
391
void reduceColMin(const Size2D &size,
392
const f32 * srcBase, ptrdiff_t srcStride,
393
f32 * dstBase)
394
{
395
internal::assertSupportedConfiguration();
396
#ifdef CAROTENE_NEON
397
memcpy(dstBase, srcBase, size.width*sizeof(f32));
398
size_t srcstep = srcStride/sizeof(f32);
399
size_t i = 0;
400
for (; i + 16 <= size.width; i += 16)
401
{
402
const f32* src_address = srcBase + i;
403
404
float32x4_t s1 = vld1q_f32(src_address + 0);
405
float32x4_t s2 = vld1q_f32(src_address + 4);
406
float32x4_t s3 = vld1q_f32(src_address + 8);
407
float32x4_t s4 = vld1q_f32(src_address + 12);
408
409
src_address += srcstep;
410
411
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
412
{
413
internal::prefetch(src_address + srcstep, 0);
414
internal::prefetch(src_address + srcstep, 32);
415
416
float32x4_t v1 = vld1q_f32(src_address + 0);
417
float32x4_t v2 = vld1q_f32(src_address + 4);
418
float32x4_t v3 = vld1q_f32(src_address + 8);
419
float32x4_t v4 = vld1q_f32(src_address + 12);
420
421
s1 = vminq_f32(s1, v1);
422
s2 = vminq_f32(s2, v2);
423
s3 = vminq_f32(s3, v3);
424
s4 = vminq_f32(s4, v4);
425
}
426
427
vst1q_f32(dstBase + i + 0, s1);
428
vst1q_f32(dstBase + i + 4, s2);
429
vst1q_f32(dstBase + i + 8, s3);
430
vst1q_f32(dstBase + i + 12, s4);
431
}
432
433
for (; i + 4 <= size.width; i += 4)
434
{
435
const f32* src_address = srcBase + i;
436
float32x4_t s1 = vld1q_f32(src_address);
437
src_address += srcstep;
438
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
439
{
440
internal::prefetch(src_address + srcstep, 0);
441
442
float32x4_t v1 = vld1q_f32(src_address);
443
s1 = vminq_f32(s1, v1);
444
}
445
vst1q_f32(dstBase + i, s1);
446
}
447
448
if (i < size.width)
449
for(size_t h = 1; h < size.height; ++h)
450
for(size_t j = i ; j < size.width; j++ )
451
dstBase[j] = std::min(dstBase[j], srcBase[j + srcstep * h]);
452
#else
453
(void)size;
454
(void)srcBase;
455
(void)srcStride;
456
(void)dstBase;
457
#endif
458
}
459
460
} // namespace CAROTENE_NS
461
462