Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/channel_extract.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
#include "vtransform.hpp"
42
43
namespace CAROTENE_NS {
44
45
void extract2(const Size2D &size,
46
const u8 * srcBase, ptrdiff_t srcStride,
47
u8 * dstBase, ptrdiff_t dstStride,
48
u32 coi)
49
{
50
internal::assertSupportedConfiguration();
51
#ifdef CAROTENE_NEON
52
#ifndef __ANDROID__
53
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
54
#endif
55
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
56
57
for (size_t i = 0u; i < size.height; ++i)
58
{
59
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
60
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
61
size_t sj = 0u, dj = 0u;
62
63
#ifndef __ANDROID__
64
for (; dj < roiw32; sj += 64, dj += 32)
65
{
66
internal::prefetch(src + sj);
67
68
uint8x16x2_t v_src = vld2q_u8(src + sj);
69
vst1q_u8(dst + dj, v_src.val[coi]);
70
71
v_src = vld2q_u8(src + sj + 32);
72
vst1q_u8(dst + dj + 16, v_src.val[coi]);
73
}
74
#endif
75
76
for (; dj < roiw8; sj += 16, dj += 8)
77
{
78
uint8x8x2_t v_src = vld2_u8(src + sj);
79
vst1_u8(dst + dj, v_src.val[coi]);
80
}
81
82
for (; dj < size.width; sj += 2, ++dj)
83
{
84
dst[dj] = src[sj + coi];
85
}
86
}
87
#else
88
(void)size;
89
(void)srcBase;
90
(void)srcStride;
91
(void)dstBase;
92
(void)dstStride;
93
(void)coi;
94
#endif
95
}
96
97
void extract3(const Size2D &size,
98
const u8 * srcBase, ptrdiff_t srcStride,
99
u8 * dstBase, ptrdiff_t dstStride,
100
u32 coi)
101
{
102
internal::assertSupportedConfiguration();
103
#ifdef CAROTENE_NEON
104
#ifndef __ANDROID__
105
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
106
#endif
107
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
108
109
for (size_t i = 0u; i < size.height; ++i)
110
{
111
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
112
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
113
size_t sj = 0u, dj = 0u;
114
115
#ifndef __ANDROID__
116
for (; dj < roiw32; sj += 96, dj += 32)
117
{
118
internal::prefetch(src + sj);
119
120
uint8x16x3_t v_src = vld3q_u8(src + sj);
121
vst1q_u8(dst + dj, v_src.val[coi]);
122
123
v_src = vld3q_u8(src + sj + 48);
124
vst1q_u8(dst + dj + 16, v_src.val[coi]);
125
}
126
#endif
127
128
for (; dj < roiw8; sj += 24, dj += 8)
129
{
130
uint8x8x3_t v_src = vld3_u8(src + sj);
131
vst1_u8(dst + dj, v_src.val[coi]);
132
}
133
134
for (; dj < size.width; sj += 3, ++dj)
135
{
136
dst[dj] = src[sj + coi];
137
}
138
}
139
#else
140
(void)size;
141
(void)srcBase;
142
(void)srcStride;
143
(void)dstBase;
144
(void)dstStride;
145
(void)coi;
146
#endif
147
}
148
149
void extract4(const Size2D &size,
150
const u8 * srcBase, ptrdiff_t srcStride,
151
u8 * dstBase, ptrdiff_t dstStride,
152
u32 coi)
153
{
154
internal::assertSupportedConfiguration();
155
#ifdef CAROTENE_NEON
156
#ifndef __ANDROID__
157
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
158
#endif
159
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
160
161
for (size_t i = 0u; i < size.height; ++i)
162
{
163
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
164
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
165
size_t sj = 0u, dj = 0u;
166
167
#ifndef __ANDROID__
168
for (; dj < roiw32; sj += 128, dj += 32)
169
{
170
internal::prefetch(src + sj);
171
172
uint8x16x4_t v_src = vld4q_u8(src + sj);
173
vst1q_u8(dst + dj, v_src.val[coi]);
174
175
v_src = vld4q_u8(src + sj + 64);
176
vst1q_u8(dst + dj + 16, v_src.val[coi]);
177
}
178
#endif
179
180
for (; dj < roiw8; sj += 32, dj += 8)
181
{
182
uint8x8x4_t v_src = vld4_u8(src + sj);
183
vst1_u8(dst + dj, v_src.val[coi]);
184
}
185
186
for (; dj < size.width; sj += 4, ++dj)
187
{
188
dst[dj] = src[sj + coi];
189
}
190
}
191
#else
192
(void)size;
193
(void)srcBase;
194
(void)srcStride;
195
(void)dstBase;
196
(void)dstStride;
197
(void)coi;
198
#endif
199
}
200
201
#define FILL_LINES2(macro,type) \
202
macro##_LINE(type,0) \
203
macro##_LINE(type,1)
204
#define FILL_LINES3(macro,type) \
205
FILL_LINES2(macro,type) \
206
macro##_LINE(type,2)
207
#define FILL_LINES4(macro,type) \
208
FILL_LINES3(macro,type) \
209
macro##_LINE(type,3)
210
211
#define FARG_LINE(type, n) , type * dst##n##Base, ptrdiff_t dst##n##Stride
212
213
#ifdef CAROTENE_NEON
214
215
#define VROW_LINE(type, n) type * dst##n = internal::getRowPtr(dst##n##Base, dst##n##Stride, i);
216
#define VST1Q_LINE(type, n) vst1q_##type(dst##n + dj, v_src.val[n]);
217
#define VST1_LINE(type, n) vst1_##type(dst##n + dj, v_src.val[n]);
218
#define SST_LINE(type, n) dst##n[dj] = src[sj + n];
219
220
#define MUL2(val) (val << 1)
221
#define MUL3(val) (MUL2(val) + val)
222
#define MUL4(val) (val << 2)
223
224
#define CONTDST2 srcStride == dst0Stride && \
225
srcStride == dst1Stride &&
226
#define CONTDST3 srcStride == dst0Stride && \
227
srcStride == dst1Stride && \
228
srcStride == dst2Stride &&
229
#define CONTDST4 srcStride == dst0Stride && \
230
srcStride == dst1Stride && \
231
srcStride == dst2Stride && \
232
srcStride == dst3Stride &&
233
234
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
235
236
#define SPLIT_ASM2(sgn, bits) __asm__ ( \
237
"vld2." #bits " {d0, d2}, [%[in0]] \n\t" \
238
"vld2." #bits " {d1, d3}, [%[in1]] \n\t" \
239
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
240
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
241
: \
242
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), \
243
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL2(8)/sizeof(sgn##bits)) \
244
: "d0","d1","d2","d3" \
245
);
246
#define SPLIT_ASM3(sgn, bits) __asm__ ( \
247
"vld3." #bits " {d0, d2, d4}, [%[in0]] \n\t" \
248
"vld3." #bits " {d1, d3, d5}, [%[in1]] \n\t" \
249
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
250
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
251
"vst1." #bits " {d4-d5}, [%[out2]] \n\t" \
252
: \
253
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \
254
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL3(8)/sizeof(sgn##bits)) \
255
: "d0","d1","d2","d3","d4","d5" \
256
);
257
#define SPLIT_ASM4(sgn, bits) __asm__ ( \
258
"vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \
259
"vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \
260
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
261
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
262
"vst1." #bits " {d4-d5}, [%[out2]] \n\t" \
263
"vst1." #bits " {d6-d7}, [%[out3]] \n\t" \
264
: \
265
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), [out3] "r" (dst3 + dj), \
266
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL4(8)/sizeof(sgn##bits)) \
267
: "d0","d1","d2","d3","d4","d5","d6","d7" \
268
);
269
270
#define SPLIT_QUAD(sgn, bits, n) { \
271
internal::prefetch(src + sj); \
272
SPLIT_ASM##n(sgn, bits) \
273
}
274
275
#else
276
277
#define SPLIT_QUAD(sgn, bits, n) { \
278
internal::prefetch(src + sj); \
279
vec128 v_src = vld##n##q_##sgn##bits(src + sj); \
280
FILL_LINES##n(VST1Q, sgn##bits) \
281
}
282
283
#endif
284
285
#define SPLIT(sgn,bits,n) void split##n(const Size2D &_size, \
286
const sgn##bits * srcBase, ptrdiff_t srcStride \
287
FILL_LINES##n(FARG, sgn##bits) ) \
288
{ \
289
internal::assertSupportedConfiguration(); \
290
Size2D size(_size); \
291
if (CONTDST##n \
292
dst0Stride == (ptrdiff_t)(size.width)) \
293
{ \
294
size.width *= size.height; \
295
size.height = 1; \
296
} \
297
typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \
298
size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
299
typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \
300
size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \
301
\
302
for (size_t i = 0u; i < size.height; ++i) \
303
{ \
304
const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \
305
FILL_LINES##n(VROW, sgn##bits) \
306
size_t sj = 0u, dj = 0u; \
307
\
308
for (; dj < roiw16; sj += MUL##n(16)/sizeof(sgn##bits), dj += 16/sizeof(sgn##bits)) \
309
SPLIT_QUAD(sgn, bits, n) \
310
\
311
if (dj < roiw8) \
312
{ \
313
vec64 v_src = vld##n##_##sgn##bits(src + sj); \
314
FILL_LINES##n(VST1, sgn##bits) \
315
sj += MUL##n(8)/sizeof(sgn##bits); \
316
dj += 8/sizeof(sgn##bits); \
317
} \
318
\
319
for (; dj < size.width; sj += n, ++dj) \
320
{ \
321
FILL_LINES##n(SST, sgn##bits) \
322
} \
323
} \
324
}
325
326
#define SPLIT64(sgn,n) void split##n(const Size2D &_size, \
327
const sgn##64 * srcBase, ptrdiff_t srcStride \
328
FILL_LINES##n(FARG, sgn##64) ) \
329
{ \
330
internal::assertSupportedConfiguration(); \
331
Size2D size(_size); \
332
if (CONTDST##n \
333
dst0Stride == (ptrdiff_t)(size.width)) \
334
{ \
335
size.width *= size.height; \
336
size.height = 1; \
337
} \
338
typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \
339
\
340
for (size_t i = 0u; i < size.height; ++i) \
341
{ \
342
const sgn##64 * src = internal::getRowPtr(srcBase, srcStride, i); \
343
FILL_LINES##n(VROW, sgn##64) \
344
size_t sj = 0u, dj = 0u; \
345
\
346
for (; dj < size.width; sj += n, ++dj) \
347
{ \
348
vec64 v_src = vld##n##_##sgn##64(src + sj); \
349
FILL_LINES##n(VST1, sgn##64) \
350
} \
351
} \
352
}
353
354
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
355
356
#define ALPHA_QUAD(sgn, bits) { \
357
internal::prefetch(src + sj); \
358
__asm__ ( \
359
"vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \
360
"vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \
361
"vst3." #bits " {d0, d2, d4}, [%[out3_1]] \n\t" \
362
"vst3." #bits " {d1, d3, d5}, [%[out3_2]] \n\t" \
363
"vst1." #bits " {d6-d7}, [%[out1]] \n\t" \
364
: \
365
: [out3_1] "r" (dst3 + d3j), [out3_2] "r" (dst3 + d3j + 24/sizeof(sgn##bits)), [out1] "r" (dst1 + d1j), \
366
[in0] "r" (src + sj), [in1] "r" (src + sj + 32/sizeof(sgn##bits)) \
367
: "d0","d1","d2","d3","d4","d5","d6","d7" \
368
); \
369
}
370
371
#else
372
373
#define ALPHA_QUAD(sgn, bits) { \
374
internal::prefetch(src + sj); \
375
union { vec128_4 v4; vec128_3 v3; } vals; \
376
vals.v4 = vld4q_##sgn##bits(src + sj); \
377
vst3q_##sgn##bits(dst3 + d3j, vals.v3); \
378
vst1q_##sgn##bits(dst1 + d1j, vals.v4.val[3]); \
379
}
380
381
#endif
382
383
#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &_size, \
384
const sgn##bits * srcBase, ptrdiff_t srcStride, \
385
sgn##bits * dst3Base, ptrdiff_t dst3Stride, \
386
sgn##bits * dst1Base, ptrdiff_t dst1Stride) \
387
{ \
388
internal::assertSupportedConfiguration(); \
389
Size2D size(_size); \
390
if (srcStride == dst3Stride && \
391
srcStride == dst1Stride && \
392
srcStride == (ptrdiff_t)(size.width)) \
393
{ \
394
size.width *= size.height; \
395
size.height = 1; \
396
} \
397
typedef internal::VecTraits<sgn##bits, 4>::vec128 vec128_4; \
398
typedef internal::VecTraits<sgn##bits, 3>::vec128 vec128_3; \
399
size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
400
typedef internal::VecTraits<sgn##bits, 4>::vec64 vec64_4; \
401
typedef internal::VecTraits<sgn##bits, 3>::vec64 vec64_3; \
402
size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \
403
\
404
for (size_t i = 0u; i < size.height; ++i) \
405
{ \
406
const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \
407
sgn##bits * dst3 = internal::getRowPtr(dst3Base, dst3Stride, i); \
408
sgn##bits * dst1 = internal::getRowPtr(dst1Base, dst1Stride, i); \
409
size_t sj = 0u, d3j = 0u, d1j = 0u; \
410
\
411
for (; d1j < roiw16; sj += MUL4(16)/sizeof(sgn##bits), d3j += MUL3(16)/sizeof(sgn##bits), \
412
d1j += 16/sizeof(sgn##bits)) \
413
ALPHA_QUAD(sgn, bits) \
414
\
415
if (d1j < roiw8) \
416
{ \
417
union { vec64_4 v4; vec64_3 v3; } vals; \
418
vals.v4 = vld4_##sgn##bits(src + sj); \
419
vst3_u8(dst3 + d3j, vals.v3); \
420
vst1_u8(dst1 + d1j, vals.v4.val[3]); \
421
sj += MUL4(8)/sizeof(sgn##bits); \
422
d3j += MUL3(8)/sizeof(sgn##bits); \
423
d1j += 8/sizeof(sgn##bits); \
424
} \
425
\
426
for (; d1j < size.width; sj += 4, d3j += 3, ++d1j) \
427
{ \
428
dst3[d3j+0] = src[sj + 0]; \
429
dst3[d3j+1] = src[sj + 1]; \
430
dst3[d3j+2] = src[sj + 2]; \
431
dst1[d1j] = src[sj + 3]; \
432
} \
433
} \
434
}
435
436
#else
437
438
#define VOID_LINE(type, n) (void)dst##n##Base; (void)dst##n##Stride;
439
440
#define SPLIT(sgn,bits,n) void split##n(const Size2D &size, \
441
const sgn##bits * srcBase, ptrdiff_t srcStride \
442
FILL_LINES##n(FARG, sgn##bits) ) \
443
{ \
444
internal::assertSupportedConfiguration(); \
445
(void)size; \
446
(void)srcBase; \
447
(void)srcStride; \
448
FILL_LINES##n(VOID, sgn##bits) \
449
}
450
451
#define SPLIT64(sgn,n) SPLIT(sgn,64,n)
452
453
#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &size, \
454
const sgn##bits * srcBase, ptrdiff_t srcStride, \
455
sgn##bits * dst3Base, ptrdiff_t dst3Stride, \
456
sgn##bits * dst1Base, ptrdiff_t dst1Stride) \
457
{ \
458
internal::assertSupportedConfiguration(); \
459
(void)size; \
460
(void)srcBase; \
461
(void)srcStride; \
462
(void)dst3Base; \
463
(void)dst3Stride; \
464
(void)dst1Base; \
465
(void)dst1Stride; \
466
}
467
468
#endif //CAROTENE_NEON
469
470
SPLIT(u, 8,2)
471
SPLIT(u, 8,3)
472
SPLIT(u, 8,4)
473
SPLIT(u,16,2)
474
SPLIT(u,16,3)
475
SPLIT(u,16,4)
476
SPLIT(s,32,2)
477
SPLIT(s,32,3)
478
SPLIT(s,32,4)
479
480
SPLIT64(s, 2)
481
SPLIT64(s, 3)
482
SPLIT64(s, 4)
483
484
SPLIT4ALPHA(u,8)
485
486
} // namespace CAROTENE_NS
487
488