Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/channels_combine.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
#include "vtransform.hpp"
42
43
namespace CAROTENE_NS {
44
45
#define FILL_LINES2(macro,type) \
46
macro##_LINE(type,0) \
47
macro##_LINE(type,1)
48
#define FILL_LINES3(macro,type) \
49
FILL_LINES2(macro,type) \
50
macro##_LINE(type,2)
51
#define FILL_LINES4(macro,type) \
52
FILL_LINES3(macro,type) \
53
macro##_LINE(type,3)
54
55
#define FARG_LINE(type, n) , const type * src##n##Base, ptrdiff_t src##n##Stride
56
57
#ifdef CAROTENE_NEON
58
59
#define VROW_LINE(type, n) const type * src##n = internal::getRowPtr(src##n##Base, src##n##Stride, i);
60
#define PREF_LINE(type, n) internal::prefetch(src##n + sj);
61
#define VLD1Q_LINE(type, n) v_dst.val[n] = vld1q_##type(src##n + sj);
62
#define PRLD_LINE(type, n) internal::prefetch(src##n + sj); v_dst.val[n] = vld1q_##type(src##n + sj);
63
#define VLD1_LINE(type, n) v_dst.val[n] = vld1_##type(src##n + sj);
64
#define SLD_LINE(type, n) dst[dj + n] = src##n[sj];
65
66
#define MUL2(val) (val << 1)
67
#define MUL3(val) (MUL2(val) + val)
68
#define MUL4(val) (val << 2)
69
70
#define CONTSRC2 dstStride == src0Stride && \
71
dstStride == src1Stride &&
72
#define CONTSRC3 dstStride == src0Stride && \
73
dstStride == src1Stride && \
74
dstStride == src2Stride &&
75
#define CONTSRC4 dstStride == src0Stride && \
76
dstStride == src1Stride && \
77
dstStride == src2Stride && \
78
dstStride == src3Stride &&
79
80
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
81
82
#define MERGE_ASM2(sgn, bits) __asm__ ( \
83
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
84
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
85
"vst2." #bits " {d0, d2}, [%[out0]] \n\t" \
86
"vst2." #bits " {d1, d3}, [%[out1]] \n\t" \
87
: \
88
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), \
89
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL2(8)/sizeof(sgn##bits)) \
90
: "d0","d1","d2","d3" \
91
);
92
#define MERGE_ASM3(sgn, bits) __asm__ ( \
93
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
94
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
95
"vld1." #bits " {d4-d5}, [%[in2]] \n\t" \
96
"vst3." #bits " {d0, d2, d4}, [%[out0]] \n\t" \
97
"vst3." #bits " {d1, d3, d5}, [%[out1]] \n\t" \
98
: \
99
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), \
100
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL3(8)/sizeof(sgn##bits)) \
101
: "d0","d1","d2","d3","d4","d5" \
102
);
103
#define MERGE_ASM4(sgn, bits) __asm__ ( \
104
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
105
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
106
"vld1." #bits " {d4-d5}, [%[in2]] \n\t" \
107
"vld1." #bits " {d6-d7}, [%[in3]] \n\t" \
108
"vst4." #bits " {d0, d2, d4, d6}, [%[out0]] \n\t" \
109
"vst4." #bits " {d1, d3, d5, d7}, [%[out1]] \n\t" \
110
: \
111
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), [in3] "r" (src3 + sj), \
112
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL4(8)/sizeof(sgn##bits)) \
113
: "d0","d1","d2","d3","d4","d5","d6","d7" \
114
);
115
116
#define MERGE_QUAD(sgn, bits, n) { \
117
FILL_LINES##n(PREF, sgn##bits) \
118
MERGE_ASM##n(sgn, bits) \
119
}
120
121
#else
122
123
#define MERGE_QUAD(sgn, bits, n) { \
124
vec128 v_dst; \
125
/*FILL_LINES##n(PREF, sgn##bits) \
126
FILL_LINES##n(VLD1Q, sgn##bits)*/ \
127
FILL_LINES##n(PRLD, sgn##bits) \
128
vst##n##q_##sgn##bits(dst + dj, v_dst); \
129
}
130
131
#endif
132
133
#define COMBINE(sgn,bits,n) void combine##n(const Size2D &_size \
134
FILL_LINES##n(FARG, sgn##bits), \
135
sgn##bits * dstBase, ptrdiff_t dstStride) \
136
{ \
137
internal::assertSupportedConfiguration(); \
138
Size2D size(_size); \
139
if (CONTSRC##n \
140
dstStride == (ptrdiff_t)(size.width)) \
141
{ \
142
size.width *= size.height; \
143
size.height = 1; \
144
} \
145
typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \
146
size_t roiw16 = size.width >= (16/sizeof(sgn##bits) - 1) ? size.width - (16/sizeof(sgn##bits) - 1) : 0; \
147
typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \
148
size_t roiw8 = size.width >= (8/sizeof(sgn##bits) - 1) ? size.width - (8/sizeof(sgn##bits) - 1) : 0; \
149
\
150
for (size_t i = 0u; i < size.height; ++i) \
151
{ \
152
FILL_LINES##n(VROW, sgn##bits) \
153
sgn##bits * dst = internal::getRowPtr(dstBase, dstStride, i); \
154
size_t sj = 0u, dj = 0u; \
155
\
156
for (; sj < roiw16; sj += 16/sizeof(sgn##bits), dj += MUL##n(16)/sizeof(sgn##bits)) \
157
MERGE_QUAD(sgn, bits, n) \
158
\
159
if ( sj < roiw8 ) \
160
{ \
161
vec64 v_dst; \
162
FILL_LINES##n(VLD1, sgn##bits) \
163
vst##n##_##sgn##bits(dst + dj, v_dst); \
164
sj += 8/sizeof(sgn##bits); dj += MUL##n(8)/sizeof(sgn##bits); \
165
} \
166
\
167
for (; sj < size.width; ++sj, dj += n) \
168
{ \
169
FILL_LINES##n(SLD, sgn##bits) \
170
} \
171
} \
172
}
173
174
#define COMBINE64(sgn,n) void combine##n(const Size2D &_size \
175
FILL_LINES##n(FARG, sgn##64), \
176
sgn##64 * dstBase, ptrdiff_t dstStride) \
177
{ \
178
internal::assertSupportedConfiguration(); \
179
Size2D size(_size); \
180
if (CONTSRC##n \
181
dstStride == (ptrdiff_t)(size.width)) \
182
{ \
183
size.width *= size.height; \
184
size.height = 1; \
185
} \
186
typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \
187
\
188
for (size_t i = 0u; i < size.height; ++i) \
189
{ \
190
FILL_LINES##n(VROW, sgn##64) \
191
sgn##64 * dst = internal::getRowPtr(dstBase, dstStride, i); \
192
size_t sj = 0u, dj = 0u; \
193
\
194
for (; sj < size.width; ++sj, dj += n) \
195
{ \
196
vec64 v_dst; \
197
FILL_LINES##n(VLD1, sgn##64) \
198
vst##n##_##sgn##64(dst + dj, v_dst); \
199
/*FILL_LINES##n(SLD, sgn##64)*/ \
200
} \
201
} \
202
}
203
204
#else
205
206
#define VOID_LINE(type, n) (void)src##n##Base; (void)src##n##Stride;
207
208
#define COMBINE(sgn,bits,n) void combine##n(const Size2D &size \
209
FILL_LINES##n(FARG, sgn##bits), \
210
sgn##bits * dstBase, ptrdiff_t dstStride) \
211
{ \
212
internal::assertSupportedConfiguration(); \
213
(void)size; \
214
FILL_LINES##n(VOID, sgn##bits) \
215
(void)dstBase; \
216
(void)dstStride; \
217
}
218
#define COMBINE64(sgn,n) COMBINE(sgn,64,n)
219
220
#endif //CAROTENE_NEON
221
222
COMBINE(u, 8,2)
223
COMBINE(u, 8,3)
224
COMBINE(u, 8,4)
225
COMBINE(u,16,2)
226
COMBINE(u,16,3)
227
COMBINE(u,16,4)
228
COMBINE(s,32,2)
229
COMBINE(s,32,3)
230
COMBINE(s,32,4)
231
COMBINE64(s, 2)
232
COMBINE64(s, 3)
233
COMBINE64(s, 4)
234
235
void combineYUYV(const Size2D &size,
236
const u8 * srcyBase, ptrdiff_t srcyStride,
237
const u8 * srcuBase, ptrdiff_t srcuStride,
238
const u8 * srcvBase, ptrdiff_t srcvStride,
239
u8 * dstBase, ptrdiff_t dstStride)
240
{
241
internal::assertSupportedConfiguration();
242
#ifdef CAROTENE_NEON
243
#ifndef __ANDROID__
244
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
245
#endif
246
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
247
248
for (size_t i = 0u; i < size.height; i += 1)
249
{
250
const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
251
const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
252
const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
253
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
254
size_t syj = 0u, sj = 0u, dj = 0u;
255
256
#ifndef __ANDROID__
257
for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
258
{
259
internal::prefetch(srcy + syj);
260
internal::prefetch(srcu + sj);
261
internal::prefetch(srcv + sj);
262
263
uint8x16x2_t v_y = vld2q_u8(srcy + syj);
264
uint8x16x4_t v_dst;
265
v_dst.val[0] = v_y.val[0];
266
v_dst.val[1] = vld1q_u8(srcu + sj);
267
v_dst.val[2] = v_y.val[1];
268
v_dst.val[3] = vld1q_u8(srcv + sj);
269
vst4q_u8(dst + dj, v_dst);
270
271
v_y = vld2q_u8(srcy + syj + 32);
272
v_dst.val[0] = v_y.val[0];
273
v_dst.val[1] = vld1q_u8(srcu + sj + 16);
274
v_dst.val[2] = v_y.val[1];
275
v_dst.val[3] = vld1q_u8(srcv + sj + 16);
276
vst4q_u8(dst + dj + 64, v_dst);
277
}
278
#endif
279
280
for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
281
{
282
uint8x8x2_t v_y = vld2_u8(srcy + syj);
283
uint8x8x4_t v_dst;
284
v_dst.val[0] = v_y.val[0];
285
v_dst.val[1] = vld1_u8(srcu + sj);
286
v_dst.val[2] = v_y.val[1];
287
v_dst.val[3] = vld1_u8(srcv + sj);
288
vst4_u8(dst + dj, v_dst);
289
}
290
291
for (; sj < size.width; ++sj, syj += 2, dj += 4)
292
{
293
dst[dj] = srcy[syj];
294
dst[dj + 1] = srcu[sj];
295
dst[dj + 2] = srcy[syj + 1];
296
dst[dj + 3] = srcv[sj];
297
}
298
}
299
#else
300
(void)size;
301
(void)srcyBase;
302
(void)srcyStride;
303
(void)srcuBase;
304
(void)srcuStride;
305
(void)srcvBase;
306
(void)srcvStride;
307
(void)dstBase;
308
(void)dstStride;
309
#endif
310
}
311
312
void combineUYVY(const Size2D &size,
313
const u8 * srcyBase, ptrdiff_t srcyStride,
314
const u8 * srcuBase, ptrdiff_t srcuStride,
315
const u8 * srcvBase, ptrdiff_t srcvStride,
316
u8 * dstBase, ptrdiff_t dstStride)
317
{
318
internal::assertSupportedConfiguration();
319
#ifdef CAROTENE_NEON
320
#ifndef __ANDROID__
321
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
322
#endif
323
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
324
325
for (size_t i = 0u; i < size.height; ++i)
326
{
327
const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
328
const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
329
const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
330
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
331
size_t syj = 0u, sj = 0u, dj = 0u;
332
333
#ifndef __ANDROID__
334
for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
335
{
336
internal::prefetch(srcy + syj);
337
internal::prefetch(srcu + sj);
338
internal::prefetch(srcv + sj);
339
340
uint8x16x2_t v_y = vld2q_u8(srcy + syj);
341
uint8x16x4_t v_dst;
342
v_dst.val[0] = vld1q_u8(srcu + sj);
343
v_dst.val[1] = v_y.val[0];
344
v_dst.val[2] = vld1q_u8(srcv + sj);
345
v_dst.val[3] = v_y.val[1];
346
vst4q_u8(dst + dj, v_dst);
347
348
v_y = vld2q_u8(srcy + syj + 32);
349
v_dst.val[0] = vld1q_u8(srcu + sj + 16);
350
v_dst.val[1] = v_y.val[0];
351
v_dst.val[2] = vld1q_u8(srcv + sj + 16);
352
v_dst.val[3] = v_y.val[1];
353
vst4q_u8(dst + dj + 64, v_dst);
354
}
355
#endif
356
357
for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
358
{
359
uint8x8x2_t v_y = vld2_u8(srcy + syj);
360
uint8x8x4_t v_dst;
361
v_dst.val[0] = vld1_u8(srcu + sj);
362
v_dst.val[1] = v_y.val[0];
363
v_dst.val[2] = vld1_u8(srcv + sj);
364
v_dst.val[3] = v_y.val[1];
365
vst4_u8(dst + dj, v_dst);
366
}
367
368
for (; sj < size.width; ++sj, syj += 2, dj += 4)
369
{
370
dst[dj] = srcu[sj];
371
dst[dj + 1] = srcy[syj];
372
dst[dj + 2] = srcv[sj];
373
dst[dj + 3] = srcy[syj + 1];
374
}
375
}
376
#else
377
(void)size;
378
(void)srcyBase;
379
(void)srcyStride;
380
(void)srcuBase;
381
(void)srcuStride;
382
(void)srcvBase;
383
(void)srcvStride;
384
(void)dstBase;
385
(void)dstStride;
386
#endif
387
}
388
389
} // namespace CAROTENE_NS
390
391