CoCalc -- channels

GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/channels_combine.cpp
¹⁶³³⁷ views
1
/*
2
 * By downloading, copying, installing or using the software you agree to this license.
3
 * If you do not agree to this license, do not download, install,
4
 * copy or use the software.
5
 *
6
 *
7
 *                           License Agreement
8
 *                For Open Source Computer Vision Library
9
 *                        (3-clause BSD License)
10
 *
11
 * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
12
 * Third party copyrights are property of their respective owners.
13
 *
14
 * Redistribution and use in source and binary forms, with or without modification,
15
 * are permitted provided that the following conditions are met:
16
 *
17
 *   * Redistributions of source code must retain the above copyright notice,
18
 *     this list of conditions and the following disclaimer.
19
 *
20
 *   * Redistributions in binary form must reproduce the above copyright notice,
21
 *     this list of conditions and the following disclaimer in the documentation
22
 *     and/or other materials provided with the distribution.
23
 *
24
 *   * Neither the names of the copyright holders nor the names of the contributors
25
 *     may be used to endorse or promote products derived from this software
26
 *     without specific prior written permission.
27
 *
28
 * This software is provided by the copyright holders and contributors "as is" and
29
 * any express or implied warranties, including, but not limited to, the implied
30
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31
 * In no event shall copyright holders or contributors be liable for any direct,
32
 * indirect, incidental, special, exemplary, or consequential damages
33
 * (including, but not limited to, procurement of substitute goods or services;
34
 * loss of use, data, or profits; or business interruption) however caused
35
 * and on any theory of liability, whether in contract, strict liability,
36
 * or tort (including negligence or otherwise) arising in any way out of
37
 * the use of this software, even if advised of the possibility of such damage.
38
 */
39

40
#include "common.hpp"
41
#include "vtransform.hpp"
42

43
namespace CAROTENE_NS {
44

45
#define FILL_LINES2(macro,type) \
46
            macro##_LINE(type,0) \
47
            macro##_LINE(type,1)
48
#define FILL_LINES3(macro,type) \
49
            FILL_LINES2(macro,type) \
50
            macro##_LINE(type,2)
51
#define FILL_LINES4(macro,type) \
52
            FILL_LINES3(macro,type) \
53
            macro##_LINE(type,3)
54

55
#define  FARG_LINE(type, n) , const type * src##n##Base, ptrdiff_t src##n##Stride
56

57
#ifdef CAROTENE_NEON
58

59
#define  VROW_LINE(type, n) const type * src##n = internal::getRowPtr(src##n##Base, src##n##Stride, i);
60
#define  PREF_LINE(type, n) internal::prefetch(src##n + sj);
61
#define VLD1Q_LINE(type, n) v_dst.val[n] = vld1q_##type(src##n + sj);
62
#define  PRLD_LINE(type, n) internal::prefetch(src##n + sj); v_dst.val[n] = vld1q_##type(src##n + sj);
63
#define  VLD1_LINE(type, n) v_dst.val[n] = vld1_##type(src##n + sj);
64
#define   SLD_LINE(type, n) dst[dj + n] = src##n[sj];
65

66
#define MUL2(val) (val << 1)
67
#define MUL3(val) (MUL2(val) + val)
68
#define MUL4(val) (val << 2)
69

70
#define CONTSRC2 dstStride == src0Stride && \
71
                 dstStride == src1Stride &&
72
#define CONTSRC3 dstStride == src0Stride && \
73
                 dstStride == src1Stride && \
74
                 dstStride == src2Stride &&
75
#define CONTSRC4 dstStride == src0Stride && \
76
                 dstStride == src1Stride && \
77
                 dstStride == src2Stride && \
78
                 dstStride == src3Stride &&
79

80
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
81

82
#define MERGE_ASM2(sgn, bits) __asm__ ( \
83
                                          "vld1." #bits " {d0-d1}, [%[in0]]             \n\t" \
84
                                          "vld1." #bits " {d2-d3}, [%[in1]]             \n\t" \
85
                                          "vst2." #bits " {d0, d2}, [%[out0]]           \n\t" \
86
                                          "vst2." #bits " {d1, d3}, [%[out1]]           \n\t" \
87
                                          : \
88
                                          : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), \
89
                                            [out0]  "r" (dst + dj), [out1]  "r" (dst + dj + MUL2(8)/sizeof(sgn##bits)) \
90
                                          : "d0","d1","d2","d3" \
91
                                      );
92
#define MERGE_ASM3(sgn, bits) __asm__ ( \
93
                                          "vld1." #bits " {d0-d1}, [%[in0]]             \n\t" \
94
                                          "vld1." #bits " {d2-d3}, [%[in1]]             \n\t" \
95
                                          "vld1." #bits " {d4-d5}, [%[in2]]             \n\t" \
96
                                          "vst3." #bits " {d0, d2, d4}, [%[out0]]       \n\t" \
97
                                          "vst3." #bits " {d1, d3, d5}, [%[out1]]       \n\t" \
98
                                          : \
99
                                          : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), \
100
                                            [out0]  "r" (dst + dj), [out1]  "r" (dst + dj + MUL3(8)/sizeof(sgn##bits)) \
101
                                          : "d0","d1","d2","d3","d4","d5" \
102
                                      );
103
#define MERGE_ASM4(sgn, bits) __asm__ ( \
104
                                          "vld1." #bits " {d0-d1}, [%[in0]]             \n\t" \
105
                                          "vld1." #bits " {d2-d3}, [%[in1]]             \n\t" \
106
                                          "vld1." #bits " {d4-d5}, [%[in2]]             \n\t" \
107
                                          "vld1." #bits " {d6-d7}, [%[in3]]             \n\t" \
108
                                          "vst4." #bits " {d0, d2, d4, d6}, [%[out0]]   \n\t" \
109
                                          "vst4." #bits " {d1, d3, d5, d7}, [%[out1]]   \n\t" \
110
                                          : \
111
                                          : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), [in3] "r" (src3 + sj), \
112
                                            [out0]  "r" (dst + dj), [out1]  "r" (dst + dj + MUL4(8)/sizeof(sgn##bits)) \
113
                                          : "d0","d1","d2","d3","d4","d5","d6","d7" \
114
                                      );
115

116
#define MERGE_QUAD(sgn, bits, n) { \
117
                                     FILL_LINES##n(PREF, sgn##bits) \
118
                                     MERGE_ASM##n(sgn, bits) \
119
                                 }
120

121
#else
122

123
#define MERGE_QUAD(sgn, bits, n) { \
124
                                     vec128 v_dst; \
125
                                     /*FILL_LINES##n(PREF, sgn##bits) \
126
                                     FILL_LINES##n(VLD1Q, sgn##bits)*/ \
127
                                     FILL_LINES##n(PRLD, sgn##bits) \
128
                                     vst##n##q_##sgn##bits(dst + dj, v_dst); \
129
                                 }
130

131
#endif
132

133
#define COMBINE(sgn,bits,n) void combine##n(const Size2D &_size                                             \
134
                                        FILL_LINES##n(FARG, sgn##bits),                                     \
135
                                        sgn##bits * dstBase, ptrdiff_t dstStride)                           \
136
{                                                                                                           \
137
    internal::assertSupportedConfiguration();                                                               \
138
    Size2D size(_size);                                                                                     \
139
    if (CONTSRC##n                                                                                          \
140
        dstStride == (ptrdiff_t)(size.width))                                                               \
141
    {                                                                                                       \
142
        size.width *= size.height;                                                                          \
143
        size.height = 1;                                                                                    \
144
    }                                                                                                       \
145
    typedef internal::VecTraits<sgn##bits, n>::vec128 vec128;                                               \
146
    size_t roiw16 = size.width >= (16/sizeof(sgn##bits) - 1) ? size.width - (16/sizeof(sgn##bits) - 1) : 0; \
147
    typedef internal::VecTraits<sgn##bits, n>::vec64 vec64;                                                 \
148
    size_t roiw8 = size.width >= (8/sizeof(sgn##bits) - 1) ? size.width - (8/sizeof(sgn##bits) - 1) : 0;    \
149
                                                                                                            \
150
    for (size_t i = 0u; i < size.height; ++i)                                                               \
151
    {                                                                                                       \
152
        FILL_LINES##n(VROW, sgn##bits)                                                                      \
153
        sgn##bits * dst = internal::getRowPtr(dstBase, dstStride, i);                                       \
154
        size_t sj = 0u, dj = 0u;                                                                            \
155
                                                                                                            \
156
        for (; sj < roiw16; sj += 16/sizeof(sgn##bits), dj += MUL##n(16)/sizeof(sgn##bits))                 \
157
            MERGE_QUAD(sgn, bits, n)                                                                        \
158
                                                                                                            \
159
        if ( sj < roiw8 )                                                                                   \
160
        {                                                                                                   \
161
            vec64 v_dst;                                                                                    \
162
            FILL_LINES##n(VLD1, sgn##bits)                                                                  \
163
            vst##n##_##sgn##bits(dst + dj, v_dst);                                                          \
164
            sj += 8/sizeof(sgn##bits); dj += MUL##n(8)/sizeof(sgn##bits);                                   \
165
        }                                                                                                   \
166
                                                                                                            \
167
        for (; sj < size.width; ++sj, dj += n)                                                              \
168
        {                                                                                                   \
169
            FILL_LINES##n(SLD, sgn##bits)                                                                   \
170
        }                                                                                                   \
171
    }                                                                                                       \
172
}
173

174
#define COMBINE64(sgn,n) void combine##n(const Size2D &_size                                                \
175
                                               FILL_LINES##n(FARG, sgn##64),                                \
176
                                               sgn##64 * dstBase, ptrdiff_t dstStride)                      \
177
{                                                                                                           \
178
    internal::assertSupportedConfiguration();                                                               \
179
    Size2D size(_size);                                                                                     \
180
    if (CONTSRC##n                                                                                          \
181
        dstStride == (ptrdiff_t)(size.width))                                                               \
182
    {                                                                                                       \
183
        size.width *= size.height;                                                                          \
184
        size.height = 1;                                                                                    \
185
    }                                                                                                       \
186
    typedef internal::VecTraits<sgn##64, n>::vec64 vec64;                                                   \
187
                                                                                                            \
188
    for (size_t i = 0u; i < size.height; ++i)                                                               \
189
    {                                                                                                       \
190
        FILL_LINES##n(VROW, sgn##64)                                                                        \
191
        sgn##64 * dst = internal::getRowPtr(dstBase, dstStride, i);                                         \
192
        size_t sj = 0u, dj = 0u;                                                                            \
193
                                                                                                            \
194
        for (; sj < size.width; ++sj, dj += n)                                                              \
195
        {                                                                                                   \
196
            vec64 v_dst;                                                                                    \
197
            FILL_LINES##n(VLD1, sgn##64)                                                                    \
198
            vst##n##_##sgn##64(dst + dj, v_dst);                                                            \
199
            /*FILL_LINES##n(SLD, sgn##64)*/                                                                 \
200
        }                                                                                                   \
201
    }                                                                                                       \
202
}
203

204
#else
205

206
#define  VOID_LINE(type, n) (void)src##n##Base; (void)src##n##Stride;
207

208
#define COMBINE(sgn,bits,n) void combine##n(const Size2D &size                                              \
209
                                        FILL_LINES##n(FARG, sgn##bits),                                     \
210
                                        sgn##bits * dstBase, ptrdiff_t dstStride)                           \
211
{                                                                                                           \
212
    internal::assertSupportedConfiguration();                                                               \
213
    (void)size;                                                                                             \
214
    FILL_LINES##n(VOID, sgn##bits)                                                                          \
215
    (void)dstBase;                                                                                          \
216
    (void)dstStride;                                                                                        \
217
}
218
#define COMBINE64(sgn,n) COMBINE(sgn,64,n)
219

220
#endif //CAROTENE_NEON
221

222
COMBINE(u, 8,2)
223
COMBINE(u, 8,3)
224
COMBINE(u, 8,4)
225
COMBINE(u,16,2)
226
COMBINE(u,16,3)
227
COMBINE(u,16,4)
228
COMBINE(s,32,2)
229
COMBINE(s,32,3)
230
COMBINE(s,32,4)
231
COMBINE64(s, 2)
232
COMBINE64(s, 3)
233
COMBINE64(s, 4)
234

235
void combineYUYV(const Size2D &size,
236
                 const u8 * srcyBase, ptrdiff_t srcyStride,
237
                 const u8 * srcuBase, ptrdiff_t srcuStride,
238
                 const u8 * srcvBase, ptrdiff_t srcvStride,
239
                 u8 * dstBase, ptrdiff_t dstStride)
240
{
241
    internal::assertSupportedConfiguration();
242
#ifdef CAROTENE_NEON
243
#ifndef __ANDROID__
244
    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
245
#endif
246
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
247

248
    for (size_t i = 0u; i < size.height; i += 1)
249
    {
250
        const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
251
        const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
252
        const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
253
        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
254
        size_t syj = 0u, sj = 0u, dj = 0u;
255

256
#ifndef __ANDROID__
257
        for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
258
        {
259
            internal::prefetch(srcy + syj);
260
            internal::prefetch(srcu + sj);
261
            internal::prefetch(srcv + sj);
262

263
            uint8x16x2_t v_y = vld2q_u8(srcy + syj);
264
            uint8x16x4_t v_dst;
265
            v_dst.val[0] = v_y.val[0];
266
            v_dst.val[1] = vld1q_u8(srcu + sj);
267
            v_dst.val[2] = v_y.val[1];
268
            v_dst.val[3] = vld1q_u8(srcv + sj);
269
            vst4q_u8(dst + dj, v_dst);
270

271
            v_y = vld2q_u8(srcy + syj + 32);
272
            v_dst.val[0] = v_y.val[0];
273
            v_dst.val[1] = vld1q_u8(srcu + sj + 16);
274
            v_dst.val[2] = v_y.val[1];
275
            v_dst.val[3] = vld1q_u8(srcv + sj + 16);
276
            vst4q_u8(dst + dj + 64, v_dst);
277
        }
278
#endif
279

280
        for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
281
        {
282
            uint8x8x2_t v_y = vld2_u8(srcy + syj);
283
            uint8x8x4_t v_dst;
284
            v_dst.val[0] = v_y.val[0];
285
            v_dst.val[1] = vld1_u8(srcu + sj);
286
            v_dst.val[2] = v_y.val[1];
287
            v_dst.val[3] = vld1_u8(srcv + sj);
288
            vst4_u8(dst + dj, v_dst);
289
        }
290

291
        for (; sj < size.width; ++sj, syj += 2, dj += 4)
292
        {
293
            dst[dj] = srcy[syj];
294
            dst[dj + 1] = srcu[sj];
295
            dst[dj + 2] = srcy[syj + 1];
296
            dst[dj + 3] = srcv[sj];
297
        }
298
    }
299
#else
300
    (void)size;
301
    (void)srcyBase;
302
    (void)srcyStride;
303
    (void)srcuBase;
304
    (void)srcuStride;
305
    (void)srcvBase;
306
    (void)srcvStride;
307
    (void)dstBase;
308
    (void)dstStride;
309
#endif
310
}
311

312
void combineUYVY(const Size2D &size,
313
                 const u8 * srcyBase, ptrdiff_t srcyStride,
314
                 const u8 * srcuBase, ptrdiff_t srcuStride,
315
                 const u8 * srcvBase, ptrdiff_t srcvStride,
316
                 u8 * dstBase, ptrdiff_t dstStride)
317
{
318
    internal::assertSupportedConfiguration();
319
#ifdef CAROTENE_NEON
320
#ifndef __ANDROID__
321
    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
322
#endif
323
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
324

325
    for (size_t i = 0u; i < size.height; ++i)
326
    {
327
        const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
328
        const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
329
        const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
330
        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
331
        size_t syj = 0u, sj = 0u, dj = 0u;
332

333
#ifndef __ANDROID__
334
        for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
335
        {
336
            internal::prefetch(srcy + syj);
337
            internal::prefetch(srcu + sj);
338
            internal::prefetch(srcv + sj);
339

340
            uint8x16x2_t v_y = vld2q_u8(srcy + syj);
341
            uint8x16x4_t v_dst;
342
            v_dst.val[0] = vld1q_u8(srcu + sj);
343
            v_dst.val[1] = v_y.val[0];
344
            v_dst.val[2] = vld1q_u8(srcv + sj);
345
            v_dst.val[3] = v_y.val[1];
346
            vst4q_u8(dst + dj, v_dst);
347

348
            v_y = vld2q_u8(srcy + syj + 32);
349
            v_dst.val[0] = vld1q_u8(srcu + sj + 16);
350
            v_dst.val[1] = v_y.val[0];
351
            v_dst.val[2] = vld1q_u8(srcv + sj + 16);
352
            v_dst.val[3] = v_y.val[1];
353
            vst4q_u8(dst + dj + 64, v_dst);
354
        }
355
#endif
356

357
        for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
358
        {
359
            uint8x8x2_t v_y = vld2_u8(srcy + syj);
360
            uint8x8x4_t v_dst;
361
            v_dst.val[0] = vld1_u8(srcu + sj);
362
            v_dst.val[1] = v_y.val[0];
363
            v_dst.val[2] = vld1_u8(srcv + sj);
364
            v_dst.val[3] = v_y.val[1];
365
            vst4_u8(dst + dj, v_dst);
366
        }
367

368
        for (; sj < size.width; ++sj, syj += 2, dj += 4)
369
        {
370
            dst[dj] = srcu[sj];
371
            dst[dj + 1] = srcy[syj];
372
            dst[dj + 2] = srcv[sj];
373
            dst[dj + 3] = srcy[syj + 1];
374
        }
375
    }
376
#else
377
    (void)size;
378
    (void)srcyBase;
379
    (void)srcyStride;
380
    (void)srcuBase;
381
    (void)srcuStride;
382
    (void)srcvBase;
383
    (void)srcvStride;
384
    (void)dstBase;
385
    (void)dstStride;
386
#endif
387
}
388

389
} // namespace CAROTENE_NS
390

391
Product

Resources

Company