Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/colorconvert.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
42
#include "saturate_cast.hpp"
43
44
namespace CAROTENE_NS {
45
46
#ifdef CAROTENE_NEON
47
48
namespace {
49
50
enum
51
{
52
SHIFT = 14,
53
SHIFT_DELTA = 1 << (SHIFT - 1),
54
55
R2Y_BT601 = 4899,
56
G2Y_BT601 = 9617,
57
B2Y_BT601 = 1868,
58
59
R2Y_BT709 = 3483,
60
G2Y_BT709 = 11718,
61
B2Y_BT709 = 1183,
62
};
63
64
inline uint8x8_t convertToGray(const uint16x8_t & v_r,
65
const uint16x8_t & v_g,
66
const uint16x8_t & v_b,
67
const uint16x4_t & v_r2y,
68
const uint16x4_t & v_g2y,
69
const uint16x4_t & v_b2y)
70
{
71
uint32x4_t v_dst0 = vmull_u16(vget_low_u16(v_g), v_g2y);
72
uint32x4_t v_dst1 = vmull_u16(vget_high_u16(v_g), v_g2y);
73
74
v_dst0 = vmlal_u16(v_dst0, vget_low_u16(v_r), v_r2y);
75
v_dst1 = vmlal_u16(v_dst1, vget_high_u16(v_r), v_r2y);
76
77
v_dst0 = vmlal_u16(v_dst0, vget_low_u16(v_b), v_b2y);
78
v_dst1 = vmlal_u16(v_dst1, vget_high_u16(v_b), v_b2y);
79
80
uint8x8_t v_gray = vqmovn_u16(vcombine_u16(vrshrn_n_u32(v_dst0, SHIFT),
81
vrshrn_n_u32(v_dst1, SHIFT)));
82
83
return v_gray;
84
}
85
86
} // namespace
87
88
#endif
89
90
void rgb2gray(const Size2D &size, COLOR_SPACE color_space,
91
const u8 * srcBase, ptrdiff_t srcStride,
92
u8 * dstBase, ptrdiff_t dstStride)
93
{
94
internal::assertSupportedConfiguration();
95
#ifdef CAROTENE_NEON
96
const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;
97
const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;
98
const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;
99
100
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
101
register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);
102
register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);
103
register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);
104
#else
105
uint16x4_t v_r2y = vdup_n_u16(R2Y),
106
v_g2y = vdup_n_u16(G2Y),
107
v_b2y = vdup_n_u16(B2Y);
108
109
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
110
#endif
111
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
112
113
for (size_t i = 0u; i < size.height; ++i)
114
{
115
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
116
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
117
size_t sj = 0u, dj = 0u;
118
119
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
120
for (; dj < roiw8; sj += 24, dj += 8)
121
{
122
internal::prefetch(src + sj);
123
__asm__ (
124
"vld3.8 {d0-d2}, [%[in]] @RGB \n\t"
125
"vmovl.u8 q2, d0 @R (d4,d5) \n\t"
126
"vmovl.u8 q3, d1 @G (d6,d7) \n\t"
127
"vmovl.u8 q4, d2 @B (d8,d9) \n\t"
128
"vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t"
129
"vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t"
130
"vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t"
131
"vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t"
132
"vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t"
133
"vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t"
134
"vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t"
135
"vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t"
136
"vqmovn.u16 d4, q4 \n\t"
137
"vst1.8 {d4}, [%[out]] \n\t"
138
: /*no output*/
139
: [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)
140
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
141
);
142
}
143
#else
144
for (; dj < roiw16; sj += 48, dj += 16)
145
{
146
internal::prefetch(src + sj);
147
uint8x16x3_t v_src0 = vld3q_u8(src + sj);
148
// 0
149
uint16x8_t v_r = vmovl_u8(vget_low_u8(v_src0.val[0])),
150
v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),
151
v_b = vmovl_u8(vget_low_u8(v_src0.val[2]));
152
uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
153
154
v_r = vmovl_u8(vget_high_u8(v_src0.val[0])),
155
v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),
156
v_b = vmovl_u8(vget_high_u8(v_src0.val[2]));
157
uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
158
159
vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));
160
}
161
162
if (dj < roiw8)
163
{
164
uint8x8x3_t v_src = vld3_u8(src + sj);
165
uint16x8_t v_r = vmovl_u8(v_src.val[0]),
166
v_g = vmovl_u8(v_src.val[1]),
167
v_b = vmovl_u8(v_src.val[2]);
168
uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
169
170
vst1_u8(dst + dj, v_gray);
171
sj += 24; dj += 8;
172
}
173
#endif
174
175
for (; dj < size.width; sj += 3, dj++)
176
{
177
u32 val = src[sj] * R2Y + src[sj + 1] * G2Y + src[sj + 2] * B2Y;
178
dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);
179
}
180
}
181
#else
182
(void)size;
183
(void)color_space;
184
(void)srcBase;
185
(void)srcStride;
186
(void)dstBase;
187
(void)dstStride;
188
#endif
189
}
190
191
void rgbx2gray(const Size2D &size, COLOR_SPACE color_space,
192
const u8 * srcBase, ptrdiff_t srcStride,
193
u8 * dstBase, ptrdiff_t dstStride)
194
{
195
internal::assertSupportedConfiguration();
196
#ifdef CAROTENE_NEON
197
const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;
198
const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;
199
const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;
200
201
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
202
register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);
203
register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);
204
register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);
205
#else
206
uint16x4_t v_r2y = vdup_n_u16(R2Y),
207
v_g2y = vdup_n_u16(G2Y),
208
v_b2y = vdup_n_u16(B2Y);
209
210
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
211
#endif
212
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
213
214
for (size_t i = 0u; i < size.height; ++i)
215
{
216
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
217
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
218
size_t sj = 0u, dj = 0u;
219
220
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
221
for (; dj < roiw8; sj += 32, dj += 8)
222
{
223
internal::prefetch(src + sj);
224
__asm__ (
225
"vld4.8 {d0-d3}, [%[in]] @RGBA \n\t"
226
"vmovl.u8 q2, d0 @R (d4,d5) \n\t"
227
"vmovl.u8 q3, d1 @G (d6,d7) \n\t"
228
"vmovl.u8 q4, d2 @B (d8,d9) \n\t"
229
"vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t"
230
"vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t"
231
"vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t"
232
"vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t"
233
"vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t"
234
"vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t"
235
"vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t"
236
"vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t"
237
"vqmovn.u16 d4, q4 \n\t"
238
"vst1.8 {d4}, [%[out]] \n\t"
239
: /*no output*/
240
: [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)
241
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
242
);
243
}
244
#else
245
for (; dj < roiw16; sj += 64, dj += 16)
246
{
247
internal::prefetch(src + sj);
248
uint8x16x4_t v_src0 = vld4q_u8(src + sj);
249
250
// 0
251
uint16x8_t v_r = vmovl_u8(vget_low_u8(v_src0.val[0])),
252
v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),
253
v_b = vmovl_u8(vget_low_u8(v_src0.val[2]));
254
uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
255
256
v_r = vmovl_u8(vget_high_u8(v_src0.val[0])),
257
v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),
258
v_b = vmovl_u8(vget_high_u8(v_src0.val[2]));
259
uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
260
261
vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));
262
}
263
264
if (dj < roiw8)
265
{
266
uint8x8x4_t v_src = vld4_u8(src + sj);
267
uint16x8_t v_r = vmovl_u8(v_src.val[0]),
268
v_g = vmovl_u8(v_src.val[1]),
269
v_b = vmovl_u8(v_src.val[2]);
270
uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
271
272
vst1_u8(dst + dj, v_gray);
273
sj += 32; dj += 8;
274
}
275
#endif
276
277
for (; dj < size.width; sj += 4, dj++)
278
{
279
u32 val = src[sj] * R2Y + src[sj + 1] * G2Y + src[sj + 2] * B2Y;
280
dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);
281
}
282
}
283
#else
284
(void)size;
285
(void)color_space;
286
(void)srcBase;
287
(void)srcStride;
288
(void)dstBase;
289
(void)dstStride;
290
#endif
291
}
292
293
void bgr2gray(const Size2D &size, COLOR_SPACE color_space,
294
const u8 * srcBase, ptrdiff_t srcStride,
295
u8 * dstBase, ptrdiff_t dstStride)
296
{
297
internal::assertSupportedConfiguration();
298
#ifdef CAROTENE_NEON
299
const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;
300
const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;
301
const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;
302
303
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
304
register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);
305
register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);
306
register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);
307
#else
308
uint16x4_t v_r2y = vdup_n_u16(R2Y),
309
v_g2y = vdup_n_u16(G2Y),
310
v_b2y = vdup_n_u16(B2Y);
311
312
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
313
#endif
314
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
315
316
for (size_t i = 0u; i < size.height; ++i)
317
{
318
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
319
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
320
size_t sj = 0u, dj = 0u;
321
322
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
323
for (; dj < roiw8; sj += 24, dj += 8)
324
{
325
internal::prefetch(src + sj);
326
__asm__ (
327
"vld3.8 {d0-d2}, [%[in]] @BGR \n\t"
328
"vmovl.u8 q2, d2 @R (d4,d5) \n\t"
329
"vmovl.u8 q3, d1 @G (d6,d7) \n\t"
330
"vmovl.u8 q4, d0 @B (d8,d9) \n\t"
331
"vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t"
332
"vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t"
333
"vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t"
334
"vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t"
335
"vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t"
336
"vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t"
337
"vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t"
338
"vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t"
339
"vqmovn.u16 d4, q4 \n\t"
340
"vst1.8 {d4}, [%[out]] \n\t"
341
: /*no output*/
342
: [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)
343
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
344
);
345
}
346
#else
347
for (; dj < roiw16; sj += 48, dj += 16)
348
{
349
internal::prefetch(src + sj);
350
uint8x16x3_t v_src0 = vld3q_u8(src + sj);
351
352
// 0
353
uint16x8_t v_b = vmovl_u8(vget_low_u8(v_src0.val[0])),
354
v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),
355
v_r = vmovl_u8(vget_low_u8(v_src0.val[2]));
356
uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
357
358
v_b = vmovl_u8(vget_high_u8(v_src0.val[0])),
359
v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),
360
v_r = vmovl_u8(vget_high_u8(v_src0.val[2]));
361
uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
362
363
vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));
364
}
365
366
if (dj < roiw8)
367
{
368
uint8x8x3_t v_src = vld3_u8(src + sj);
369
uint16x8_t v_b = vmovl_u8(v_src.val[0]),
370
v_g = vmovl_u8(v_src.val[1]),
371
v_r = vmovl_u8(v_src.val[2]);
372
uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
373
374
vst1_u8(dst + dj, v_gray);
375
sj += 24; dj += 8;
376
}
377
#endif
378
379
for (; dj < size.width; sj += 3, dj++)
380
{
381
u32 val = src[sj] * B2Y + src[sj + 1] * G2Y + src[sj + 2] * R2Y;
382
dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);
383
}
384
}
385
#else
386
(void)size;
387
(void)color_space;
388
(void)srcBase;
389
(void)srcStride;
390
(void)dstBase;
391
(void)dstStride;
392
#endif
393
}
394
395
void bgrx2gray(const Size2D &size, COLOR_SPACE color_space,
396
const u8 * srcBase, ptrdiff_t srcStride,
397
u8 * dstBase, ptrdiff_t dstStride)
398
{
399
internal::assertSupportedConfiguration();
400
#ifdef CAROTENE_NEON
401
const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;
402
const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;
403
const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;
404
405
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
406
register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);
407
register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);
408
register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);
409
#else
410
uint16x4_t v_r2y = vdup_n_u16(R2Y),
411
v_g2y = vdup_n_u16(G2Y),
412
v_b2y = vdup_n_u16(B2Y);
413
414
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
415
#endif
416
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
417
418
for (size_t i = 0u; i < size.height; ++i)
419
{
420
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
421
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
422
size_t sj = 0u, dj = 0u;
423
424
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
425
for (; dj < roiw8; sj += 32, dj += 8)
426
{
427
internal::prefetch(src + sj);
428
__asm__ (
429
"vld4.8 {d0-d3}, [%[in]] @BGRA \n\t"
430
"vmovl.u8 q2, d2 @R (d4,d5) \n\t"
431
"vmovl.u8 q3, d1 @G (d6,d7) \n\t"
432
"vmovl.u8 q4, d0 @B (d8,d9) \n\t"
433
"vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t"
434
"vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t"
435
"vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t"
436
"vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t"
437
"vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t"
438
"vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t"
439
"vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t"
440
"vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t"
441
"vqmovn.u16 d4, q4 \n\t"
442
"vst1.8 {d4}, [%[out]] \n\t"
443
: /*no output*/
444
: [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)
445
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
446
);
447
}
448
#else
449
for (; dj < roiw16; sj += 64, dj += 16)
450
{
451
internal::prefetch(src + sj);
452
uint8x16x4_t v_src0 = vld4q_u8(src + sj);
453
454
// 0
455
uint16x8_t v_b = vmovl_u8(vget_low_u8(v_src0.val[0])),
456
v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),
457
v_r = vmovl_u8(vget_low_u8(v_src0.val[2]));
458
uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
459
460
v_b = vmovl_u8(vget_high_u8(v_src0.val[0])),
461
v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),
462
v_r = vmovl_u8(vget_high_u8(v_src0.val[2]));
463
uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
464
465
vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));
466
}
467
468
if (dj < roiw8)
469
{
470
uint8x8x4_t v_src = vld4_u8(src + sj);
471
uint16x8_t v_b = vmovl_u8(v_src.val[0]),
472
v_g = vmovl_u8(v_src.val[1]),
473
v_r = vmovl_u8(v_src.val[2]);
474
uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
475
476
vst1_u8(dst + dj, v_gray);
477
sj += 32; dj += 8;
478
}
479
#endif
480
481
for (; dj < size.width; sj += 4, dj++)
482
{
483
u32 val = src[sj] * B2Y + src[sj + 1] * G2Y + src[sj + 2] * R2Y;
484
dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);
485
}
486
}
487
#else
488
(void)size;
489
(void)color_space;
490
(void)srcBase;
491
(void)srcStride;
492
(void)dstBase;
493
(void)dstStride;
494
#endif
495
}
496
497
void gray2rgb(const Size2D &size,
498
const u8 * srcBase, ptrdiff_t srcStride,
499
u8 * dstBase, ptrdiff_t dstStride)
500
{
501
internal::assertSupportedConfiguration();
502
#ifdef CAROTENE_NEON
503
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
504
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
505
506
for (size_t i = 0u; i < size.height; ++i)
507
{
508
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
509
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
510
size_t sj = 0u, dj = 0u;
511
512
for (; sj < roiw16; sj += 16, dj += 48)
513
{
514
internal::prefetch(src + sj);
515
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
516
__asm__ (
517
"vld1.8 {d0-d1}, [%[in0]] \n\t"
518
"vmov.8 q1, q0 \n\t"
519
"vmov.8 q2, q0 \n\t"
520
"vmov.8 q3, q1 \n\t"
521
"vst3.8 {d2, d4, d6}, [%[out0]] \n\t"
522
"vst3.8 {d3, d5, d7}, [%[out1]] \n\t"
523
: /*no output*/
524
: [out0] "r" (dst + dj), [out1] "r" (dst + dj + 24),
525
[in0] "r" (src + sj)
526
: "d0","d1","d2","d3","d4","d5","d6","d7"
527
);
528
#else
529
uint8x16x3_t vRgb1;
530
vRgb1.val[0] = vld1q_u8(src + sj);
531
532
vRgb1.val[1] = vRgb1.val[0];
533
vRgb1.val[2] = vRgb1.val[0];
534
535
vst3q_u8(dst + dj, vRgb1);
536
#endif
537
}
538
539
if (sj < roiw8)
540
{
541
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
542
__asm__ (
543
"vld1.8 {d0}, [%[in]] \n\t"
544
"vmov.8 d1, d0 \n\t"
545
"vmov.8 d2, d0 \n\t"
546
"vst3.8 {d0-d2}, [%[out]] \n\t"
547
: /*no output*/
548
: [out] "r" (dst + dj), [in] "r" (src + sj)
549
: "d0","d1","d2"
550
);
551
#else
552
uint8x8x3_t vRgb2;
553
vRgb2.val[0] = vld1_u8(src + sj);
554
vRgb2.val[1] = vRgb2.val[0];
555
vRgb2.val[2] = vRgb2.val[0];
556
557
vst3_u8(dst + dj, vRgb2);
558
#endif
559
sj += 8; dj += 24;
560
}
561
562
for (; sj < size.width; sj++, dj += 3)
563
{
564
dst[dj+0] = src[sj];
565
dst[dj+1] = src[sj];
566
dst[dj+2] = src[sj];
567
}
568
}
569
#else
570
(void)size;
571
(void)srcBase;
572
(void)srcStride;
573
(void)dstBase;
574
(void)dstStride;
575
#endif
576
}
577
578
void gray2rgbx(const Size2D &size,
579
const u8 * srcBase, ptrdiff_t srcStride,
580
u8 * dstBase, ptrdiff_t dstStride)
581
{
582
internal::assertSupportedConfiguration();
583
#ifdef CAROTENE_NEON
584
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
585
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
586
587
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
588
register uint8x16_t vc255 asm ("q4") = vmovq_n_u8(255);
589
#else
590
uint8x16x4_t vRgba;
591
uint8x8x4_t vRgba2;
592
vRgba.val[3] = vmovq_n_u8(255);
593
vRgba2.val[3] = vget_low_u8(vRgba.val[3]);
594
#endif
595
596
for (size_t i = 0u; i < size.height; ++i)
597
{
598
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
599
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
600
size_t sj = 0u, dj = 0u;
601
602
for (; sj < roiw16; sj += 16, dj += 64)
603
{
604
internal::prefetch(src + sj);
605
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
606
__asm__ (
607
"vld1.8 {d0-d1}, [%[in0]] \n\t"
608
"vmov.8 q1, q0 \n\t"
609
"vmov.8 q2, q0 \n\t"
610
"vmov.8 q3, q1 \n\t"
611
"vst4.8 {d2, d4, d6, d8}, [%[out0]] \n\t"
612
"vst4.8 {d3, d5, d7, d9}, [%[out1]] \n\t"
613
: /*no output*/
614
: [out0] "r" (dst + dj), [out1] "r" (dst + dj + 32),
615
[in0] "r" (src + sj),
616
"w" (vc255)
617
: "d0","d1","d2","d3","d4","d5","d6","d7"
618
);
619
#else
620
vRgba.val[0] = vld1q_u8(src + sj);
621
622
vRgba.val[1] = vRgba.val[0];
623
vRgba.val[2] = vRgba.val[0];
624
625
vst4q_u8(dst + dj, vRgba);
626
#endif
627
}
628
629
if (sj < roiw8)
630
{
631
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
632
__asm__ (
633
"vld1.8 {d5}, [%[in]] \n\t"
634
"vmov.8 d6, d5 \n\t"
635
"vmov.8 d7, d5 \n\t"
636
"vst4.8 {d5-d8}, [%[out]] \n\t"
637
: /*no output*/
638
: [out] "r" (dst + dj), [in] "r" (src + sj), "w" (vc255)
639
: "d5","d6","d7"
640
);
641
#else
642
vRgba2.val[0] = vld1_u8(src + sj);
643
vRgba2.val[1] = vRgba2.val[0];
644
vRgba2.val[2] = vRgba2.val[0];
645
646
vst4_u8(dst + dj, vRgba2);
647
#endif
648
sj += 8; dj += 32;
649
}
650
651
for (; sj < size.width; sj++, dj += 4)
652
{
653
dst[dj+0] = src[sj];
654
dst[dj+1] = src[sj];
655
dst[dj+2] = src[sj];
656
dst[dj+3] = 255;
657
}
658
}
659
#else
660
(void)size;
661
(void)srcBase;
662
(void)srcStride;
663
(void)dstBase;
664
(void)dstStride;
665
#endif
666
}
667
668
void rgb2rgbx(const Size2D &size,
669
const u8 * srcBase, ptrdiff_t srcStride,
670
u8 * dstBase, ptrdiff_t dstStride)
671
{
672
internal::assertSupportedConfiguration();
673
#ifdef CAROTENE_NEON
674
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
675
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
676
register uint8x8_t vc255_0 asm ("d3") = vmov_n_u8(255);
677
#else
678
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
679
union { uint8x16x4_t v4; uint8x16x3_t v3; } v_dst0;
680
v_dst0.v4.val[3] = vdupq_n_u8(255);
681
union { uint8x8x4_t v4; uint8x8x3_t v3; } v_dst;
682
v_dst.v4.val[3] = vdup_n_u8(255);
683
#endif
684
685
for (size_t i = 0u; i < size.height; ++i)
686
{
687
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
688
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
689
size_t sj = 0u, dj = 0u, j = 0u;
690
691
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
692
for (; j < roiw8; sj += 24, dj += 32, j += 8)
693
{
694
internal::prefetch(src + sj);
695
__asm__ (
696
"vld3.8 {d0, d1, d2}, [%[in0]] \n\t"
697
"vst4.8 {d0, d1, d2, d3}, [%[out0]] \n\t"
698
: /*no output*/
699
: [out0] "r" (dst + dj),
700
[in0] "r" (src + sj),
701
"w" (vc255_0)
702
: "d0","d1","d2"
703
);
704
}
705
#else
706
for (; j < roiw16; sj += 48, dj += 64, j += 16)
707
{
708
internal::prefetch(src + sj);
709
v_dst0.v3 = vld3q_u8(src + sj);
710
vst4q_u8(dst + dj, v_dst0.v4);
711
}
712
713
if (j < roiw8)
714
{
715
v_dst.v3 = vld3_u8(src + sj);
716
vst4_u8(dst + dj, v_dst.v4);
717
sj += 24; dj += 32; j += 8;
718
}
719
#endif
720
721
for (; j < size.width; ++j, sj += 3, dj += 4)
722
{
723
dst[dj] = src[sj];
724
dst[dj + 1] = src[sj + 1];
725
dst[dj + 2] = src[sj + 2];
726
dst[dj + 3] = 255;
727
}
728
}
729
#else
730
(void)size;
731
(void)srcBase;
732
(void)srcStride;
733
(void)dstBase;
734
(void)dstStride;
735
#endif
736
}
737
738
void rgbx2rgb(const Size2D &size,
739
const u8 * srcBase, ptrdiff_t srcStride,
740
u8 * dstBase, ptrdiff_t dstStride)
741
{
742
internal::assertSupportedConfiguration();
743
#ifdef CAROTENE_NEON
744
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
745
#if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))
746
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
747
union { uint8x16x4_t v4; uint8x16x3_t v3; } v_dst0;
748
union { uint8x8x4_t v4; uint8x8x3_t v3; } v_dst;
749
#endif
750
751
for (size_t i = 0u; i < size.height; ++i)
752
{
753
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
754
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
755
size_t sj = 0u, dj = 0u, j = 0u;
756
757
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
758
for (; j < roiw8; sj += 32, dj += 24, j += 8)
759
{
760
internal::prefetch(src + sj);
761
__asm__ (
762
"vld4.8 {d0, d1, d2, d3}, [%[in0]] \n\t"
763
"vst3.8 {d0, d1, d2}, [%[out0]] \n\t"
764
: /*no output*/
765
: [out0] "r" (dst + dj),
766
[in0] "r" (src + sj)
767
: "d0","d1","d2","d3"
768
);
769
}
770
#else
771
for (; j < roiw16; sj += 64, dj += 48, j += 16)
772
{
773
internal::prefetch(src + sj);
774
v_dst0.v4 = vld4q_u8(src + sj);
775
vst3q_u8(dst + dj, v_dst0.v3);
776
}
777
778
if (j < roiw8)
779
{
780
v_dst.v4 = vld4_u8(src + sj);
781
vst3_u8(dst + dj, v_dst.v3);
782
sj += 32; dj += 24; j += 8;
783
}
784
#endif
785
786
for (; j < size.width; ++j, sj += 4, dj += 3)
787
{
788
dst[dj] = src[sj];
789
dst[dj + 1] = src[sj + 1];
790
dst[dj + 2] = src[sj + 2];
791
}
792
}
793
#else
794
(void)size;
795
(void)srcBase;
796
(void)srcStride;
797
(void)dstBase;
798
(void)dstStride;
799
#endif
800
}
801
802
void rgb2bgr(const Size2D &size,
803
const u8 * srcBase, ptrdiff_t srcStride,
804
u8 * dstBase, ptrdiff_t dstStride)
805
{
806
internal::assertSupportedConfiguration();
807
#ifdef CAROTENE_NEON
808
#if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))
809
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
810
#endif
811
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
812
813
for (size_t i = 0u; i < size.height; ++i)
814
{
815
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
816
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
817
size_t sj = 0u, dj = 0u, j = 0u;
818
819
820
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
821
for (; j < roiw8; sj += 24, dj += 24, j += 8)
822
{
823
internal::prefetch(src + sj);
824
__asm__ (
825
"vld3.8 {d0, d1, d2}, [%[in0]] \n\t"
826
"vswp d0, d2 \n\t"
827
"vst3.8 {d0, d1, d2}, [%[out0]] \n\t"
828
: /*no output*/
829
: [out0] "r" (dst + dj),
830
[in0] "r" (src + sj)
831
: "d0","d1","d2"
832
);
833
}
834
#else
835
for (; j < roiw16; sj += 48, dj += 48, j += 16)
836
{
837
internal::prefetch(src + sj);
838
uint8x16x3_t vals0 = vld3q_u8(src + sj);
839
840
std::swap(vals0.val[0], vals0.val[2]);
841
842
vst3q_u8(dst + dj, vals0);
843
}
844
845
if (j < roiw8)
846
{
847
uint8x8x3_t vals = vld3_u8(src + sj);
848
std::swap(vals.val[0], vals.val[2]);
849
vst3_u8(dst + dj, vals);
850
sj += 24; dj += 24; j += 8;
851
}
852
#endif
853
854
for (; j < size.width; ++j, sj += 3, dj += 3)
855
{
856
u8 b = src[sj + 2];//Handle src == dst case
857
dst[dj + 2] = src[sj ];
858
dst[dj + 1] = src[sj + 1];
859
dst[dj ] = b;
860
}
861
}
862
#else
863
(void)size;
864
(void)srcBase;
865
(void)srcStride;
866
(void)dstBase;
867
(void)dstStride;
868
#endif
869
}
870
871
void rgbx2bgrx(const Size2D &size,
872
const u8 * srcBase, ptrdiff_t srcStride,
873
u8 * dstBase, ptrdiff_t dstStride)
874
{
875
internal::assertSupportedConfiguration();
876
#ifdef CAROTENE_NEON
877
#if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))
878
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
879
#endif
880
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
881
882
for (size_t i = 0u; i < size.height; ++i)
883
{
884
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
885
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
886
size_t sj = 0u, dj = 0u, j = 0u;
887
888
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
889
for (; j < roiw8; sj += 32, dj += 32, j += 8)
890
{
891
internal::prefetch(src + sj);
892
__asm__ (
893
"vld4.8 {d0, d1, d2, d3}, [%[in0]] \n\t"
894
"vswp d0, d2 \n\t"
895
"vst4.8 {d0, d1, d2, d3}, [%[out0]] \n\t"
896
: /*no output*/
897
: [out0] "r" (dst + dj),
898
[in0] "r" (src + sj)
899
: "d0","d1","d2","d3"
900
);
901
}
902
#else
903
for (; j < roiw16; sj += 64, dj += 64, j += 16)
904
{
905
internal::prefetch(src + sj);
906
uint8x16x4_t vals0 = vld4q_u8(src + sj);
907
908
std::swap(vals0.val[0], vals0.val[2]);
909
910
vst4q_u8(dst + dj, vals0);
911
}
912
913
if (j < roiw8)
914
{
915
uint8x8x4_t vals = vld4_u8(src + sj);
916
std::swap(vals.val[0], vals.val[2]);
917
vst4_u8(dst + dj, vals);
918
sj += 32; dj += 32; j += 8;
919
}
920
#endif
921
922
for (; j < size.width; ++j, sj += 4, dj += 4)
923
{
924
u8 b = src[sj + 2];//Handle src == dst case
925
dst[dj + 2] = src[sj ];
926
dst[dj + 1] = src[sj + 1];
927
dst[dj ] = b;
928
dst[dj + 3] = src[sj + 3];
929
}
930
}
931
#else
932
(void)size;
933
(void)srcBase;
934
(void)srcStride;
935
(void)dstBase;
936
(void)dstStride;
937
#endif
938
}
939
940
void rgbx2bgr(const Size2D &size,
941
const u8 * srcBase, ptrdiff_t srcStride,
942
u8 * dstBase, ptrdiff_t dstStride)
943
{
944
internal::assertSupportedConfiguration();
945
#ifdef CAROTENE_NEON
946
#if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))
947
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
948
#endif
949
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
950
951
for (size_t i = 0u; i < size.height; ++i)
952
{
953
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
954
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
955
size_t sj = 0u, dj = 0u, j = 0u;
956
957
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
958
for (; j < roiw8; sj += 32, dj += 24, j += 8)
959
{
960
internal::prefetch(src + sj);
961
__asm__ (
962
"vld4.8 {d0, d1, d2, d3}, [%[in0]] \n\t"
963
"vswp d0, d2 \n\t"
964
"vst3.8 {d0, d1, d2}, [%[out0]] \n\t"
965
: /*no output*/
966
: [out0] "r" (dst + dj),
967
[in0] "r" (src + sj)
968
: "d0","d1","d2","d3"
969
);
970
}
971
#else
972
for (; j < roiw16; sj += 64, dj += 48, j += 16)
973
{
974
internal::prefetch(src + sj);
975
union { uint8x16x4_t v4; uint8x16x3_t v3; } vals0;
976
vals0.v4 = vld4q_u8(src + sj);
977
std::swap(vals0.v3.val[0], vals0.v3.val[2]);
978
vst3q_u8(dst + dj, vals0.v3);
979
}
980
981
if (j < roiw8)
982
{
983
union { uint8x8x4_t v4; uint8x8x3_t v3; } vals;
984
vals.v4 = vld4_u8(src + sj);
985
std::swap(vals.v3.val[0], vals.v3.val[2]);
986
vst3_u8(dst + dj, vals.v3);
987
sj += 32; dj += 24; j += 8;
988
}
989
#endif
990
991
for (; j < size.width; ++j, sj += 4, dj += 3)
992
{
993
dst[dj + 2] = src[sj ];
994
dst[dj + 1] = src[sj + 1];
995
dst[dj ] = src[sj + 2];
996
}
997
}
998
#else
999
(void)size;
1000
(void)srcBase;
1001
(void)srcStride;
1002
(void)dstBase;
1003
(void)dstStride;
1004
#endif
1005
}
1006
1007
void rgb2bgrx(const Size2D &size,
1008
const u8 * srcBase, ptrdiff_t srcStride,
1009
u8 * dstBase, ptrdiff_t dstStride)
1010
{
1011
internal::assertSupportedConfiguration();
1012
#ifdef CAROTENE_NEON
1013
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
1014
register uint8x8_t vc255 asm ("d3") = vmov_n_u8(255);
1015
#else
1016
union { uint8x16x4_t v4; uint8x16x3_t v3; } vals0;
1017
vals0.v4.val[3] = vmovq_n_u8(255);
1018
union { uint8x8x4_t v4; uint8x8x3_t v3; } vals8;
1019
vals8.v4.val[3] = vmov_n_u8(255);
1020
#endif
1021
1022
#if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))
1023
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
1024
#endif
1025
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
1026
1027
for (size_t i = 0u; i < size.height; ++i)
1028
{
1029
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
1030
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
1031
size_t sj = 0u, dj = 0u, j = 0u;
1032
1033
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
1034
for (; j < roiw8; sj += 24, dj += 32, j += 8)
1035
{
1036
internal::prefetch(src + sj);
1037
__asm__ (
1038
"vld3.8 {d0, d1, d2}, [%[in0]] \n\t"
1039
"vswp d0, d2 \n\t"
1040
"vst4.8 {d0, d1, d2, d3}, [%[out0]] \n\t"
1041
: /*no output*/
1042
: [out0] "r" (dst + dj),
1043
[in0] "r" (src + sj),
1044
"w" (vc255)
1045
: "d0","d1","d2"
1046
);
1047
}
1048
#else
1049
for (; j < roiw16; sj += 48, dj += 64, j += 16)
1050
{
1051
internal::prefetch(src + sj);
1052
vals0.v3 = vld3q_u8(src + sj);
1053
std::swap(vals0.v4.val[0], vals0.v4.val[2]);
1054
vst4q_u8(dst + dj, vals0.v4);
1055
}
1056
1057
if (j < roiw8)
1058
{
1059
vals8.v3 = vld3_u8(src + sj);
1060
std::swap(vals8.v4.val[0], vals8.v4.val[2]);
1061
vst4_u8(dst + dj, vals8.v4);
1062
sj += 24; dj += 32; j += 8;
1063
}
1064
#endif
1065
1066
for (; j < size.width; ++j, sj += 3, dj += 4)
1067
{
1068
dst[dj + 3] = 255;
1069
dst[dj + 2] = src[sj ];
1070
dst[dj + 1] = src[sj + 1];
1071
dst[dj ] = src[sj + 2];
1072
}
1073
}
1074
#else
1075
(void)size;
1076
(void)srcBase;
1077
(void)srcStride;
1078
(void)dstBase;
1079
(void)dstStride;
1080
#endif
1081
}
1082
1083
namespace {
1084
1085
#ifdef CAROTENE_NEON
1086
inline uint8x8x3_t convertToHSV(const uint8x8_t vR, const uint8x8_t vG, const uint8x8_t vB,
1087
const s32 hrange )
1088
{
1089
const s32 hsv_shift = 12;
1090
const f32 vsdiv_table = f32(255 << hsv_shift);
1091
f32 vhdiv_table = f32(hrange << hsv_shift);
1092
const s32 vhrange = hrange;
1093
const s32 v0 = s32(0);
1094
const s32 vshift = s32(1 << (hsv_shift-1));
1095
const s32 v6 = s32(6);
1096
1097
uint8x8_t vMin = vmin_u8(vR, vG);
1098
uint8x8_t vMax = vmax_u8(vR, vG);
1099
1100
uint16x8_t vR_u16 = vmovl_u8(vR);
1101
uint16x8_t vG_u16 = vmovl_u8(vG);
1102
1103
vMax = vmax_u8(vMax, vB);
1104
vMin = vmin_u8(vMin, vB);
1105
uint16x8_t vB_u16 = vmovl_u8(vB);
1106
1107
uint16x8_t vDiff = vsubl_u8(vMax, vMin);
1108
1109
uint16x8_t vV = vmovl_u8(vMax);
1110
uint16x8_t vDiffx2 = vaddq_u16(vDiff, vDiff);
1111
uint32x4_t vDiffL = vmovl_u16(vget_low_u16(vDiff));
1112
uint32x4_t vDiffH = vmovl_u16(vget_high_u16(vDiff));
1113
1114
uint16x8_t vVEqR = vceqq_u16(vR_u16, vV);
1115
uint16x8_t vVEqG = vceqq_u16(vG_u16, vV);
1116
1117
int16x8_t vG_B = vsubq_s16(vreinterpretq_s16_u16(vG_u16), vreinterpretq_s16_u16(vB_u16));
1118
uint16x8_t vInvR = vmvnq_u16(vVEqR);
1119
int16x8_t vB_R = vsubq_s16(vreinterpretq_s16_u16(vB_u16), vreinterpretq_s16_u16(vR_u16));
1120
int16x8_t vR_G = vsubq_s16(vreinterpretq_s16_u16(vR_u16), vreinterpretq_s16_u16(vG_u16));
1121
1122
uint16x8_t vMask2 = vandq_u16(vVEqG, vInvR);
1123
vR_u16 = vandq_u16(vreinterpretq_u16_s16(vG_B), vVEqR);
1124
int16x8_t vH2 = vaddq_s16(vB_R, vreinterpretq_s16_u16(vDiffx2));
1125
1126
vVEqR = vmvnq_u16(vVEqG);
1127
vB_R = vaddq_s16(vreinterpretq_s16_u16(vDiffx2), vreinterpretq_s16_u16(vDiffx2));
1128
vG_B = vandq_s16(vreinterpretq_s16_u16(vInvR), vreinterpretq_s16_u16(vVEqR));
1129
vInvR = vandq_u16(vreinterpretq_u16_s16(vH2), vMask2);
1130
vR_G = vaddq_s16(vR_G, vB_R);
1131
int16x8_t vH = vaddq_s16(vreinterpretq_s16_u16(vR_u16), vreinterpretq_s16_u16(vInvR));
1132
1133
uint32x4_t vV_L = vmovl_u16(vget_low_u16(vV));
1134
vR_G = vandq_s16(vR_G, vG_B);
1135
uint32x4_t vV_H = vmovl_u16(vget_high_u16(vV));
1136
int16x8_t vDiff4 = vaddq_s16(vH, vR_G);
1137
1138
int32x4_t vc6 = vdupq_n_s32(v6);
1139
uint32x4_t vLine1 = vmulq_u32(vDiffL, vreinterpretq_u32_s32(vc6));
1140
uint32x4_t vLine2 = vmulq_u32(vDiffH, vreinterpretq_u32_s32(vc6));
1141
1142
float32x4_t vF1 = vcvtq_f32_u32(vV_L);
1143
float32x4_t vF2 = vcvtq_f32_u32(vV_H);
1144
float32x4_t vHF1 = vcvtq_f32_u32(vLine1);
1145
float32x4_t vHF2 = vcvtq_f32_u32(vLine2);
1146
1147
float32x4_t vXInv1 = vrecpeq_f32(vF1);
1148
float32x4_t vXInv2 = vrecpeq_f32(vF2);
1149
float32x4_t vXInv3 = vrecpeq_f32(vHF1);
1150
float32x4_t vXInv4 = vrecpeq_f32(vHF2);
1151
1152
float32x4_t vSt1 = vrecpsq_f32(vXInv1, vF1);
1153
float32x4_t vSt2 = vrecpsq_f32(vXInv2, vF2);
1154
float32x4_t vSt3 = vrecpsq_f32(vXInv3, vHF1);
1155
float32x4_t vSt4 = vrecpsq_f32(vXInv4, vHF2);
1156
1157
vF1 = vmulq_f32(vXInv1, vSt1);
1158
vF2 = vmulq_f32(vXInv2, vSt2);
1159
vHF1 = vmulq_f32(vXInv3, vSt3);
1160
vHF2 = vmulq_f32(vXInv4, vSt4);
1161
1162
float32x4_t vDivTab = vdupq_n_f32(vsdiv_table);
1163
vSt1 = vmulq_f32(vF1, vDivTab);
1164
vSt2 = vmulq_f32(vF2, vDivTab);
1165
vDivTab = vdupq_n_f32(vhdiv_table);
1166
vSt3 = vmulq_f32(vHF1, vDivTab);
1167
vSt4 = vmulq_f32(vHF2, vDivTab);
1168
1169
float32x4_t bias = vdupq_n_f32(0.5f);
1170
1171
vSt1 = vaddq_f32(vSt1, bias);
1172
vSt2 = vaddq_f32(vSt2, bias);
1173
vSt3 = vaddq_f32(vSt3, bias);
1174
vSt4 = vaddq_f32(vSt4, bias);
1175
1176
uint32x4_t vRes1 = vcvtq_u32_f32(vSt1);
1177
uint32x4_t vRes2 = vcvtq_u32_f32(vSt2);
1178
uint32x4_t vRes3 = vcvtq_u32_f32(vSt3);
1179
uint32x4_t vRes4 = vcvtq_u32_f32(vSt4);
1180
1181
int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4));
1182
int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4));
1183
1184
uint32x4_t vDiff_Res1 = vmulq_u32(vDiffL, vRes1);
1185
uint32x4_t vDiff_Res2 = vmulq_u32(vDiffH, vRes2);
1186
uint32x4_t vDiff_Res3 = vmulq_u32(vreinterpretq_u32_s32(vH_L), vRes3);
1187
uint32x4_t vDiff_Res4 = vmulq_u32(vreinterpretq_u32_s32(vH_H), vRes4);
1188
1189
int32x4_t vShift = vdupq_n_s32(vshift);
1190
uint32x4_t vAddRes1 = vaddq_u32(vDiff_Res1, vreinterpretq_u32_s32(vShift));
1191
uint32x4_t vAddRes2 = vaddq_u32(vDiff_Res2, vreinterpretq_u32_s32(vShift));
1192
uint32x4_t vAddRes3 = vaddq_u32(vDiff_Res3, vreinterpretq_u32_s32(vShift));
1193
uint32x4_t vAddRes4 = vaddq_u32(vDiff_Res4, vreinterpretq_u32_s32(vShift));
1194
int16x4_t vShrRes1 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes1), 8);
1195
int16x4_t vShrRes2 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes2), 8);
1196
int16x4_t vShrRes3 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes3), 8);
1197
int16x4_t vShrRes4 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes4), 8);
1198
1199
int16x8_t vc0 = vdupq_n_s16((s16)v0);
1200
int8x8_t vShrRes1_s8 = vshrn_n_s16(vcombine_s16(vShrRes1, vShrRes2), 4);
1201
uint16x8_t vCltRes_u16 = vcltq_s16(vcombine_s16(vShrRes3, vShrRes4), vc0);
1202
int8x8_t vShrRes2_s8 = vshrn_n_s16(vcombine_s16(vShrRes3, vShrRes4), 4);
1203
1204
int8x8_t vCltRes_s8 = vmovn_s16(vreinterpretq_s16_u16(vCltRes_u16));
1205
int8x8_t vcHRange = vdup_n_s8((s8)vhrange);
1206
uint8x8_t vHResAdd = vand_u8(vreinterpret_u8_s8(vCltRes_s8), vreinterpret_u8_s8(vcHRange));
1207
int8x8_t vHRes = vadd_s8(vShrRes2_s8, vreinterpret_s8_u8(vHResAdd));
1208
1209
uint8x8x3_t vHsv;
1210
vHsv.val[0] = vreinterpret_u8_s8(vHRes);
1211
vHsv.val[1] = vreinterpret_u8_s8(vShrRes1_s8);
1212
vHsv.val[2] = vMax;
1213
1214
return vHsv;
1215
}
1216
1217
const u8 fastSaturate8u[] =
1218
{
1219
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1220
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1221
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1222
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1223
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1224
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1225
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1226
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1227
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1228
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1229
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1230
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1231
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1232
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1233
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1234
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1235
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1236
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1237
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1238
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
1239
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
1240
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
1241
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
1242
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
1243
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
1244
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
1245
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
1246
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
1247
192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
1248
208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
1249
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
1250
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
1251
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1252
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1253
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1254
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1255
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1256
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1257
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1258
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1259
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1260
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1261
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1262
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1263
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1264
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1265
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1266
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1267
255
1268
};
1269
1270
inline void convertToHSV(const s32 r, const s32 g, const s32 b,
1271
const s32 &hrange, const s32 &hsv_shift,
1272
u8* dst)
1273
{
1274
s32 h, s, v = b;
1275
s32 vmin = b, diff;
1276
s32 vr, vg;
1277
1278
v += fastSaturate8u[g-v+256];
1279
v += fastSaturate8u[r-v+256];
1280
vmin -= fastSaturate8u[vmin-g+256];
1281
vmin -= fastSaturate8u[vmin-r+256];
1282
1283
diff = v - vmin;
1284
vr = v == r ? -1 : 0;
1285
vg = v == g ? -1 : 0;
1286
1287
s = (s32(diff * (255 << hsv_shift) * (1.0f/(f32)v)) + (1 << (hsv_shift-1))) >> hsv_shift;
1288
h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
1289
h = ((h * s32((hrange << hsv_shift)/(6.f*diff) + 0.5)) + (1 << (hsv_shift-1))) >> hsv_shift;
1290
h += h < 0 ? hrange : 0;
1291
1292
dst[0] = internal::saturate_cast<u8>(h);
1293
dst[1] = (u8)s;
1294
dst[2] = (u8)v;
1295
}
1296
1297
#define CONVERT_TO_HSV_ASM(loadop, rreg, breg) \
1298
__asm__ ( \
1299
#loadop ", [%[in]] @RGB \n\t" \
1300
"vmin.u8 d3, d0, d1 @VMin (d3) \n\t" \
1301
"vmax.u8 d6, d0, d1 @V (d6) \n\t" \
1302
"vmovl.u8 q2, " #rreg " @V16_R (d4,d5) \n\t" \
1303
"vmovl.u8 q4, d1 @V16_G (d8,d9) \n\t" \
1304
"vmax.u8 d6, d6, d2 \n\t" \
1305
"vmin.u8 d3, d3, d2 \n\t" \
1306
"vmovl.u8 q0, " #breg " @V16_B (d0,d1) \n\t" \
1307
"vsubl.u8 q8, d6, d3 @V16_Diff (d16,d17) \n\t" \
1308
\
1309
"vmovl.u8 q5, d6 @V16_V (d10,d11) \n\t" \
1310
"vadd.s16 q10, q8, q8 @V16_Diff_2 (d20,d21) \n\t" \
1311
"vmovl.u16 q9, d16 @V32_Diff_L (d18,d19) \n\t" \
1312
"vmovl.u16 q11, d17 @V32_Diff_H (d22,d23) \n\t" \
1313
"vceq.u16 q12, q2, q5 @V==R(d24,d25) \n\t" \
1314
"vceq.u16 q13, q4, q5 @V==G(d26,d27) \n\t" \
1315
\
1316
"vsub.s16 q8, q4, q0 @V16_G-B (d16,d17) \n\t" \
1317
"vmvn.u16 q15, q12 @V16~R \n\t" \
1318
"vsub.s16 q6, q0, q2 @V16_B-R (d12,d13) \n\t" \
1319
"vsub.s16 q7, q2, q4 @V16_R-G (d14,d15) \n\t" \
1320
"vand.u16 q1, q13, q15 @VMask2 \n\t" \
1321
"vand.u16 q2, q8, q12 @V16_H(d4,d5) \n\t" \
1322
"vadd.s16 q4, q6, q10 @V16_H2 \n\t" \
1323
"vmvn.u16 q12, q13 @V16~G \n\t" \
1324
"vadd.s16 q6, q10, q10 @VDiff16_4 (d12,d13) \n\t" \
1325
"vand.u16 q8, q15, q12 @VMask3 \n\t" \
1326
"vand.u16 q15, q4, q1 @vH2(d30,d31) \n\t" \
1327
"vadd.s16 q7, q7, q6 @V16_H3 (d14,d15) \n\t" \
1328
"vadd.s16 q14, q2, q15 @vH16 \n\t" \
1329
"vmovl.u16 q12, d10 @V32_V_L \n\t" \
1330
"vand.s16 q7, q7, q8 @vH16 \n\t" \
1331
"vmovl.u16 q13, d11 @V32_V_H \n\t" \
1332
"vadd.s16 q2, q14, q7 @V16_Diff_4 \n\t" \
1333
\
1334
"vdup.32 q4, %[v6] \n\t" \
1335
"vmul.u32 q14, q9, q4 \n\t" \
1336
"vmul.u32 q15, q11, q4 \n\t" \
1337
"vcvt.f32.u32 q4, q12 @VF1 (d8,d9) \n\t" \
1338
"vcvt.f32.u32 q8, q13 @VF2 \n\t" \
1339
"vcvt.f32.u32 q0, q14 @HF1 \n\t" \
1340
"vcvt.f32.u32 q1, q15 @HF2 \n\t" \
1341
"vrecpe.f32 q12, q4 @Vxinv \n\t" \
1342
"vrecpe.f32 q13, q8 @Vxinv \n\t" \
1343
"vrecpe.f32 q5, q0 @Vxinv \n\t" \
1344
"vrecpe.f32 q7, q1 @Vxinv \n\t" \
1345
"vrecps.f32 q14, q12, q4 @Vst1 \n\t" \
1346
"vrecps.f32 q15, q13, q8 @Vst1 \n\t" \
1347
"vrecps.f32 q10, q5, q0 @Vst1 \n\t" \
1348
"vrecps.f32 q6, q7, q1 @Vst1 \n\t" \
1349
"vmul.f32 q4, q12, q14 \n\t" \
1350
"vmul.f32 q8, q13, q15 \n\t" \
1351
"vmul.f32 q0, q5, q10 \n\t" \
1352
"vmul.f32 q1, q7, q6 \n\t" \
1353
"vdup.32 q12, %[vsdiv_table] \n\t" \
1354
"vmul.f32 q14, q4, q12 \n\t" \
1355
"vmul.f32 q15, q8, q12 \n\t" \
1356
"vdup.32 q12, %[vhdiv_table] \n\t" \
1357
"vmul.f32 q10, q0, q12 \n\t" \
1358
"vmul.f32 q6, q1, q12 \n\t" \
1359
\
1360
"vdup.32 q12, %[bias] \n\t" \
1361
\
1362
"vadd.f32 q7, q14, q12 \n\t" \
1363
"vadd.f32 q13, q15, q12 \n\t" \
1364
"vcvt.u32.f32 q4, q7 \n\t" \
1365
"vcvt.u32.f32 q8, q13 \n\t" \
1366
\
1367
"vadd.f32 q14, q10, q12 \n\t" \
1368
"vadd.f32 q7, q6, q12 \n\t" \
1369
"vcvt.u32.f32 q0, q14 \n\t" \
1370
"vcvt.u32.f32 q1, q7 @Vres \n\t" \
1371
\
1372
"vmovl.s16 q7, d4 @V32_H_L (d14,d15) \n\t" \
1373
"vmovl.s16 q5, d5 @V32_H_H (d10,d11) \n\t" \
1374
"vmul.u32 q14, q9, q4 \n\t" \
1375
"vmul.u32 q15, q11, q8 \n\t" \
1376
"vmul.u32 q10, q7, q0 \n\t" \
1377
"vmul.u32 q6, q5, q1 \n\t" \
1378
\
1379
"vdup.32 q12, %[vshift] \n\t" \
1380
"vadd.u32 q13, q14, q12 \n\t" \
1381
"vadd.u32 q8, q15, q12 \n\t" \
1382
"vadd.u32 q0, q10, q12 \n\t" \
1383
"vadd.u32 q1, q6, q12 \n\t" \
1384
"vshrn.s32 d8, q13, #8 \n\t" \
1385
"vshrn.s32 d9, q8, #8 \n\t" \
1386
"vshrn.s32 d10, q0, #8 \n\t" \
1387
"vshrn.s32 d11, q1, #8 \n\t" \
1388
\
1389
"vdup.16 q8, %[v0] \n\t" \
1390
"vshrn.s16 d5, q4, #4 \n\t" \
1391
"vclt.s16 q9, q5, q8 \n\t" \
1392
"vshrn.s16 d4, q5, #4 \n\t" \
1393
\
1394
"vmovn.s16 d9, q9 \n\t" \
1395
"vdup.8 d7, %[vhrange] \n\t" \
1396
"vand.u8 d10, d9, d7 \n\t" \
1397
"vadd.s8 d4, d4, d10 \n\t" \
1398
"vst3.8 {d4-d6}, [%[out]] @HSV \n\t" \
1399
: /*no output*/ \
1400
: [out] "r" (dst + dj), [in] "r" (src + sj), \
1401
[vsdiv_table] "r" (vsdiv_table), \
1402
[vshift] "r" (vshift), \
1403
[vhdiv_table] "r" (vhdiv_table), \
1404
[v6] "r" (v6), [vhrange] "r" (vhrange), \
1405
[v0] "r" (v0), [bias] "r" (bias) \
1406
: "d0","d1","d2","d3","d4","d5","d6","d7", \
1407
"d8","d9","d10","d11","d12","d13","d14","d15", \
1408
"d16","d17","d18","d19","d20","d21","d22","d23", \
1409
"d24","d25","d26","d27","d28","d29","d30","d31" \
1410
);
1411
1412
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1413
1414
#define YCRCB_CONSTS \
1415
register int16x4_t vcYR asm ("d31") = vmov_n_s16(4899); \
1416
register int16x4_t vcYG asm ("d30") = vmov_n_s16(9617); \
1417
register int16x4_t vcYB asm ("d29") = vmov_n_s16(1868); \
1418
register int16x4_t vcCrG asm ("d28") = vmov_n_s16(6860); \
1419
register int16x4_t vcCrB asm ("d27") = vmov_n_s16(1332); \
1420
register int16x4_t vcCbR asm ("d26") = vmov_n_s16(2765); \
1421
register int16x4_t vcCbG asm ("d25") = vmov_n_s16(5427);
1422
1423
#else
1424
1425
#define YCRCB_CONSTS \
1426
const s16 convertCoeffs[] = { 4899, 4899, 4899, 4899, \
1427
9617, 9617, 9617, 9617, \
1428
1868, 1868, 1868, 1868, \
1429
6860, 6860, 6860, 6860, \
1430
1332, 1332, 1332, 1332, \
1431
2765, 2765, 2765, 2765, \
1432
5427, 5427, 5427, 5427 }; \
1433
const int16x8_t vcYRG = vld1q_s16(convertCoeffs); /*YR and YG*/ \
1434
const int16x4_t vcYB = vld1_s16(convertCoeffs + 8); /*YB*/ \
1435
const int16x8_t vcCrGB = vld1q_s16(convertCoeffs + 12); /*CrG and CrB*/ \
1436
const int16x8_t vcCbRG = vld1q_s16(convertCoeffs + 20); /*CbR and CbG*/
1437
1438
#endif
1439
1440
#define CONVERTTOYCRCB(loadcmd, rreg, greg, breg) \
1441
__asm__ ( \
1442
#loadcmd ", [%[in]] @RGB \n\t" \
1443
"vmovl.u8 q2, " #rreg " @R (d4,d5) \n\t" \
1444
"vmovl.u8 q3, " #greg " @G (d6,d7) \n\t" \
1445
"vmovl.u8 q4, " #breg " @B (d8,d9) \n\t" \
1446
\
1447
"vshll.u16 q7, d4, #13 @Cr(q7,q8): R \n\t" \
1448
"vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t" \
1449
"vshll.u16 q9, d8, #13 @Cb(q9,q10): B \n\t" \
1450
"vshll.u16 q8, d5, #13 @Cr(q7,q8): R \n\t" \
1451
"vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t" \
1452
"vshll.u16 q10, d9, #13 @Cb(q9,q10): B \n\t" \
1453
\
1454
"vmlsl.s16 q7, d6, d28 @Cr(q7,q8): RG \n\t" \
1455
"vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t" \
1456
"vmlsl.s16 q9, d4, d26 @Cb(q9,q10): BR \n\t" \
1457
"vmlsl.s16 q8, d7, d28 @Cr(q7,q8): RG \n\t" \
1458
"vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t" \
1459
"vmlsl.s16 q10, d5, d26 @Cb(q9,q10): BR \n\t" \
1460
\
1461
"vmlsl.s16 q7, d8, d27 @Cr(q7,q8): RGB \n\t" \
1462
"vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t" \
1463
"vmlsl.s16 q9, d6, d25 @Cb(q9,q10): BRG \n\t" \
1464
"vmlsl.s16 q8, d9, d27 @Cr(q7,q8): RGB \n\t" \
1465
"vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t" \
1466
"vmlsl.s16 q10, d7, d25 @Cb(q9,q10): BRG \n\t" \
1467
\
1468
"vrshrn.s32 d4, q7, #14 @Cr -> q2 \n\t" \
1469
"vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t" \
1470
"vrshrn.s32 d6, q9, #14 @Cb -> q3 \n\t" \
1471
"vrshrn.s32 d5, q8, #14 @Cr -> q2 \n\t" \
1472
"vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t" \
1473
"vrshrn.s32 d7, q10, #14 @Cb -> q3 \n\t" \
1474
\
1475
"vmov.s16 q5, #128 \n\t" \
1476
"vmov.s16 q6, #128 \n\t" \
1477
"vadd.i16 q5, q2 @Cr -> q5 \n\t" \
1478
"vadd.i16 q6, q3 @Cb -> q6 \n\t" \
1479
\
1480
"vqmovn.u16 d4, q4 \n\t" \
1481
"vqmovun.s16 d5, q5 \n\t" \
1482
"vqmovun.s16 d6, q6 \n\t" \
1483
\
1484
"vst3.8 {d4-d6}, [%[out]] \n\t" \
1485
: /*no output*/ \
1486
: [out] "r" (dst + dj), [in] "r" (src + sj), \
1487
"w" (vcYR), "w" (vcYG), "w" (vcYB), \
1488
"w" (vcCrB), "w" (vcCrG), "w" (vcCbG), "w" (vcCbR) \
1489
: "d0","d1","d2","d3","d4","d5","d6","d7", \
1490
"d8","d9","d10","d11","d12","d13","d14","d15", \
1491
"d16","d17","d18","d19","d20","d21" \
1492
);
1493
1494
1495
inline uint8x8x3_t convertToYCrCb( const int16x8_t& vR, const int16x8_t& vG, const int16x8_t& vB,
1496
const int16x8_t& vcYRG, const int16x4_t& vcYB,
1497
const int16x8_t& vcCrGB, const int16x8_t& vcCbRG )
1498
{
1499
int32x4_t vCrL = vshll_n_s16(vget_low_s16(vR), 13); // R
1500
int32x4_t vCrH = vshll_n_s16(vget_high_s16(vR), 13); // R
1501
int32x4_t vYL = vmull_s16(vget_low_s16(vG), vget_high_s16(vcYRG)); // G
1502
int32x4_t vYH = vmull_s16(vget_high_s16(vG), vget_high_s16(vcYRG)); // G
1503
int32x4_t vCbL = vshll_n_s16(vget_low_s16(vB), 13); // B
1504
int32x4_t vCbH = vshll_n_s16(vget_high_s16(vB), 13); // B
1505
1506
vCrL = vmlsl_s16(vCrL, vget_low_s16(vG), vget_low_s16(vcCrGB)); // RG
1507
vCrH = vmlsl_s16(vCrH, vget_high_s16(vG), vget_low_s16(vcCrGB)); // RG
1508
vYL = vmlal_s16(vYL, vget_low_s16(vB), vcYB); // GB
1509
vYH = vmlal_s16(vYH, vget_high_s16(vB), vcYB); // GB
1510
vCbL = vmlsl_s16(vCbL, vget_low_s16(vR), vget_low_s16(vcCbRG)); // BR
1511
vCbH = vmlsl_s16(vCbH, vget_high_s16(vR), vget_low_s16(vcCbRG)); // BR
1512
1513
vCrL = vmlsl_s16(vCrL, vget_low_s16(vB), vget_high_s16(vcCrGB)); // RGB
1514
vCrH = vmlsl_s16(vCrH, vget_high_s16(vB), vget_high_s16(vcCrGB)); // RGB
1515
vYL = vmlal_s16(vYL, vget_low_s16(vR), vget_low_s16(vcYRG)); // GBR
1516
vYH = vmlal_s16(vYH, vget_high_s16(vR), vget_low_s16(vcYRG)); // GBR
1517
vCbL = vmlsl_s16(vCbL, vget_low_s16(vG), vget_high_s16(vcCbRG)); // BRG
1518
vCbH = vmlsl_s16(vCbH, vget_high_s16(vG), vget_high_s16(vcCbRG)); // BRG
1519
1520
int16x4_t vCrL_ = vrshrn_n_s32(vCrL, 14);
1521
int16x4_t vCrH_ = vrshrn_n_s32(vCrH, 14);
1522
int16x4_t vYL_ = vrshrn_n_s32(vYL, 14);
1523
int16x4_t vYH_ = vrshrn_n_s32(vYH, 14);
1524
int16x4_t vCbL_ = vrshrn_n_s32(vCbL, 14);
1525
int16x4_t vCbH_ = vrshrn_n_s32(vCbH, 14);
1526
1527
int16x8_t vCr = vmovq_n_s16(128);
1528
int16x8_t vCb = vmovq_n_s16(128);
1529
1530
vCr = vaddq_s16(vCr, vcombine_s16(vCrL_, vCrH_));
1531
vCb = vaddq_s16(vCb, vcombine_s16(vCbL_, vCbH_));
1532
1533
uint8x8x3_t vYCrCb;
1534
vYCrCb.val[0] = vqmovn_u16(vreinterpretq_u16_s16(vcombine_s16(vYL_, vYH_)));
1535
vYCrCb.val[1] = vqmovun_s16(vCr);
1536
vYCrCb.val[2] = vqmovun_s16(vCb);
1537
1538
return vYCrCb;
1539
}
1540
1541
#define S_CONVERTTOYCRCB(R, G, B) \
1542
s32 Y = (R * 4899 + G * 9617 + B * 1868 + (1 << 13)) >> 14; \
1543
s32 Cr = 128 + ((R * 8192 - G * 6860 - B * 1332 + (1 << 13)) >> 14); \
1544
s32 Cb = 128 + ((R * (-2765) - G * 5427 + B * 8192 + (1 << 13)) >> 14); \
1545
dst[dj + 0] = internal::saturate_cast<u8>(Y); \
1546
dst[dj + 1] = internal::saturate_cast<u8>(Cr); \
1547
dst[dj + 2] = internal::saturate_cast<u8>(Cb);
1548
1549
#define COEFF_Y ( 149)
1550
#define COEFF_BU ( 129)
1551
#define COEFF_RV ( 102)
1552
#define COEFF_GU ( 25)
1553
#define COEFF_GV ( 52)
1554
#define COEFF_R (-14248)
1555
#define COEFF_G ( 8663)
1556
#define COEFF_B (-17705)
1557
1558
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1559
#define YUV420ALPHA3_CONST
1560
#define YUV420ALPHA4_CONST register uint8x16_t c255 asm ("q13") = vmovq_n_u8(255);
1561
#define YUV420ALPHA3_CONVERT
1562
#define YUV420ALPHA4_CONVERT , "w" (c255)
1563
#define YUV420STORE1CMD3 "vst3.8 {d20, d22, d24}"
1564
#define YUV420STORE2CMD3 "vst3.8 {d21, d23, d25}"
1565
#define YUV420STORE1CMD4 "vst4.8 {d20, d22, d24, d26}"
1566
#define YUV420STORE2CMD4 "vst4.8 {d21, d23, d25, d27}"
1567
1568
#define YUV420_CONSTS(cn, bIdx, vIdx) \
1569
register const s32 cR = s16(COEFF_R); \
1570
register const s32 cG = s16(COEFF_G); \
1571
register const s32 cB = s16(COEFF_B); \
1572
\
1573
register uint8x16_t vc16 asm ("q15") = vmovq_n_u8(16); \
1574
register uint8x8_t cGU asm ("d14") = vmov_n_u8(COEFF_GU); \
1575
register uint8x8_t cGV asm ("d15") = vmov_n_u8(COEFF_GV); \
1576
register uint8x8_t cRV asm ("d16") = vmov_n_u8(COEFF_RV); \
1577
register uint8x8_t cBU asm ("d17") = vmov_n_u8(COEFF_BU); \
1578
register uint8x16_t cRGBY asm ("q3") = vmovq_n_u8(COEFF_Y); \
1579
YUV420ALPHA##cn##_CONST
1580
1581
#define CONVERTYUV420TORGB(cn, ureg, vreg, rreg, breg) \
1582
__asm__ ( \
1583
"vld2.8 {d0-d1}, [%[inUV]] @UV \n\t" \
1584
"vdup.16 q4, %[cG] @cG \n\t" \
1585
"vld2.8 {d2-d3}, [%[inY1]] @YY \n\t" \
1586
"vdup.16 "#rreg", %[cR] @cR \n\t" \
1587
"vld2.8 {d4-d5}, [%[inY2]] @YY \n\t" \
1588
"vdup.16 "#breg", %[cB] @cB \n\t" \
1589
"vmlsl.u8 q4, "#ureg", d14 @cG-25u \n\t" \
1590
"vmax.u8 q1, q15 @max(Y,16) \n\t" \
1591
"vmlal.u8 "#rreg", "#vreg", d16 @cR+102*v \n\t" \
1592
"vmlal.u8 "#breg", "#ureg", d17 @cB+129*u \n\t" \
1593
"vmax.u8 q2, q15 @max(Y,16) \n\t" \
1594
"vmlsl.u8 q4, "#vreg", d15 @cG-25u-52v \n\t" \
1595
/*q10,q11,q12,q13 - for output*/ \
1596
"vmull.u8 q9, d3, d6 @h 149*y \n\t" \
1597
"vmull.u8 q10, d2, d7 @l 149*y \n\t" \
1598
"vshr.u16 q9, #1 @h (149*y)/2 \n\t" \
1599
"vshr.u16 q10, #1 @l (149*y)/2 \n\t" \
1600
\
1601
"vhadd.s16 q0, q9, q4 @hG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \
1602
"vhadd.s16 q12, q10, q6 @lB ((149*y)/2 + cB + 129*u)/2 \n\t" \
1603
"vhadd.s16 q1, q9, q5 @hR ((149*y)/2 + cR + 102*v)/2 \n\t" \
1604
"vhadd.s16 q11, q10, q4 @lG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \
1605
"vhadd.s16 q9, q6 @hB ((149*y)/2 + cB + 129*u)/2 \n\t" \
1606
"vhadd.s16 q10, q5 @lR ((149*y)/2 + cR + 102*v)/2 \n\t" \
1607
\
1608
"vqrshrun.s16 d24, q12, #5 @lB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \
1609
"vqrshrun.s16 d22, q11, #5 @lG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \
1610
"vqrshrun.s16 d20, q10, #5 @lR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \
1611
"vqrshrun.s16 d23, q0, #5 @hG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \
1612
"vqrshrun.s16 d21, q1, #5 @hR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \
1613
"vqrshrun.s16 d25, q9, #5 @hB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \
1614
\
1615
"vzip.8 d22, d23 @G \n\t" \
1616
"vzip.8 d20, d21 @R \n\t" \
1617
"vzip.8 d24, d25 @B \n\t" \
1618
\
1619
YUV420STORE1CMD##cn", [%[out1]] \n\t" \
1620
YUV420STORE2CMD##cn", [%[out1x]] \n\t" \
1621
\
1622
"vmull.u8 q9, d5, d6 @h 149*y \n\t" \
1623
"vmull.u8 q10, d4, d7 @l 149*y \n\t" \
1624
"vshr.u16 q9, #1 @h (149*y)/2 \n\t" \
1625
"vshr.u16 q10, #1 @l (149*y)/2 \n\t" \
1626
\
1627
"vhadd.s16 q0, q9, q4 @hG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \
1628
"vhadd.s16 q12, q10, q6 @lB ((149*y)/2 + cB + 129*u)/2 \n\t" \
1629
"vhadd.s16 q1, q9, q5 @hR ((149*y)/2 + cR + 102*v)/2 \n\t" \
1630
"vhadd.s16 q11, q10, q4 @lG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \
1631
"vhadd.s16 q9, q6 @hB ((149*y)/2 + cB + 129*u)/2 \n\t" \
1632
"vhadd.s16 q10, q5 @lR ((149*y)/2 + cR + 102*v)/2 \n\t" \
1633
\
1634
"vqrshrun.s16 d24, q12, #5 @lB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \
1635
"vqrshrun.s16 d22, q11, #5 @lG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \
1636
"vqrshrun.s16 d20, q10, #5 @lR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \
1637
"vqrshrun.s16 d23, q0, #5 @hG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \
1638
"vqrshrun.s16 d21, q1, #5 @hR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \
1639
"vqrshrun.s16 d25, q9, #5 @hB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \
1640
\
1641
"vzip.8 d22, d23 @G \n\t" \
1642
"vzip.8 d20, d21 @R \n\t" \
1643
"vzip.8 d24, d25 @B \n\t" \
1644
\
1645
YUV420STORE1CMD##cn", [%[out2]] \n\t" \
1646
YUV420STORE2CMD##cn", [%[out2x]] \n\t" \
1647
\
1648
: /*no output*/ \
1649
: [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \
1650
[out1x] "r" (dst1 + dj+cn*8), [out2x] "r" (dst2 + dj+cn*8), \
1651
[inUV] "r" (uv+j), [inY1] "r" (y1+j), [inY2] "r" (y2+j), \
1652
[cR] "r" (cR), [cG] "r" (cG), [cB] "r" (cB), \
1653
"w" (vc16), "w" (cGU), "w" (cGV), "w" (cBU), "w" (cRV), "w" (cRGBY) YUV420ALPHA##cn##_CONVERT \
1654
: "d0","d1","d2","d3","d4","d5","d8","d9","d10","d11","d12", \
1655
"d13","d18","d19","d20","d21","d22","d23","d24","d25" \
1656
);
1657
1658
#else
1659
1660
template<int bIdx>
1661
struct _convertYUV420Internals
1662
{
1663
uint16x8_t vc14216;
1664
uint16x8_t vc17672;
1665
uint16x8_t vc8696;
1666
uint8x8_t vc102;
1667
uint8x8_t vc25;
1668
uint8x8_t vc129;
1669
uint8x8_t vc52;
1670
uint16x8_t vc_1;
1671
uint8x8_t vc149;
1672
uint8x8_t vc16;
1673
_convertYUV420Internals()
1674
{
1675
vc14216 = vdupq_n_u16(-COEFF_R);
1676
vc17672 = vdupq_n_u16(-COEFF_B);
1677
vc8696 = vdupq_n_u16(COEFF_G);
1678
vc102 = vdup_n_u8(COEFF_RV);
1679
vc25 = vdup_n_u8(COEFF_GU);
1680
vc129 = vdup_n_u8(COEFF_BU);
1681
vc52 = vdup_n_u8(COEFF_GV);
1682
vc_1 = vdupq_n_u16((uint16_t)-1);
1683
vc149 = vdup_n_u8(COEFF_Y);
1684
vc16 = vdup_n_u8(16);
1685
}
1686
1687
inline void UVrgbToRGB( const int16x8_t &ruv, const int16x8_t &guv, const int16x8_t &buv,
1688
const u8 *y, uint8x16x3_t &rgbl )
1689
{
1690
//y get line
1691
uint8x8x2_t yl = vld2_u8(y);
1692
yl.val[0] = vmax_u8(yl.val[0], vc16);
1693
yl.val[1] = vmax_u8(yl.val[1], vc16);
1694
1695
//y part line
1696
uint16x8_t yodd1 = vmlal_u8(vc_1, yl.val[0], vc149); //(-1+149*y)
1697
uint16x8_t yevn1 = vmlal_u8(vc_1, yl.val[1], vc149); //(-1+149*y)
1698
int16x8_t yodd1h = (int16x8_t)vshrq_n_u16(yodd1, 1); //(-1+149*y)/2
1699
int16x8_t yevn1h = (int16x8_t)vshrq_n_u16(yevn1, 1); //(-1+149*y)/2
1700
1701
//y line calc rgb
1702
int16x8_t rodd1w = vhsubq_s16(yodd1h, ruv); //((-1+149*y)/2 - (14216-102*v))/2
1703
int16x8_t gevn1w = vhaddq_s16(yevn1h, guv); //((-1+149*y)/2 + ((8696-25*u)-52*v))/2
1704
int16x8_t bodd1w = vhsubq_s16(yodd1h, buv); //((-1+149*y)/2 - (17672-129*u))/2
1705
int16x8_t revn1w = vhsubq_s16(yevn1h, ruv); //((-1+149*y)/2 - (14216-102*v))/2
1706
int16x8_t godd1w = vhaddq_s16(yodd1h, guv); //((-1+149*y)/2 + ((8696-25*u)-52*v))/2
1707
int16x8_t bevn1w = vhsubq_s16(yevn1h, buv); //((-1+149*y)/2 - (17672-129*u))/2
1708
1709
//y line clamp + narrow
1710
uint8x8_t rodd1n = vqshrun_n_s16(rodd1w, 5);
1711
uint8x8_t revn1n = vqshrun_n_s16(revn1w, 5);
1712
uint8x8_t godd1n = vqshrun_n_s16(godd1w, 5);
1713
uint8x8x2_t r1 = vzip_u8 (rodd1n, revn1n);
1714
uint8x8_t gevn1n = vqshrun_n_s16(gevn1w, 5);
1715
uint8x8_t bodd1n = vqshrun_n_s16(bodd1w, 5);
1716
uint8x8x2_t g1 = vzip_u8 (godd1n, gevn1n);
1717
uint8x8_t bevn1n = vqshrun_n_s16(bevn1w, 5);
1718
uint8x8x2_t b1 = vzip_u8 (bodd1n, bevn1n);
1719
rgbl.val[2 - bIdx] = vcombine_u8(r1.val[0], r1.val[1]);
1720
rgbl.val[1] = vcombine_u8(g1.val[0], g1.val[1]);
1721
rgbl.val[0 + bIdx] = vcombine_u8(b1.val[0], b1.val[1]);
1722
}
1723
};
1724
1725
template<int cn, int bIdx, int vIdx>
1726
struct _convertYUV420
1727
{
1728
_convertYUV420Internals<bIdx> convertYUV420Internals;
1729
1730
inline void ToRGB( const u8 *y1, const u8 *y2, const u8 *uv,
1731
u8 *dst1, u8 *dst2 )
1732
{
1733
uint8x8x2_t raw_uv = vld2_u8(uv);
1734
uint16x8_t gu = vmlsl_u8(convertYUV420Internals.vc8696, raw_uv.val[1-vIdx], convertYUV420Internals.vc25); //(8696-25*u)
1735
int16x8_t ruv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc14216, raw_uv.val[vIdx], convertYUV420Internals.vc102); //(14216-102*v)
1736
1737
int16x8_t buv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc17672, raw_uv.val[1-vIdx], convertYUV420Internals.vc129); //(17672-129*u)
1738
int16x8_t guv = (int16x8_t)vmlsl_u8(gu, raw_uv.val[vIdx], convertYUV420Internals.vc52); //((8696-25*u)-52*v))
1739
1740
uint8x16x3_t rgbl;
1741
//y line1
1742
convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y1, rgbl);
1743
vst3q_u8(dst1, rgbl);
1744
//y line2
1745
convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y2, rgbl);
1746
vst3q_u8(dst2, rgbl);
1747
}
1748
};
1749
1750
template<int bIdx, int vIdx>
1751
struct _convertYUV420<4, bIdx, vIdx>
1752
{
1753
_convertYUV420Internals<bIdx> convertYUV420Internals;
1754
1755
inline void ToRGB( const u8 *y1, const u8 *y2, const u8 *uv,
1756
u8 *dst1, u8 *dst2 )
1757
{
1758
uint8x8x2_t raw_uv = vld2_u8(uv);
1759
uint16x8_t gu = vmlsl_u8(convertYUV420Internals.vc8696, raw_uv.val[1-vIdx], convertYUV420Internals.vc25); //(8696-25*u)
1760
int16x8_t ruv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc14216, raw_uv.val[vIdx], convertYUV420Internals.vc102); //(14216-102*v)
1761
1762
int16x8_t buv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc17672, raw_uv.val[1-vIdx], convertYUV420Internals.vc129); //(17672-129*u)
1763
int16x8_t guv = (int16x8_t)vmlsl_u8(gu, raw_uv.val[vIdx], convertYUV420Internals.vc52); //((8696-25*u)-52*v))
1764
1765
union { uint8x16x4_t v4; uint8x16x3_t v3; } rgbl;
1766
rgbl.v4.val[3] = vdupq_n_u8(0xff);
1767
//y line1
1768
convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y1, rgbl.v3);
1769
vst4q_u8(dst1, rgbl.v4);
1770
//y line2
1771
convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y2, rgbl.v3);
1772
vst4q_u8(dst2, rgbl.v4);
1773
}
1774
};
1775
1776
#define YUV420_CONSTS(cn, bIdx, vIdx) _convertYUV420<cn, bIdx, vIdx> convertYUV420;
1777
1778
#endif
1779
1780
template <int cn> inline void fillAlpha(u8 *, u8 *){}
1781
template <> inline void fillAlpha<4>(u8 *dst1, u8 *dst2)
1782
{
1783
dst1[3] = 255;
1784
dst1[7] = 255;
1785
dst2[3] = 255;
1786
dst2[7] = 255;
1787
}
1788
template <int cn, int bIdx, int vIdx>
1789
inline void convertYUV420ToRGB(const u8 *y1, const u8 *y2, const u8 *uv, u8* dst1, u8 *dst2)
1790
{
1791
int Y11 = y1[0];
1792
int Y12 = y1[1];
1793
int Y21 = y2[0];
1794
int Y22 = y2[1];
1795
1796
int U = uv[1 - vIdx];
1797
int V = uv[vIdx];
1798
1799
int y11 = (COEFF_Y * std::max(16, Y11)) >> 1;
1800
int y12 = (COEFF_Y * std::max(16, Y12)) >> 1;
1801
int y21 = (COEFF_Y * std::max(16, Y21)) >> 1;
1802
int y22 = (COEFF_Y * std::max(16, Y22)) >> 1;
1803
1804
int uvR = COEFF_R + COEFF_RV * V;
1805
int uvG = COEFF_G - COEFF_GU * U - COEFF_GV * V;
1806
int uvB = COEFF_B + COEFF_BU * U;
1807
1808
dst1[2-bIdx] = internal::saturate_cast<u8>((((y11 + uvR) >> 1) + (1 << 4)) >> 5);
1809
dst1[1] = internal::saturate_cast<u8>((((y11 + uvG) >> 1) + (1 << 4)) >> 5);
1810
dst1[bIdx] = internal::saturate_cast<u8>((((y11 + uvB) >> 1) + (1 << 4)) >> 5);
1811
1812
dst1[cn+2-bIdx] = internal::saturate_cast<u8>((((y12 + uvR) >> 1) + (1 << 4)) >> 5);
1813
dst1[cn+1] = internal::saturate_cast<u8>((((y12 + uvG) >> 1) + (1 << 4)) >> 5);
1814
dst1[cn+bIdx] = internal::saturate_cast<u8>((((y12 + uvB) >> 1) + (1 << 4)) >> 5);
1815
1816
dst2[2-bIdx] = internal::saturate_cast<u8>((((y21 + uvR) >> 1) + (1 << 4)) >> 5);
1817
dst2[1] = internal::saturate_cast<u8>((((y21 + uvG) >> 1) + (1 << 4)) >> 5);
1818
dst2[bIdx] = internal::saturate_cast<u8>((((y21 + uvB) >> 1) + (1 << 4)) >> 5);
1819
1820
dst2[cn+2-bIdx] = internal::saturate_cast<u8>((((y22 + uvR) >> 1) + (1 << 4)) >> 5);
1821
dst2[cn+1] = internal::saturate_cast<u8>((((y22 + uvG) >> 1) + (1 << 4)) >> 5);
1822
dst2[cn+bIdx] = internal::saturate_cast<u8>((((y22 + uvB) >> 1) + (1 << 4)) >> 5);
1823
1824
fillAlpha<cn>(dst1, dst2);
1825
}
1826
1827
// converts R, G, B (B, G, R) pixels to RGB(BGR)565 format respectively
1828
inline uint8x16x2_t convertTo565( const uint8x16_t& vR, const uint8x16_t& vG, const uint8x16_t& vB )
1829
{
1830
uint8x16x2_t vRgb565; // rrrrRRRR ggggGGGG bbbbBBBB
1831
1832
vRgb565.val[1] = vsriq_n_u8(vB, vG, 5); // xxxxxxxx bbbbBggg
1833
vRgb565.val[0] = vshlq_n_u8(vG, 3); // gGGGG000 bbbbBggg
1834
vRgb565.val[0] = vsriq_n_u8(vRgb565.val[0], vR, 3); // gGGrrrrR bbbbBggg
1835
1836
return vRgb565;
1837
}
1838
inline void convertTo565( const u16 R, const u16 G, const u16 B, u8 * dst )
1839
{
1840
*((u16*)dst) = (R >> 3)|((G&~3) << 3)|((B&~7) << 8);
1841
}
1842
#endif
1843
1844
} //namespace
1845
1846
void rgb2hsv(const Size2D &size,
1847
const u8 * srcBase, ptrdiff_t srcStride,
1848
u8 * dstBase, ptrdiff_t dstStride,
1849
s32 hrange)
1850
{
1851
internal::assertSupportedConfiguration();
1852
#ifdef CAROTENE_NEON
1853
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
1854
const s32 hsv_shift = 12;
1855
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1856
register const f32 vsdiv_table = f32(255 << hsv_shift);
1857
register f32 vhdiv_table = f32(hrange << hsv_shift);
1858
register const s32 vhrange = hrange;
1859
register const s32 v0 = s32(0);
1860
register const s32 vshift = s32(1 << (hsv_shift-1));
1861
register const s32 v6 = s32(6);
1862
register const f32 bias = 0.5f;
1863
#endif
1864
1865
for (size_t i = 0u; i < size.height; ++i)
1866
{
1867
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
1868
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
1869
size_t sj = 0u, dj = 0u, j = 0u;
1870
1871
for (; j < roiw8; sj += 24, dj += 24, j += 8)
1872
{
1873
internal::prefetch(src + sj);
1874
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1875
CONVERT_TO_HSV_ASM(vld3.8 {d0-d2}, d0, d2)
1876
#else
1877
uint8x8x3_t vRgb = vld3_u8(src + sj);
1878
uint8x8x3_t vHsv = convertToHSV(vRgb.val[0], vRgb.val[1], vRgb.val[2], hrange);
1879
vst3_u8(dst + dj, vHsv);
1880
#endif
1881
}
1882
1883
for (; j < size.width; ++j, sj += 3, dj += 3)
1884
{
1885
convertToHSV(src[sj], src[sj+1], src[sj+2], hrange, hsv_shift, dst+dj);
1886
}
1887
}
1888
#else
1889
(void)size;
1890
(void)srcBase;
1891
(void)srcStride;
1892
(void)dstBase;
1893
(void)dstStride;
1894
(void)hrange;
1895
#endif
1896
}
1897
1898
void rgbx2hsv(const Size2D &size,
1899
const u8 * srcBase, ptrdiff_t srcStride,
1900
u8 * dstBase, ptrdiff_t dstStride,
1901
s32 hrange)
1902
{
1903
internal::assertSupportedConfiguration();
1904
#ifdef CAROTENE_NEON
1905
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
1906
const s32 hsv_shift = 12;
1907
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1908
register const f32 vsdiv_table = f32(255 << hsv_shift);
1909
register f32 vhdiv_table = f32(hrange << hsv_shift);
1910
register const s32 vhrange = hrange;
1911
register const s32 v0 = s32(0);
1912
register const s32 vshift = s32(1 << (hsv_shift-1));
1913
register const s32 v6 = s32(6);
1914
register const f32 bias = 0.5f;
1915
#endif
1916
1917
for (size_t i = 0u; i < size.height; ++i)
1918
{
1919
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
1920
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
1921
size_t sj = 0u, dj = 0u, j = 0u;
1922
1923
for (; j < roiw8; sj += 32, dj += 24, j += 8)
1924
{
1925
internal::prefetch(src + sj);
1926
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1927
CONVERT_TO_HSV_ASM(vld4.8 {d0-d3}, d0, d2)
1928
#else
1929
uint8x8x4_t vRgb = vld4_u8(src + sj);
1930
uint8x8x3_t vHsv = convertToHSV(vRgb.val[0], vRgb.val[1], vRgb.val[2], hrange);
1931
vst3_u8(dst + dj, vHsv);
1932
#endif
1933
}
1934
1935
for (; j < size.width; ++j, sj += 4, dj += 3)
1936
{
1937
convertToHSV(src[sj], src[sj+1], src[sj+2], hrange, hsv_shift, dst+dj);
1938
}
1939
}
1940
#else
1941
(void)size;
1942
(void)srcBase;
1943
(void)srcStride;
1944
(void)dstBase;
1945
(void)dstStride;
1946
(void)hrange;
1947
#endif
1948
}
1949
1950
void bgr2hsv(const Size2D &size,
1951
const u8 * srcBase, ptrdiff_t srcStride,
1952
u8 * dstBase, ptrdiff_t dstStride,
1953
s32 hrange)
1954
{
1955
internal::assertSupportedConfiguration();
1956
#ifdef CAROTENE_NEON
1957
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
1958
const s32 hsv_shift = 12;
1959
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1960
register const f32 vsdiv_table = f32(255 << hsv_shift);
1961
register f32 vhdiv_table = f32(hrange << hsv_shift);
1962
register const s32 vhrange = hrange;
1963
register const s32 v0 = s32(0);
1964
register const s32 vshift = s32(1 << (hsv_shift-1));
1965
register const s32 v6 = s32(6);
1966
register const f32 bias = 0.5f;
1967
#endif
1968
1969
for (size_t i = 0u; i < size.height; ++i)
1970
{
1971
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
1972
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
1973
size_t sj = 0u, dj = 0u, j = 0u;
1974
1975
for (; j < roiw8; sj += 24, dj += 24, j += 8)
1976
{
1977
internal::prefetch(src + sj);
1978
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1979
CONVERT_TO_HSV_ASM(vld3.8 {d0-d2}, d2, d0)
1980
#else
1981
uint8x8x3_t vRgb = vld3_u8(src + sj);
1982
uint8x8x3_t vHsv = convertToHSV(vRgb.val[2], vRgb.val[1], vRgb.val[0], hrange);
1983
vst3_u8(dst + dj, vHsv);
1984
#endif
1985
}
1986
1987
for (; j < size.width; ++j, sj += 3, dj += 3)
1988
{
1989
convertToHSV(src[sj+2], src[sj+1], src[sj], hrange, hsv_shift, dst+dj);
1990
}
1991
}
1992
#else
1993
(void)size;
1994
(void)srcBase;
1995
(void)srcStride;
1996
(void)dstBase;
1997
(void)dstStride;
1998
(void)hrange;
1999
#endif
2000
}
2001
2002
void bgrx2hsv(const Size2D &size,
2003
const u8 * srcBase, ptrdiff_t srcStride,
2004
u8 * dstBase, ptrdiff_t dstStride,
2005
s32 hrange)
2006
{
2007
internal::assertSupportedConfiguration();
2008
#ifdef CAROTENE_NEON
2009
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
2010
const s32 hsv_shift = 12;
2011
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2012
register const f32 vsdiv_table = f32(255 << hsv_shift);
2013
register f32 vhdiv_table = f32(hrange << hsv_shift);
2014
register const s32 vhrange = hrange;
2015
register const s32 v0 = s32(0);
2016
register const s32 vshift = s32(1 << (hsv_shift-1));
2017
register const s32 v6 = s32(6);
2018
register const f32 bias = 0.5f;
2019
#endif
2020
2021
for (size_t i = 0u; i < size.height; ++i)
2022
{
2023
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2024
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2025
size_t sj = 0u, dj = 0u, j = 0u;
2026
2027
for (; j < roiw8; sj += 32, dj += 24, j += 8)
2028
{
2029
internal::prefetch(src + sj);
2030
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2031
CONVERT_TO_HSV_ASM(vld4.8 {d0-d3}, d2, d0)
2032
#else
2033
uint8x8x4_t vRgb = vld4_u8(src + sj);
2034
uint8x8x3_t vHsv = convertToHSV(vRgb.val[2], vRgb.val[1], vRgb.val[0], hrange);
2035
vst3_u8(dst + dj, vHsv);
2036
#endif
2037
}
2038
2039
for (; j < size.width; ++j, sj += 4, dj += 3)
2040
{
2041
convertToHSV(src[sj+2], src[sj+1], src[sj], hrange, hsv_shift, dst+dj);
2042
}
2043
}
2044
#else
2045
(void)size;
2046
(void)srcBase;
2047
(void)srcStride;
2048
(void)dstBase;
2049
(void)dstStride;
2050
(void)hrange;
2051
#endif
2052
}
2053
2054
void rgbx2bgr565(const Size2D &size,
2055
const u8 * srcBase, ptrdiff_t srcStride,
2056
u8 * dstBase, ptrdiff_t dstStride)
2057
{
2058
internal::assertSupportedConfiguration();
2059
#ifdef CAROTENE_NEON
2060
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2061
2062
for (size_t i = 0u; i < size.height; ++i)
2063
{
2064
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2065
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2066
size_t sj = 0u, dj = 0u, j = 0u;
2067
2068
for (; j < roiw16; sj += 64, dj += 32, j += 16)
2069
{
2070
internal::prefetch(src + sj);
2071
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2072
__asm__ (
2073
"vld4.8 {d2, d4, d6, d8}, [%[in0]] @ q0 q1 q2 q3 q4 \n\t"
2074
"vld4.8 {d3, d5, d7, d9}, [%[in1]] @ xxxxxxxx rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2075
"vsri.8 q1, q2, #5 @ xxxxxxxx rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2076
"vshl.u8 q0, q2, #3 @ gGGGG000 rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2077
"vsri.8 q0, q3, #3 @ gGGbbbbB rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2078
"vst2.8 {d0, d2}, [%[out0]] \n\t"
2079
"vst2.8 {d1, d3}, [%[out1]] \n\t"
2080
: /*no output*/
2081
: [out0] "r" (dst + dj),
2082
[out1] "r" (dst + dj + 16),
2083
[in0] "r" (src + sj),
2084
[in1] "r" (src + sj + 32)
2085
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
2086
);
2087
#else
2088
uint8x16x4_t vRgba = vld4q_u8(src + sj);
2089
uint8x16x2_t vVal565 = convertTo565(vRgba.val[2], vRgba.val[1], vRgba.val[0]);
2090
vst2q_u8(dst + dj, vVal565);
2091
#endif
2092
}
2093
2094
for (; j < size.width; ++j, sj += 4, dj += 2)
2095
{
2096
convertTo565(src[sj + 2], src[sj + 1], src[sj], dst + dj);
2097
}
2098
}
2099
#else
2100
(void)size;
2101
(void)srcBase;
2102
(void)srcStride;
2103
(void)dstBase;
2104
(void)dstStride;
2105
#endif
2106
}
2107
2108
void rgb2bgr565(const Size2D &size,
2109
const u8 * srcBase, ptrdiff_t srcStride,
2110
u8 * dstBase, ptrdiff_t dstStride)
2111
{
2112
internal::assertSupportedConfiguration();
2113
#ifdef CAROTENE_NEON
2114
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2115
2116
for (size_t i = 0u; i < size.height; ++i)
2117
{
2118
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2119
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2120
size_t sj = 0u, dj = 0u, j = 0u;
2121
2122
for (; j < roiw16; sj += 48, dj += 32, j += 16)
2123
{
2124
internal::prefetch(src + sj);
2125
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2126
__asm__ (
2127
"vld3.8 {d2, d4, d6}, [%[in0]] @ q0 q1 q2 q3 q4 \n\t"
2128
"vld3.8 {d3, d5, d7}, [%[in1]] @ xxxxxxxx rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2129
"vsri.8 q1, q2, #5 @ xxxxxxxx rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2130
"vshl.u8 q0, q2, #3 @ gGGGG000 rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2131
"vsri.8 q0, q3, #3 @ gGGbbbbB rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2132
"vst2.8 {d0, d2}, [%[out0]] \n\t"
2133
"vst2.8 {d1, d3}, [%[out1]] \n\t"
2134
: /*no output*/
2135
: [out0] "r" (dst + dj),
2136
[out1] "r" (dst + dj + 16),
2137
[in0] "r" (src + sj),
2138
[in1] "r" (src + sj + 24)
2139
: "d0","d1","d2","d3","d4","d5","d6","d7"
2140
);
2141
#else
2142
uint8x16x3_t vRgba = vld3q_u8(src + sj);
2143
uint8x16x2_t vVal565 = convertTo565(vRgba.val[2], vRgba.val[1], vRgba.val[0]);
2144
vst2q_u8(dst + dj, vVal565);
2145
#endif
2146
}
2147
2148
for (; j < size.width; ++j, sj += 3, dj += 2)
2149
{
2150
convertTo565(src[sj + 2], src[sj + 1], src[sj], dst + dj);
2151
}
2152
}
2153
#else
2154
(void)size;
2155
(void)srcBase;
2156
(void)srcStride;
2157
(void)dstBase;
2158
(void)dstStride;
2159
#endif
2160
}
2161
2162
void rgbx2rgb565(const Size2D &size,
2163
const u8 * srcBase, ptrdiff_t srcStride,
2164
u8 * dstBase, ptrdiff_t dstStride)
2165
{
2166
internal::assertSupportedConfiguration();
2167
#ifdef CAROTENE_NEON
2168
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2169
2170
for (size_t i = 0u; i < size.height; ++i)
2171
{
2172
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2173
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2174
size_t sj = 0u, dj = 0u, j = 0u;
2175
2176
for (; j < roiw16; sj += 64, dj += 32, j += 16)
2177
{
2178
internal::prefetch(src + sj);
2179
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2180
__asm__ (
2181
"vld4.8 {d0, d2, d4, d6}, [%[in0]] @ q0 q1 q2 q3 \n\t"
2182
"vld4.8 {d1, d3, d5, d7}, [%[in1]] @ rrrrRRRR ggggGGGG bbbbBBBB aaaaAAAA \n\t"
2183
"vsri.8 q2, q1, #5 @ rrrrRRRR ggggGGGG bbbbBggg aaaaAAAA \n\t"
2184
"vshl.u8 q1, #3 @ rrrrRRRR gGGGG000 bbbbBggg aaaaAAAA \n\t"
2185
"vsri.8 q1, q0, #3 @ rrrrRRRR gGGrrrrR bbbbBggg aaaaAAAA \n\t"
2186
"vst2.8 {d2, d4}, [%[out0]] \n\t"
2187
"vst2.8 {d3, d5}, [%[out1]] \n\t"
2188
: /*no output*/
2189
: [out0] "r" (dst + dj),
2190
[out1] "r" (dst + dj + 16),
2191
[in0] "r" (src + sj),
2192
[in1] "r" (src + sj + 32)
2193
: "d0","d1","d2","d3","d4","d5","d6","d7"
2194
);
2195
#else
2196
uint8x16x4_t vRgba = vld4q_u8(src + sj);
2197
uint8x16x2_t vVal565 = convertTo565(vRgba.val[0], vRgba.val[1], vRgba.val[2]);
2198
vst2q_u8(dst + dj, vVal565);
2199
#endif
2200
}
2201
2202
for (; j < size.width; ++j, sj += 4, dj += 2)
2203
{
2204
convertTo565(src[sj], src[sj + 1], src[sj + 2], dst + dj);
2205
}
2206
}
2207
#else
2208
(void)size;
2209
(void)srcBase;
2210
(void)srcStride;
2211
(void)dstBase;
2212
(void)dstStride;
2213
#endif
2214
}
2215
2216
void rgb2rgb565(const Size2D &size,
2217
const u8 * srcBase, ptrdiff_t srcStride,
2218
u8 * dstBase, ptrdiff_t dstStride)
2219
{
2220
internal::assertSupportedConfiguration();
2221
#ifdef CAROTENE_NEON
2222
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2223
2224
for (size_t i = 0u; i < size.height; ++i)
2225
{
2226
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2227
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2228
size_t sj = 0u, dj = 0u, j = 0u;
2229
2230
for (; j < roiw16; sj += 48, dj += 32, j += 16)
2231
{
2232
internal::prefetch(src + sj);
2233
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2234
__asm__ (
2235
"vld3.8 {d0, d2, d4}, [%[in0]] @ q0 q1 q2 q3 \n\t"
2236
"vld3.8 {d1, d3, d5}, [%[in1]] @ rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2237
"vsri.8 q2, q1, #5 @ rrrrRRRR ggggGGGG bbbbBggg xxxxxxxx \n\t"
2238
"vshl.u8 q1, #3 @ rrrrRRRR gGGGG000 bbbbBggg xxxxxxxx \n\t"
2239
"vsri.8 q1, q0, #3 @ rrrrRRRR gGGrrrrR bbbbBggg xxxxxxxx \n\t"
2240
"vst2.8 {d2, d4}, [%[out0]] \n\t"
2241
"vst2.8 {d3, d5}, [%[out1]] \n\t"
2242
: /*no output*/
2243
: [out0] "r" (dst + dj),
2244
[out1] "r" (dst + dj + 16),
2245
[in0] "r" (src + sj),
2246
[in1] "r" (src + sj + 24)
2247
: "d0","d1","d2","d3","d4","d5"
2248
);
2249
#else
2250
uint8x16x3_t vRgba = vld3q_u8(src + sj);
2251
uint8x16x2_t vVal565 = convertTo565(vRgba.val[0], vRgba.val[1], vRgba.val[2]);
2252
vst2q_u8(dst + dj, vVal565);
2253
#endif
2254
}
2255
2256
for (; j < size.width; ++j, sj += 3, dj += 2)
2257
{
2258
convertTo565(src[sj], src[sj + 1], src[sj + 2], dst + dj);
2259
}
2260
}
2261
#else
2262
(void)size;
2263
(void)srcBase;
2264
(void)srcStride;
2265
(void)dstBase;
2266
(void)dstStride;
2267
#endif
2268
}
2269
2270
void rgb2ycrcb(const Size2D &size,
2271
const u8 * srcBase, ptrdiff_t srcStride,
2272
u8 * dstBase, ptrdiff_t dstStride)
2273
{
2274
internal::assertSupportedConfiguration();
2275
#ifdef CAROTENE_NEON
2276
YCRCB_CONSTS
2277
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
2278
2279
for (size_t i = 0u; i < size.height; ++i)
2280
{
2281
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2282
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2283
size_t sj = 0u, dj = 0u, j = 0u;
2284
2285
for (; j < roiw8; sj += 24, dj += 24, j += 8)
2286
{
2287
internal::prefetch(src + sj);
2288
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2289
CONVERTTOYCRCB(vld3.8 {d0-d2}, d0, d1, d2)
2290
#else
2291
uint8x8x3_t vRgb = vld3_u8(src + sj);
2292
int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[0]));
2293
int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[1]));
2294
int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[2]));
2295
uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);
2296
vst3_u8(dst + dj, vYCrCb);
2297
#endif
2298
}
2299
2300
for (; j < size.width; ++j, sj += 3, dj += 3)
2301
{
2302
S_CONVERTTOYCRCB(src[sj], src[sj + 1], src[sj + 2]);
2303
}
2304
}
2305
#else
2306
(void)size;
2307
(void)srcBase;
2308
(void)srcStride;
2309
(void)dstBase;
2310
(void)dstStride;
2311
#endif
2312
}
2313
2314
void rgbx2ycrcb(const Size2D &size,
2315
const u8 * srcBase, ptrdiff_t srcStride,
2316
u8 * dstBase, ptrdiff_t dstStride)
2317
{
2318
internal::assertSupportedConfiguration();
2319
#ifdef CAROTENE_NEON
2320
YCRCB_CONSTS
2321
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
2322
2323
for (size_t i = 0u; i < size.height; ++i)
2324
{
2325
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2326
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2327
size_t sj = 0u, dj = 0u, j = 0u;
2328
2329
for (; j < roiw8; sj += 32, dj += 24, j += 8)
2330
{
2331
internal::prefetch(src + sj);
2332
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2333
CONVERTTOYCRCB(vld4.8 {d0-d3}, d0, d1, d2)
2334
#else
2335
uint8x8x4_t vRgba = vld4_u8(src + sj);
2336
int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[0]));
2337
int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[1]));
2338
int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[2]));
2339
uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);
2340
vst3_u8(dst + dj, vYCrCb);
2341
#endif
2342
}
2343
2344
for (; j < size.width; ++j, sj += 4, dj += 3)
2345
{
2346
S_CONVERTTOYCRCB(src[sj], src[sj + 1], src[sj + 2]);
2347
}
2348
}
2349
#else
2350
(void)size;
2351
(void)srcBase;
2352
(void)srcStride;
2353
(void)dstBase;
2354
(void)dstStride;
2355
#endif
2356
}
2357
2358
void bgr2ycrcb(const Size2D &size,
2359
const u8 * srcBase, ptrdiff_t srcStride,
2360
u8 * dstBase, ptrdiff_t dstStride)
2361
{
2362
internal::assertSupportedConfiguration();
2363
#ifdef CAROTENE_NEON
2364
YCRCB_CONSTS
2365
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
2366
2367
for (size_t i = 0u; i < size.height; ++i)
2368
{
2369
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2370
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2371
size_t sj = 0u, dj = 0u, j = 0u;
2372
2373
for (; j < roiw8; sj += 24, dj += 24, j += 8)
2374
{
2375
internal::prefetch(src + sj);
2376
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2377
CONVERTTOYCRCB(vld3.8 {d0-d2}, d2, d1, d0)
2378
#else
2379
uint8x8x3_t vBgr = vld3_u8(src + sj);
2380
int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[0]));
2381
int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[1]));
2382
int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[2]));
2383
uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);
2384
vst3_u8(dst + dj, vYCrCb);
2385
#endif
2386
}
2387
2388
for (; j < size.width; ++j, sj += 3, dj += 3)
2389
{
2390
S_CONVERTTOYCRCB(src[sj + 2], src[sj + 1], src[sj]);
2391
}
2392
}
2393
#else
2394
(void)size;
2395
(void)srcBase;
2396
(void)srcStride;
2397
(void)dstBase;
2398
(void)dstStride;
2399
#endif
2400
}
2401
2402
void bgrx2ycrcb(const Size2D &size,
2403
const u8 * srcBase, ptrdiff_t srcStride,
2404
u8 * dstBase, ptrdiff_t dstStride)
2405
{
2406
internal::assertSupportedConfiguration();
2407
#ifdef CAROTENE_NEON
2408
YCRCB_CONSTS
2409
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
2410
2411
for (size_t i = 0u; i < size.height; ++i)
2412
{
2413
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2414
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2415
size_t sj = 0u, dj = 0u, j = 0u;
2416
2417
for (; j < roiw8; sj += 32, dj += 24, j += 8)
2418
{
2419
internal::prefetch(src + sj);
2420
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2421
CONVERTTOYCRCB(vld4.8 {d0-d3}, d2, d1, d0)
2422
#else
2423
uint8x8x4_t vBgra = vld4_u8(src + sj);
2424
int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[0]));
2425
int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[1]));
2426
int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[2]));
2427
uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);
2428
vst3_u8(dst + dj, vYCrCb);
2429
#endif
2430
}
2431
2432
for (; j < size.width; ++j, sj += 4, dj += 3)
2433
{
2434
S_CONVERTTOYCRCB(src[sj + 2], src[sj + 1], src[sj]);
2435
}
2436
}
2437
#else
2438
(void)size;
2439
(void)srcBase;
2440
(void)srcStride;
2441
(void)dstBase;
2442
(void)dstStride;
2443
#endif
2444
}
2445
2446
void yuv420sp2rgb(const Size2D &size,
2447
const u8 * yBase, ptrdiff_t yStride,
2448
const u8 * uvBase, ptrdiff_t uvStride,
2449
u8 * dstBase, ptrdiff_t dstStride)
2450
{
2451
// input data:
2452
////////////// Y matrix:
2453
// {y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16}
2454
// {Y1, Y2, Y3, Y4, Y5, Y6, Y7, Y8, Y9, Y10, Y11, Y12, Y13, Y14, Y15, Y16}
2455
////////////// UV matrix:
2456
// {v12, u12, v34, u34, v56, u56, v78, u78, v90 u90, V12, U12, V34, U34, V56, U56}
2457
2458
// fp version
2459
// R = 1.164(Y - 16) + 1.596(V - 128)
2460
// G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
2461
// B = 1.164(Y - 16) + 2.018(U - 128)
2462
2463
// integer version
2464
// R = [((149*y)/2 + (-14248+102*v) )/2]/32
2465
// G = [((149*y)/2 + ((8663- 25*u)-52*v))/2]/32
2466
// B = [((149*y)/2 + (-17705+129*u) )/2]/32
2467
2468
// error estimation:
2469
//Rerr = 0.0000625 * y - 0.00225 * v - 0.287
2470
//Gerr = 0.0000625 * y + 0.0005 * v + 0.000375 * u + 0.128625
2471
//Berr = 0.0000625 * y - 0.002375 * u - 0.287375
2472
2473
//real error test:
2474
//=================
2475
//R: 1 less: 520960 == 3.11% of full space
2476
//G: 1 less: 251425 == 1.50% of full space
2477
//B: 1 less: 455424 == 2.71% of full space
2478
//=================
2479
//R: 1 more: 642048 == 3.83% of full space
2480
//G: 1 more: 192458 == 1.15% of full space
2481
//B: 1 more: 445184 == 2.65% of full space
2482
2483
internal::assertSupportedConfiguration();
2484
#ifdef CAROTENE_NEON
2485
YUV420_CONSTS(3, 2, 0)
2486
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2487
2488
for (size_t i = 0u; i < size.height; i+=2)
2489
{
2490
const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2491
const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2492
const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2493
u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2494
u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2495
2496
size_t dj = 0u, j = 0u;
2497
for (; j < roiw16; dj += 48, j += 16)
2498
{
2499
internal::prefetch(uv + j);
2500
internal::prefetch(y1 + j);
2501
internal::prefetch(y2 + j);
2502
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2503
CONVERTYUV420TORGB(3, d1, d0, q5, q6)
2504
#else
2505
convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2506
#endif
2507
}
2508
for (; j + 2 <= size.width; j+=2, dj += 6)
2509
{
2510
convertYUV420ToRGB<3, 2, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2511
}
2512
}
2513
#else
2514
(void)size;
2515
(void)yBase;
2516
(void)yStride;
2517
(void)uvBase;
2518
(void)uvStride;
2519
(void)dstBase;
2520
(void)dstStride;
2521
#endif
2522
}
2523
2524
void yuv420sp2rgbx(const Size2D &size,
2525
const u8 * yBase, ptrdiff_t yStride,
2526
const u8 * uvBase, ptrdiff_t uvStride,
2527
u8 * dstBase, ptrdiff_t dstStride)
2528
{
2529
internal::assertSupportedConfiguration();
2530
#ifdef CAROTENE_NEON
2531
YUV420_CONSTS(4, 2, 0)
2532
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2533
2534
for (size_t i = 0u; i < size.height; i+=2)
2535
{
2536
const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2537
const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2538
const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2539
u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2540
u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2541
2542
size_t dj = 0u, j = 0u;
2543
for (; j < roiw16; dj += 64, j += 16)
2544
{
2545
internal::prefetch(uv + j);
2546
internal::prefetch(y1 + j);
2547
internal::prefetch(y2 + j);
2548
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2549
CONVERTYUV420TORGB(4, d1, d0, q5, q6)
2550
#else
2551
convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2552
#endif
2553
}
2554
for (; j + 2 <= size.width; j+=2, dj += 8)
2555
{
2556
convertYUV420ToRGB<4, 2, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2557
}
2558
}
2559
#else
2560
(void)size;
2561
(void)yBase;
2562
(void)yStride;
2563
(void)uvBase;
2564
(void)uvStride;
2565
(void)dstBase;
2566
(void)dstStride;
2567
#endif
2568
}
2569
2570
void yuv420i2rgb(const Size2D &size,
2571
const u8 * yBase, ptrdiff_t yStride,
2572
const u8 * uvBase, ptrdiff_t uvStride,
2573
u8 * dstBase, ptrdiff_t dstStride)
2574
{
2575
internal::assertSupportedConfiguration();
2576
#ifdef CAROTENE_NEON
2577
YUV420_CONSTS(3, 2, 1)
2578
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2579
2580
for (size_t i = 0u; i < size.height; i+=2)
2581
{
2582
const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2583
const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2584
const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2585
u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2586
u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2587
2588
size_t dj = 0u, j = 0u;
2589
for (; j < roiw16; dj += 48, j += 16)
2590
{
2591
internal::prefetch(uv + j);
2592
internal::prefetch(y1 + j);
2593
internal::prefetch(y2 + j);
2594
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2595
CONVERTYUV420TORGB(3, d0, d1, q5, q6)
2596
#else
2597
convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2598
#endif
2599
}
2600
for (; j + 2 <= size.width; j+=2, dj += 6)
2601
{
2602
convertYUV420ToRGB<3, 2, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2603
}
2604
}
2605
#else
2606
(void)size;
2607
(void)yBase;
2608
(void)yStride;
2609
(void)uvBase;
2610
(void)uvStride;
2611
(void)dstBase;
2612
(void)dstStride;
2613
#endif
2614
}
2615
2616
void yuv420i2rgbx(const Size2D &size,
2617
const u8 * yBase, ptrdiff_t yStride,
2618
const u8 * uvBase, ptrdiff_t uvStride,
2619
u8 * dstBase, ptrdiff_t dstStride)
2620
{
2621
internal::assertSupportedConfiguration();
2622
#ifdef CAROTENE_NEON
2623
YUV420_CONSTS(4, 2, 1)
2624
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2625
2626
for (size_t i = 0u; i < size.height; i+=2)
2627
{
2628
const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2629
const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2630
const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2631
u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2632
u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2633
2634
size_t dj = 0u, j = 0u;
2635
for (; j < roiw16; dj += 64, j += 16)
2636
{
2637
internal::prefetch(uv + j);
2638
internal::prefetch(y1 + j);
2639
internal::prefetch(y2 + j);
2640
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2641
CONVERTYUV420TORGB(4, d0, d1, q5, q6)
2642
#else
2643
convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2644
#endif
2645
}
2646
for (; j + 2 <= size.width; j+=2, dj += 8)
2647
{
2648
convertYUV420ToRGB<4, 2, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2649
}
2650
}
2651
#else
2652
(void)size;
2653
(void)yBase;
2654
(void)yStride;
2655
(void)uvBase;
2656
(void)uvStride;
2657
(void)dstBase;
2658
(void)dstStride;
2659
#endif
2660
}
2661
2662
void yuv420sp2bgr(const Size2D &size,
2663
const u8 * yBase, ptrdiff_t yStride,
2664
const u8 * uvBase, ptrdiff_t uvStride,
2665
u8 * dstBase, ptrdiff_t dstStride)
2666
{
2667
internal::assertSupportedConfiguration();
2668
#ifdef CAROTENE_NEON
2669
YUV420_CONSTS(3, 0, 0)
2670
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2671
2672
for (size_t i = 0u; i < size.height; i+=2)
2673
{
2674
const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2675
const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2676
const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2677
u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2678
u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2679
2680
size_t dj = 0u, j = 0u;
2681
for (; j < roiw16; dj += 48, j += 16)
2682
{
2683
internal::prefetch(uv + j);
2684
internal::prefetch(y1 + j);
2685
internal::prefetch(y2 + j);
2686
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2687
CONVERTYUV420TORGB(3, d1, d0, q6, q5)
2688
#else
2689
convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2690
#endif
2691
}
2692
for (; j + 2 <= size.width; j+=2, dj += 6)
2693
{
2694
convertYUV420ToRGB<3, 0, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2695
}
2696
}
2697
#else
2698
(void)size;
2699
(void)yBase;
2700
(void)yStride;
2701
(void)uvBase;
2702
(void)uvStride;
2703
(void)dstBase;
2704
(void)dstStride;
2705
#endif
2706
}
2707
2708
void yuv420sp2bgrx(const Size2D &size,
2709
const u8 * yBase, ptrdiff_t yStride,
2710
const u8 * uvBase, ptrdiff_t uvStride,
2711
u8 * dstBase, ptrdiff_t dstStride)
2712
{
2713
internal::assertSupportedConfiguration();
2714
#ifdef CAROTENE_NEON
2715
YUV420_CONSTS(4, 0, 0)
2716
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2717
2718
for (size_t i = 0u; i < size.height; i+=2)
2719
{
2720
const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2721
const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2722
const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2723
u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2724
u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2725
2726
size_t dj = 0u, j = 0u;
2727
for (; j < roiw16; dj += 64, j += 16)
2728
{
2729
internal::prefetch(uv + j);
2730
internal::prefetch(y1 + j);
2731
internal::prefetch(y2 + j);
2732
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2733
CONVERTYUV420TORGB(4, d1, d0, q6, q5)
2734
#else
2735
convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2736
#endif
2737
}
2738
for (; j + 2 <= size.width; j+=2, dj += 8)
2739
{
2740
convertYUV420ToRGB<4, 0, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2741
}
2742
}
2743
#else
2744
(void)size;
2745
(void)yBase;
2746
(void)yStride;
2747
(void)uvBase;
2748
(void)uvStride;
2749
(void)dstBase;
2750
(void)dstStride;
2751
#endif
2752
}
2753
2754
void yuv420i2bgr(const Size2D &size,
2755
const u8 * yBase, ptrdiff_t yStride,
2756
const u8 * uvBase, ptrdiff_t uvStride,
2757
u8 * dstBase, ptrdiff_t dstStride)
2758
{
2759
internal::assertSupportedConfiguration();
2760
#ifdef CAROTENE_NEON
2761
YUV420_CONSTS(3, 0, 1)
2762
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2763
2764
for (size_t i = 0u; i < size.height; i+=2)
2765
{
2766
const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2767
const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2768
const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2769
u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2770
u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2771
2772
size_t dj = 0u, j = 0u;
2773
for (; j < roiw16; dj += 48, j += 16)
2774
{
2775
internal::prefetch(uv + j);
2776
internal::prefetch(y1 + j);
2777
internal::prefetch(y2 + j);
2778
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2779
CONVERTYUV420TORGB(3, d0, d1, q6, q5)
2780
#else
2781
convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2782
#endif
2783
}
2784
for (; j + 2 <= size.width; j+=2, dj += 6)
2785
{
2786
convertYUV420ToRGB<3, 0, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2787
}
2788
}
2789
#else
2790
(void)size;
2791
(void)yBase;
2792
(void)yStride;
2793
(void)uvBase;
2794
(void)uvStride;
2795
(void)dstBase;
2796
(void)dstStride;
2797
#endif
2798
}
2799
2800
void yuv420i2bgrx(const Size2D &size,
2801
const u8 * yBase, ptrdiff_t yStride,
2802
const u8 * uvBase, ptrdiff_t uvStride,
2803
u8 * dstBase, ptrdiff_t dstStride)
2804
{
2805
internal::assertSupportedConfiguration();
2806
#ifdef CAROTENE_NEON
2807
YUV420_CONSTS(4, 0, 1)
2808
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2809
2810
for (size_t i = 0u; i < size.height; i+=2)
2811
{
2812
const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2813
const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2814
const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2815
u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2816
u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2817
2818
size_t dj = 0u, j = 0u;
2819
for (; j < roiw16; dj += 64, j += 16)
2820
{
2821
internal::prefetch(uv + j);
2822
internal::prefetch(y1 + j);
2823
internal::prefetch(y2 + j);
2824
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2825
CONVERTYUV420TORGB(4, d0, d1, q6, q5)
2826
#else
2827
convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2828
#endif
2829
}
2830
for (; j + 2 <= size.width; j+=2, dj += 8)
2831
{
2832
convertYUV420ToRGB<4, 0, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2833
}
2834
}
2835
#else
2836
(void)size;
2837
(void)yBase;
2838
(void)yStride;
2839
(void)uvBase;
2840
(void)uvStride;
2841
(void)dstBase;
2842
(void)dstStride;
2843
#endif
2844
}
2845
2846
} // namespace CAROTENE_NS
2847
2848