Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/convert.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
42
namespace CAROTENE_NS {
43
44
#ifdef CAROTENE_NEON
45
46
#define CVT_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \
47
void convert(const Size2D &_size, \
48
const T1 * srcBase, ptrdiff_t srcStride, \
49
T2 * dstBase, ptrdiff_t dstStride) \
50
{ \
51
internal::assertSupportedConfiguration(); \
52
Size2D size(_size); \
53
if (srcStride == dstStride && \
54
srcStride == (ptrdiff_t)(size.width)) \
55
{ \
56
size.width *= size.height; \
57
size.height = 1; \
58
} \
59
const ptrdiff_t sstep = srcStride / sizeof(T1); \
60
const ptrdiff_t dstep = dstStride / sizeof(T2); \
61
const size_t w = size.width & ~(SIMD_SIZE-1); \
62
if (size.width >= SIMD_SIZE) \
63
{ \
64
const T1* _src = srcBase; \
65
T2* _dst = dstBase; \
66
CVTINIT \
67
for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
68
CVTROW \
69
} \
70
if(w < size.width) \
71
{ \
72
const T1* _src = srcBase; \
73
T2* _dst = dstBase; \
74
for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
75
for(size_t i = w; i < size.width; i++ ) \
76
_dst[i] = internal::saturate_cast<T2>(_src[i]); \
77
} \
78
}
79
80
#else
81
82
#define CVT_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \
83
void convert(const Size2D &, \
84
const T1 *, ptrdiff_t, \
85
T2 *, ptrdiff_t) \
86
{ \
87
internal::assertSupportedConfiguration(); \
88
}
89
90
#endif
91
92
CVT_FUNC(u8, s8, 16,
93
uint8x16_t v127 = vdupq_n_u8(127);,
94
{
95
for (size_t i = 0; i < w; i += 16)
96
{
97
internal::prefetch(_src + i);
98
uint8x16_t vu8 = vld1q_u8(_src + i);
99
int8x16_t vu1 = vreinterpretq_s8_u8(vminq_u8(vu8, v127));
100
vst1q_s8(_dst + i, vu1);
101
}
102
})
103
104
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
105
CVT_FUNC(u8, u16, 16,
106
register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);,
107
{
108
for (size_t i = 0; i < w; i += 16)
109
{
110
internal::prefetch(_src + i);
111
__asm__ (
112
"vld1.8 {d0-d1}, [%[src]] \n\t"
113
"vst2.8 {d0,d2}, [%[dst1]] \n\t"
114
"vst2.8 {d1,d3}, [%[dst2]] \n\t"
115
: /*no output*/
116
: [src] "r" (_src + i),
117
[dst1] "r" (_dst + i + 0),
118
[dst2] "r" (_dst + i + 8),
119
"w" (zero0)
120
: "d0","d1"
121
);
122
}
123
})
124
#else
125
CVT_FUNC(u8, u16, 16,
126
uint8x16x2_t vline;
127
vline.val[1] = vmovq_n_u8(0);,
128
{
129
for (size_t i = 0; i < w; i += 16)
130
{
131
internal::prefetch(_src + i);
132
vline.val[0] = vld1q_u8(_src + i);
133
vst2q_u8((uint8_t*)(_dst + i), vline);
134
}
135
})
136
#endif
137
138
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
139
CVT_FUNC(u8, s32, 16,
140
register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);
141
register uint8x16_t zero1 asm ("q2") = vmovq_n_u8(0);
142
register uint8x16_t zero2 asm ("q3") = vmovq_n_u8(0);,
143
{
144
for (size_t i = 0; i < w; i += 16)
145
{
146
internal::prefetch(_src + i);
147
__asm__ (
148
"vld1.8 {d0-d1}, [%[src]] \n\t"
149
"vst4.8 {d0,d2,d4,d6}, [%[dst1]] \n\t"
150
"vst4.8 {d1,d3,d5,d7}, [%[dst2]] \n\t"
151
: /*no output*/
152
: [src] "r" (_src + i),
153
[dst1] "r" (_dst + i + 0),
154
[dst2] "r" (_dst + i + 8),
155
"w" (zero0), "w" (zero1), "w" (zero2)
156
: "d0","d1"
157
);
158
}
159
})
160
#else
161
CVT_FUNC(u8, s32, 16,
162
uint8x16x4_t vline;
163
vline.val[1] = vmovq_n_u8(0);
164
vline.val[2] = vmovq_n_u8(0);
165
vline.val[3] = vmovq_n_u8(0);,
166
{
167
for (size_t i = 0; i < w; i += 16)
168
{
169
internal::prefetch(_src + i);
170
vline.val[0] = vld1q_u8(_src + i);
171
vst4q_u8((uint8_t*)(_dst + i), vline);
172
}
173
})
174
#endif
175
176
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
177
CVT_FUNC(u8, f32, 16,
178
,
179
{
180
for (size_t i = 0; i < w; i += 16)
181
{
182
internal::prefetch(_src + i);
183
__asm__ (
184
"vld1.8 {d0-d1}, [%[src]] \n\t"
185
"vmovl.u8 q1, d0 \n\t"
186
"vmovl.u8 q2, d1 \n\t"
187
"vmovl.u16 q3, d2 \n\t"
188
"vmovl.u16 q4, d3 \n\t"
189
"vmovl.u16 q5, d4 \n\t"
190
"vmovl.u16 q6, d5 \n\t"
191
"vcvt.f32.u32 q7, q3 \n\t"
192
"vcvt.f32.u32 q8, q4 \n\t"
193
"vcvt.f32.u32 q9, q5 \n\t"
194
"vcvt.f32.u32 q10, q6 \n\t"
195
"vst1.32 {d14-d15}, [%[dst1]] \n\t"
196
"vst1.32 {d16-d17}, [%[dst2]] \n\t"
197
"vst1.32 {d18-d19}, [%[dst3]] \n\t"
198
"vst1.32 {d20-d21}, [%[dst4]] \n\t"
199
: /*no output*/
200
: [src] "r" (_src + i),
201
[dst1] "r" (_dst + i + 0),
202
[dst2] "r" (_dst + i + 4),
203
[dst3] "r" (_dst + i + 8),
204
[dst4] "r" (_dst + i + 12)
205
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
206
);
207
}
208
})
209
#else
210
CVT_FUNC(u8, f32, 16,
211
,
212
{
213
for (size_t i = 0; i < w; i += 16)
214
{
215
internal::prefetch(_src + i);
216
uint8x16_t vline_u8 = vld1q_u8(_src + i);
217
218
uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8(vline_u8));
219
uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline_u8));
220
221
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16(vline1_u16));
222
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
223
uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16(vline2_u16));
224
uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
225
226
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
227
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
228
float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
229
float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
230
231
vst1q_f32(_dst + i, vline1_f32);
232
vst1q_f32(_dst + i + 4, vline2_f32);
233
vst1q_f32(_dst + i + 8, vline3_f32);
234
vst1q_f32(_dst + i + 12, vline4_f32);
235
}
236
})
237
#endif
238
239
CVT_FUNC(s8, u8, 16,
240
int8x16_t vZero = vdupq_n_s8(0);,
241
{
242
for (size_t i = 0; i < w; i += 16)
243
{
244
internal::prefetch(_src + i);
245
int8x16_t vu8 = vld1q_s8(_src + i);
246
uint8x16_t vu1 = vreinterpretq_u8_s8(vmaxq_s8(vu8, vZero));
247
vst1q_u8(_dst + i, vu1);
248
}
249
})
250
251
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
252
CVT_FUNC(s8, u16, 16,
253
register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);,
254
{
255
for (size_t i = 0; i < w; i += 16)
256
{
257
internal::prefetch(_src + i);
258
__asm__ (
259
"vld1.8 {d0-d1}, [%[src]] \n\t"
260
"vmax.s8 q0, q1 \n\t"
261
"vst2.8 {d0,d2}, [%[dst1]] \n\t"
262
"vst2.8 {d1,d3}, [%[dst2]] \n\t"
263
: /*no output*/
264
: [src] "r" (_src + i),
265
[dst1] "r" (_dst + i + 0),
266
[dst2] "r" (_dst + i + 8),
267
"w" (zero0)
268
: "d0","d1"
269
);
270
}
271
})
272
#else
273
CVT_FUNC(s8, u16, 16,
274
int8x16x2_t vline_s8;
275
vline_s8.val[1] = vmovq_n_s8(0);,
276
{
277
for (size_t i = 0; i < w; i += 16)
278
{
279
internal::prefetch(_src + i);
280
vline_s8.val[0] = vld1q_s8(_src + i);
281
vline_s8.val[0] = vmaxq_s8(vline_s8.val[0], vline_s8.val[1]);
282
vst2q_s8((int8_t*)(_dst + i), vline_s8);
283
}
284
})
285
#endif
286
287
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
288
CVT_FUNC(s8, s16, 16,
289
,
290
{
291
for (size_t i = 0; i < w; i += 16)
292
{
293
internal::prefetch(_src + i);
294
__asm__ (
295
"vld1.8 {d0-d1}, [%[src]] \n\t"
296
"vmovl.s8 q1, d0 \n\t"
297
"vmovl.s8 q2, d1 \n\t"
298
"vst1.16 {d2-d3}, [%[dst1]] \n\t"
299
"vst1.16 {d4-d5}, [%[dst2]] \n\t"
300
: /*no output*/
301
: [src] "r" (_src + i),
302
[dst1] "r" (_dst + i + 0),
303
[dst2] "r" (_dst + i + 8)
304
: "d0","d1","d2","d3","d4","d5"
305
);
306
}
307
})
308
#else
309
CVT_FUNC(s8, s16, 16,
310
,
311
{
312
for (size_t i = 0; i < w; i += 16)
313
{
314
internal::prefetch(_src + i);
315
int8x16_t vline_s8 = vld1q_s8(_src + i);
316
317
int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8));
318
int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8));
319
320
vst1q_s16(_dst + i, vline1_s16);
321
vst1q_s16(_dst + i + 8, vline2_s16);
322
}
323
})
324
#endif
325
326
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
327
CVT_FUNC(s8, s32, 16,
328
,
329
{
330
for (size_t i = 0; i < w; i += 16)
331
{
332
internal::prefetch(_src + i);
333
__asm__ (
334
"vld1.8 {d0-d1}, [%[src]] \n\t"
335
"vmovl.s8 q1, d0 \n\t"
336
"vmovl.s8 q2, d1 \n\t"
337
"vmovl.s16 q3, d2 \n\t"
338
"vmovl.s16 q4, d3 \n\t"
339
"vmovl.s16 q5, d4 \n\t"
340
"vmovl.s16 q6, d5 \n\t"
341
"vst1.32 {d6-d7}, [%[dst1]] \n\t"
342
"vst1.32 {d8-d9}, [%[dst2]] \n\t"
343
"vst1.32 {d10-d11}, [%[dst3]] \n\t"
344
"vst1.32 {d12-d13}, [%[dst4]] \n\t"
345
: /*no output*/
346
: [src] "r" (_src + i),
347
[dst1] "r" (_dst + i + 0),
348
[dst2] "r" (_dst + i + 4),
349
[dst3] "r" (_dst + i + 8),
350
[dst4] "r" (_dst + i + 12)
351
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
352
);
353
}
354
})
355
#else
356
CVT_FUNC(s8, s32, 16,
357
,
358
{
359
for (size_t i = 0; i < w; i += 16)
360
{
361
internal::prefetch(_src + i);
362
int8x16_t vline_s8 = vld1q_s8(_src + i);
363
364
int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8));
365
int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8));
366
367
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16(vline1_s16));
368
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
369
int32x4_t vline3_s32 = vmovl_s16(vget_low_s16(vline2_s16));
370
int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
371
372
vst1q_s32(_dst + i, vline1_s32);
373
vst1q_s32(_dst + i + 4, vline2_s32);
374
vst1q_s32(_dst + i + 8, vline3_s32);
375
vst1q_s32(_dst + i + 12, vline4_s32);
376
}
377
})
378
#endif
379
380
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
381
CVT_FUNC(s8, f32, 16,
382
,
383
{
384
for (size_t i = 0; i < w; i += 16)
385
{
386
internal::prefetch(_src + i);
387
__asm__ (
388
"vld1.8 {d0-d1}, [%[src]] \n\t"
389
"vmovl.s8 q1, d0 \n\t"
390
"vmovl.s8 q2, d1 \n\t"
391
"vmovl.s16 q3, d2 \n\t"
392
"vmovl.s16 q4, d3 \n\t"
393
"vmovl.s16 q5, d4 \n\t"
394
"vmovl.s16 q6, d5 \n\t"
395
"vcvt.f32.s32 q7, q3 \n\t"
396
"vcvt.f32.s32 q8, q4 \n\t"
397
"vcvt.f32.s32 q9, q5 \n\t"
398
"vcvt.f32.s32 q10, q6 \n\t"
399
"vst1.32 {d14-d15}, [%[dst1]] \n\t"
400
"vst1.32 {d16-d17}, [%[dst2]] \n\t"
401
"vst1.32 {d18-d19}, [%[dst3]] \n\t"
402
"vst1.32 {d20-d21}, [%[dst4]] \n\t"
403
: /*no output*/
404
: [src] "r" (_src + i),
405
[dst1] "r" (_dst + i + 0),
406
[dst2] "r" (_dst + i + 4),
407
[dst3] "r" (_dst + i + 8),
408
[dst4] "r" (_dst + i + 12)
409
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
410
);
411
}
412
})
413
#else
414
CVT_FUNC(s8, f32, 16,
415
,
416
{
417
for (size_t i = 0; i < w; i += 16)
418
{
419
internal::prefetch(_src + i);
420
int8x16_t vline_s8 = vld1q_s8(_src + i);
421
422
int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8));
423
int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8));
424
425
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16(vline1_s16));
426
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
427
int32x4_t vline3_s32 = vmovl_s16(vget_low_s16(vline2_s16));
428
int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
429
430
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
431
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
432
float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
433
float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
434
435
vst1q_f32(_dst + i, vline1_f32);
436
vst1q_f32(_dst + i + 4, vline2_f32);
437
vst1q_f32(_dst + i + 8, vline3_f32);
438
vst1q_f32(_dst + i + 12, vline4_f32);
439
}
440
})
441
#endif
442
443
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
444
CVT_FUNC(u16, u8, 16,
445
,
446
{
447
for (size_t i = 0; i < w; i += 16)
448
{
449
internal::prefetch(_src + i);
450
__asm__ (
451
"vld1.8 {d0-d1}, [%[src1]] \n\t"
452
"vqmovn.u16 d4, q0 \n\t"
453
"vld1.8 {d2-d3}, [%[src2]] \n\t"
454
"vqmovn.u16 d5, q1 \n\t"
455
"vst1.8 {d4-d5}, [%[dst]] \n\t"
456
: /*no output*/
457
: [src1] "r" (_src + i),
458
[src2] "r" (_src + i + 8),
459
[dst] "r" (_dst + i + 0)
460
: "d0","d1","d2","d3","d4","d5"
461
);
462
}
463
})
464
#else
465
CVT_FUNC(u16, u8, 16,
466
,
467
{
468
for (size_t i = 0; i < w; i += 16)
469
{
470
internal::prefetch(_src + i);
471
uint16x8_t vline1_u16 = vld1q_u16(_src + i);
472
uint16x8_t vline2_u16 = vld1q_u16(_src + i + 8);
473
474
uint8x8_t vline1_u8 = vqmovn_u16(vline1_u16);
475
uint8x8_t vline2_u8 = vqmovn_u16(vline2_u16);
476
477
vst1q_u8(_dst + i, vcombine_u8(vline1_u8, vline2_u8));
478
}
479
})
480
#endif
481
482
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
483
CVT_FUNC(u16, s8, 16,
484
register uint8x16_t v127 asm ("q4") = vmovq_n_u8(127);,
485
{
486
for (size_t i = 0; i < w; i += 16)
487
{
488
internal::prefetch(_src + i);
489
__asm__ (
490
"vld1.8 {d0-d1}, [%[src1]] \n\t"
491
"vqmovn.u16 d4, q0 \n\t"
492
"vld1.8 {d2-d3}, [%[src2]] \n\t"
493
"vqmovn.u16 d5, q1 \n\t"
494
"vmin.u8 q3, q2, q4 \n\t"
495
"vst1.8 {d6-d7}, [%[dst]] \n\t"
496
: /*no output*/
497
: [src1] "r" (_src + i),
498
[src2] "r" (_src + i + 8),
499
[dst] "r" (_dst + i + 0),
500
"w" (v127)
501
: "d0","d1","d2","d3","d4","d5","d6","d7"
502
);
503
}
504
})
505
#else
506
CVT_FUNC(u16, s8, 16,
507
uint8x8_t v127 = vmov_n_u8(127);,
508
{
509
for (size_t i = 0; i < w; i += 16)
510
{
511
internal::prefetch(_src + i);
512
uint16x8_t vline1_u16 = vld1q_u16(_src + i);
513
uint16x8_t vline2_u16 = vld1q_u16(_src + i + 8);
514
515
uint8x8_t vline1_u8 = vqmovn_u16(vline1_u16);
516
uint8x8_t vline2_u8 = vqmovn_u16(vline2_u16);
517
vline1_u8 = vmin_u8(vline1_u8, v127);
518
vline2_u8 = vmin_u8(vline2_u8, v127);
519
520
vst1q_s8(_dst + i, vcombine_s8(vreinterpret_s8_u8(vline1_u8), vreinterpret_s8_u8(vline2_u8)));
521
}
522
})
523
#endif
524
525
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
526
CVT_FUNC(u16, s16, 8,
527
register uint16x8_t v32767 asm ("q4") = vmovq_n_u16(0x7FFF);,
528
{
529
for (size_t i = 0; i < w; i += 8)
530
{
531
internal::prefetch(_src + i);
532
__asm__ (
533
"vld1.16 {d0-d1}, [%[src]] \n\t"
534
"vmin.u16 q1, q0, q4 \n\t"
535
"vst1.16 {d2-d3}, [%[dst]] \n\t"
536
: /*no output*/
537
: [src] "r" (_src + i),
538
[dst] "r" (_dst + i + 0),
539
"w" (v32767)
540
: "d0","d1","d2","d3"
541
);
542
}
543
})
544
#else
545
CVT_FUNC(u16, s16, 8,
546
uint16x8_t v32767 = vmovq_n_u16(0x7FFF);,
547
{
548
for (size_t i = 0; i < w; i += 8)
549
{
550
internal::prefetch(_src + i);
551
uint16x8_t vline_u16 = vld1q_u16(_src + i);
552
vline_u16 = vminq_u16(vline_u16, v32767);
553
vst1q_s16((_dst + i), vreinterpretq_s16_u16(vline_u16));
554
}
555
})
556
#endif
557
558
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
559
CVT_FUNC(u16, s32, 8,
560
register uint16x8_t zero0 asm ("q1") = vmovq_n_u16(0);,
561
{
562
for (size_t i = 0; i < w; i += 8)
563
{
564
internal::prefetch(_src + i);
565
__asm__ (
566
"vld1.16 {d0-d1}, [%[src]] \n\t"
567
"vst2.16 {d0,d2}, [%[dst1]] \n\t"
568
"vst2.16 {d1,d3}, [%[dst2]] \n\t"
569
: /*no output*/
570
: [src] "r" (_src + i),
571
[dst1] "r" (_dst + i),
572
[dst2] "r" (_dst + i + 4),
573
"w" (zero0)
574
: "d0","d1"//,"d2","d3"//,"d4","d5","d6","d7"
575
);
576
}
577
})
578
#else
579
CVT_FUNC(u16, s32, 8,
580
uint16x8x2_t vline;
581
vline.val[1] = vmovq_n_u16(0);,
582
{
583
for (size_t i = 0; i < w; i += 8)
584
{
585
internal::prefetch(_src + i);
586
vline.val[0] = vld1q_u16(_src + i);
587
vst2q_u16((uint16_t*)(_dst + i), vline);
588
}
589
})
590
#endif
591
592
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
593
CVT_FUNC(u16, f32, 8,
594
,
595
{
596
for (size_t i = 0; i < w; i += 8)
597
{
598
internal::prefetch(_src + i);
599
__asm__ (
600
"vld1.16 {d0-d1}, [%[src]] \n\t"
601
"vmovl.u16 q1, d0 \n\t"
602
"vmovl.u16 q2, d1 \n\t"
603
"vcvt.f32.u32 q3, q1 \n\t"
604
"vcvt.f32.u32 q4, q2 \n\t"
605
"vst1.32 {d6-d7}, [%[dst1]] \n\t"
606
"vst1.32 {d8-d9}, [%[dst2]] \n\t"
607
: /*no output*/
608
: [src] "r" (_src + i),
609
[dst1] "r" (_dst + i + 0),
610
[dst2] "r" (_dst + i + 4)
611
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
612
);
613
}
614
})
615
#else
616
CVT_FUNC(u16, f32, 8,
617
,
618
{
619
for (size_t i = 0; i < w; i += 8)
620
{
621
internal::prefetch(_src + i);
622
uint16x8_t vline_u16 = vld1q_u16(_src + i);
623
624
uint32x4_t vline_u32_lo = vmovl_u16(vget_low_u16(vline_u16));
625
uint32x4_t vline_u32_hi = vmovl_u16(vget_high_u16(vline_u16));
626
627
float32x4_t vline_f32_lo = vcvtq_f32_u32(vline_u32_lo);
628
float32x4_t vline_f32_hi = vcvtq_f32_u32(vline_u32_hi);
629
630
vst1q_f32(_dst + i, vline_f32_lo);
631
vst1q_f32(_dst + i + 4, vline_f32_hi);
632
}
633
})
634
#endif
635
636
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
637
CVT_FUNC(s16, u8, 16,
638
,
639
{
640
for (size_t i = 0; i < w; i += 16)
641
{
642
internal::prefetch(_src + i);
643
__asm__ (
644
"vld1.8 {d0-d1}, [%[src1]] \n\t"
645
"vld1.8 {d2-d3}, [%[src2]] \n\t"
646
"vqmovun.s16 d4, q0 \n\t"
647
"vqmovun.s16 d5, q1 \n\t"
648
"vst1.8 {d4-d5}, [%[dst]] \n\t"
649
: /*no output*/
650
: [src1] "r" (_src + i),
651
[src2] "r" (_src + i + 8),
652
[dst] "r" (_dst + i + 0)
653
: "d0","d1","d2","d3","d4","d5"
654
);
655
}
656
})
657
#else
658
CVT_FUNC(s16, u8, 16,
659
,
660
{
661
for (size_t i = 0; i < w; i += 16)
662
{
663
internal::prefetch(_src + i);
664
int16x8_t vline1_s16 = vld1q_s16(_src + i);
665
int16x8_t vline2_s16 = vld1q_s16(_src + i + 8);
666
667
uint8x8_t vline1_u8 = vqmovun_s16(vline1_s16);
668
uint8x8_t vline2_u8 = vqmovun_s16(vline2_s16);
669
670
vst1q_u8(_dst + i, vcombine_u8(vline1_u8, vline2_u8));
671
}
672
})
673
#endif
674
675
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
676
CVT_FUNC(s16, s8, 16,
677
,
678
{
679
for (size_t i = 0; i < w; i += 16)
680
{
681
internal::prefetch(_src + i);
682
__asm__ (
683
"vld1.8 {d0-d1}, [%[src1]] \n\t"
684
"vld1.8 {d2-d3}, [%[src2]] \n\t"
685
"vqmovn.s16 d4, q0 \n\t"
686
"vqmovn.s16 d5, q1 \n\t"
687
"vst1.8 {d4-d5}, [%[dst]] \n\t"
688
: /*no output*/
689
: [src1] "r" (_src + i),
690
[src2] "r" (_src + i + 8),
691
[dst] "r" (_dst + i + 0)
692
: "d0","d1","d2","d3","d4","d5"
693
);
694
}
695
})
696
#else
697
CVT_FUNC(s16, s8, 16,
698
,
699
{
700
for (size_t i = 0; i < w; i += 16)
701
{
702
internal::prefetch(_src + i);
703
int16x8_t vline1_s16 = vld1q_s16(_src + i);
704
int16x8_t vline2_s16 = vld1q_s16(_src + i + 8);
705
706
int8x8_t vline1_s8 = vqmovn_s16(vline1_s16);
707
int8x8_t vline2_s8 = vqmovn_s16(vline2_s16);
708
709
vst1q_s8(_dst + i, vcombine_s8(vline1_s8, vline2_s8));
710
}
711
})
712
#endif
713
714
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
715
CVT_FUNC(s16, u16, 8,
716
register int16x8_t vZero asm ("q4") = vmovq_n_s16(0);,
717
{
718
for (size_t i = 0; i < w; i += 8)
719
{
720
internal::prefetch(_src + i);
721
__asm__ (
722
"vld1.16 {d0-d1}, [%[src]] \n\t"
723
"vmax.s16 q1, q0, q4 \n\t"
724
"vst1.16 {d2-d3}, [%[dst]] \n\t"
725
: /*no output*/
726
: [src] "r" (_src + i),
727
[dst] "r" (_dst + i + 0),
728
"w" (vZero)
729
: "d0","d1","d2","d3"
730
);
731
}
732
})
733
#else
734
CVT_FUNC(s16, u16, 8,
735
int16x4_t vZero = vmov_n_s16(0);,
736
{
737
for (size_t i = 0; i < w; i += 8)
738
{
739
internal::prefetch(_src + i);
740
int16x8_t vline_s16 = vld1q_s16(_src + i);
741
742
int16x4_t vline_s16_lo = vmax_s16(vget_low_s16(vline_s16), vZero);
743
int16x4_t vline_s16_hi = vmax_s16(vget_high_s16(vline_s16), vZero);
744
745
vst1q_u16(_dst + i, vcombine_u16(vreinterpret_u16_s16(vline_s16_lo), vreinterpret_u16_s16(vline_s16_hi)));
746
}
747
})
748
#endif
749
750
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
751
CVT_FUNC(s16, s32, 8,
752
,
753
{
754
for (size_t i = 0; i < w; i += 8)
755
{
756
internal::prefetch(_src + i);
757
__asm__ (
758
"vld1.16 {d0-d1}, [%[src]] \n\t"
759
"vmovl.s16 q1, d0 \n\t"
760
"vmovl.s16 q2, d1 \n\t"
761
"vst1.32 {d2-d3}, [%[dst1]] \n\t"
762
"vst1.32 {d4-d5}, [%[dst2]] \n\t"
763
: /*no output*/
764
: [src] "r" (_src + i),
765
[dst1] "r" (_dst + i + 0),
766
[dst2] "r" (_dst + i + 4)
767
: "d0","d1","d2","d3","d4","d5"
768
);
769
}
770
})
771
#else
772
CVT_FUNC(s16, s32, 8,
773
,
774
{
775
for (size_t i = 0; i < w; i += 8)
776
{
777
internal::prefetch(_src + i);
778
int16x8_t vline_s16 = vld1q_s16(_src + i);
779
780
int32x4_t vline_s32_lo = vmovl_s16(vget_low_s16(vline_s16));
781
int32x4_t vline_s32_hi = vmovl_s16(vget_high_s16(vline_s16));
782
783
vst1q_s32(_dst + i, vline_s32_lo);
784
vst1q_s32(_dst + i + 4, vline_s32_hi);
785
}
786
})
787
#endif
788
789
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
790
CVT_FUNC(s16, f32, 8,
791
,
792
{
793
for (size_t i = 0; i < w; i += 8)
794
{
795
internal::prefetch(_src + i);
796
__asm__ (
797
"vld1.16 {d0-d1}, [%[src]] \n\t"
798
"vmovl.s16 q1, d0 \n\t"
799
"vmovl.s16 q2, d1 \n\t"
800
"vcvt.f32.s32 q3, q1 \n\t"
801
"vcvt.f32.s32 q4, q2 \n\t"
802
"vst1.32 {d6-d7}, [%[dst1]] \n\t"
803
"vst1.32 {d8-d9}, [%[dst2]] \n\t"
804
: /*no output*/
805
: [src] "r" (_src + i),
806
[dst1] "r" (_dst + i + 0),
807
[dst2] "r" (_dst + i + 4)
808
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
809
);
810
}
811
})
812
#else
813
CVT_FUNC(s16, f32, 8,
814
,
815
{
816
for (size_t i = 0; i < w; i += 8)
817
{
818
internal::prefetch(_src + i);
819
int16x8_t vline_s16 = vld1q_s16(_src + i);
820
821
int32x4_t vline_s32_lo = vmovl_s16(vget_low_s16(vline_s16));
822
int32x4_t vline_s32_hi = vmovl_s16(vget_high_s16(vline_s16));
823
float32x4_t vline_f32_lo = vcvtq_f32_s32(vline_s32_lo);
824
float32x4_t vline_f32_hi = vcvtq_f32_s32(vline_s32_hi);
825
826
vst1q_f32(_dst + i, vline_f32_lo);
827
vst1q_f32(_dst + i + 4, vline_f32_hi);
828
}
829
})
830
#endif
831
832
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
833
CVT_FUNC(s32, u8, 8,
834
,
835
{
836
for (size_t i = 0; i < w; i += 8)
837
{
838
internal::prefetch(_src + i);
839
__asm__ (
840
"vld1.32 {d0-d1}, [%[src1]] \n\t"
841
"vld1.32 {d2-d3}, [%[src2]] \n\t"
842
"vqmovun.s32 d4, q0 \n\t"
843
"vqmovun.s32 d5, q1 \n\t"
844
"vqmovn.u16 d6, q2 \n\t"
845
"vst1.8 {d6}, [%[dst]] \n\t"
846
: /*no output*/
847
: [src1] "r" (_src + i + 0),
848
[src2] "r" (_src + i + 4),
849
[dst] "r" (_dst + i)
850
: "d0","d1","d2","d3","d4","d5","d6"
851
);
852
}
853
})
854
#else
855
CVT_FUNC(s32, u8, 8,
856
,
857
{
858
for (size_t i = 0; i < w; i += 8)
859
{
860
internal::prefetch(_src + i);
861
int32x4_t vline1_s32 = vld1q_s32(_src + i);
862
int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
863
864
uint16x4_t vline1_u16 = vqmovun_s32(vline1_s32);
865
uint16x4_t vline2_u16 = vqmovun_s32(vline2_s32);
866
uint8x8_t vline_u8 = vqmovn_u16(vcombine_u16(vline1_u16, vline2_u16));
867
868
vst1_u8(_dst + i, vline_u8);
869
}
870
})
871
#endif
872
873
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
874
CVT_FUNC(s32, s8, 8,
875
,
876
{
877
for (size_t i = 0; i < w; i += 8)
878
{
879
internal::prefetch(_src + i);
880
__asm__ (
881
"vld1.32 {d0-d1}, [%[src1]] \n\t"
882
"vld1.32 {d2-d3}, [%[src2]] \n\t"
883
"vqmovn.s32 d4, q0 \n\t"
884
"vqmovn.s32 d5, q1 \n\t"
885
"vqmovn.s16 d6, q2 \n\t"
886
"vst1.8 {d6}, [%[dst]] \n\t"
887
: /*no output*/
888
: [src1] "r" (_src + i + 0),
889
[src2] "r" (_src + i + 4),
890
[dst] "r" (_dst + i)
891
: "d0","d1","d2","d3","d4","d5","d6"
892
);
893
}
894
})
895
#else
896
CVT_FUNC(s32, s8, 8,
897
,
898
{
899
for (size_t i = 0; i < w; i += 8)
900
{
901
internal::prefetch(_src + i);
902
int32x4_t vline1_s32 = vld1q_s32(_src + i);
903
int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
904
905
int16x4_t vline1_s16 = vqmovn_s32(vline1_s32);
906
int16x4_t vline2_s16 = vqmovn_s32(vline2_s32);
907
int8x8_t vline_s8 = vqmovn_s16(vcombine_s16(vline1_s16, vline2_s16));
908
909
vst1_s8(_dst + i, vline_s8);
910
}
911
})
912
#endif
913
914
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
915
CVT_FUNC(s32, u16, 8,
916
,
917
{
918
for (size_t i = 0; i < w; i += 8)
919
{
920
internal::prefetch(_src + i);
921
__asm__ (
922
"vld1.32 {d0-d1}, [%[src1]] \n\t"
923
"vld1.32 {d2-d3}, [%[src2]] \n\t"
924
"vqmovun.s32 d4, q0 \n\t"
925
"vqmovun.s32 d5, q1 \n\t"
926
"vst1.16 {d4-d5}, [%[dst]] \n\t"
927
: /*no output*/
928
: [src1] "r" (_src + i + 0),
929
[src2] "r" (_src + i + 4),
930
[dst] "r" (_dst + i)
931
: "d0","d1","d2","d3","d4","d5"
932
);
933
}
934
})
935
#else
936
CVT_FUNC(s32, u16, 8,
937
,
938
{
939
for (size_t i = 0; i < w; i += 8)
940
{
941
internal::prefetch(_src + i);
942
int32x4_t vline1_s32 = vld1q_s32(_src + i);
943
int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
944
945
uint16x4_t vline1_u16 = vqmovun_s32(vline1_s32);
946
uint16x4_t vline2_u16 = vqmovun_s32(vline2_s32);
947
948
vst1q_u16(_dst + i, vcombine_u16(vline1_u16, vline2_u16));
949
}
950
})
951
#endif
952
953
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
954
CVT_FUNC(s32, s16, 8,
955
,
956
{
957
for (size_t i = 0; i < w; i += 8)
958
{
959
internal::prefetch(_src + i);
960
__asm__ (
961
"vld1.32 {d0-d1}, [%[src1]] \n\t"
962
"vld1.32 {d2-d3}, [%[src2]] \n\t"
963
"vqmovn.s32 d4, q0 \n\t"
964
"vqmovn.s32 d5, q1 \n\t"
965
"vst1.8 {d4-d5}, [%[dst]] \n\t"
966
: /*no output*/
967
: [src1] "r" (_src + i + 0),
968
[src2] "r" (_src + i + 4),
969
[dst] "r" (_dst + i)
970
: "d0","d1","d2","d3","d4","d5"
971
);
972
}
973
})
974
#else
975
CVT_FUNC(s32, s16, 8,
976
,
977
{
978
for (size_t i = 0; i < w; i += 8)
979
{
980
internal::prefetch(_src + i);
981
int32x4_t vline1_s32 = vld1q_s32(_src + i);
982
int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
983
984
int16x4_t vline1_s16 = vqmovn_s32(vline1_s32);
985
int16x4_t vline2_s16 = vqmovn_s32(vline2_s32);
986
987
vst1q_s16(_dst + i, vcombine_s16(vline1_s16, vline2_s16));
988
}
989
})
990
#endif
991
992
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
993
CVT_FUNC(s32, f32, 8,
994
,
995
{
996
for (size_t i = 0; i < w; i += 8)
997
{
998
internal::prefetch(_src + i);
999
__asm__ (
1000
"vld1.32 {d0-d1}, [%[src]] \n\t"
1001
"vcvt.f32.s32 q1, q0 \n\t"
1002
"vst1.32 {d2-d3}, [%[dst]] \n\t"
1003
: /*no output*/
1004
: [src] "r" (_src + i),
1005
[dst] "r" (_dst + i)
1006
: "d0","d1","d2","d3"//,"d4","d5"
1007
);
1008
__asm__ (
1009
"vld1.32 {d0-d1}, [%[src]] \n\t"
1010
"vcvt.f32.s32 q1, q0 \n\t"
1011
"vst1.32 {d2-d3}, [%[dst]] \n\t"
1012
: /*no output*/
1013
: [src] "r" (_src + i + 4),
1014
[dst] "r" (_dst + i + 4)
1015
: "d0","d1","d2","d3"//,"d4","d5"
1016
);
1017
}
1018
})
1019
#else
1020
CVT_FUNC(s32, f32, 8,
1021
,
1022
{
1023
for (size_t i = 0; i < w; i += 8)
1024
{
1025
internal::prefetch(_src + i);
1026
int32x4_t vline_s32 = vld1q_s32(_src + i);
1027
float32x4_t vline_f32 = vcvtq_f32_s32(vline_s32);
1028
vst1q_f32(_dst + i, vline_f32);
1029
1030
vline_s32 = vld1q_s32(_src + i + 4);
1031
vline_f32 = vcvtq_f32_s32(vline_s32);
1032
vst1q_f32(_dst + i + 4, vline_f32);
1033
}
1034
})
1035
#endif
1036
1037
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
1038
CVT_FUNC(f32, u8, 8,
1039
register float32x4_t vmult asm ("q0") = vdupq_n_f32((float)(1 << 16));
1040
register uint32x4_t vmask asm ("q1") = vdupq_n_u32(1<<16);,
1041
{
1042
for (size_t i = 0; i < w; i += 8)
1043
{
1044
internal::prefetch(_src + i);
1045
__asm__ (
1046
"vld1.32 {d4-d5}, [%[src1]] \n\t"
1047
"vld1.32 {d6-d7}, [%[src2]] \n\t"
1048
"vmul.f32 q4, q2, q0 \n\t"
1049
"vmul.f32 q5, q3, q0 \n\t"
1050
"vcvt.u32.f32 q6, q4 \n\t"
1051
"vcvt.u32.f32 q7, q5 \n\t"
1052
"vbic q8, q1, q6 \n\t"
1053
"vbic q9, q1, q7 \n\t"
1054
"vshr.u32 q10, q8, #16 \n\t"
1055
"vshr.u32 q11, q9, #16 \n\t"
1056
"vqsub.u32 q12, q6, q10 \n\t"
1057
"vqsub.u32 q13, q7, q11 \n\t"
1058
"vqrshrn.u32 d28, q12, #16 \n\t"
1059
"vqrshrn.u32 d29, q13, #16 \n\t"
1060
"vqmovn.u16 d30, q14 \n\t"
1061
"vst1.8 {d30}, [%[dst]] \n\t"
1062
: /*no output*/
1063
: [src1] "r" (_src + i + 0),
1064
[src2] "r" (_src + i + 4),
1065
[dst] "r" (_dst + i),
1066
"w" (vmult), "w" (vmask)
1067
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30"
1068
);
1069
}
1070
})
1071
#else
1072
CVT_FUNC(f32, u8, 8,
1073
float32x4_t vmult = vdupq_n_f32((float)(1 << 16));
1074
uint32x4_t vmask = vdupq_n_u32(1<<16);,
1075
{
1076
for (size_t i = 0; i < w; i += 8)
1077
{
1078
internal::prefetch(_src + i);
1079
float32x4_t vline1_f32 = vld1q_f32(_src + i);
1080
float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
1081
1082
float32x4_t vline1w_f32 = vmulq_f32(vline1_f32, vmult);
1083
float32x4_t vline2w_f32 = vmulq_f32(vline2_f32, vmult);
1084
1085
uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1w_f32);
1086
uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2w_f32);
1087
1088
uint32x4_t vl1_masked = vbicq_u32(vmask, vline1_u32);
1089
uint32x4_t vl2_masked = vbicq_u32(vmask, vline2_u32);
1090
uint32x4_t vl1_masked2 = vshrq_n_u32(vl1_masked, 16);
1091
uint32x4_t vl2_masked2 = vshrq_n_u32(vl2_masked, 16);
1092
uint32x4_t vline1r_u32 = vqsubq_u32(vline1_u32, vl1_masked2);
1093
uint32x4_t vline2r_u32 = vqsubq_u32(vline2_u32, vl2_masked2);
1094
1095
uint16x4_t vline1_u16 = vqrshrn_n_u32(vline1r_u32, 16);
1096
uint16x4_t vline2_u16 = vqrshrn_n_u32(vline2r_u32, 16);
1097
1098
uint8x8_t vline_u8 = vqmovn_u16(vcombine_u16(vline1_u16, vline2_u16));
1099
vst1_u8(_dst + i, vline_u8);
1100
}
1101
})
1102
#endif
1103
1104
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
1105
CVT_FUNC(f32, s8, 8,
1106
register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
1107
{
1108
for (size_t i = 0; i < w; i += 8)
1109
{
1110
internal::prefetch(_src + i);
1111
__asm__ (
1112
"vld1.32 {d2-d3}, [%[src1]] \n\t"
1113
"vld1.32 {d4-d5}, [%[src2]] \n\t"
1114
"vadd.f32 q3, q1, q0 \n\t"
1115
"vadd.f32 q4, q2, q0 \n\t"
1116
"vcvt.s32.f32 q5, q3 \n\t"
1117
"vcvt.s32.f32 q6, q4 \n\t"
1118
"vqmovn.s32 d14, q5 \n\t"
1119
"vqmovn.s32 d15, q6 \n\t"
1120
"vqmovn.s16 d16, q7 \n\t"
1121
"vst1.8 {d16}, [%[dst]] \n\t"
1122
: /*no output*/
1123
: [src1] "r" (_src + i + 0),
1124
[src2] "r" (_src + i + 4),
1125
[dst] "r" (_dst + i),
1126
"w" (vhalf)
1127
: "d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17"
1128
);
1129
}
1130
})
1131
#else
1132
CVT_FUNC(f32, s8, 8,
1133
float32x4_t vhalf = vdupq_n_f32(0.5f);,
1134
{
1135
for (size_t i = 0; i < w; i += 8)
1136
{
1137
internal::prefetch(_src + i);
1138
float32x4_t vline1_f32 = vld1q_f32(_src + i);
1139
float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
1140
1141
vline1_f32 = vaddq_f32(vline1_f32, vhalf);
1142
vline2_f32 = vaddq_f32(vline2_f32, vhalf);
1143
1144
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
1145
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
1146
int16x4_t vline1_s16 = vqmovn_s32(vline1_s32);
1147
int16x4_t vline2_s16 = vqmovn_s32(vline2_s32);
1148
1149
int8x8_t vline_s8 = vqmovn_s16(vcombine_s16(vline1_s16, vline2_s16));
1150
1151
vst1_s8(_dst + i, vline_s8);
1152
}
1153
})
1154
#endif
1155
1156
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
1157
CVT_FUNC(f32, u16, 8,
1158
register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
1159
{
1160
for (size_t i = 0; i < w; i += 8)
1161
{
1162
internal::prefetch(_src + i);
1163
__asm__ (
1164
"vld1.32 {d2-d3}, [%[src]] \n\t"
1165
"vadd.f32 q2, q1, q0 \n\t"
1166
"vcvt.u32.f32 q3, q2 \n\t"
1167
"vqmovn.u32 d8, q3 \n\t"
1168
"vst1.16 {d8}, [%[dst]] \n\t"
1169
: /*no output*/
1170
: [src] "r" (_src + i),
1171
[dst] "r" (_dst + i),
1172
"w" (vhalf)
1173
: "d2","d3","d4","d5","d6","d7","d8"
1174
);
1175
__asm__ (
1176
"vld1.32 {d2-d3}, [%[src]] \n\t"
1177
"vadd.f32 q2, q1, q0 \n\t"
1178
"vcvt.u32.f32 q3, q2 \n\t"
1179
"vqmovn.u32 d8, q3 \n\t"
1180
"vst1.16 {d8}, [%[dst]] \n\t"
1181
: /*no output*/
1182
: [src] "r" (_src + i + 4),
1183
[dst] "r" (_dst + i + 4),
1184
"w" (vhalf)
1185
: "d2","d3","d4","d5","d6","d7","d8"
1186
);
1187
}
1188
})
1189
#else
1190
CVT_FUNC(f32, u16, 8,
1191
float32x4_t vhalf = vdupq_n_f32(0.5f);,
1192
{
1193
for (size_t i = 0; i < w; i += 8)
1194
{
1195
internal::prefetch(_src + i);
1196
float32x4_t vline_f32 = vld1q_f32(_src + i);
1197
1198
vline_f32 = vaddq_f32(vline_f32, vhalf);
1199
uint32x4_t vline_u32 = vcvtq_u32_f32(vline_f32);
1200
uint16x4_t vline_u16 = vqmovn_u32(vline_u32);
1201
1202
vst1_u16(_dst + i, vline_u16);
1203
1204
vline_f32 = vld1q_f32(_src + i + 4);
1205
1206
vline_f32 = vaddq_f32(vline_f32, vhalf);
1207
vline_u32 = vcvtq_u32_f32(vline_f32);
1208
vline_u16 = vqmovn_u32(vline_u32);
1209
1210
vst1_u16(_dst + i + 4, vline_u16);
1211
}
1212
})
1213
#endif
1214
1215
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
1216
CVT_FUNC(f32, s16, 8,
1217
register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
1218
{
1219
for (size_t i = 0; i < w; i += 8)
1220
{
1221
internal::prefetch(_src + i);
1222
__asm__ (
1223
"vld1.32 {d2-d3}, [%[src]] \n\t"
1224
"vadd.f32 q2, q1, q0 \n\t"
1225
"vcvt.s32.f32 q3, q2 \n\t"
1226
"vqmovn.s32 d8, q3 \n\t"
1227
"vst1.16 {d8}, [%[dst]] \n\t"
1228
: /*no output*/
1229
: [src] "r" (_src + i),
1230
[dst] "r" (_dst + i),
1231
"w" (vhalf)
1232
: "d2","d3","d4","d5","d6","d7","d8"
1233
);
1234
__asm__ (
1235
"vld1.32 {d2-d3}, [%[src]] \n\t"
1236
"vadd.f32 q2, q1, q0 \n\t"
1237
"vcvt.s32.f32 q3, q2 \n\t"
1238
"vqmovn.s32 d8, q3 \n\t"
1239
"vst1.16 {d8}, [%[dst]] \n\t"
1240
: /*no output*/
1241
: [src] "r" (_src + i + 4),
1242
[dst] "r" (_dst + i + 4),
1243
"w" (vhalf)
1244
: "d2","d3","d4","d5","d6","d7","d8"
1245
);
1246
}
1247
})
1248
#else
1249
CVT_FUNC(f32, s16, 8,
1250
float32x4_t vhalf = vdupq_n_f32(0.5f);,
1251
{
1252
for (size_t i = 0; i < w; i += 8)
1253
{
1254
internal::prefetch(_src + i);
1255
float32x4_t vline_f32 = vld1q_f32(_src + i);
1256
1257
vline_f32 = vaddq_f32(vline_f32, vhalf);
1258
int32x4_t vline_s32 = vcvtq_s32_f32(vline_f32);
1259
int16x4_t vline_s16 = vqmovn_s32(vline_s32);
1260
1261
vst1_s16(_dst + i, vline_s16);
1262
1263
vline_f32 = vld1q_f32(_src + i + 4);
1264
1265
vline_f32 = vaddq_f32(vline_f32, vhalf);
1266
vline_s32 = vcvtq_s32_f32(vline_f32);
1267
vline_s16 = vqmovn_s32(vline_s32);
1268
1269
vst1_s16(_dst + i + 4, vline_s16);
1270
}
1271
})
1272
#endif
1273
1274
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
1275
CVT_FUNC(f32, s32, 8,
1276
register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
1277
{
1278
for (size_t i = 0; i < w; i += 8)
1279
{
1280
internal::prefetch(_src + i);
1281
__asm__ (
1282
"vld1.32 {d2-d3}, [%[src1]] \n\t"
1283
"vld1.32 {d4-d5}, [%[src2]] \n\t"
1284
"vadd.f32 q3, q1, q0 \n\t"
1285
"vadd.f32 q4, q2, q0 \n\t"
1286
"vcvt.s32.f32 q5, q3 \n\t"
1287
"vcvt.s32.f32 q6, q4 \n\t"
1288
"vst1.32 {q5}, [%[dst1]] \n\t"
1289
"vst1.32 {q6}, [%[dst2]] \n\t"
1290
: /*no output*/
1291
: [src1] "r" (_src + i),
1292
[src2] "r" (_src + i + 4),
1293
[dst1] "r" (_dst + i),
1294
[dst2] "r" (_dst + i + 4),
1295
"w" (vhalf)
1296
: "d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
1297
);
1298
}
1299
})
1300
#else
1301
CVT_FUNC(f32, s32, 8,
1302
float32x4_t vhalf = vdupq_n_f32(0.5f);,
1303
{
1304
for (size_t i = 0; i < w; i += 8)
1305
{
1306
internal::prefetch(_src + i);
1307
float32x4_t vline_f32 = vld1q_f32(_src + i);
1308
1309
vline_f32 = vaddq_f32(vline_f32, vhalf);
1310
int32x4_t vline_s32 = vcvtq_s32_f32(vline_f32);
1311
1312
vst1q_s32(_dst + i, vline_s32);
1313
1314
vline_f32 = vld1q_f32(_src + i + 4);
1315
1316
vline_f32 = vaddq_f32(vline_f32, vhalf);
1317
vline_s32 = vcvtq_s32_f32(vline_f32);
1318
1319
vst1q_s32(_dst + i + 4, vline_s32);
1320
}
1321
})
1322
#endif
1323
1324
void convert(const Size2D &_size,
1325
const u8 * srcBase, ptrdiff_t srcStride,
1326
s16 * dstBase, ptrdiff_t dstStride)
1327
{
1328
convert(_size, srcBase, srcStride, (u16*)dstBase, dstStride);
1329
}
1330
1331
} // namespace CAROTENE_NS
1332
1333