CoCalc -- convert.cpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/convert.cpp
¹⁶³³⁷ views
1
/*
2
 * By downloading, copying, installing or using the software you agree to this license.
3
 * If you do not agree to this license, do not download, install,
4
 * copy or use the software.
5
 *
6
 *
7
 *                           License Agreement
8
 *                For Open Source Computer Vision Library
9
 *                        (3-clause BSD License)
10
 *
11
 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
 * Third party copyrights are property of their respective owners.
13
 *
14
 * Redistribution and use in source and binary forms, with or without modification,
15
 * are permitted provided that the following conditions are met:
16
 *
17
 *   * Redistributions of source code must retain the above copyright notice,
18
 *     this list of conditions and the following disclaimer.
19
 *
20
 *   * Redistributions in binary form must reproduce the above copyright notice,
21
 *     this list of conditions and the following disclaimer in the documentation
22
 *     and/or other materials provided with the distribution.
23
 *
24
 *   * Neither the names of the copyright holders nor the names of the contributors
25
 *     may be used to endorse or promote products derived from this software
26
 *     without specific prior written permission.
27
 *
28
 * This software is provided by the copyright holders and contributors "as is" and
29
 * any express or implied warranties, including, but not limited to, the implied
30
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31
 * In no event shall copyright holders or contributors be liable for any direct,
32
 * indirect, incidental, special, exemplary, or consequential damages
33
 * (including, but not limited to, procurement of substitute goods or services;
34
 * loss of use, data, or profits; or business interruption) however caused
35
 * and on any theory of liability, whether in contract, strict liability,
36
 * or tort (including negligence or otherwise) arising in any way out of
37
 * the use of this software, even if advised of the possibility of such damage.
38
 */
39

40
#include "common.hpp"
41

42
namespace CAROTENE_NS {
43

44
#ifdef CAROTENE_NEON
45

46
#define CVT_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW)                            \
47
    void convert(const Size2D &_size,                                           \
48
                 const T1 * srcBase, ptrdiff_t srcStride,                       \
49
                 T2 * dstBase, ptrdiff_t dstStride)                             \
50
    {                                                                           \
51
        internal::assertSupportedConfiguration();                               \
52
        Size2D size(_size);                                                     \
53
        if (srcStride == dstStride &&                                           \
54
            srcStride == (ptrdiff_t)(size.width))                               \
55
        {                                                                       \
56
            size.width *= size.height;                                          \
57
            size.height = 1;                                                    \
58
        }                                                                       \
59
        const ptrdiff_t sstep = srcStride / sizeof(T1);                         \
60
        const ptrdiff_t dstep = dstStride / sizeof(T2);                         \
61
        const size_t w = size.width & ~(SIMD_SIZE-1);                           \
62
        if (size.width >= SIMD_SIZE)                                            \
63
        {                                                                       \
64
            const T1* _src = srcBase;                                           \
65
            T2* _dst = dstBase;                                                 \
66
            CVTINIT                                                             \
67
            for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
68
                CVTROW                                                          \
69
        }                                                                       \
70
        if(w < size.width)                                                      \
71
        {                                                                       \
72
            const T1* _src = srcBase;                                           \
73
            T2* _dst = dstBase;                                                 \
74
            for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
75
                for(size_t i = w; i < size.width; i++ )                         \
76
                    _dst[i] = internal::saturate_cast<T2>(_src[i]);             \
77
        }                                                                       \
78
    }
79

80
#else
81

82
#define CVT_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW)                            \
83
    void convert(const Size2D &,                                                \
84
                 const T1 *, ptrdiff_t,                                         \
85
                 T2 *, ptrdiff_t)                                               \
86
    {                                                                           \
87
        internal::assertSupportedConfiguration();                               \
88
    }
89

90
#endif
91

92
CVT_FUNC(u8, s8, 16,
93
     uint8x16_t v127 = vdupq_n_u8(127);,
94
{
95
     for (size_t i = 0; i < w; i += 16)
96
     {
97
         internal::prefetch(_src + i);
98
         uint8x16_t vu8 = vld1q_u8(_src + i);
99
         int8x16_t vu1 = vreinterpretq_s8_u8(vminq_u8(vu8, v127));
100
         vst1q_s8(_dst + i, vu1);
101
     }
102
})
103

104
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
105
CVT_FUNC(u8, u16, 16,
106
     register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);,
107
{
108
     for (size_t i = 0; i < w; i += 16)
109
     {
110
         internal::prefetch(_src + i);
111
         __asm__ (
112
             "vld1.8 {d0-d1}, [%[src]]                              \n\t"
113
             "vst2.8 {d0,d2}, [%[dst1]]                             \n\t"
114
             "vst2.8 {d1,d3}, [%[dst2]]                             \n\t"
115
             : /*no output*/
116
             : [src] "r" (_src + i),
117
               [dst1] "r" (_dst + i + 0),
118
               [dst2] "r" (_dst + i + 8),
119
               "w" (zero0)
120
             : "d0","d1"
121
         );
122
     }
123
})
124
#else
125
CVT_FUNC(u8, u16, 16,
126
     uint8x16x2_t vline;
127
     vline.val[1] = vmovq_n_u8(0);,
128
{
129
     for (size_t i = 0; i < w; i += 16)
130
     {
131
         internal::prefetch(_src + i);
132
         vline.val[0] = vld1q_u8(_src + i);
133
         vst2q_u8((uint8_t*)(_dst + i), vline);
134
     }
135
})
136
#endif
137

138
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
139
CVT_FUNC(u8, s32, 16,
140
     register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);
141
     register uint8x16_t zero1 asm ("q2") = vmovq_n_u8(0);
142
     register uint8x16_t zero2 asm ("q3") = vmovq_n_u8(0);,
143
{
144
     for (size_t i = 0; i < w; i += 16)
145
     {
146
         internal::prefetch(_src + i);
147
         __asm__ (
148
             "vld1.8 {d0-d1}, [%[src]]                              \n\t"
149
             "vst4.8 {d0,d2,d4,d6}, [%[dst1]]                       \n\t"
150
             "vst4.8 {d1,d3,d5,d7}, [%[dst2]]                       \n\t"
151
             : /*no output*/
152
             : [src] "r" (_src + i),
153
               [dst1] "r" (_dst + i + 0),
154
               [dst2] "r" (_dst + i + 8),
155
               "w" (zero0), "w" (zero1), "w" (zero2)
156
             : "d0","d1"
157
         );
158
     }
159
})
160
#else
161
CVT_FUNC(u8, s32, 16,
162
     uint8x16x4_t vline;
163
     vline.val[1] = vmovq_n_u8(0);
164
     vline.val[2] = vmovq_n_u8(0);
165
     vline.val[3] = vmovq_n_u8(0);,
166
{
167
     for (size_t i = 0; i < w; i += 16)
168
     {
169
         internal::prefetch(_src + i);
170
        vline.val[0] = vld1q_u8(_src + i);
171
        vst4q_u8((uint8_t*)(_dst + i), vline);
172
     }
173
})
174
#endif
175

176
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
177
CVT_FUNC(u8, f32, 16,
178
,
179
{
180
     for (size_t i = 0; i < w; i += 16)
181
     {
182
         internal::prefetch(_src + i);
183
         __asm__ (
184
             "vld1.8 {d0-d1}, [%[src]]                              \n\t"
185
             "vmovl.u8 q1, d0                                       \n\t"
186
             "vmovl.u8 q2, d1                                       \n\t"
187
             "vmovl.u16 q3, d2                                      \n\t"
188
             "vmovl.u16 q4, d3                                      \n\t"
189
             "vmovl.u16 q5, d4                                      \n\t"
190
             "vmovl.u16 q6, d5                                      \n\t"
191
             "vcvt.f32.u32 q7, q3                                   \n\t"
192
             "vcvt.f32.u32 q8, q4                                   \n\t"
193
             "vcvt.f32.u32 q9, q5                                   \n\t"
194
             "vcvt.f32.u32 q10, q6                                  \n\t"
195
             "vst1.32 {d14-d15}, [%[dst1]]                          \n\t"
196
             "vst1.32 {d16-d17}, [%[dst2]]                          \n\t"
197
             "vst1.32 {d18-d19}, [%[dst3]]                          \n\t"
198
             "vst1.32 {d20-d21}, [%[dst4]]                          \n\t"
199
             : /*no output*/
200
             : [src] "r" (_src + i),
201
               [dst1] "r" (_dst + i + 0),
202
               [dst2] "r" (_dst + i + 4),
203
               [dst3] "r" (_dst + i + 8),
204
               [dst4] "r" (_dst + i + 12)
205
             : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
206
         );
207
     }
208
})
209
#else
210
CVT_FUNC(u8, f32, 16,
211
,
212
{
213
     for (size_t i = 0; i < w; i += 16)
214
     {
215
         internal::prefetch(_src + i);
216
         uint8x16_t vline_u8 = vld1q_u8(_src + i);
217

218
         uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8(vline_u8));
219
         uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline_u8));
220

221
         uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16(vline1_u16));
222
         uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
223
         uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16(vline2_u16));
224
         uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
225

226
         float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
227
         float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
228
         float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
229
         float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
230

231
         vst1q_f32(_dst + i, vline1_f32);
232
         vst1q_f32(_dst + i + 4, vline2_f32);
233
         vst1q_f32(_dst + i + 8, vline3_f32);
234
         vst1q_f32(_dst + i + 12, vline4_f32);
235
     }
236
})
237
#endif
238

239
CVT_FUNC(s8, u8, 16,
240
     int8x16_t vZero = vdupq_n_s8(0);,
241
{
242
     for (size_t i = 0; i < w; i += 16)
243
     {
244
         internal::prefetch(_src + i);
245
         int8x16_t vu8 = vld1q_s8(_src + i);
246
         uint8x16_t vu1 = vreinterpretq_u8_s8(vmaxq_s8(vu8, vZero));
247
         vst1q_u8(_dst + i, vu1);
248
     }
249
})
250

251
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
252
CVT_FUNC(s8, u16, 16,
253
     register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);,
254
{
255
     for (size_t i = 0; i < w; i += 16)
256
     {
257
         internal::prefetch(_src + i);
258
         __asm__ (
259
             "vld1.8 {d0-d1}, [%[src]]                              \n\t"
260
             "vmax.s8 q0, q1                                        \n\t"
261
             "vst2.8 {d0,d2}, [%[dst1]]                             \n\t"
262
             "vst2.8 {d1,d3}, [%[dst2]]                             \n\t"
263
             : /*no output*/
264
             : [src] "r" (_src + i),
265
               [dst1] "r" (_dst + i + 0),
266
               [dst2] "r" (_dst + i + 8),
267
               "w" (zero0)
268
             : "d0","d1"
269
         );
270
     }
271
})
272
#else
273
CVT_FUNC(s8, u16, 16,
274
     int8x16x2_t vline_s8;
275
     vline_s8.val[1] = vmovq_n_s8(0);,
276
{
277
     for (size_t i = 0; i < w; i += 16)
278
     {
279
         internal::prefetch(_src + i);
280
         vline_s8.val[0] = vld1q_s8(_src + i);
281
         vline_s8.val[0] = vmaxq_s8(vline_s8.val[0], vline_s8.val[1]);
282
         vst2q_s8((int8_t*)(_dst + i), vline_s8);
283
     }
284
})
285
#endif
286

287
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
288
CVT_FUNC(s8, s16, 16,
289
,
290
{
291
     for (size_t i = 0; i < w; i += 16)
292
     {
293
         internal::prefetch(_src + i);
294
         __asm__ (
295
             "vld1.8 {d0-d1}, [%[src]]                              \n\t"
296
             "vmovl.s8 q1, d0                                       \n\t"
297
             "vmovl.s8 q2, d1                                       \n\t"
298
             "vst1.16 {d2-d3}, [%[dst1]]                            \n\t"
299
             "vst1.16 {d4-d5}, [%[dst2]]                            \n\t"
300
             : /*no output*/
301
             : [src] "r" (_src + i),
302
               [dst1] "r" (_dst + i + 0),
303
               [dst2] "r" (_dst + i + 8)
304
             : "d0","d1","d2","d3","d4","d5"
305
         );
306
     }
307
})
308
#else
309
CVT_FUNC(s8, s16, 16,
310
,
311
{
312
     for (size_t i = 0; i < w; i += 16)
313
     {
314
         internal::prefetch(_src + i);
315
         int8x16_t vline_s8 = vld1q_s8(_src + i);
316

317
         int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8));
318
         int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8));
319

320
         vst1q_s16(_dst + i, vline1_s16);
321
         vst1q_s16(_dst + i + 8, vline2_s16);
322
     }
323
})
324
#endif
325

326
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
327
CVT_FUNC(s8, s32, 16,
328
,
329
{
330
     for (size_t i = 0; i < w; i += 16)
331
     {
332
         internal::prefetch(_src + i);
333
         __asm__ (
334
             "vld1.8 {d0-d1}, [%[src]]                              \n\t"
335
             "vmovl.s8 q1, d0                                       \n\t"
336
             "vmovl.s8 q2, d1                                       \n\t"
337
             "vmovl.s16 q3, d2                                      \n\t"
338
             "vmovl.s16 q4, d3                                      \n\t"
339
             "vmovl.s16 q5, d4                                      \n\t"
340
             "vmovl.s16 q6, d5                                      \n\t"
341
             "vst1.32 {d6-d7}, [%[dst1]]                            \n\t"
342
             "vst1.32 {d8-d9}, [%[dst2]]                            \n\t"
343
             "vst1.32 {d10-d11}, [%[dst3]]                          \n\t"
344
             "vst1.32 {d12-d13}, [%[dst4]]                          \n\t"
345
             : /*no output*/
346
             : [src] "r" (_src + i),
347
               [dst1] "r" (_dst + i + 0),
348
               [dst2] "r" (_dst + i + 4),
349
               [dst3] "r" (_dst + i + 8),
350
               [dst4] "r" (_dst + i + 12)
351
             : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
352
         );
353
     }
354
})
355
#else
356
CVT_FUNC(s8, s32, 16,
357
,
358
{
359
     for (size_t i = 0; i < w; i += 16)
360
     {
361
         internal::prefetch(_src + i);
362
         int8x16_t vline_s8 = vld1q_s8(_src + i);
363

364
         int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8));
365
         int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8));
366

367
         int32x4_t vline1_s32 = vmovl_s16(vget_low_s16(vline1_s16));
368
         int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
369
         int32x4_t vline3_s32 = vmovl_s16(vget_low_s16(vline2_s16));
370
         int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
371

372
         vst1q_s32(_dst + i, vline1_s32);
373
         vst1q_s32(_dst + i + 4, vline2_s32);
374
         vst1q_s32(_dst + i + 8, vline3_s32);
375
         vst1q_s32(_dst + i + 12, vline4_s32);
376
     }
377
})
378
#endif
379

380
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
381
CVT_FUNC(s8, f32, 16,
382
,
383
{
384
     for (size_t i = 0; i < w; i += 16)
385
     {
386
         internal::prefetch(_src + i);
387
         __asm__ (
388
             "vld1.8 {d0-d1}, [%[src]]                              \n\t"
389
             "vmovl.s8 q1, d0                                       \n\t"
390
             "vmovl.s8 q2, d1                                       \n\t"
391
             "vmovl.s16 q3, d2                                      \n\t"
392
             "vmovl.s16 q4, d3                                      \n\t"
393
             "vmovl.s16 q5, d4                                      \n\t"
394
             "vmovl.s16 q6, d5                                      \n\t"
395
             "vcvt.f32.s32 q7, q3                                   \n\t"
396
             "vcvt.f32.s32 q8, q4                                   \n\t"
397
             "vcvt.f32.s32 q9, q5                                   \n\t"
398
             "vcvt.f32.s32 q10, q6                                  \n\t"
399
             "vst1.32 {d14-d15}, [%[dst1]]                          \n\t"
400
             "vst1.32 {d16-d17}, [%[dst2]]                          \n\t"
401
             "vst1.32 {d18-d19}, [%[dst3]]                          \n\t"
402
             "vst1.32 {d20-d21}, [%[dst4]]                          \n\t"
403
             : /*no output*/
404
             : [src] "r" (_src + i),
405
               [dst1] "r" (_dst + i + 0),
406
               [dst2] "r" (_dst + i + 4),
407
               [dst3] "r" (_dst + i + 8),
408
               [dst4] "r" (_dst + i + 12)
409
             : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
410
         );
411
     }
412
})
413
#else
414
CVT_FUNC(s8, f32, 16,
415
,
416
{
417
     for (size_t i = 0; i < w; i += 16)
418
     {
419
         internal::prefetch(_src + i);
420
         int8x16_t vline_s8 = vld1q_s8(_src + i);
421

422
         int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8));
423
         int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8));
424

425
         int32x4_t vline1_s32 = vmovl_s16(vget_low_s16(vline1_s16));
426
         int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
427
         int32x4_t vline3_s32 = vmovl_s16(vget_low_s16(vline2_s16));
428
         int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
429

430
         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
431
         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
432
         float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
433
         float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
434

435
         vst1q_f32(_dst + i, vline1_f32);
436
         vst1q_f32(_dst + i + 4, vline2_f32);
437
         vst1q_f32(_dst + i + 8, vline3_f32);
438
         vst1q_f32(_dst + i + 12, vline4_f32);
439
     }
440
})
441
#endif
442

443
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
444
CVT_FUNC(u16, u8, 16,
445
,
446
{
447
     for (size_t i = 0; i < w; i += 16)
448
     {
449
         internal::prefetch(_src + i);
450
         __asm__ (
451
             "vld1.8 {d0-d1}, [%[src1]]                             \n\t"
452
             "vqmovn.u16 d4, q0                                     \n\t"
453
             "vld1.8 {d2-d3}, [%[src2]]                             \n\t"
454
             "vqmovn.u16 d5, q1                                     \n\t"
455
             "vst1.8 {d4-d5}, [%[dst]]                              \n\t"
456
             : /*no output*/
457
             : [src1] "r" (_src + i),
458
               [src2] "r" (_src + i + 8),
459
               [dst] "r" (_dst + i + 0)
460
             : "d0","d1","d2","d3","d4","d5"
461
         );
462
     }
463
})
464
#else
465
CVT_FUNC(u16, u8, 16,
466
,
467
{
468
     for (size_t i = 0; i < w; i += 16)
469
     {
470
         internal::prefetch(_src + i);
471
         uint16x8_t vline1_u16 = vld1q_u16(_src + i);
472
         uint16x8_t vline2_u16 = vld1q_u16(_src + i + 8);
473

474
         uint8x8_t vline1_u8 = vqmovn_u16(vline1_u16);
475
         uint8x8_t vline2_u8 = vqmovn_u16(vline2_u16);
476

477
         vst1q_u8(_dst + i, vcombine_u8(vline1_u8, vline2_u8));
478
     }
479
})
480
#endif
481

482
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
483
CVT_FUNC(u16, s8, 16,
484
    register uint8x16_t v127 asm ("q4") = vmovq_n_u8(127);,
485
{
486
    for (size_t i = 0; i < w; i += 16)
487
    {
488
        internal::prefetch(_src + i);
489
        __asm__ (
490
            "vld1.8 {d0-d1}, [%[src1]]                             \n\t"
491
            "vqmovn.u16 d4, q0                                     \n\t"
492
            "vld1.8 {d2-d3}, [%[src2]]                             \n\t"
493
            "vqmovn.u16 d5, q1                                     \n\t"
494
            "vmin.u8 q3, q2, q4                                    \n\t"
495
            "vst1.8 {d6-d7}, [%[dst]]                              \n\t"
496
            : /*no output*/
497
            : [src1] "r" (_src + i),
498
              [src2] "r" (_src + i + 8),
499
              [dst] "r" (_dst + i + 0),
500
              "w" (v127)
501
            : "d0","d1","d2","d3","d4","d5","d6","d7"
502
         );
503
    }
504
})
505
#else
506
CVT_FUNC(u16, s8, 16,
507
    uint8x8_t v127 = vmov_n_u8(127);,
508
{
509
    for (size_t i = 0; i < w; i += 16)
510
    {
511
        internal::prefetch(_src + i);
512
        uint16x8_t vline1_u16 = vld1q_u16(_src + i);
513
        uint16x8_t vline2_u16 = vld1q_u16(_src + i + 8);
514

515
        uint8x8_t vline1_u8 = vqmovn_u16(vline1_u16);
516
        uint8x8_t vline2_u8 = vqmovn_u16(vline2_u16);
517
        vline1_u8 = vmin_u8(vline1_u8, v127);
518
        vline2_u8 = vmin_u8(vline2_u8, v127);
519

520
        vst1q_s8(_dst + i, vcombine_s8(vreinterpret_s8_u8(vline1_u8), vreinterpret_s8_u8(vline2_u8)));
521
    }
522
})
523
#endif
524

525
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
526
CVT_FUNC(u16, s16, 8,
527
     register uint16x8_t v32767 asm ("q4") = vmovq_n_u16(0x7FFF);,
528
{
529
     for (size_t i = 0; i < w; i += 8)
530
     {
531
         internal::prefetch(_src + i);
532
         __asm__ (
533
             "vld1.16 {d0-d1}, [%[src]]                              \n\t"
534
             "vmin.u16 q1, q0, q4                                    \n\t"
535
             "vst1.16 {d2-d3}, [%[dst]]                              \n\t"
536
             : /*no output*/
537
             : [src] "r" (_src + i),
538
               [dst] "r" (_dst + i + 0),
539
               "w" (v32767)
540
             : "d0","d1","d2","d3"
541
         );
542
     }
543
})
544
#else
545
CVT_FUNC(u16, s16, 8,
546
     uint16x8_t v32767 = vmovq_n_u16(0x7FFF);,
547
{
548
     for (size_t i = 0; i < w; i += 8)
549
     {
550
         internal::prefetch(_src + i);
551
         uint16x8_t vline_u16 = vld1q_u16(_src + i);
552
         vline_u16 = vminq_u16(vline_u16, v32767);
553
         vst1q_s16((_dst + i), vreinterpretq_s16_u16(vline_u16));
554
     }
555
})
556
#endif
557

558
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
559
CVT_FUNC(u16, s32, 8,
560
     register uint16x8_t zero0 asm ("q1") = vmovq_n_u16(0);,
561
{
562
     for (size_t i = 0; i < w; i += 8)
563
     {
564
         internal::prefetch(_src + i);
565
         __asm__ (
566
             "vld1.16 {d0-d1}, [%[src]]                        \n\t"
567
             "vst2.16 {d0,d2}, [%[dst1]]                       \n\t"
568
             "vst2.16 {d1,d3}, [%[dst2]]                       \n\t"
569
             : /*no output*/
570
             : [src] "r" (_src + i),
571
               [dst1] "r" (_dst + i),
572
               [dst2] "r" (_dst + i + 4),
573
               "w" (zero0)
574
             : "d0","d1"//,"d2","d3"//,"d4","d5","d6","d7"
575
         );
576
     }
577
})
578
#else
579
CVT_FUNC(u16, s32, 8,
580
     uint16x8x2_t vline;
581
     vline.val[1] = vmovq_n_u16(0);,
582
{
583
     for (size_t i = 0; i < w; i += 8)
584
     {
585
         internal::prefetch(_src + i);
586
         vline.val[0] = vld1q_u16(_src + i);
587
         vst2q_u16((uint16_t*)(_dst + i), vline);
588
     }
589
})
590
#endif
591

592
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
593
CVT_FUNC(u16, f32, 8,
594
,
595
{
596
     for (size_t i = 0; i < w; i += 8)
597
     {
598
         internal::prefetch(_src + i);
599
         __asm__ (
600
             "vld1.16 {d0-d1}, [%[src]]                              \n\t"
601
             "vmovl.u16 q1, d0                                       \n\t"
602
             "vmovl.u16 q2, d1                                       \n\t"
603
             "vcvt.f32.u32 q3, q1                                    \n\t"
604
             "vcvt.f32.u32 q4, q2                                    \n\t"
605
             "vst1.32 {d6-d7}, [%[dst1]]                             \n\t"
606
             "vst1.32 {d8-d9}, [%[dst2]]                             \n\t"
607
             : /*no output*/
608
             : [src] "r" (_src + i),
609
               [dst1] "r" (_dst + i + 0),
610
               [dst2] "r" (_dst + i + 4)
611
             : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
612
         );
613
     }
614
})
615
#else
616
CVT_FUNC(u16, f32, 8,
617
,
618
{
619
     for (size_t i = 0; i < w; i += 8)
620
     {
621
         internal::prefetch(_src + i);
622
         uint16x8_t vline_u16 = vld1q_u16(_src + i);
623

624
         uint32x4_t vline_u32_lo = vmovl_u16(vget_low_u16(vline_u16));
625
         uint32x4_t vline_u32_hi = vmovl_u16(vget_high_u16(vline_u16));
626

627
         float32x4_t vline_f32_lo = vcvtq_f32_u32(vline_u32_lo);
628
         float32x4_t vline_f32_hi = vcvtq_f32_u32(vline_u32_hi);
629

630
         vst1q_f32(_dst + i, vline_f32_lo);
631
         vst1q_f32(_dst + i + 4, vline_f32_hi);
632
     }
633
})
634
#endif
635

636
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
637
CVT_FUNC(s16, u8, 16,
638
,
639
{
640
     for (size_t i = 0; i < w; i += 16)
641
     {
642
         internal::prefetch(_src + i);
643
         __asm__ (
644
             "vld1.8 {d0-d1}, [%[src1]]                             \n\t"
645
             "vld1.8 {d2-d3}, [%[src2]]                             \n\t"
646
             "vqmovun.s16 d4, q0                                    \n\t"
647
             "vqmovun.s16 d5, q1                                    \n\t"
648
             "vst1.8 {d4-d5}, [%[dst]]                              \n\t"
649
             : /*no output*/
650
             : [src1] "r" (_src + i),
651
               [src2] "r" (_src + i + 8),
652
               [dst] "r" (_dst + i + 0)
653
             : "d0","d1","d2","d3","d4","d5"
654
         );
655
     }
656
})
657
#else
658
CVT_FUNC(s16, u8, 16,
659
,
660
{
661
     for (size_t i = 0; i < w; i += 16)
662
     {
663
         internal::prefetch(_src + i);
664
         int16x8_t vline1_s16 = vld1q_s16(_src + i);
665
         int16x8_t vline2_s16 = vld1q_s16(_src + i + 8);
666

667
         uint8x8_t vline1_u8 = vqmovun_s16(vline1_s16);
668
         uint8x8_t vline2_u8 = vqmovun_s16(vline2_s16);
669

670
         vst1q_u8(_dst + i, vcombine_u8(vline1_u8, vline2_u8));
671
     }
672
})
673
#endif
674

675
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
676
CVT_FUNC(s16, s8, 16,
677
,
678
{
679
     for (size_t i = 0; i < w; i += 16)
680
     {
681
         internal::prefetch(_src + i);
682
         __asm__ (
683
             "vld1.8 {d0-d1}, [%[src1]]                             \n\t"
684
             "vld1.8 {d2-d3}, [%[src2]]                             \n\t"
685
             "vqmovn.s16 d4, q0                                     \n\t"
686
             "vqmovn.s16 d5, q1                                     \n\t"
687
             "vst1.8 {d4-d5}, [%[dst]]                              \n\t"
688
             : /*no output*/
689
             : [src1] "r" (_src + i),
690
               [src2] "r" (_src + i + 8),
691
               [dst] "r" (_dst + i + 0)
692
             : "d0","d1","d2","d3","d4","d5"
693
         );
694
     }
695
})
696
#else
697
CVT_FUNC(s16, s8, 16,
698
,
699
{
700
     for (size_t i = 0; i < w; i += 16)
701
     {
702
         internal::prefetch(_src + i);
703
         int16x8_t vline1_s16 = vld1q_s16(_src + i);
704
         int16x8_t vline2_s16 = vld1q_s16(_src + i + 8);
705

706
         int8x8_t vline1_s8 = vqmovn_s16(vline1_s16);
707
         int8x8_t vline2_s8 = vqmovn_s16(vline2_s16);
708

709
         vst1q_s8(_dst + i, vcombine_s8(vline1_s8, vline2_s8));
710
     }
711
})
712
#endif
713

714
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
715
CVT_FUNC(s16, u16, 8,
716
     register int16x8_t vZero asm ("q4") = vmovq_n_s16(0);,
717
{
718
     for (size_t i = 0; i < w; i += 8)
719
     {
720
         internal::prefetch(_src + i);
721
         __asm__ (
722
             "vld1.16 {d0-d1}, [%[src]]                              \n\t"
723
             "vmax.s16 q1, q0, q4                                    \n\t"
724
             "vst1.16 {d2-d3}, [%[dst]]                              \n\t"
725
             : /*no output*/
726
             : [src] "r" (_src + i),
727
               [dst] "r" (_dst + i + 0),
728
               "w" (vZero)
729
             : "d0","d1","d2","d3"
730
         );
731
     }
732
})
733
#else
734
CVT_FUNC(s16, u16, 8,
735
     int16x4_t vZero = vmov_n_s16(0);,
736
{
737
     for (size_t i = 0; i < w; i += 8)
738
     {
739
         internal::prefetch(_src + i);
740
         int16x8_t vline_s16 = vld1q_s16(_src + i);
741

742
         int16x4_t vline_s16_lo = vmax_s16(vget_low_s16(vline_s16), vZero);
743
         int16x4_t vline_s16_hi = vmax_s16(vget_high_s16(vline_s16), vZero);
744

745
         vst1q_u16(_dst + i, vcombine_u16(vreinterpret_u16_s16(vline_s16_lo), vreinterpret_u16_s16(vline_s16_hi)));
746
     }
747
})
748
#endif
749

750
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
751
CVT_FUNC(s16, s32, 8,
752
,
753
{
754
     for (size_t i = 0; i < w; i += 8)
755
     {
756
         internal::prefetch(_src + i);
757
         __asm__ (
758
             "vld1.16 {d0-d1}, [%[src]]                              \n\t"
759
             "vmovl.s16 q1, d0                                       \n\t"
760
             "vmovl.s16 q2, d1                                       \n\t"
761
             "vst1.32 {d2-d3}, [%[dst1]]                             \n\t"
762
             "vst1.32 {d4-d5}, [%[dst2]]                             \n\t"
763
             : /*no output*/
764
             : [src] "r" (_src + i),
765
               [dst1] "r" (_dst + i + 0),
766
               [dst2] "r" (_dst + i + 4)
767
             : "d0","d1","d2","d3","d4","d5"
768
         );
769
     }
770
})
771
#else
772
CVT_FUNC(s16, s32, 8,
773
,
774
{
775
     for (size_t i = 0; i < w; i += 8)
776
     {
777
         internal::prefetch(_src + i);
778
         int16x8_t vline_s16 = vld1q_s16(_src + i);
779

780
         int32x4_t vline_s32_lo = vmovl_s16(vget_low_s16(vline_s16));
781
         int32x4_t vline_s32_hi = vmovl_s16(vget_high_s16(vline_s16));
782

783
         vst1q_s32(_dst + i, vline_s32_lo);
784
         vst1q_s32(_dst + i + 4, vline_s32_hi);
785
     }
786
})
787
#endif
788

789
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
790
CVT_FUNC(s16, f32, 8,
791
,
792
{
793
     for (size_t i = 0; i < w; i += 8)
794
     {
795
         internal::prefetch(_src + i);
796
         __asm__ (
797
             "vld1.16 {d0-d1}, [%[src]]                              \n\t"
798
             "vmovl.s16 q1, d0                                       \n\t"
799
             "vmovl.s16 q2, d1                                       \n\t"
800
             "vcvt.f32.s32 q3, q1                                    \n\t"
801
             "vcvt.f32.s32 q4, q2                                    \n\t"
802
             "vst1.32 {d6-d7}, [%[dst1]]                             \n\t"
803
             "vst1.32 {d8-d9}, [%[dst2]]                             \n\t"
804
             : /*no output*/
805
             : [src] "r" (_src + i),
806
               [dst1] "r" (_dst + i + 0),
807
               [dst2] "r" (_dst + i + 4)
808
             : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
809
         );
810
     }
811
})
812
#else
813
CVT_FUNC(s16, f32, 8,
814
,
815
{
816
     for (size_t i = 0; i < w; i += 8)
817
     {
818
         internal::prefetch(_src + i);
819
         int16x8_t vline_s16 = vld1q_s16(_src + i);
820

821
         int32x4_t vline_s32_lo = vmovl_s16(vget_low_s16(vline_s16));
822
         int32x4_t vline_s32_hi = vmovl_s16(vget_high_s16(vline_s16));
823
         float32x4_t vline_f32_lo = vcvtq_f32_s32(vline_s32_lo);
824
         float32x4_t vline_f32_hi = vcvtq_f32_s32(vline_s32_hi);
825

826
         vst1q_f32(_dst + i, vline_f32_lo);
827
         vst1q_f32(_dst + i + 4, vline_f32_hi);
828
     }
829
})
830
#endif
831

832
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
833
CVT_FUNC(s32, u8, 8,
834
,
835
{
836
     for (size_t i = 0; i < w; i += 8)
837
     {
838
         internal::prefetch(_src + i);
839
         __asm__ (
840
             "vld1.32 {d0-d1}, [%[src1]]                              \n\t"
841
             "vld1.32 {d2-d3}, [%[src2]]                              \n\t"
842
             "vqmovun.s32 d4, q0                                      \n\t"
843
             "vqmovun.s32 d5, q1                                      \n\t"
844
             "vqmovn.u16  d6, q2                                      \n\t"
845
             "vst1.8 {d6}, [%[dst]]                                   \n\t"
846
             : /*no output*/
847
             : [src1] "r" (_src + i + 0),
848
               [src2] "r" (_src + i + 4),
849
               [dst] "r" (_dst + i)
850
             : "d0","d1","d2","d3","d4","d5","d6"
851
         );
852
     }
853
})
854
#else
855
CVT_FUNC(s32, u8, 8,
856
,
857
{
858
     for (size_t i = 0; i < w; i += 8)
859
     {
860
         internal::prefetch(_src + i);
861
         int32x4_t vline1_s32 = vld1q_s32(_src + i);
862
         int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
863

864
         uint16x4_t vline1_u16 = vqmovun_s32(vline1_s32);
865
         uint16x4_t vline2_u16 = vqmovun_s32(vline2_s32);
866
         uint8x8_t vline_u8 = vqmovn_u16(vcombine_u16(vline1_u16, vline2_u16));
867

868
         vst1_u8(_dst + i, vline_u8);
869
     }
870
})
871
#endif
872

873
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
874
CVT_FUNC(s32, s8, 8,
875
,
876
{
877
     for (size_t i = 0; i < w; i += 8)
878
     {
879
         internal::prefetch(_src + i);
880
         __asm__ (
881
             "vld1.32 {d0-d1}, [%[src1]]                              \n\t"
882
             "vld1.32 {d2-d3}, [%[src2]]                              \n\t"
883
             "vqmovn.s32 d4, q0                                       \n\t"
884
             "vqmovn.s32 d5, q1                                       \n\t"
885
             "vqmovn.s16  d6, q2                                      \n\t"
886
             "vst1.8 {d6}, [%[dst]]                                   \n\t"
887
             : /*no output*/
888
             : [src1] "r" (_src + i + 0),
889
               [src2] "r" (_src + i + 4),
890
               [dst] "r" (_dst + i)
891
             : "d0","d1","d2","d3","d4","d5","d6"
892
         );
893
     }
894
})
895
#else
896
CVT_FUNC(s32, s8, 8,
897
,
898
{
899
     for (size_t i = 0; i < w; i += 8)
900
     {
901
         internal::prefetch(_src + i);
902
         int32x4_t vline1_s32 = vld1q_s32(_src + i);
903
         int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
904

905
         int16x4_t vline1_s16 = vqmovn_s32(vline1_s32);
906
         int16x4_t vline2_s16 = vqmovn_s32(vline2_s32);
907
         int8x8_t vline_s8 = vqmovn_s16(vcombine_s16(vline1_s16, vline2_s16));
908

909
         vst1_s8(_dst + i, vline_s8);
910
     }
911
})
912
#endif
913

914
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
915
CVT_FUNC(s32, u16, 8,
916
,
917
{
918
     for (size_t i = 0; i < w; i += 8)
919
     {
920
         internal::prefetch(_src + i);
921
         __asm__ (
922
             "vld1.32 {d0-d1}, [%[src1]]                              \n\t"
923
             "vld1.32 {d2-d3}, [%[src2]]                              \n\t"
924
             "vqmovun.s32 d4, q0                                      \n\t"
925
             "vqmovun.s32 d5, q1                                      \n\t"
926
             "vst1.16 {d4-d5}, [%[dst]]                               \n\t"
927
             : /*no output*/
928
             : [src1] "r" (_src + i + 0),
929
               [src2] "r" (_src + i + 4),
930
               [dst] "r" (_dst + i)
931
             : "d0","d1","d2","d3","d4","d5"
932
         );
933
     }
934
})
935
#else
936
CVT_FUNC(s32, u16, 8,
937
,
938
{
939
     for (size_t i = 0; i < w; i += 8)
940
     {
941
         internal::prefetch(_src + i);
942
         int32x4_t vline1_s32 = vld1q_s32(_src + i);
943
         int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
944

945
         uint16x4_t vline1_u16 = vqmovun_s32(vline1_s32);
946
         uint16x4_t vline2_u16 = vqmovun_s32(vline2_s32);
947

948
         vst1q_u16(_dst + i, vcombine_u16(vline1_u16, vline2_u16));
949
     }
950
})
951
#endif
952

953
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
954
CVT_FUNC(s32, s16, 8,
955
,
956
{
957
     for (size_t i = 0; i < w; i += 8)
958
     {
959
         internal::prefetch(_src + i);
960
         __asm__ (
961
             "vld1.32 {d0-d1}, [%[src1]]                              \n\t"
962
             "vld1.32 {d2-d3}, [%[src2]]                              \n\t"
963
             "vqmovn.s32 d4, q0                                       \n\t"
964
             "vqmovn.s32 d5, q1                                       \n\t"
965
             "vst1.8 {d4-d5}, [%[dst]]                                \n\t"
966
             : /*no output*/
967
             : [src1] "r" (_src + i + 0),
968
               [src2] "r" (_src + i + 4),
969
               [dst] "r" (_dst + i)
970
             : "d0","d1","d2","d3","d4","d5"
971
         );
972
     }
973
})
974
#else
975
CVT_FUNC(s32, s16, 8,
976
,
977
{
978
     for (size_t i = 0; i < w; i += 8)
979
     {
980
         internal::prefetch(_src + i);
981
         int32x4_t vline1_s32 = vld1q_s32(_src + i);
982
         int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
983

984
         int16x4_t vline1_s16 = vqmovn_s32(vline1_s32);
985
         int16x4_t vline2_s16 = vqmovn_s32(vline2_s32);
986

987
         vst1q_s16(_dst + i, vcombine_s16(vline1_s16, vline2_s16));
988
     }
989
})
990
#endif
991

992
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
993
CVT_FUNC(s32, f32, 8,
994
,
995
{
996
     for (size_t i = 0; i < w; i += 8)
997
     {
998
         internal::prefetch(_src + i);
999
         __asm__ (
1000
             "vld1.32 {d0-d1}, [%[src]]                              \n\t"
1001
             "vcvt.f32.s32 q1, q0                                    \n\t"
1002
             "vst1.32 {d2-d3}, [%[dst]]                              \n\t"
1003
             : /*no output*/
1004
             : [src] "r" (_src + i),
1005
               [dst] "r" (_dst + i)
1006
             : "d0","d1","d2","d3"//,"d4","d5"
1007
         );
1008
         __asm__ (
1009
             "vld1.32 {d0-d1}, [%[src]]                              \n\t"
1010
             "vcvt.f32.s32 q1, q0                                    \n\t"
1011
             "vst1.32 {d2-d3}, [%[dst]]                              \n\t"
1012
             : /*no output*/
1013
             : [src] "r" (_src + i + 4),
1014
               [dst] "r" (_dst + i + 4)
1015
             : "d0","d1","d2","d3"//,"d4","d5"
1016
         );
1017
     }
1018
})
1019
#else
1020
CVT_FUNC(s32, f32, 8,
1021
,
1022
{
1023
     for (size_t i = 0; i < w; i += 8)
1024
     {
1025
         internal::prefetch(_src + i);
1026
         int32x4_t vline_s32 = vld1q_s32(_src + i);
1027
         float32x4_t vline_f32 = vcvtq_f32_s32(vline_s32);
1028
         vst1q_f32(_dst + i, vline_f32);
1029

1030
         vline_s32 = vld1q_s32(_src + i + 4);
1031
         vline_f32 = vcvtq_f32_s32(vline_s32);
1032
         vst1q_f32(_dst + i + 4, vline_f32);
1033
     }
1034
})
1035
#endif
1036

1037
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
1038
CVT_FUNC(f32, u8, 8,
1039
    register float32x4_t vmult asm ("q0") = vdupq_n_f32((float)(1 << 16));
1040
    register uint32x4_t  vmask asm ("q1") = vdupq_n_u32(1<<16);,
1041
{
1042
    for (size_t i = 0; i < w; i += 8)
1043
    {
1044
        internal::prefetch(_src + i);
1045
        __asm__ (
1046
            "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
1047
            "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
1048
            "vmul.f32 q4, q2, q0                                     \n\t"
1049
            "vmul.f32 q5, q3, q0                                     \n\t"
1050
            "vcvt.u32.f32 q6, q4                                     \n\t"
1051
            "vcvt.u32.f32 q7, q5                                     \n\t"
1052
            "vbic q8, q1, q6                                         \n\t"
1053
            "vbic q9, q1, q7                                         \n\t"
1054
            "vshr.u32 q10, q8, #16                                   \n\t"
1055
            "vshr.u32 q11, q9, #16                                   \n\t"
1056
            "vqsub.u32 q12, q6, q10                                  \n\t"
1057
            "vqsub.u32 q13, q7, q11                                  \n\t"
1058
            "vqrshrn.u32 d28, q12, #16                               \n\t"
1059
            "vqrshrn.u32 d29, q13, #16                               \n\t"
1060
            "vqmovn.u16 d30, q14                                     \n\t"
1061
            "vst1.8 {d30}, [%[dst]]                                  \n\t"
1062
            : /*no output*/
1063
            : [src1] "r" (_src + i + 0),
1064
              [src2] "r" (_src + i + 4),
1065
              [dst] "r" (_dst + i),
1066
              "w" (vmult), "w" (vmask)
1067
            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30"
1068
        );
1069
     }
1070
})
1071
#else
1072
CVT_FUNC(f32, u8, 8,
1073
    float32x4_t vmult = vdupq_n_f32((float)(1 << 16));
1074
    uint32x4_t  vmask = vdupq_n_u32(1<<16);,
1075
{
1076
    for (size_t i = 0; i < w; i += 8)
1077
    {
1078
        internal::prefetch(_src + i);
1079
        float32x4_t vline1_f32 = vld1q_f32(_src + i);
1080
        float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
1081

1082
        float32x4_t vline1w_f32 = vmulq_f32(vline1_f32, vmult);
1083
        float32x4_t vline2w_f32 = vmulq_f32(vline2_f32, vmult);
1084

1085
        uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1w_f32);
1086
        uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2w_f32);
1087

1088
        uint32x4_t vl1_masked = vbicq_u32(vmask, vline1_u32);
1089
        uint32x4_t vl2_masked = vbicq_u32(vmask, vline2_u32);
1090
        uint32x4_t vl1_masked2 = vshrq_n_u32(vl1_masked, 16);
1091
        uint32x4_t vl2_masked2 = vshrq_n_u32(vl2_masked, 16);
1092
        uint32x4_t vline1r_u32 = vqsubq_u32(vline1_u32, vl1_masked2);
1093
        uint32x4_t vline2r_u32 = vqsubq_u32(vline2_u32, vl2_masked2);
1094

1095
        uint16x4_t vline1_u16 = vqrshrn_n_u32(vline1r_u32, 16);
1096
        uint16x4_t vline2_u16 = vqrshrn_n_u32(vline2r_u32, 16);
1097

1098
        uint8x8_t vline_u8 = vqmovn_u16(vcombine_u16(vline1_u16, vline2_u16));
1099
        vst1_u8(_dst + i, vline_u8);
1100
     }
1101
})
1102
#endif
1103

1104
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
1105
CVT_FUNC(f32, s8, 8,
1106
     register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
1107
{
1108
     for (size_t i = 0; i < w; i += 8)
1109
     {
1110
         internal::prefetch(_src + i);
1111
         __asm__ (
1112
             "vld1.32 {d2-d3}, [%[src1]]                              \n\t"
1113
             "vld1.32 {d4-d5}, [%[src2]]                              \n\t"
1114
             "vadd.f32 q3, q1, q0                                     \n\t"
1115
             "vadd.f32 q4, q2, q0                                     \n\t"
1116
             "vcvt.s32.f32 q5, q3                                     \n\t"
1117
             "vcvt.s32.f32 q6, q4                                     \n\t"
1118
             "vqmovn.s32 d14, q5                                      \n\t"
1119
             "vqmovn.s32 d15, q6                                      \n\t"
1120
             "vqmovn.s16 d16, q7                                      \n\t"
1121
             "vst1.8 {d16}, [%[dst]]                                  \n\t"
1122
             : /*no output*/
1123
             : [src1] "r" (_src + i + 0),
1124
               [src2] "r" (_src + i + 4),
1125
               [dst] "r" (_dst + i),
1126
               "w" (vhalf)
1127
             : "d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17"
1128
         );
1129
     }
1130
})
1131
#else
1132
CVT_FUNC(f32, s8, 8,
1133
     float32x4_t vhalf = vdupq_n_f32(0.5f);,
1134
{
1135
     for (size_t i = 0; i < w; i += 8)
1136
     {
1137
         internal::prefetch(_src + i);
1138
         float32x4_t vline1_f32 = vld1q_f32(_src + i);
1139
         float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
1140

1141
         vline1_f32 = vaddq_f32(vline1_f32, vhalf);
1142
         vline2_f32 = vaddq_f32(vline2_f32, vhalf);
1143

1144
         int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
1145
         int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
1146
         int16x4_t vline1_s16 = vqmovn_s32(vline1_s32);
1147
         int16x4_t vline2_s16 = vqmovn_s32(vline2_s32);
1148

1149
         int8x8_t vline_s8 = vqmovn_s16(vcombine_s16(vline1_s16, vline2_s16));
1150

1151
         vst1_s8(_dst + i, vline_s8);
1152
     }
1153
})
1154
#endif
1155

1156
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
1157
CVT_FUNC(f32, u16, 8,
1158
     register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
1159
{
1160
     for (size_t i = 0; i < w; i += 8)
1161
     {
1162
         internal::prefetch(_src + i);
1163
         __asm__ (
1164
             "vld1.32 {d2-d3}, [%[src]]                               \n\t"
1165
             "vadd.f32 q2, q1, q0                                     \n\t"
1166
             "vcvt.u32.f32 q3, q2                                     \n\t"
1167
             "vqmovn.u32 d8, q3                                       \n\t"
1168
             "vst1.16 {d8}, [%[dst]]                                  \n\t"
1169
             : /*no output*/
1170
             : [src] "r" (_src + i),
1171
               [dst] "r" (_dst + i),
1172
               "w" (vhalf)
1173
             : "d2","d3","d4","d5","d6","d7","d8"
1174
         );
1175
         __asm__ (
1176
             "vld1.32 {d2-d3}, [%[src]]                               \n\t"
1177
             "vadd.f32 q2, q1, q0                                     \n\t"
1178
             "vcvt.u32.f32 q3, q2                                     \n\t"
1179
             "vqmovn.u32 d8, q3                                       \n\t"
1180
             "vst1.16 {d8}, [%[dst]]                                  \n\t"
1181
             : /*no output*/
1182
             : [src] "r" (_src + i + 4),
1183
               [dst] "r" (_dst + i + 4),
1184
               "w" (vhalf)
1185
             : "d2","d3","d4","d5","d6","d7","d8"
1186
         );
1187
     }
1188
})
1189
#else
1190
CVT_FUNC(f32, u16, 8,
1191
     float32x4_t vhalf = vdupq_n_f32(0.5f);,
1192
{
1193
     for (size_t i = 0; i < w; i += 8)
1194
     {
1195
         internal::prefetch(_src + i);
1196
         float32x4_t vline_f32 = vld1q_f32(_src + i);
1197

1198
         vline_f32 = vaddq_f32(vline_f32, vhalf);
1199
         uint32x4_t vline_u32 = vcvtq_u32_f32(vline_f32);
1200
         uint16x4_t vline_u16 = vqmovn_u32(vline_u32);
1201

1202
         vst1_u16(_dst + i, vline_u16);
1203

1204
         vline_f32 = vld1q_f32(_src + i + 4);
1205

1206
         vline_f32 = vaddq_f32(vline_f32, vhalf);
1207
         vline_u32 = vcvtq_u32_f32(vline_f32);
1208
         vline_u16 = vqmovn_u32(vline_u32);
1209

1210
         vst1_u16(_dst + i + 4, vline_u16);
1211
     }
1212
})
1213
#endif
1214

1215
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
1216
CVT_FUNC(f32, s16, 8,
1217
     register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
1218
{
1219
     for (size_t i = 0; i < w; i += 8)
1220
     {
1221
         internal::prefetch(_src + i);
1222
         __asm__ (
1223
             "vld1.32 {d2-d3}, [%[src]]                               \n\t"
1224
             "vadd.f32 q2, q1, q0                                     \n\t"
1225
             "vcvt.s32.f32 q3, q2                                     \n\t"
1226
             "vqmovn.s32 d8, q3                                       \n\t"
1227
             "vst1.16 {d8}, [%[dst]]                                  \n\t"
1228
             : /*no output*/
1229
             : [src] "r" (_src + i),
1230
               [dst] "r" (_dst + i),
1231
               "w" (vhalf)
1232
             : "d2","d3","d4","d5","d6","d7","d8"
1233
         );
1234
         __asm__ (
1235
             "vld1.32 {d2-d3}, [%[src]]                               \n\t"
1236
             "vadd.f32 q2, q1, q0                                     \n\t"
1237
             "vcvt.s32.f32 q3, q2                                     \n\t"
1238
             "vqmovn.s32 d8, q3                                       \n\t"
1239
             "vst1.16 {d8}, [%[dst]]                                  \n\t"
1240
             : /*no output*/
1241
             : [src] "r" (_src + i + 4),
1242
               [dst] "r" (_dst + i + 4),
1243
               "w" (vhalf)
1244
             : "d2","d3","d4","d5","d6","d7","d8"
1245
         );
1246
     }
1247
})
1248
#else
1249
CVT_FUNC(f32, s16, 8,
1250
     float32x4_t vhalf = vdupq_n_f32(0.5f);,
1251
{
1252
     for (size_t i = 0; i < w; i += 8)
1253
     {
1254
         internal::prefetch(_src + i);
1255
         float32x4_t vline_f32 = vld1q_f32(_src + i);
1256

1257
         vline_f32 = vaddq_f32(vline_f32, vhalf);
1258
         int32x4_t vline_s32 = vcvtq_s32_f32(vline_f32);
1259
         int16x4_t vline_s16 = vqmovn_s32(vline_s32);
1260

1261
         vst1_s16(_dst + i, vline_s16);
1262

1263
         vline_f32 = vld1q_f32(_src + i + 4);
1264

1265
         vline_f32 = vaddq_f32(vline_f32, vhalf);
1266
         vline_s32 = vcvtq_s32_f32(vline_f32);
1267
         vline_s16 = vqmovn_s32(vline_s32);
1268

1269
         vst1_s16(_dst + i + 4, vline_s16);
1270
     }
1271
})
1272
#endif
1273

1274
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
1275
CVT_FUNC(f32, s32, 8,
1276
     register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);,
1277
{
1278
     for (size_t i = 0; i < w; i += 8)
1279
     {
1280
         internal::prefetch(_src + i);
1281
         __asm__ (
1282
             "vld1.32 {d2-d3}, [%[src1]]                              \n\t"
1283
             "vld1.32 {d4-d5}, [%[src2]]                              \n\t"
1284
             "vadd.f32 q3, q1, q0                                     \n\t"
1285
             "vadd.f32 q4, q2, q0                                     \n\t"
1286
             "vcvt.s32.f32 q5, q3                                     \n\t"
1287
             "vcvt.s32.f32 q6, q4                                     \n\t"
1288
             "vst1.32 {q5}, [%[dst1]]                                 \n\t"
1289
             "vst1.32 {q6}, [%[dst2]]                                 \n\t"
1290
             : /*no output*/
1291
             : [src1] "r" (_src + i),
1292
               [src2] "r" (_src + i + 4),
1293
               [dst1] "r" (_dst + i),
1294
               [dst2] "r" (_dst + i + 4),
1295
               "w" (vhalf)
1296
             : "d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
1297
         );
1298
     }
1299
})
1300
#else
1301
CVT_FUNC(f32, s32, 8,
1302
     float32x4_t vhalf = vdupq_n_f32(0.5f);,
1303
{
1304
     for (size_t i = 0; i < w; i += 8)
1305
     {
1306
         internal::prefetch(_src + i);
1307
         float32x4_t vline_f32 = vld1q_f32(_src + i);
1308

1309
         vline_f32 = vaddq_f32(vline_f32, vhalf);
1310
         int32x4_t vline_s32 = vcvtq_s32_f32(vline_f32);
1311

1312
         vst1q_s32(_dst + i, vline_s32);
1313

1314
         vline_f32 = vld1q_f32(_src + i + 4);
1315

1316
         vline_f32 = vaddq_f32(vline_f32, vhalf);
1317
         vline_s32 = vcvtq_s32_f32(vline_f32);
1318

1319
         vst1q_s32(_dst + i + 4, vline_s32);
1320
     }
1321
})
1322
#endif
1323

1324
void convert(const Size2D &_size,
1325
             const u8 * srcBase, ptrdiff_t srcStride,
1326
             s16 * dstBase, ptrdiff_t dstStride)
1327
{
1328
    convert(_size, srcBase, srcStride, (u16*)dstBase, dstStride);
1329
}
1330

1331
} // namespace CAROTENE_NS
1332

1333
Product

Resources

Company