Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/convert_scale.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
42
namespace CAROTENE_NS {
43
44
#ifdef CAROTENE_NEON
45
46
#define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \
47
void convertScale(const Size2D &_size, \
48
const T1 * srcBase, ptrdiff_t srcStride, \
49
T2 * dstBase, ptrdiff_t dstStride, \
50
f64 alpha, f64 beta) \
51
{ \
52
internal::assertSupportedConfiguration(); \
53
Size2D size(_size); \
54
if (srcStride == dstStride && \
55
srcStride == (ptrdiff_t)(size.width)) \
56
{ \
57
size.width *= size.height; \
58
size.height = 1; \
59
} \
60
const ptrdiff_t sstep = srcStride / sizeof(T1); \
61
const ptrdiff_t dstep = dstStride / sizeof(T2); \
62
const size_t w = size.width & ~(SIMD_SIZE-1); \
63
if (size.width >= SIMD_SIZE) \
64
{ \
65
const T1* _src = srcBase; \
66
T2* _dst = dstBase; \
67
CVTINIT \
68
for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
69
CVTROW \
70
} \
71
if(w < size.width) \
72
{ \
73
const T1* _src = srcBase; \
74
T2* _dst = dstBase; \
75
for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
76
for(size_t i = w; i < size.width; i++ ) \
77
_dst[i] = internal::saturate_cast<T2>(_src[i]*alpha + beta); \
78
} \
79
}
80
81
#define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW) \
82
void convertScale(const Size2D &_size, \
83
const T1 * srcBase, ptrdiff_t srcStride, \
84
T1 * dstBase, ptrdiff_t dstStride, \
85
f64 alpha, f64 beta) \
86
{ \
87
internal::assertSupportedConfiguration(); \
88
Size2D size(_size); \
89
if (srcStride == dstStride && \
90
srcStride == (ptrdiff_t)(size.width)) \
91
{ \
92
size.width *= size.height; \
93
size.height = 1; \
94
} \
95
const ptrdiff_t sstep = srcStride / sizeof(T1); \
96
const ptrdiff_t dstep = dstStride / sizeof(T1); \
97
const size_t w = size.width & ~(SIMD_SIZE-1); \
98
if (size.width >= SIMD_SIZE) \
99
{ \
100
const T1* _src = srcBase; \
101
T1* _dst = dstBase; \
102
CVTSINIT \
103
for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
104
CVTSROW \
105
} \
106
if(w < size.width) \
107
{ \
108
const T1* _src = srcBase; \
109
T1* _dst = dstBase; \
110
for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \
111
for(size_t i = w; i < size.width; i++ ) \
112
_dst[i] = internal::saturate_cast<T1>(_src[i]*alpha + beta); \
113
} \
114
}
115
116
#else
117
118
#define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \
119
void convertScale(const Size2D &, \
120
const T1 *, ptrdiff_t, \
121
T2 *, ptrdiff_t, \
122
f64, f64) \
123
{ \
124
internal::assertSupportedConfiguration(); \
125
}
126
127
#define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW) \
128
void convertScale(const Size2D &, \
129
const T1 *, ptrdiff_t, \
130
T1 *, ptrdiff_t, \
131
f64, f64) \
132
{ \
133
internal::assertSupportedConfiguration(); \
134
}
135
136
#endif
137
138
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
139
CVTS_FUNC1(u8, 16,
140
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
141
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
142
{
143
for (size_t i = 0; i < w; i += 16)
144
{
145
internal::prefetch(_src + i);
146
__asm__ (
147
"vld1.8 {d4-d5}, [%[src]] \n\t"
148
"vmovl.u8 q3, d4 \n\t"
149
"vmovl.u8 q4, d5 \n\t"
150
"vmovl.u16 q5, d6 \n\t"
151
"vmovl.u16 q6, d7 \n\t"
152
"vmovl.u16 q7, d8 \n\t"
153
"vmovl.u16 q8, d9 \n\t"
154
"vcvt.f32.u32 q9, q5 \n\t"
155
"vcvt.f32.u32 q10, q6 \n\t"
156
"vcvt.f32.u32 q11, q7 \n\t"
157
"vcvt.f32.u32 q12, q8 \n\t"
158
"vmul.f32 q13, q9, q0 \n\t"
159
"vmul.f32 q14, q10, q0 \n\t"
160
"vmul.f32 q15, q11, q0 \n\t"
161
"vmul.f32 q2, q12, q0 \n\t"
162
"vadd.f32 q3, q13, q1 \n\t"
163
"vadd.f32 q4, q14, q1 \n\t"
164
"vadd.f32 q5, q15, q1 \n\t"
165
"vadd.f32 q6, q2, q1 \n\t"
166
"vcvt.s32.f32 q7, q3 \n\t"
167
"vcvt.s32.f32 q8, q4 \n\t"
168
"vcvt.s32.f32 q9, q5 \n\t"
169
"vcvt.s32.f32 q10, q6 \n\t"
170
"vqmovun.s32 d22, q7 \n\t"
171
"vqmovun.s32 d23, q8 \n\t"
172
"vqmovun.s32 d24, q9 \n\t"
173
"vqmovun.s32 d25, q10 \n\t"
174
"vqmovn.u16 d26, q11 \n\t"
175
"vqmovn.u16 d27, q12 \n\t"
176
"vst1.8 {d26-d27}, [%[dst1]] \n\t"
177
: /*no output*/
178
: [src] "r" (_src + i),
179
[dst1] "r" (_dst + i + 0),
180
"w" (vscale), "w" (vshift)
181
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
182
);
183
}
184
})
185
#else
186
CVTS_FUNC1(u8, 16,
187
float32x4_t vscale = vdupq_n_f32((f32)alpha);
188
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
189
{
190
for (size_t i = 0; i < w; i += 16)
191
{
192
internal::prefetch(_src + i);
193
uint8x16_t vline = vld1q_u8(_src + i);
194
uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
195
uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
196
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
197
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
198
uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
199
uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
200
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
201
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
202
float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
203
float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
204
vline1_f32 = vmulq_f32(vline1_f32, vscale);
205
vline2_f32 = vmulq_f32(vline2_f32, vscale);
206
vline3_f32 = vmulq_f32(vline3_f32, vscale);
207
vline4_f32 = vmulq_f32(vline4_f32, vscale);
208
vline1_f32 = vaddq_f32(vline1_f32, vshift);
209
vline2_f32 = vaddq_f32(vline2_f32, vshift);
210
vline3_f32 = vaddq_f32(vline3_f32, vshift);
211
vline4_f32 = vaddq_f32(vline4_f32, vshift);
212
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
213
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
214
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
215
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
216
uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
217
uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
218
vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
219
}
220
})
221
#endif
222
223
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
224
CVTS_FUNC(u8, s8, 16,
225
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
226
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
227
{
228
for (size_t i = 0; i < w; i += 16)
229
{
230
internal::prefetch(_src + i);
231
__asm__ (
232
"vld1.8 {d4-d5}, [%[src]] \n\t"
233
"vmovl.u8 q3, d4 \n\t"
234
"vmovl.u8 q4, d5 \n\t"
235
"vmovl.u16 q5, d6 \n\t"
236
"vmovl.u16 q6, d7 \n\t"
237
"vmovl.u16 q7, d8 \n\t"
238
"vmovl.u16 q8, d9 \n\t"
239
"vcvt.f32.u32 q9, q5 \n\t"
240
"vcvt.f32.u32 q10, q6 \n\t"
241
"vcvt.f32.u32 q11, q7 \n\t"
242
"vcvt.f32.u32 q12, q8 \n\t"
243
"vmul.f32 q13, q9, q0 \n\t"
244
"vmul.f32 q14, q10, q0 \n\t"
245
"vmul.f32 q15, q11, q0 \n\t"
246
"vmul.f32 q2, q12, q0 \n\t"
247
"vadd.f32 q3, q13, q1 \n\t"
248
"vadd.f32 q4, q14, q1 \n\t"
249
"vadd.f32 q5, q15, q1 \n\t"
250
"vadd.f32 q6, q2, q1 \n\t"
251
"vcvt.s32.f32 q7, q3 \n\t"
252
"vcvt.s32.f32 q8, q4 \n\t"
253
"vcvt.s32.f32 q9, q5 \n\t"
254
"vcvt.s32.f32 q10, q6 \n\t"
255
"vqmovn.s32 d22, q7 \n\t"
256
"vqmovn.s32 d23, q8 \n\t"
257
"vqmovn.s32 d24, q9 \n\t"
258
"vqmovn.s32 d25, q10 \n\t"
259
"vqmovn.s16 d26, q11 \n\t"
260
"vqmovn.s16 d27, q12 \n\t"
261
"vst1.8 {d26-d27}, [%[dst1]] \n\t"
262
: //no output
263
: [src] "r" (_src + i),
264
[dst1] "r" (_dst + i + 0),
265
"w" (vscale), "w" (vshift)
266
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
267
);
268
}
269
})
270
#else
271
CVTS_FUNC(u8, s8, 16,
272
float32x4_t vscale = vdupq_n_f32((f32)alpha);
273
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
274
{
275
for (size_t i = 0; i < w; i += 16)
276
{
277
internal::prefetch(_src + i);
278
uint8x16_t vline = vld1q_u8(_src + i);
279
uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
280
uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
281
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
282
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
283
uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
284
uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
285
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
286
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
287
float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
288
float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
289
vline1_f32 = vmulq_f32(vline1_f32, vscale);
290
vline2_f32 = vmulq_f32(vline2_f32, vscale);
291
vline3_f32 = vmulq_f32(vline3_f32, vscale);
292
vline4_f32 = vmulq_f32(vline4_f32, vscale);
293
vline1_f32 = vaddq_f32(vline1_f32, vshift);
294
vline2_f32 = vaddq_f32(vline2_f32, vshift);
295
vline3_f32 = vaddq_f32(vline3_f32, vshift);
296
vline4_f32 = vaddq_f32(vline4_f32, vshift);
297
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
298
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
299
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
300
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
301
int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
302
int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
303
vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16)));
304
}
305
})
306
#endif
307
308
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
309
CVTS_FUNC(u8, u16, 16,
310
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
311
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
312
{
313
for (size_t i = 0; i < w; i += 16)
314
{
315
internal::prefetch(_src + i);
316
__asm__ (
317
"vld1.8 {d4-d5}, [%[src]] \n\t"
318
"vmovl.u8 q3, d4 \n\t"
319
"vmovl.u8 q4, d5 \n\t"
320
"vmovl.u16 q5, d6 \n\t"
321
"vmovl.u16 q6, d7 \n\t"
322
"vmovl.u16 q7, d8 \n\t"
323
"vmovl.u16 q8, d9 \n\t"
324
"vcvt.f32.u32 q9, q5 \n\t"
325
"vcvt.f32.u32 q10, q6 \n\t"
326
"vcvt.f32.u32 q11, q7 \n\t"
327
"vcvt.f32.u32 q12, q8 \n\t"
328
"vmul.f32 q13, q9, q0 \n\t"
329
"vmul.f32 q14, q10, q0 \n\t"
330
"vmul.f32 q15, q11, q0 \n\t"
331
"vmul.f32 q2, q12, q0 \n\t"
332
"vadd.f32 q3, q13, q1 \n\t"
333
"vadd.f32 q4, q14, q1 \n\t"
334
"vadd.f32 q5, q15, q1 \n\t"
335
"vadd.f32 q6, q2, q1 \n\t"
336
"vcvt.s32.f32 q7, q3 \n\t"
337
"vcvt.s32.f32 q8, q4 \n\t"
338
"vcvt.s32.f32 q9, q5 \n\t"
339
"vcvt.s32.f32 q10, q6 \n\t"
340
"vqmovun.s32 d22, q7 \n\t"
341
"vqmovun.s32 d23, q8 \n\t"
342
"vqmovun.s32 d24, q9 \n\t"
343
"vqmovun.s32 d25, q10 \n\t"
344
"vst1.16 {d22-d23}, [%[dst1]] \n\t"
345
"vst1.16 {d24-d25}, [%[dst2]] \n\t"
346
: /*no output*/
347
: [src] "r" (_src + i),
348
[dst1] "r" (_dst + i + 0),
349
[dst2] "r" (_dst + i + 8),
350
"w" (vscale), "w" (vshift)
351
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
352
);
353
}
354
})
355
#else
356
CVTS_FUNC(u8, u16, 16,
357
float32x4_t vscale = vdupq_n_f32((f32)alpha);
358
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
359
{
360
for (size_t i = 0; i < w; i += 16)
361
{
362
internal::prefetch(_src + i);
363
uint8x16_t vline = vld1q_u8(_src + i);
364
uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
365
uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
366
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
367
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
368
uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
369
uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
370
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
371
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
372
float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
373
float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
374
vline1_f32 = vmulq_f32(vline1_f32, vscale);
375
vline2_f32 = vmulq_f32(vline2_f32, vscale);
376
vline3_f32 = vmulq_f32(vline3_f32, vscale);
377
vline4_f32 = vmulq_f32(vline4_f32, vscale);
378
vline1_f32 = vaddq_f32(vline1_f32, vshift);
379
vline2_f32 = vaddq_f32(vline2_f32, vshift);
380
vline3_f32 = vaddq_f32(vline3_f32, vshift);
381
vline4_f32 = vaddq_f32(vline4_f32, vshift);
382
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
383
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
384
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
385
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
386
vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)));
387
vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)));
388
}
389
})
390
#endif
391
392
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
393
CVTS_FUNC(u8, s16, 16,
394
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
395
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
396
{
397
for (size_t i = 0; i < w; i += 16)
398
{
399
internal::prefetch(_src + i);
400
__asm__ (
401
"vld1.8 {d4-d5}, [%[src]] \n\t"
402
"vmovl.u8 q3, d4 \n\t"
403
"vmovl.u8 q4, d5 \n\t"
404
"vmovl.u16 q5, d6 \n\t"
405
"vmovl.u16 q6, d7 \n\t"
406
"vmovl.u16 q7, d8 \n\t"
407
"vmovl.u16 q8, d9 \n\t"
408
"vcvt.f32.u32 q9, q5 \n\t"
409
"vcvt.f32.u32 q10, q6 \n\t"
410
"vcvt.f32.u32 q11, q7 \n\t"
411
"vcvt.f32.u32 q12, q8 \n\t"
412
"vmul.f32 q13, q9, q0 \n\t"
413
"vmul.f32 q14, q10, q0 \n\t"
414
"vmul.f32 q15, q11, q0 \n\t"
415
"vmul.f32 q2, q12, q0 \n\t"
416
"vadd.f32 q3, q13, q1 \n\t"
417
"vadd.f32 q4, q14, q1 \n\t"
418
"vadd.f32 q5, q15, q1 \n\t"
419
"vadd.f32 q6, q2, q1 \n\t"
420
"vcvt.s32.f32 q7, q3 \n\t"
421
"vcvt.s32.f32 q8, q4 \n\t"
422
"vcvt.s32.f32 q9, q5 \n\t"
423
"vcvt.s32.f32 q10, q6 \n\t"
424
"vqmovn.s32 d22, q7 \n\t"
425
"vqmovn.s32 d23, q8 \n\t"
426
"vqmovn.s32 d24, q9 \n\t"
427
"vqmovn.s32 d25, q10 \n\t"
428
"vst1.16 {d22-d23}, [%[dst1]] \n\t"
429
"vst1.16 {d24-d25}, [%[dst2]] \n\t"
430
: //no output
431
: [src] "r" (_src + i),
432
[dst1] "r" (_dst + i + 0),
433
[dst2] "r" (_dst + i + 8),
434
"w" (vscale), "w" (vshift)
435
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
436
);
437
}
438
})
439
#else
440
CVTS_FUNC(u8, s16, 16,
441
float32x4_t vscale = vdupq_n_f32((f32)alpha);
442
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
443
{
444
for (size_t i = 0; i < w; i += 16)
445
{
446
internal::prefetch(_src + i);
447
uint8x16_t vline = vld1q_u8(_src + i);
448
uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
449
uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
450
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
451
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
452
uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
453
uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
454
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
455
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
456
float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
457
float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
458
vline1_f32 = vmulq_f32(vline1_f32, vscale);
459
vline2_f32 = vmulq_f32(vline2_f32, vscale);
460
vline3_f32 = vmulq_f32(vline3_f32, vscale);
461
vline4_f32 = vmulq_f32(vline4_f32, vscale);
462
vline1_f32 = vaddq_f32(vline1_f32, vshift);
463
vline2_f32 = vaddq_f32(vline2_f32, vshift);
464
vline3_f32 = vaddq_f32(vline3_f32, vshift);
465
vline4_f32 = vaddq_f32(vline4_f32, vshift);
466
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
467
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
468
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
469
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
470
vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)));
471
vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)));
472
}
473
})
474
#endif
475
476
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
477
CVTS_FUNC(u8, s32, 16,
478
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
479
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
480
{
481
for (size_t i = 0; i < w; i += 16)
482
{
483
internal::prefetch(_src + i);
484
__asm__ (
485
"vld1.8 {d4-d5}, [%[src]] \n\t"
486
"vmovl.u8 q3, d4 \n\t"
487
"vmovl.u8 q4, d5 \n\t"
488
"vmovl.u16 q5, d6 \n\t"
489
"vmovl.u16 q6, d7 \n\t"
490
"vmovl.u16 q7, d8 \n\t"
491
"vmovl.u16 q8, d9 \n\t"
492
"vcvt.f32.u32 q9, q5 \n\t"
493
"vcvt.f32.u32 q10, q6 \n\t"
494
"vcvt.f32.u32 q11, q7 \n\t"
495
"vcvt.f32.u32 q12, q8 \n\t"
496
"vmul.f32 q13, q9, q0 \n\t"
497
"vmul.f32 q14, q10, q0 \n\t"
498
"vmul.f32 q15, q11, q0 \n\t"
499
"vmul.f32 q2, q12, q0 \n\t"
500
"vadd.f32 q3, q13, q1 \n\t"
501
"vadd.f32 q4, q14, q1 \n\t"
502
"vadd.f32 q5, q15, q1 \n\t"
503
"vadd.f32 q6, q2, q1 \n\t"
504
"vcvt.s32.f32 q7, q3 \n\t"
505
"vcvt.s32.f32 q8, q4 \n\t"
506
"vcvt.s32.f32 q9, q5 \n\t"
507
"vcvt.s32.f32 q10, q6 \n\t"
508
"vst1.32 {d14-d15}, [%[dst1]] \n\t"
509
"vst1.32 {d16-d17}, [%[dst2]] \n\t"
510
"vst1.32 {d18-d19}, [%[dst3]] \n\t"
511
"vst1.32 {d20-d21}, [%[dst4]] \n\t"
512
: /*no output*/
513
: [src] "r" (_src + i),
514
[dst1] "r" (_dst + i + 0),
515
[dst2] "r" (_dst + i + 4),
516
[dst3] "r" (_dst + i + 8),
517
[dst4] "r" (_dst + i + 12),
518
"w" (vscale), "w" (vshift)
519
: "d4","d5","d6","d7","d8","d9","d10",
520
"d11","d12","d13","d14","d15","d16","d17",
521
"d18","d19","d20","d21","d22","d23","d24",
522
"d25","d26","d27","d28","d29","d30","d31"
523
);
524
}
525
})
526
#else
527
CVTS_FUNC(u8, s32, 16,
528
float32x4_t vscale = vdupq_n_f32((f32)alpha);
529
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
530
{
531
for (size_t i = 0; i < w; i += 16)
532
{
533
internal::prefetch(_src + i);
534
uint8x16_t vline = vld1q_u8(_src + i);
535
uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
536
uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
537
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
538
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
539
uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
540
uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
541
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
542
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
543
float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
544
float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
545
vline1_f32 = vmulq_f32(vline1_f32, vscale);
546
vline2_f32 = vmulq_f32(vline2_f32, vscale);
547
vline3_f32 = vmulq_f32(vline3_f32, vscale);
548
vline4_f32 = vmulq_f32(vline4_f32, vscale);
549
vline1_f32 = vaddq_f32(vline1_f32, vshift);
550
vline2_f32 = vaddq_f32(vline2_f32, vshift);
551
vline3_f32 = vaddq_f32(vline3_f32, vshift);
552
vline4_f32 = vaddq_f32(vline4_f32, vshift);
553
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
554
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
555
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
556
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
557
vst1q_s32(_dst + i + 0, vline1_s32);
558
vst1q_s32(_dst + i + 4, vline2_s32);
559
vst1q_s32(_dst + i + 8, vline3_s32);
560
vst1q_s32(_dst + i + 12, vline4_s32);
561
}
562
})
563
#endif
564
565
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
566
CVTS_FUNC(u8, f32, 16,
567
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
568
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
569
{
570
for (size_t i = 0; i < w; i += 16)
571
{
572
internal::prefetch(_src + i);
573
__asm__ (
574
"vld1.8 {d4-d5}, [%[src]] \n\t"
575
"vmovl.u8 q3, d4 \n\t"
576
"vmovl.u8 q4, d5 \n\t"
577
"vmovl.u16 q5, d6 \n\t"
578
"vmovl.u16 q6, d7 \n\t"
579
"vmovl.u16 q7, d8 \n\t"
580
"vmovl.u16 q8, d9 \n\t"
581
"vcvt.f32.u32 q9, q5 \n\t"
582
"vcvt.f32.u32 q10, q6 \n\t"
583
"vcvt.f32.u32 q11, q7 \n\t"
584
"vcvt.f32.u32 q12, q8 \n\t"
585
"vmul.f32 q13, q9, q0 \n\t"
586
"vmul.f32 q14, q10, q0 \n\t"
587
"vmul.f32 q15, q11, q0 \n\t"
588
"vmul.f32 q2, q12, q0 \n\t"
589
"vadd.f32 q3, q13, q1 \n\t"
590
"vadd.f32 q4, q14, q1 \n\t"
591
"vadd.f32 q5, q15, q1 \n\t"
592
"vadd.f32 q6, q2, q1 \n\t"
593
"vst1.32 {d6-d7}, [%[dst1]] \n\t"
594
"vst1.32 {d8-d9}, [%[dst2]] \n\t"
595
"vst1.32 {d10-d11}, [%[dst3]] \n\t"
596
"vst1.32 {d12-d13}, [%[dst4]] \n\t"
597
: /*no output*/
598
: [src] "r" (_src + i),
599
[dst1] "r" (_dst + i + 0),
600
[dst2] "r" (_dst + i + 4),
601
[dst3] "r" (_dst + i + 8),
602
[dst4] "r" (_dst + i + 12),
603
"w" (vscale), "w" (vshift)
604
: "d4","d5","d6","d7","d8","d9","d10",
605
"d11","d12","d13","d14","d15","d16","d17",
606
"d18","d19","d20","d21","d22","d23","d24",
607
"d25","d26","d27","d28","d29","d30","d31"
608
);
609
}
610
})
611
#else
612
CVTS_FUNC(u8, f32, 16,
613
float32x4_t vscale = vdupq_n_f32((f32)alpha);
614
float32x4_t vshift = vdupq_n_f32((f32)beta);,
615
{
616
for (size_t i = 0; i < w; i += 16)
617
{
618
internal::prefetch(_src + i);
619
uint8x16_t vline = vld1q_u8(_src + i);
620
uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
621
uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
622
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
623
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
624
uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
625
uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
626
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
627
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
628
float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
629
float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
630
vline1_f32 = vmulq_f32(vline1_f32, vscale);
631
vline2_f32 = vmulq_f32(vline2_f32, vscale);
632
vline3_f32 = vmulq_f32(vline3_f32, vscale);
633
vline4_f32 = vmulq_f32(vline4_f32, vscale);
634
vline1_f32 = vaddq_f32(vline1_f32, vshift);
635
vline2_f32 = vaddq_f32(vline2_f32, vshift);
636
vline3_f32 = vaddq_f32(vline3_f32, vshift);
637
vline4_f32 = vaddq_f32(vline4_f32, vshift);
638
vst1q_f32(_dst + i + 0, vline1_f32);
639
vst1q_f32(_dst + i + 4, vline2_f32);
640
vst1q_f32(_dst + i + 8, vline3_f32);
641
vst1q_f32(_dst + i + 12, vline4_f32);
642
}
643
})
644
#endif
645
646
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
647
CVTS_FUNC(s8, u8, 16,
648
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
649
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
650
{
651
for (size_t i = 0; i < w; i += 16)
652
{
653
internal::prefetch(_src + i);
654
__asm__ (
655
"vld1.8 {d4-d5}, [%[src]] \n\t"
656
"vmovl.s8 q3, d4 \n\t"
657
"vmovl.s8 q4, d5 \n\t"
658
"vmovl.s16 q5, d6 \n\t"
659
"vmovl.s16 q6, d7 \n\t"
660
"vmovl.s16 q7, d8 \n\t"
661
"vmovl.s16 q8, d9 \n\t"
662
"vcvt.f32.s32 q9, q5 \n\t"
663
"vcvt.f32.s32 q10, q6 \n\t"
664
"vcvt.f32.s32 q11, q7 \n\t"
665
"vcvt.f32.s32 q12, q8 \n\t"
666
"vmul.f32 q13, q9, q0 \n\t"
667
"vmul.f32 q14, q10, q0 \n\t"
668
"vmul.f32 q15, q11, q0 \n\t"
669
"vmul.f32 q2, q12, q0 \n\t"
670
"vadd.f32 q3, q13, q1 \n\t"
671
"vadd.f32 q4, q14, q1 \n\t"
672
"vadd.f32 q5, q15, q1 \n\t"
673
"vadd.f32 q6, q2, q1 \n\t"
674
"vcvt.s32.f32 q7, q3 \n\t"
675
"vcvt.s32.f32 q8, q4 \n\t"
676
"vcvt.s32.f32 q9, q5 \n\t"
677
"vcvt.s32.f32 q10, q6 \n\t"
678
"vqmovun.s32 d22, q7 \n\t"
679
"vqmovun.s32 d23, q8 \n\t"
680
"vqmovun.s32 d24, q9 \n\t"
681
"vqmovun.s32 d25, q10 \n\t"
682
"vqmovn.u16 d26, q11 \n\t"
683
"vqmovn.u16 d27, q12 \n\t"
684
"vst1.8 {d26-d27}, [%[dst1]] \n\t"
685
: /*no output*/
686
: [src] "r" (_src + i),
687
[dst1] "r" (_dst + i + 0),
688
"w" (vscale), "w" (vshift)
689
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
690
);
691
}
692
})
693
#else
694
CVTS_FUNC(s8, u8, 16,
695
float32x4_t vscale = vdupq_n_f32((f32)alpha);
696
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
697
{
698
for (size_t i = 0; i < w; i += 16)
699
{
700
internal::prefetch(_src + i);
701
int8x16_t vline = vld1q_s8(_src + i);
702
int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
703
int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
704
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
705
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
706
int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
707
int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
708
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
709
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
710
float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
711
float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
712
vline1_f32 = vmulq_f32(vline1_f32, vscale);
713
vline2_f32 = vmulq_f32(vline2_f32, vscale);
714
vline3_f32 = vmulq_f32(vline3_f32, vscale);
715
vline4_f32 = vmulq_f32(vline4_f32, vscale);
716
vline1_f32 = vaddq_f32(vline1_f32, vshift);
717
vline2_f32 = vaddq_f32(vline2_f32, vshift);
718
vline3_f32 = vaddq_f32(vline3_f32, vshift);
719
vline4_f32 = vaddq_f32(vline4_f32, vshift);
720
vline1_s32 = vcvtq_s32_f32(vline1_f32);
721
vline2_s32 = vcvtq_s32_f32(vline2_f32);
722
vline3_s32 = vcvtq_s32_f32(vline3_f32);
723
vline4_s32 = vcvtq_s32_f32(vline4_f32);
724
uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
725
uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
726
vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
727
}
728
})
729
#endif
730
731
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
732
CVTS_FUNC1(s8, 16,
733
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
734
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
735
{
736
for (size_t i = 0; i < w; i += 16)
737
{
738
internal::prefetch(_src + i);
739
__asm__ (
740
"vld1.8 {d4-d5}, [%[src]] \n\t"
741
"vmovl.s8 q3, d4 \n\t"
742
"vmovl.s8 q4, d5 \n\t"
743
"vmovl.s16 q5, d6 \n\t"
744
"vmovl.s16 q6, d7 \n\t"
745
"vmovl.s16 q7, d8 \n\t"
746
"vmovl.s16 q8, d9 \n\t"
747
"vcvt.f32.s32 q9, q5 \n\t"
748
"vcvt.f32.s32 q10, q6 \n\t"
749
"vcvt.f32.s32 q11, q7 \n\t"
750
"vcvt.f32.s32 q12, q8 \n\t"
751
"vmul.f32 q13, q9, q0 \n\t"
752
"vmul.f32 q14, q10, q0 \n\t"
753
"vmul.f32 q15, q11, q0 \n\t"
754
"vmul.f32 q2, q12, q0 \n\t"
755
"vadd.f32 q3, q13, q1 \n\t"
756
"vadd.f32 q4, q14, q1 \n\t"
757
"vadd.f32 q5, q15, q1 \n\t"
758
"vadd.f32 q6, q2, q1 \n\t"
759
"vcvt.s32.f32 q7, q3 \n\t"
760
"vcvt.s32.f32 q8, q4 \n\t"
761
"vcvt.s32.f32 q9, q5 \n\t"
762
"vcvt.s32.f32 q10, q6 \n\t"
763
"vqmovn.s32 d22, q7 \n\t"
764
"vqmovn.s32 d23, q8 \n\t"
765
"vqmovn.s32 d24, q9 \n\t"
766
"vqmovn.s32 d25, q10 \n\t"
767
"vqmovn.s16 d26, q11 \n\t"
768
"vqmovn.s16 d27, q12 \n\t"
769
"vst1.8 {d26-d27}, [%[dst1]] \n\t"
770
: /*no output*/
771
: [src] "r" (_src + i),
772
[dst1] "r" (_dst + i + 0),
773
"w" (vscale), "w" (vshift)
774
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
775
);
776
}
777
})
778
#else
779
CVTS_FUNC1(s8, 16,
780
float32x4_t vscale = vdupq_n_f32((f32)alpha);
781
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
782
{
783
for (size_t i = 0; i < w; i += 16)
784
{
785
internal::prefetch(_src + i);
786
int8x16_t vline = vld1q_s8(_src + i);
787
int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
788
int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
789
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
790
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
791
int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
792
int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
793
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
794
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
795
float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
796
float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
797
vline1_f32 = vmulq_f32(vline1_f32, vscale);
798
vline2_f32 = vmulq_f32(vline2_f32, vscale);
799
vline3_f32 = vmulq_f32(vline3_f32, vscale);
800
vline4_f32 = vmulq_f32(vline4_f32, vscale);
801
vline1_f32 = vaddq_f32(vline1_f32, vshift);
802
vline2_f32 = vaddq_f32(vline2_f32, vshift);
803
vline3_f32 = vaddq_f32(vline3_f32, vshift);
804
vline4_f32 = vaddq_f32(vline4_f32, vshift);
805
vline1_s32 = vcvtq_s32_f32(vline1_f32);
806
vline2_s32 = vcvtq_s32_f32(vline2_f32);
807
vline3_s32 = vcvtq_s32_f32(vline3_f32);
808
vline4_s32 = vcvtq_s32_f32(vline4_f32);
809
int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
810
int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
811
vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16)));
812
}
813
})
814
#endif
815
816
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
817
CVTS_FUNC(s8, u16, 16,
818
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
819
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
820
{
821
for (size_t i = 0; i < w; i += 16)
822
{
823
internal::prefetch(_src + i);
824
__asm__ (
825
"vld1.8 {d4-d5}, [%[src]] \n\t"
826
"vmovl.s8 q3, d4 \n\t"
827
"vmovl.s8 q4, d5 \n\t"
828
"vmovl.s16 q5, d6 \n\t"
829
"vmovl.s16 q6, d7 \n\t"
830
"vmovl.s16 q7, d8 \n\t"
831
"vmovl.s16 q8, d9 \n\t"
832
"vcvt.f32.s32 q9, q5 \n\t"
833
"vcvt.f32.s32 q10, q6 \n\t"
834
"vcvt.f32.s32 q11, q7 \n\t"
835
"vcvt.f32.s32 q12, q8 \n\t"
836
"vmul.f32 q13, q9, q0 \n\t"
837
"vmul.f32 q14, q10, q0 \n\t"
838
"vmul.f32 q15, q11, q0 \n\t"
839
"vmul.f32 q2, q12, q0 \n\t"
840
"vadd.f32 q3, q13, q1 \n\t"
841
"vadd.f32 q4, q14, q1 \n\t"
842
"vadd.f32 q5, q15, q1 \n\t"
843
"vadd.f32 q6, q2, q1 \n\t"
844
"vcvt.s32.f32 q7, q3 \n\t"
845
"vcvt.s32.f32 q8, q4 \n\t"
846
"vcvt.s32.f32 q9, q5 \n\t"
847
"vcvt.s32.f32 q10, q6 \n\t"
848
"vqmovun.s32 d22, q7 \n\t"
849
"vqmovun.s32 d23, q8 \n\t"
850
"vqmovun.s32 d24, q9 \n\t"
851
"vqmovun.s32 d25, q10 \n\t"
852
"vst1.16 {d22-d23}, [%[dst1]] \n\t"
853
"vst1.16 {d24-d25}, [%[dst2]] \n\t"
854
: /*no output*/
855
: [src] "r" (_src + i),
856
[dst1] "r" (_dst + i + 0),
857
[dst2] "r" (_dst + i + 8),
858
"w" (vscale), "w" (vshift)
859
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
860
);
861
}
862
})
863
#else
864
CVTS_FUNC(s8, u16, 16,
865
float32x4_t vscale = vdupq_n_f32((f32)alpha);
866
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
867
{
868
for (size_t i = 0; i < w; i += 16)
869
{
870
internal::prefetch(_src + i);
871
int8x16_t vline = vld1q_s8(_src + i);
872
int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
873
int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
874
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
875
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
876
int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
877
int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
878
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
879
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
880
float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
881
float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
882
vline1_f32 = vmulq_f32(vline1_f32, vscale);
883
vline2_f32 = vmulq_f32(vline2_f32, vscale);
884
vline3_f32 = vmulq_f32(vline3_f32, vscale);
885
vline4_f32 = vmulq_f32(vline4_f32, vscale);
886
vline1_f32 = vaddq_f32(vline1_f32, vshift);
887
vline2_f32 = vaddq_f32(vline2_f32, vshift);
888
vline3_f32 = vaddq_f32(vline3_f32, vshift);
889
vline4_f32 = vaddq_f32(vline4_f32, vshift);
890
vline1_s32 = vcvtq_s32_f32(vline1_f32);
891
vline2_s32 = vcvtq_s32_f32(vline2_f32);
892
vline3_s32 = vcvtq_s32_f32(vline3_f32);
893
vline4_s32 = vcvtq_s32_f32(vline4_f32);
894
uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
895
uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
896
vst1q_u16(_dst + i + 0, vRes1_u16);
897
vst1q_u16(_dst + i + 8, vRes2_u16);
898
}
899
})
900
#endif
901
902
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
903
CVTS_FUNC(s8, s16, 16,
904
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
905
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
906
{
907
for (size_t i = 0; i < w; i += 16)
908
{
909
internal::prefetch(_src + i);
910
__asm__ (
911
"vld1.8 {d4-d5}, [%[src]] \n\t"
912
"vmovl.s8 q3, d4 \n\t"
913
"vmovl.s8 q4, d5 \n\t"
914
"vmovl.s16 q5, d6 \n\t"
915
"vmovl.s16 q6, d7 \n\t"
916
"vmovl.s16 q7, d8 \n\t"
917
"vmovl.s16 q8, d9 \n\t"
918
"vcvt.f32.s32 q9, q5 \n\t"
919
"vcvt.f32.s32 q10, q6 \n\t"
920
"vcvt.f32.s32 q11, q7 \n\t"
921
"vcvt.f32.s32 q12, q8 \n\t"
922
"vmul.f32 q13, q9, q0 \n\t"
923
"vmul.f32 q14, q10, q0 \n\t"
924
"vmul.f32 q15, q11, q0 \n\t"
925
"vmul.f32 q2, q12, q0 \n\t"
926
"vadd.f32 q3, q13, q1 \n\t"
927
"vadd.f32 q4, q14, q1 \n\t"
928
"vadd.f32 q5, q15, q1 \n\t"
929
"vadd.f32 q6, q2, q1 \n\t"
930
"vcvt.s32.f32 q7, q3 \n\t"
931
"vcvt.s32.f32 q8, q4 \n\t"
932
"vcvt.s32.f32 q9, q5 \n\t"
933
"vcvt.s32.f32 q10, q6 \n\t"
934
"vqmovn.s32 d22, q7 \n\t"
935
"vqmovn.s32 d23, q8 \n\t"
936
"vqmovn.s32 d24, q9 \n\t"
937
"vqmovn.s32 d25, q10 \n\t"
938
"vst1.16 {d22-d23}, [%[dst1]] \n\t"
939
"vst1.16 {d24-d25}, [%[dst2]] \n\t"
940
: /*no output*/
941
: [src] "r" (_src + i),
942
[dst1] "r" (_dst + i + 0),
943
[dst2] "r" (_dst + i + 8),
944
"w" (vscale), "w" (vshift)
945
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
946
);
947
}
948
})
949
#else
950
CVTS_FUNC(s8, s16, 16,
951
float32x4_t vscale = vdupq_n_f32((f32)alpha);
952
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
953
{
954
for (size_t i = 0; i < w; i += 16)
955
{
956
internal::prefetch(_src + i);
957
int8x16_t vline = vld1q_s8(_src + i);
958
int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
959
int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
960
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
961
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
962
int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
963
int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
964
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
965
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
966
float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
967
float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
968
vline1_f32 = vmulq_f32(vline1_f32, vscale);
969
vline2_f32 = vmulq_f32(vline2_f32, vscale);
970
vline3_f32 = vmulq_f32(vline3_f32, vscale);
971
vline4_f32 = vmulq_f32(vline4_f32, vscale);
972
vline1_f32 = vaddq_f32(vline1_f32, vshift);
973
vline2_f32 = vaddq_f32(vline2_f32, vshift);
974
vline3_f32 = vaddq_f32(vline3_f32, vshift);
975
vline4_f32 = vaddq_f32(vline4_f32, vshift);
976
vline1_s32 = vcvtq_s32_f32(vline1_f32);
977
vline2_s32 = vcvtq_s32_f32(vline2_f32);
978
vline3_s32 = vcvtq_s32_f32(vline3_f32);
979
vline4_s32 = vcvtq_s32_f32(vline4_f32);
980
int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
981
int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
982
vst1q_s16(_dst + i + 0, vRes1_s16);
983
vst1q_s16(_dst + i + 8, vRes2_s16);
984
}
985
})
986
#endif
987
988
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
989
CVTS_FUNC(s8, s32, 16,
990
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
991
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
992
{
993
for (size_t i = 0; i < w; i += 16)
994
{
995
internal::prefetch(_src + i);
996
__asm__ (
997
"vld1.8 {d4-d5}, [%[src]] \n\t"
998
"vmovl.s8 q3, d4 \n\t"
999
"vmovl.s8 q4, d5 \n\t"
1000
"vmovl.s16 q5, d6 \n\t"
1001
"vmovl.s16 q6, d7 \n\t"
1002
"vmovl.s16 q7, d8 \n\t"
1003
"vmovl.s16 q8, d9 \n\t"
1004
"vcvt.f32.s32 q9, q5 \n\t"
1005
"vcvt.f32.s32 q10, q6 \n\t"
1006
"vcvt.f32.s32 q11, q7 \n\t"
1007
"vcvt.f32.s32 q12, q8 \n\t"
1008
"vmul.f32 q13, q9, q0 \n\t"
1009
"vmul.f32 q14, q10, q0 \n\t"
1010
"vmul.f32 q15, q11, q0 \n\t"
1011
"vmul.f32 q2, q12, q0 \n\t"
1012
"vadd.f32 q3, q13, q1 \n\t"
1013
"vadd.f32 q4, q14, q1 \n\t"
1014
"vadd.f32 q5, q15, q1 \n\t"
1015
"vadd.f32 q6, q2, q1 \n\t"
1016
"vcvt.s32.f32 q7, q3 \n\t"
1017
"vcvt.s32.f32 q8, q4 \n\t"
1018
"vcvt.s32.f32 q9, q5 \n\t"
1019
"vcvt.s32.f32 q10, q6 \n\t"
1020
"vst1.32 {d14-d15}, [%[dst1]] \n\t"
1021
"vst1.32 {d16-d17}, [%[dst2]] \n\t"
1022
"vst1.32 {d18-d19}, [%[dst3]] \n\t"
1023
"vst1.32 {d20-d21}, [%[dst4]] \n\t"
1024
: /*no output*/
1025
: [src] "r" (_src + i),
1026
[dst1] "r" (_dst + i + 0),
1027
[dst2] "r" (_dst + i + 4),
1028
[dst3] "r" (_dst + i + 8),
1029
[dst4] "r" (_dst + i + 12),
1030
"w" (vscale), "w" (vshift)
1031
: "d4","d5","d6","d7","d8","d9","d10",
1032
"d11","d12","d13","d14","d15","d16","d17",
1033
"d18","d19","d20","d21","d22","d23","d24",
1034
"d25","d26","d27","d28","d29","d30","d31"
1035
);
1036
}
1037
})
1038
#else
1039
CVTS_FUNC(s8, s32, 16,
1040
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1041
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1042
{
1043
for (size_t i = 0; i < w; i += 16)
1044
{
1045
internal::prefetch(_src + i);
1046
int8x16_t vline = vld1q_s8(_src + i);
1047
int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
1048
int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
1049
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
1050
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
1051
int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
1052
int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
1053
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1054
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1055
float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
1056
float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
1057
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1058
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1059
vline3_f32 = vmulq_f32(vline3_f32, vscale);
1060
vline4_f32 = vmulq_f32(vline4_f32, vscale);
1061
vline1_f32 = vaddq_f32(vline1_f32, vshift);
1062
vline2_f32 = vaddq_f32(vline2_f32, vshift);
1063
vline3_f32 = vaddq_f32(vline3_f32, vshift);
1064
vline4_f32 = vaddq_f32(vline4_f32, vshift);
1065
vline1_s32 = vcvtq_s32_f32(vline1_f32);
1066
vline2_s32 = vcvtq_s32_f32(vline2_f32);
1067
vline3_s32 = vcvtq_s32_f32(vline3_f32);
1068
vline4_s32 = vcvtq_s32_f32(vline4_f32);
1069
vst1q_s32(_dst + i + 0, vline1_s32);
1070
vst1q_s32(_dst + i + 4, vline2_s32);
1071
vst1q_s32(_dst + i + 8, vline3_s32);
1072
vst1q_s32(_dst + i + 12, vline4_s32);
1073
}
1074
})
1075
#endif
1076
1077
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1078
CVTS_FUNC(s8, f32, 16,
1079
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1080
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
1081
{
1082
for (size_t i = 0; i < w; i += 16)
1083
{
1084
internal::prefetch(_src + i);
1085
__asm__ (
1086
"vld1.8 {d4-d5}, [%[src]] \n\t"
1087
"vmovl.s8 q3, d4 \n\t"
1088
"vmovl.s8 q4, d5 \n\t"
1089
"vmovl.s16 q5, d6 \n\t"
1090
"vmovl.s16 q6, d7 \n\t"
1091
"vmovl.s16 q7, d8 \n\t"
1092
"vmovl.s16 q8, d9 \n\t"
1093
"vcvt.f32.s32 q9, q5 \n\t"
1094
"vcvt.f32.s32 q10, q6 \n\t"
1095
"vcvt.f32.s32 q11, q7 \n\t"
1096
"vcvt.f32.s32 q12, q8 \n\t"
1097
"vmul.f32 q13, q9, q0 \n\t"
1098
"vmul.f32 q14, q10, q0 \n\t"
1099
"vmul.f32 q15, q11, q0 \n\t"
1100
"vmul.f32 q2, q12, q0 \n\t"
1101
"vadd.f32 q3, q13, q1 \n\t"
1102
"vadd.f32 q4, q14, q1 \n\t"
1103
"vadd.f32 q5, q15, q1 \n\t"
1104
"vadd.f32 q6, q2, q1 \n\t"
1105
"vst1.32 {d6-d7}, [%[dst1]] \n\t"
1106
"vst1.32 {d8-d9}, [%[dst2]] \n\t"
1107
"vst1.32 {d10-d11}, [%[dst3]] \n\t"
1108
"vst1.32 {d12-d13}, [%[dst4]] \n\t"
1109
: /*no output*/
1110
: [src] "r" (_src + i),
1111
[dst1] "r" (_dst + i + 0),
1112
[dst2] "r" (_dst + i + 4),
1113
[dst3] "r" (_dst + i + 8),
1114
[dst4] "r" (_dst + i + 12),
1115
"w" (vscale), "w" (vshift)
1116
: "d4","d5","d6","d7","d8","d9","d10",
1117
"d11","d12","d13","d14","d15","d16","d17",
1118
"d18","d19","d20","d21","d22","d23","d24",
1119
"d25","d26","d27","d28","d29","d30","d31"
1120
);
1121
}
1122
})
1123
#else
1124
CVTS_FUNC(s8, f32, 16,
1125
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1126
float32x4_t vshift = vdupq_n_f32((f32)beta);,
1127
{
1128
for (size_t i = 0; i < w; i += 16)
1129
{
1130
internal::prefetch(_src + i);
1131
int8x16_t vline = vld1q_s8(_src + i);
1132
int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
1133
int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
1134
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
1135
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
1136
int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
1137
int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
1138
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1139
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1140
float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
1141
float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
1142
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1143
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1144
vline3_f32 = vmulq_f32(vline3_f32, vscale);
1145
vline4_f32 = vmulq_f32(vline4_f32, vscale);
1146
vline1_f32 = vaddq_f32(vline1_f32, vshift);
1147
vline2_f32 = vaddq_f32(vline2_f32, vshift);
1148
vline3_f32 = vaddq_f32(vline3_f32, vshift);
1149
vline4_f32 = vaddq_f32(vline4_f32, vshift);
1150
vst1q_f32(_dst + i + 0, vline1_f32);
1151
vst1q_f32(_dst + i + 4, vline2_f32);
1152
vst1q_f32(_dst + i + 8, vline3_f32);
1153
vst1q_f32(_dst + i + 12, vline4_f32);
1154
}
1155
})
1156
#endif
1157
1158
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1159
CVTS_FUNC(u16, u8, 16,
1160
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1161
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1162
{
1163
for (size_t i = 0; i < w; i += 8)
1164
{
1165
internal::prefetch(_src + i);
1166
__asm__ (
1167
"vld1.8 {d4-d5}, [%[src1]] \n\t"
1168
"vmovl.u16 q3, d4 \n\t"
1169
"vmovl.u16 q4, d5 \n\t"
1170
"vcvt.f32.u32 q5, q3 \n\t"
1171
"vcvt.f32.u32 q6, q4 \n\t"
1172
"vmul.f32 q7, q5, q0 \n\t"
1173
"vmul.f32 q8, q6, q0 \n\t"
1174
"vadd.f32 q9, q7, q1 \n\t"
1175
"vadd.f32 q10, q8, q1 \n\t"
1176
"vcvt.s32.f32 q11, q9 \n\t"
1177
"vcvt.s32.f32 q12, q10 \n\t"
1178
"vqmovn.s32 d26, q11 \n\t"
1179
"vqmovn.s32 d27, q12 \n\t"
1180
"vqmovun.s16 d28, q13 \n\t"
1181
"vst1.8 {d28}, [%[dst]] \n\t"
1182
: /*no output*/
1183
: [src1] "r" (_src + i),
1184
[dst] "r" (_dst + i + 0),
1185
"w" (vscale), "w" (vshift)
1186
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
1187
);
1188
}
1189
})
1190
#else
1191
CVTS_FUNC(u16, u8, 16,
1192
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1193
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1194
{
1195
for (size_t i = 0; i < w; i += 8)
1196
{
1197
internal::prefetch(_src + i);
1198
uint16x8_t vline = vld1q_u16(_src + i);
1199
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
1200
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
1201
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
1202
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
1203
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1204
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1205
vline1_f32 = vaddq_f32(vline1_f32, vshift);
1206
vline2_f32 = vaddq_f32(vline2_f32, vshift);
1207
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
1208
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
1209
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
1210
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
1211
uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
1212
vst1_u8(_dst + i, vRes);
1213
}
1214
})
1215
#endif
1216
1217
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1218
CVTS_FUNC(u16, s8, 16,
1219
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1220
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1221
{
1222
for (size_t i = 0; i < w; i += 8)
1223
{
1224
internal::prefetch(_src + i);
1225
__asm__ (
1226
"vld1.8 {d4-d5}, [%[src1]] \n\t"
1227
"vmovl.u16 q3, d4 \n\t"
1228
"vmovl.u16 q4, d5 \n\t"
1229
"vcvt.f32.u32 q5, q3 \n\t"
1230
"vcvt.f32.u32 q6, q4 \n\t"
1231
"vmul.f32 q7, q5, q0 \n\t"
1232
"vmul.f32 q8, q6, q0 \n\t"
1233
"vadd.f32 q9, q7, q1 \n\t"
1234
"vadd.f32 q10, q8, q1 \n\t"
1235
"vcvt.s32.f32 q11, q9 \n\t"
1236
"vcvt.s32.f32 q12, q10 \n\t"
1237
"vqmovn.s32 d26, q11 \n\t"
1238
"vqmovn.s32 d27, q12 \n\t"
1239
"vqmovn.s16 d28, q13 \n\t"
1240
"vst1.8 {d28}, [%[dst]] \n\t"
1241
: /*no output*/
1242
: [src1] "r" (_src + i),
1243
[dst] "r" (_dst + i + 0),
1244
"w" (vscale), "w" (vshift)
1245
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
1246
);
1247
}
1248
})
1249
#else
1250
CVTS_FUNC(u16, s8, 16,
1251
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1252
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1253
{
1254
for (size_t i = 0; i < w; i += 8)
1255
{
1256
internal::prefetch(_src + i);
1257
uint16x8_t vline = vld1q_u16(_src + i);
1258
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
1259
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
1260
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
1261
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
1262
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1263
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1264
vline1_f32 = vaddq_f32(vline1_f32, vshift);
1265
vline2_f32 = vaddq_f32(vline2_f32, vshift);
1266
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
1267
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
1268
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
1269
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
1270
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
1271
vst1_s8(_dst + i, vRes);
1272
}
1273
})
1274
#endif
1275
1276
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1277
CVTS_FUNC1(u16, 16,
1278
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1279
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1280
{
1281
for (size_t i = 0; i < w; i += 8)
1282
{
1283
internal::prefetch(_src + i);
1284
__asm__ (
1285
"vld1.16 {d4-d5}, [%[src]] \n\t"
1286
"vmovl.u16 q3, d4 \n\t"
1287
"vmovl.u16 q4, d5 \n\t"
1288
"vcvt.f32.u32 q5, q3 \n\t"
1289
"vcvt.f32.u32 q6, q4 \n\t"
1290
"vmul.f32 q7, q5, q0 \n\t"
1291
"vmul.f32 q8, q6, q0 \n\t"
1292
"vadd.f32 q9, q7, q1 \n\t"
1293
"vadd.f32 q10, q8, q1 \n\t"
1294
"vcvt.s32.f32 q11, q9 \n\t"
1295
"vcvt.s32.f32 q12, q10 \n\t"
1296
"vqmovun.s32 d26, q11 \n\t"
1297
"vqmovun.s32 d27, q12 \n\t"
1298
"vst1.16 {d26-d27}, [%[dst]] \n\t"
1299
: /*no output*/
1300
: [src] "r" (_src + i),
1301
[dst] "r" (_dst + i + 0),
1302
"w" (vshift), "w" (vscale)
1303
: "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
1304
);
1305
}
1306
})
1307
#else
1308
CVTS_FUNC1(u16, 16,
1309
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1310
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1311
{
1312
for (size_t i = 0; i < w; i += 8)
1313
{
1314
internal::prefetch(_src + i);
1315
uint16x8_t vline = vld1q_u16(_src + i);
1316
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
1317
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
1318
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
1319
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
1320
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1321
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1322
vline1_f32 = vaddq_f32(vline1_f32, vshift);
1323
vline2_f32 = vaddq_f32(vline2_f32, vshift);
1324
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
1325
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
1326
uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
1327
uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
1328
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
1329
}
1330
})
1331
#endif
1332
1333
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1334
CVTS_FUNC(u16, s16, 8,
1335
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1336
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1337
{
1338
for (size_t i = 0; i < w; i += 8)
1339
{
1340
internal::prefetch(_src + i);
1341
__asm__ (
1342
"vld1.16 {d4-d5}, [%[src]] \n\t"
1343
"vmovl.u16 q3, d4 \n\t"
1344
"vmovl.u16 q4, d5 \n\t"
1345
"vcvt.f32.u32 q5, q3 \n\t"
1346
"vcvt.f32.u32 q6, q4 \n\t"
1347
"vmul.f32 q7, q5, q0 \n\t"
1348
"vmul.f32 q8, q6, q0 \n\t"
1349
"vadd.f32 q9, q7, q1 \n\t"
1350
"vadd.f32 q10, q8, q1 \n\t"
1351
"vcvt.s32.f32 q11, q9 \n\t"
1352
"vcvt.s32.f32 q12, q10 \n\t"
1353
"vqmovn.s32 d26, q11 \n\t"
1354
"vqmovn.s32 d27, q12 \n\t"
1355
"vst1.16 {d26-d27}, [%[dst]] \n\t"
1356
: /*no output*/
1357
: [src] "r" (_src + i),
1358
[dst] "r" (_dst + i + 0),
1359
"w" (vshift), "w" (vscale)
1360
: "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
1361
);
1362
}
1363
})
1364
#else
1365
CVTS_FUNC(u16, s16, 8,
1366
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1367
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1368
{
1369
for (size_t i = 0; i < w; i += 8)
1370
{
1371
internal::prefetch(_src + i);
1372
uint16x8_t vline = vld1q_u16(_src + i);
1373
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
1374
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
1375
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
1376
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
1377
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1378
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1379
vline1_f32 = vaddq_f32(vline1_f32, vshift);
1380
vline2_f32 = vaddq_f32(vline2_f32, vshift);
1381
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
1382
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
1383
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
1384
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
1385
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
1386
}
1387
})
1388
#endif
1389
1390
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1391
CVTS_FUNC(u16, s32, 8,
1392
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1393
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1394
{
1395
for (size_t i = 0; i < w; i += 8)
1396
{
1397
internal::prefetch(_src + i);
1398
__asm__ (
1399
"vld1.16 {d4-d5}, [%[src]] \n\t"
1400
"vmovl.u16 q3, d4 \n\t"
1401
"vmovl.u16 q4, d5 \n\t"
1402
"vcvt.f32.u32 q5, q3 \n\t"
1403
"vcvt.f32.u32 q6, q4 \n\t"
1404
"vmul.f32 q7, q5, q0 \n\t"
1405
"vmul.f32 q8, q6, q0 \n\t"
1406
"vadd.f32 q9, q7, q1 \n\t"
1407
"vadd.f32 q10, q8, q1 \n\t"
1408
"vcvt.s32.f32 q11, q9 \n\t"
1409
"vcvt.s32.f32 q12, q10 \n\t"
1410
"vst1.32 {d22-d23}, [%[dst1]] \n\t"
1411
"vst1.32 {d24-d25}, [%[dst2]] \n\t"
1412
: /*no output*/
1413
: [src] "r" (_src + i),
1414
[dst1] "r" (_dst + i),
1415
[dst2] "r" (_dst + i + 4),
1416
"w" (vshift), "w" (vscale)
1417
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
1418
);
1419
}
1420
})
1421
#else
1422
CVTS_FUNC(u16, s32, 8,
1423
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1424
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1425
{
1426
for (size_t i = 0; i < w; i += 8)
1427
{
1428
internal::prefetch(_src + i);
1429
uint16x8_t vline = vld1q_u16(_src + i);
1430
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
1431
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
1432
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
1433
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
1434
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1435
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1436
vline1_f32 = vaddq_f32(vline1_f32, vshift);
1437
vline2_f32 = vaddq_f32(vline2_f32, vshift);
1438
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
1439
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
1440
vst1q_s32(_dst + i + 0, vline1_s32);
1441
vst1q_s32(_dst + i + 4, vline2_s32);
1442
}
1443
})
1444
#endif
1445
1446
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1447
CVTS_FUNC(u16, f32, 8,
1448
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1449
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
1450
{
1451
for (size_t i = 0; i < w; i += 8)
1452
{
1453
internal::prefetch(_src + i);
1454
__asm__ (
1455
"vld1.16 {d4-d5}, [%[src]] \n\t"
1456
"vmovl.u16 q3, d4 \n\t"
1457
"vmovl.u16 q4, d5 \n\t"
1458
"vcvt.f32.u32 q5, q3 \n\t"
1459
"vcvt.f32.u32 q6, q4 \n\t"
1460
"vmul.f32 q7, q5, q0 \n\t"
1461
"vmul.f32 q8, q6, q0 \n\t"
1462
"vadd.f32 q9, q7, q1 \n\t"
1463
"vadd.f32 q10, q8, q1 \n\t"
1464
"vst1.32 {d18-d19}, [%[dst1]] \n\t"
1465
"vst1.32 {d20-d21}, [%[dst2]] \n\t"
1466
: /*no output*/
1467
: [src] "r" (_src + i),
1468
[dst1] "r" (_dst + i + 0),
1469
[dst2] "r" (_dst + i + 4),
1470
"w" (vscale), "w" (vshift)
1471
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
1472
);
1473
}
1474
})
1475
#else
1476
CVTS_FUNC(u16, f32, 8,
1477
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1478
float32x4_t vshift = vdupq_n_f32((f32)beta);,
1479
{
1480
for (size_t i = 0; i < w; i += 8)
1481
{
1482
internal::prefetch(_src + i);
1483
uint16x8_t vline = vld1q_u16(_src + i);
1484
uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
1485
uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
1486
float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
1487
float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
1488
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1489
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1490
vline1_f32 = vaddq_f32(vline1_f32, vshift);
1491
vline2_f32 = vaddq_f32(vline2_f32, vshift);
1492
vst1q_f32(_dst + i + 0, vline1_f32);
1493
vst1q_f32(_dst + i + 4, vline2_f32);
1494
}
1495
})
1496
#endif
1497
1498
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1499
CVTS_FUNC(s16, u8, 16,
1500
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1501
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1502
{
1503
for (size_t i = 0; i < w; i += 8)
1504
{
1505
internal::prefetch(_src + i);
1506
__asm__ (
1507
"vld1.8 {d4-d5}, [%[src1]] \n\t"
1508
"vmovl.s16 q3, d4 \n\t"
1509
"vmovl.s16 q4, d5 \n\t"
1510
"vcvt.f32.s32 q5, q3 \n\t"
1511
"vcvt.f32.s32 q6, q4 \n\t"
1512
"vmul.f32 q7, q5, q0 \n\t"
1513
"vmul.f32 q8, q6, q0 \n\t"
1514
"vadd.f32 q9, q7, q1 \n\t"
1515
"vadd.f32 q10, q8, q1 \n\t"
1516
"vcvt.s32.f32 q11, q9 \n\t"
1517
"vcvt.s32.f32 q12, q10 \n\t"
1518
"vqmovn.s32 d26, q11 \n\t"
1519
"vqmovn.s32 d27, q12 \n\t"
1520
"vqmovun.s16 d28, q13 \n\t"
1521
"vst1.8 {d28}, [%[dst]] \n\t"
1522
: /*no output*/
1523
: [src1] "r" (_src + i),
1524
[dst] "r" (_dst + i + 0),
1525
"w" (vscale), "w" (vshift)
1526
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
1527
);
1528
}
1529
})
1530
#else
1531
CVTS_FUNC(s16, u8, 16,
1532
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1533
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1534
{
1535
for (size_t i = 0; i < w; i += 8)
1536
{
1537
internal::prefetch(_src + i);
1538
int16x8_t vline = vld1q_s16(_src + i);
1539
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
1540
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
1541
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1542
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1543
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1544
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1545
vline1_f32 = vaddq_f32(vline1_f32, vshift);
1546
vline2_f32 = vaddq_f32(vline2_f32, vshift);
1547
vline1_s32 = vcvtq_s32_f32(vline1_f32);
1548
vline2_s32 = vcvtq_s32_f32(vline2_f32);
1549
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
1550
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
1551
uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
1552
vst1_u8(_dst + i, vRes);
1553
}
1554
})
1555
#endif
1556
1557
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1558
CVTS_FUNC(s16, s8, 16,
1559
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1560
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1561
{
1562
for (size_t i = 0; i < w; i += 8)
1563
{
1564
internal::prefetch(_src + i);
1565
__asm__ (
1566
"vld1.8 {d4-d5}, [%[src1]] \n\t"
1567
"vmovl.s16 q3, d4 \n\t"
1568
"vmovl.s16 q4, d5 \n\t"
1569
"vcvt.f32.s32 q5, q3 \n\t"
1570
"vcvt.f32.s32 q6, q4 \n\t"
1571
"vmul.f32 q7, q5, q0 \n\t"
1572
"vmul.f32 q8, q6, q0 \n\t"
1573
"vadd.f32 q9, q7, q1 \n\t"
1574
"vadd.f32 q10, q8, q1 \n\t"
1575
"vcvt.s32.f32 q11, q9 \n\t"
1576
"vcvt.s32.f32 q12, q10 \n\t"
1577
"vqmovn.s32 d26, q11 \n\t"
1578
"vqmovn.s32 d27, q12 \n\t"
1579
"vqmovn.s16 d28, q13 \n\t"
1580
"vst1.8 {d28}, [%[dst]] \n\t"
1581
: /*no output*/
1582
: [src1] "r" (_src + i),
1583
[dst] "r" (_dst + i + 0),
1584
"w" (vscale), "w" (vshift)
1585
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
1586
);
1587
}
1588
})
1589
#else
1590
CVTS_FUNC(s16, s8, 16,
1591
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1592
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1593
{
1594
for (size_t i = 0; i < w; i += 8)
1595
{
1596
internal::prefetch(_src + i);
1597
int16x8_t vline = vld1q_s16(_src + i);
1598
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
1599
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
1600
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1601
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1602
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1603
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1604
vline1_f32 = vaddq_f32(vline1_f32, vshift);
1605
vline2_f32 = vaddq_f32(vline2_f32, vshift);
1606
vline1_s32 = vcvtq_s32_f32(vline1_f32);
1607
vline2_s32 = vcvtq_s32_f32(vline2_f32);
1608
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
1609
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
1610
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
1611
vst1_s8(_dst + i, vRes);
1612
}
1613
})
1614
#endif
1615
1616
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1617
CVTS_FUNC(s16, u16, 8,
1618
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1619
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1620
{
1621
for (size_t i = 0; i < w; i += 8)
1622
{
1623
internal::prefetch(_src + i);
1624
__asm__ (
1625
"vld1.16 {d4-d5}, [%[src]] \n\t"
1626
"vmovl.s16 q3, d4 \n\t"
1627
"vmovl.s16 q4, d5 \n\t"
1628
"vcvt.f32.s32 q5, q3 \n\t"
1629
"vcvt.f32.s32 q6, q4 \n\t"
1630
"vmul.f32 q7, q5, q0 \n\t"
1631
"vmul.f32 q8, q6, q0 \n\t"
1632
"vadd.f32 q9, q7, q1 \n\t"
1633
"vadd.f32 q10, q8, q1 \n\t"
1634
"vcvt.s32.f32 q11, q9 \n\t"
1635
"vcvt.s32.f32 q12, q10 \n\t"
1636
"vqmovun.s32 d26, q11 \n\t"
1637
"vqmovun.s32 d27, q12 \n\t"
1638
"vst1.16 {d26-d27}, [%[dst]] \n\t"
1639
: /*no output*/
1640
: [src] "r" (_src + i),
1641
[dst] "r" (_dst + i + 0),
1642
"w" (vscale), "w" (vshift)
1643
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
1644
);
1645
}
1646
})
1647
#else
1648
CVTS_FUNC(s16, u16, 8,
1649
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1650
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1651
{
1652
for (size_t i = 0; i < w; i += 8)
1653
{
1654
internal::prefetch(_src + i);
1655
int16x8_t vline = vld1q_s16(_src + i);
1656
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
1657
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
1658
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1659
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1660
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1661
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1662
vline1_f32 = vaddq_f32(vline1_f32, vshift);
1663
vline2_f32 = vaddq_f32(vline2_f32, vshift);
1664
vline1_s32 = vcvtq_s32_f32(vline1_f32);
1665
vline2_s32 = vcvtq_s32_f32(vline2_f32);
1666
uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
1667
uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
1668
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
1669
}
1670
})
1671
#endif
1672
1673
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1674
CVTS_FUNC1(s16, 16,
1675
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1676
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1677
{
1678
for (size_t i = 0; i < w; i += 8)
1679
{
1680
internal::prefetch(_src + i);
1681
__asm__ (
1682
"vld1.16 {d4-d5}, [%[src]] \n\t"
1683
"vmovl.s16 q3, d4 \n\t"
1684
"vmovl.s16 q4, d5 \n\t"
1685
"vcvt.f32.s32 q5, q3 \n\t"
1686
"vcvt.f32.s32 q6, q4 \n\t"
1687
"vmul.f32 q7, q5, q0 \n\t"
1688
"vmul.f32 q8, q6, q0 \n\t"
1689
"vadd.f32 q9, q7, q1 \n\t"
1690
"vadd.f32 q10, q8, q1 \n\t"
1691
"vcvt.s32.f32 q11, q9 \n\t"
1692
"vcvt.s32.f32 q12, q10 \n\t"
1693
"vqmovn.s32 d26, q11 \n\t"
1694
"vqmovn.s32 d27, q12 \n\t"
1695
"vst1.16 {d26-d27}, [%[dst]] \n\t"
1696
: /*no output*/
1697
: [src] "r" (_src + i),
1698
[dst] "r" (_dst + i + 0),
1699
"w" (vshift), "w" (vscale)
1700
: "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
1701
);
1702
}
1703
})
1704
#else
1705
CVTS_FUNC1(s16, 16,
1706
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1707
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1708
{
1709
for (size_t i = 0; i < w; i += 8)
1710
{
1711
internal::prefetch(_src + i);
1712
int16x8_t vline = vld1q_s16(_src + i);
1713
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
1714
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
1715
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1716
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1717
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1718
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1719
vline1_f32 = vaddq_f32(vline1_f32, vshift);
1720
vline2_f32 = vaddq_f32(vline2_f32, vshift);
1721
vline1_s32 = vcvtq_s32_f32(vline1_f32);
1722
vline2_s32 = vcvtq_s32_f32(vline2_f32);
1723
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
1724
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
1725
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
1726
}
1727
})
1728
#endif
1729
1730
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1731
CVTS_FUNC(s16, s32, 8,
1732
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1733
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1734
{
1735
for (size_t i = 0; i < w; i += 8)
1736
{
1737
internal::prefetch(_src + i);
1738
__asm__ (
1739
"vld1.16 {d4-d5}, [%[src]] \n\t"
1740
"vmovl.s16 q3, d4 \n\t"
1741
"vmovl.s16 q4, d5 \n\t"
1742
"vcvt.f32.s32 q5, q3 \n\t"
1743
"vcvt.f32.s32 q6, q4 \n\t"
1744
"vmul.f32 q7, q5, q0 \n\t"
1745
"vmul.f32 q8, q6, q0 \n\t"
1746
"vadd.f32 q9, q7, q1 \n\t"
1747
"vadd.f32 q10, q8, q1 \n\t"
1748
"vcvt.s32.f32 q11, q9 \n\t"
1749
"vcvt.s32.f32 q12, q10 \n\t"
1750
"vst1.32 {d22-d23}, [%[dst1]] \n\t"
1751
"vst1.32 {d24-d25}, [%[dst2]] \n\t"
1752
: /*no output*/
1753
: [src] "r" (_src + i),
1754
[dst1] "r" (_dst + i + 0),
1755
[dst2] "r" (_dst + i + 4),
1756
"w" (vscale), "w" (vshift)
1757
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
1758
);
1759
}
1760
})
1761
#else
1762
CVTS_FUNC(s16, s32, 8,
1763
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1764
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1765
{
1766
for (size_t i = 0; i < w; i += 8)
1767
{
1768
internal::prefetch(_src + i);
1769
int16x8_t vline = vld1q_s16(_src + i);
1770
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
1771
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
1772
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1773
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1774
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1775
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1776
vline1_f32 = vaddq_f32(vline1_f32, vshift);
1777
vline2_f32 = vaddq_f32(vline2_f32, vshift);
1778
vline1_s32 = vcvtq_s32_f32(vline1_f32);
1779
vline2_s32 = vcvtq_s32_f32(vline2_f32);
1780
vst1q_s32(_dst + i + 0, vline1_s32);
1781
vst1q_s32(_dst + i + 4, vline2_s32);
1782
}
1783
})
1784
#endif
1785
1786
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1787
CVTS_FUNC(s16, f32, 8,
1788
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1789
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
1790
{
1791
for (size_t i = 0; i < w; i += 8)
1792
{
1793
internal::prefetch(_src + i);
1794
__asm__ (
1795
"vld1.16 {d4-d5}, [%[src]] \n\t"
1796
"vmovl.s16 q3, d4 \n\t"
1797
"vmovl.s16 q4, d5 \n\t"
1798
"vcvt.f32.s32 q5, q3 \n\t"
1799
"vcvt.f32.s32 q6, q4 \n\t"
1800
"vmul.f32 q7, q5, q0 \n\t"
1801
"vmul.f32 q8, q6, q0 \n\t"
1802
"vadd.f32 q9, q7, q1 \n\t"
1803
"vadd.f32 q10, q8, q1 \n\t"
1804
"vst1.32 {d18-d19}, [%[dst1]] \n\t"
1805
"vst1.32 {d20-d21}, [%[dst2]] \n\t"
1806
: /*no output*/
1807
: [src] "r" (_src + i),
1808
[dst1] "r" (_dst + i + 0),
1809
[dst2] "r" (_dst + i + 4),
1810
"w" (vscale), "w" (vshift)
1811
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
1812
);
1813
}
1814
})
1815
#else
1816
CVTS_FUNC(s16, f32, 8,
1817
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1818
float32x4_t vshift = vdupq_n_f32((f32)beta);,
1819
{
1820
for (size_t i = 0; i < w; i += 8)
1821
{
1822
internal::prefetch(_src + i);
1823
int16x8_t vline = vld1q_s16(_src + i);
1824
int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
1825
int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
1826
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1827
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1828
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1829
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1830
vline1_f32 = vaddq_f32(vline1_f32, vshift);
1831
vline2_f32 = vaddq_f32(vline2_f32, vshift);
1832
vst1q_f32(_dst + i + 0, vline1_f32);
1833
vst1q_f32(_dst + i + 4, vline2_f32);
1834
}
1835
})
1836
#endif
1837
1838
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1839
CVTS_FUNC(s32, u8, 8,
1840
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1841
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1842
{
1843
for (size_t i = 0; i < w; i += 8)
1844
{
1845
internal::prefetch(_src + i);
1846
__asm__ (
1847
"vld1.32 {d4-d5}, [%[src1]] \n\t"
1848
"vld1.32 {d6-d7}, [%[src2]] \n\t"
1849
"vcvt.f32.s32 q4, q2 \n\t"
1850
"vcvt.f32.s32 q5, q3 \n\t"
1851
"vmul.f32 q6, q4, q0 \n\t"
1852
"vmul.f32 q7, q5, q0 \n\t"
1853
"vadd.f32 q8, q6, q1 \n\t"
1854
"vadd.f32 q9, q7, q1 \n\t"
1855
"vcvt.s32.f32 q10, q8 \n\t"
1856
"vcvt.s32.f32 q11, q9 \n\t"
1857
"vqmovun.s32 d24, q10 \n\t"
1858
"vqmovun.s32 d25, q11 \n\t"
1859
"vqmovn.u16 d26, q12 \n\t"
1860
"vst1.8 {d26}, [%[dst]] \n\t"
1861
: /*no output*/
1862
: [src1] "r" (_src + i + 0),
1863
[src2] "r" (_src + i + 4),
1864
[dst] "r" (_dst + i),
1865
"w" (vscale), "w" (vshift)
1866
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26"
1867
);
1868
}
1869
})
1870
#else
1871
CVTS_FUNC(s32, u8, 8,
1872
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1873
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1874
{
1875
for (size_t i = 0; i < w; i += 8)
1876
{
1877
internal::prefetch(_src + i);
1878
int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
1879
int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
1880
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1881
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1882
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1883
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1884
vline1_f32 = vaddq_f32(vline1_f32, vshift);
1885
vline2_f32 = vaddq_f32(vline2_f32, vshift);
1886
vline1_s32 = vcvtq_s32_f32(vline1_f32);
1887
vline2_s32 = vcvtq_s32_f32(vline2_f32);
1888
uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
1889
uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
1890
uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));
1891
vst1_u8(_dst + i, vRes);
1892
}
1893
})
1894
#endif
1895
1896
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1897
CVTS_FUNC(s32, s8, 8,
1898
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1899
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1900
{
1901
for (size_t i = 0; i < w; i += 8)
1902
{
1903
internal::prefetch(_src + i);
1904
__asm__ (
1905
"vld1.32 {d4-d5}, [%[src1]] \n\t"
1906
"vld1.32 {d6-d7}, [%[src2]] \n\t"
1907
"vcvt.f32.s32 q4, q2 \n\t"
1908
"vcvt.f32.s32 q5, q3 \n\t"
1909
"vmul.f32 q6, q4, q0 \n\t"
1910
"vmul.f32 q7, q5, q0 \n\t"
1911
"vadd.f32 q8, q6, q1 \n\t"
1912
"vadd.f32 q9, q7, q1 \n\t"
1913
"vcvt.s32.f32 q10, q8 \n\t"
1914
"vcvt.s32.f32 q11, q9 \n\t"
1915
"vqmovn.s32 d24, q10 \n\t"
1916
"vqmovn.s32 d25, q11 \n\t"
1917
"vqmovn.s16 d26, q12 \n\t"
1918
"vst1.8 {d26}, [%[dst]] \n\t"
1919
: /*no output*/
1920
: [src1] "r" (_src + i + 0),
1921
[src2] "r" (_src + i + 4),
1922
[dst] "r" (_dst + i),
1923
"w" (vscale), "w" (vshift)
1924
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26"
1925
);
1926
}
1927
})
1928
#else
1929
CVTS_FUNC(s32, s8, 8,
1930
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1931
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1932
{
1933
for (size_t i = 0; i < w; i += 8)
1934
{
1935
internal::prefetch(_src + i);
1936
int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
1937
int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
1938
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1939
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1940
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1941
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1942
vline1_f32 = vaddq_f32(vline1_f32, vshift);
1943
vline2_f32 = vaddq_f32(vline2_f32, vshift);
1944
vline1_s32 = vcvtq_s32_f32(vline1_f32);
1945
vline2_s32 = vcvtq_s32_f32(vline2_f32);
1946
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
1947
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
1948
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
1949
vst1_s8(_dst + i, vRes);
1950
}
1951
})
1952
#endif
1953
1954
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1955
CVTS_FUNC(s32, u16, 8,
1956
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1957
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1958
{
1959
for (size_t i = 0; i < w; i += 8)
1960
{
1961
internal::prefetch(_src + i);
1962
__asm__ (
1963
"vld1.32 {d4-d5}, [%[src1]] \n\t"
1964
"vld1.32 {d6-d7}, [%[src2]] \n\t"
1965
"vcvt.f32.s32 q4, q2 \n\t"
1966
"vcvt.f32.s32 q5, q3 \n\t"
1967
"vmul.f32 q6, q4, q0 \n\t"
1968
"vmul.f32 q7, q5, q0 \n\t"
1969
"vadd.f32 q8, q6, q1 \n\t"
1970
"vadd.f32 q9, q7, q1 \n\t"
1971
"vcvt.s32.f32 q10, q8 \n\t"
1972
"vcvt.s32.f32 q11, q9 \n\t"
1973
"vqmovun.s32 d24, q10 \n\t"
1974
"vqmovun.s32 d25, q11 \n\t"
1975
"vst1.16 {d24-d25}, [%[dst]] \n\t"
1976
: /*no output*/
1977
: [src1] "r" (_src + i + 0),
1978
[src2] "r" (_src + i + 4),
1979
[dst] "r" (_dst + i),
1980
"w" (vscale), "w" (vshift)
1981
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
1982
);
1983
}
1984
})
1985
#else
1986
CVTS_FUNC(s32, u16, 8,
1987
float32x4_t vscale = vdupq_n_f32((f32)alpha);
1988
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1989
{
1990
for (size_t i = 0; i < w; i += 8)
1991
{
1992
internal::prefetch(_src + i);
1993
int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
1994
int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
1995
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1996
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1997
vline1_f32 = vmulq_f32(vline1_f32, vscale);
1998
vline2_f32 = vmulq_f32(vline2_f32, vscale);
1999
vline1_f32 = vaddq_f32(vline1_f32, vshift);
2000
vline2_f32 = vaddq_f32(vline2_f32, vshift);
2001
vline1_s32 = vcvtq_s32_f32(vline1_f32);
2002
vline2_s32 = vcvtq_s32_f32(vline2_f32);
2003
uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
2004
uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
2005
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
2006
}
2007
})
2008
#endif
2009
2010
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2011
CVTS_FUNC(s32, s16, 8,
2012
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
2013
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
2014
{
2015
for (size_t i = 0; i < w; i += 8)
2016
{
2017
internal::prefetch(_src + i);
2018
__asm__ (
2019
"vld1.32 {d4-d5}, [%[src1]] \n\t"
2020
"vld1.32 {d6-d7}, [%[src2]] \n\t"
2021
"vcvt.f32.s32 q4, q2 \n\t"
2022
"vcvt.f32.s32 q5, q3 \n\t"
2023
"vmul.f32 q6, q4, q0 \n\t"
2024
"vmul.f32 q7, q5, q0 \n\t"
2025
"vadd.f32 q8, q6, q1 \n\t"
2026
"vadd.f32 q9, q7, q1 \n\t"
2027
"vcvt.s32.f32 q10, q8 \n\t"
2028
"vcvt.s32.f32 q11, q9 \n\t"
2029
"vqmovn.s32 d24, q10 \n\t"
2030
"vqmovn.s32 d25, q11 \n\t"
2031
"vst1.8 {d24-d25}, [%[dst]] \n\t"
2032
: /*no output*/
2033
: [src1] "r" (_src + i + 0),
2034
[src2] "r" (_src + i + 4),
2035
[dst] "r" (_dst + i),
2036
"w" (vscale), "w" (vshift)
2037
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
2038
);
2039
}
2040
})
2041
#else
2042
CVTS_FUNC(s32, s16, 8,
2043
float32x4_t vscale = vdupq_n_f32((f32)alpha);
2044
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
2045
{
2046
for (size_t i = 0; i < w; i += 8)
2047
{
2048
internal::prefetch(_src + i);
2049
int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
2050
int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
2051
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
2052
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
2053
vline1_f32 = vmulq_f32(vline1_f32, vscale);
2054
vline2_f32 = vmulq_f32(vline2_f32, vscale);
2055
vline1_f32 = vaddq_f32(vline1_f32, vshift);
2056
vline2_f32 = vaddq_f32(vline2_f32, vshift);
2057
vline1_s32 = vcvtq_s32_f32(vline1_f32);
2058
vline2_s32 = vcvtq_s32_f32(vline2_f32);
2059
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
2060
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
2061
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
2062
}
2063
})
2064
#endif
2065
2066
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2067
CVTS_FUNC1(s32, 8,
2068
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
2069
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
2070
{
2071
for (size_t i = 0; i < w; i += 8)
2072
{
2073
internal::prefetch(_src + i);
2074
__asm__ (
2075
"vld1.32 {d4-d5}, [%[src1]] \n\t"
2076
"vld1.32 {d6-d7}, [%[src2]] \n\t"
2077
"vcvt.f32.s32 q4, q2 \n\t"
2078
"vcvt.f32.s32 q5, q3 \n\t"
2079
"vmul.f32 q6, q4, q0 \n\t"
2080
"vmul.f32 q7, q5, q0 \n\t"
2081
"vadd.f32 q8, q6, q1 \n\t"
2082
"vadd.f32 q9, q7, q1 \n\t"
2083
"vcvt.s32.f32 q10, q8 \n\t"
2084
"vcvt.s32.f32 q11, q9 \n\t"
2085
"vst1.32 {d20-d21}, [%[dst1]] \n\t"
2086
"vst1.32 {d22-d23}, [%[dst2]] \n\t"
2087
: /*no output*/
2088
: [src1] "r" (_src + i + 0),
2089
[src2] "r" (_src + i + 4),
2090
[dst1] "r" (_dst + i + 0),
2091
[dst2] "r" (_dst + i + 4),
2092
"w" (vscale), "w" (vshift)
2093
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
2094
);
2095
}
2096
})
2097
#else
2098
CVTS_FUNC1(s32, 8,
2099
float32x4_t vscale = vdupq_n_f32((f32)alpha);
2100
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
2101
{
2102
for (size_t i = 0; i < w; i += 8)
2103
{
2104
internal::prefetch(_src + i);
2105
int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
2106
int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
2107
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
2108
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
2109
vline1_f32 = vmulq_f32(vline1_f32, vscale);
2110
vline2_f32 = vmulq_f32(vline2_f32, vscale);
2111
vline1_f32 = vaddq_f32(vline1_f32, vshift);
2112
vline2_f32 = vaddq_f32(vline2_f32, vshift);
2113
vline1_s32 = vcvtq_s32_f32(vline1_f32);
2114
vline2_s32 = vcvtq_s32_f32(vline2_f32);
2115
vst1q_s32(_dst + i + 0, vline1_s32);
2116
vst1q_s32(_dst + i + 4, vline2_s32);
2117
}
2118
})
2119
#endif
2120
2121
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2122
CVTS_FUNC(s32, f32, 8,
2123
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
2124
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
2125
{
2126
for (size_t i = 0; i < w; i += 8)
2127
{
2128
internal::prefetch(_src + i);
2129
__asm__ (
2130
"vld1.32 {d4-d5}, [%[src1]] \n\t"
2131
"vld1.32 {d6-d7}, [%[src2]] \n\t"
2132
"vcvt.f32.s32 q4, q2 \n\t"
2133
"vcvt.f32.s32 q5, q3 \n\t"
2134
"vmul.f32 q6, q4, q0 \n\t"
2135
"vmul.f32 q7, q5, q0 \n\t"
2136
"vadd.f32 q8, q6, q1 \n\t"
2137
"vadd.f32 q9, q7, q1 \n\t"
2138
"vst1.32 {d16-d17}, [%[dst1]] \n\t"
2139
"vst1.32 {d18-d19}, [%[dst2]] \n\t"
2140
: /*no output*/
2141
: [src1] "r" (_src + i),
2142
[src2] "r" (_src + i + 4),
2143
[dst1] "r" (_dst + i),
2144
[dst2] "r" (_dst + i + 4),
2145
"w" (vscale), "w" (vshift)
2146
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
2147
);
2148
}
2149
})
2150
#else
2151
CVTS_FUNC(s32, f32, 8,
2152
float32x4_t vscale = vdupq_n_f32((f32)alpha);
2153
float32x4_t vshift = vdupq_n_f32((f32)beta);,
2154
{
2155
for (size_t i = 0; i < w; i += 8)
2156
{
2157
internal::prefetch(_src + i);
2158
int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
2159
int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
2160
float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
2161
float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
2162
vline1_f32 = vmulq_f32(vline1_f32, vscale);
2163
vline2_f32 = vmulq_f32(vline2_f32, vscale);
2164
vline1_f32 = vaddq_f32(vline1_f32, vshift);
2165
vline2_f32 = vaddq_f32(vline2_f32, vshift);
2166
vst1q_f32(_dst + i + 0, vline1_f32);
2167
vst1q_f32(_dst + i + 4, vline2_f32);
2168
}
2169
})
2170
#endif
2171
2172
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2173
CVTS_FUNC(f32, u8, 8,
2174
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)((1 << 16)*alpha));
2175
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)((1 << 16)*beta));
2176
register uint32x4_t vmask asm ("q2") = vdupq_n_u32(1<<16);,
2177
{
2178
for (size_t i = 0; i < w; i += 8)
2179
{
2180
internal::prefetch(_src + i);
2181
__asm__ (
2182
"vld1.32 {d6-d7}, [%[src1]] \n\t"
2183
"vld1.32 {d8-d9}, [%[src2]] \n\t"
2184
"vmul.f32 q5, q3, q0 \n\t"
2185
"vmul.f32 q6, q4, q0 \n\t"
2186
"vadd.f32 q7, q5, q1 \n\t"
2187
"vadd.f32 q8, q6, q1 \n\t"
2188
"vcvt.u32.f32 q9, q7 \n\t"
2189
"vcvt.u32.f32 q10, q8 \n\t"
2190
"vbic q11, q2, q6 \n\t"
2191
"vbic q12, q2, q7 \n\t"
2192
"vshr.u32 q13, q11, #16 \n\t"
2193
"vshr.u32 q14, q12, #16 \n\t"
2194
"vqsub.u32 q7, q9, q13 \n\t"
2195
"vqsub.u32 q8, q10, q14 \n\t"
2196
"vqrshrn.u32 d22, q7, #16 \n\t"
2197
"vqrshrn.u32 d23, q8, #16 \n\t"
2198
"vqmovn.u16 d30, q11 \n\t"
2199
"vst1.8 {d30}, [%[dst]] \n\t"
2200
: /*no output*/
2201
: [src1] "r" (_src + i + 0),
2202
[src2] "r" (_src + i + 4),
2203
[dst] "r" (_dst + i),
2204
"w" (vscale), "w" (vshift), "w" (vmask)
2205
: "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30"
2206
);
2207
}
2208
})
2209
#else
2210
CVTS_FUNC(f32, u8, 8,
2211
float32x4_t vscale = vdupq_n_f32((f32)((1 << 16)*alpha));
2212
float32x4_t vshift = vdupq_n_f32((f32)((1 << 16)*beta));
2213
uint32x4_t vmask = vdupq_n_u32(1<<16);,
2214
{
2215
for (size_t i = 0; i < w; i += 8)
2216
{
2217
internal::prefetch(_src + i);
2218
float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
2219
float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
2220
2221
vline1_f32 = vmulq_f32(vline1_f32, vscale);
2222
vline2_f32 = vmulq_f32(vline2_f32, vscale);
2223
float32x4_t vline1Shifted_f32 = vaddq_f32(vline1_f32, vshift);
2224
float32x4_t vline2Shifted_f32 = vaddq_f32(vline2_f32, vshift);
2225
uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1Shifted_f32);
2226
uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2Shifted_f32);
2227
uint32x4_t vline1Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline2_f32));
2228
uint32x4_t vline2Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline1Shifted_f32));
2229
vline1Mask = vshrq_n_u32(vline1Mask, 16);
2230
vline2Mask = vshrq_n_u32(vline2Mask, 16);
2231
vline1_u32 = vqsubq_u32(vline1_u32, vline1Mask);
2232
vline2_u32 = vqsubq_u32(vline2_u32, vline2Mask);
2233
uint16x4_t vRes1 = vqrshrn_n_u32(vline1_u32, 16);
2234
uint16x4_t vRes2 = vqrshrn_n_u32(vline2_u32, 16);
2235
uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));
2236
2237
vst1_u8(_dst + i, vRes);
2238
}
2239
})
2240
#endif
2241
2242
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2243
CVTS_FUNC(f32, s8, 8,
2244
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
2245
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
2246
{
2247
for (size_t i = 0; i < w; i += 8)
2248
{
2249
internal::prefetch(_src + i);
2250
__asm__ (
2251
"vld1.32 {d4-d5}, [%[src1]] \n\t"
2252
"vld1.32 {d6-d7}, [%[src2]] \n\t"
2253
"vmul.f32 q4, q2, q0 \n\t"
2254
"vmul.f32 q5, q3, q0 \n\t"
2255
"vadd.f32 q6, q4, q1 \n\t"
2256
"vadd.f32 q7, q5, q1 \n\t"
2257
"vcvt.s32.f32 q8, q6 \n\t"
2258
"vcvt.s32.f32 q9, q7 \n\t"
2259
"vqmovn.s32 d14, q8 \n\t"
2260
"vqmovn.s32 d15, q9 \n\t"
2261
"vqmovn.s16 d16, q7 \n\t"
2262
"vst1.8 {d16}, [%[dst]] \n\t"
2263
: /*no output*/
2264
: [src1] "r" (_src + i + 0),
2265
[src2] "r" (_src + i + 4),
2266
[dst] "r" (_dst + i),
2267
"w" (vscale), "w" (vshift)
2268
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
2269
);
2270
}
2271
})
2272
#else
2273
CVTS_FUNC(f32, s8, 8,
2274
float32x4_t vscale = vdupq_n_f32((f32)alpha);
2275
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
2276
{
2277
for (size_t i = 0; i < w; i += 8)
2278
{
2279
internal::prefetch(_src + i);
2280
float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
2281
float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
2282
vline1_f32 = vmulq_f32(vline1_f32, vscale);
2283
vline2_f32 = vmulq_f32(vline2_f32, vscale);
2284
vline1_f32 = vaddq_f32(vline1_f32, vshift);
2285
vline2_f32 = vaddq_f32(vline2_f32, vshift);
2286
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
2287
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
2288
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
2289
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
2290
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
2291
vst1_s8(_dst + i, vRes);
2292
}
2293
})
2294
#endif
2295
2296
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2297
CVTS_FUNC(f32, u16, 8,
2298
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
2299
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
2300
{
2301
for (size_t i = 0; i < w; i += 8)
2302
{
2303
internal::prefetch(_src + i);
2304
__asm__ (
2305
"vld1.32 {d4-d5}, [%[src1]] \n\t"
2306
"vld1.32 {d6-d7}, [%[src2]] \n\t"
2307
"vmul.f32 q4, q2, q0 \n\t"
2308
"vmul.f32 q5, q3, q0 \n\t"
2309
"vadd.f32 q6, q4, q1 \n\t"
2310
"vadd.f32 q7, q5, q1 \n\t"
2311
"vcvt.u32.f32 q8, q6 \n\t"
2312
"vcvt.u32.f32 q9, q7 \n\t"
2313
"vqmovn.u32 d8, q8 \n\t"
2314
"vqmovn.u32 d9, q9 \n\t"
2315
"vst1.16 {d8-d9}, [%[dst]] \n\t"
2316
: /*no output*/
2317
: [src1] "r" (_src + i + 0),
2318
[src2] "r" (_src + i + 4),
2319
[dst] "r" (_dst + i),
2320
"w" (vscale), "w" (vshift)
2321
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
2322
);
2323
}
2324
})
2325
#else
2326
CVTS_FUNC(f32, u16, 8,
2327
float32x4_t vscale = vdupq_n_f32((f32)alpha);
2328
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
2329
{
2330
for (size_t i = 0; i < w; i += 8)
2331
{
2332
internal::prefetch(_src + i);
2333
float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
2334
float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
2335
vline1_f32 = vmulq_f32(vline1_f32, vscale);
2336
vline2_f32 = vmulq_f32(vline2_f32, vscale);
2337
vline1_f32 = vaddq_f32(vline1_f32, vshift);
2338
vline2_f32 = vaddq_f32(vline2_f32, vshift);
2339
uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1_f32);
2340
uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2_f32);
2341
uint16x4_t vRes1 = vqmovn_u32(vline1_u32);
2342
uint16x4_t vRes2 = vqmovn_u32(vline2_u32);
2343
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
2344
}
2345
})
2346
#endif
2347
2348
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2349
CVTS_FUNC(f32, s16, 8,
2350
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
2351
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
2352
{
2353
for (size_t i = 0; i < w; i += 8)
2354
{
2355
internal::prefetch(_src + i);
2356
__asm__ (
2357
"vld1.32 {d4-d5}, [%[src1]] \n\t"
2358
"vld1.32 {d6-d7}, [%[src2]] \n\t"
2359
"vmul.f32 q4, q2, q0 \n\t"
2360
"vmul.f32 q5, q3, q0 \n\t"
2361
"vadd.f32 q6, q4, q1 \n\t"
2362
"vadd.f32 q7, q5, q1 \n\t"
2363
"vcvt.s32.f32 q8, q6 \n\t"
2364
"vcvt.s32.f32 q9, q7 \n\t"
2365
"vqmovn.s32 d8, q8 \n\t"
2366
"vqmovn.s32 d9, q9 \n\t"
2367
"vst1.16 {d8-d9}, [%[dst]] \n\t"
2368
: /*no output*/
2369
: [src1] "r" (_src + i + 0),
2370
[src2] "r" (_src + i + 4),
2371
[dst] "r" (_dst + i),
2372
"w" (vscale), "w" (vshift)
2373
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
2374
);
2375
}
2376
})
2377
#else
2378
CVTS_FUNC(f32, s16, 8,
2379
float32x4_t vscale = vdupq_n_f32((f32)alpha);
2380
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
2381
{
2382
for (size_t i = 0; i < w; i += 8)
2383
{
2384
internal::prefetch(_src + i);
2385
float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
2386
float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
2387
vline1_f32 = vmulq_f32(vline1_f32, vscale);
2388
vline2_f32 = vmulq_f32(vline2_f32, vscale);
2389
vline1_f32 = vaddq_f32(vline1_f32, vshift);
2390
vline2_f32 = vaddq_f32(vline2_f32, vshift);
2391
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
2392
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
2393
int16x4_t vRes1 = vqmovn_s32(vline1_s32);
2394
int16x4_t vRes2 = vqmovn_s32(vline2_s32);
2395
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
2396
}
2397
})
2398
#endif
2399
2400
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2401
CVTS_FUNC(f32, s32, 8,
2402
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
2403
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
2404
{
2405
for (size_t i = 0; i < w; i += 8)
2406
{
2407
internal::prefetch(_src + i);
2408
__asm__ (
2409
"vld1.32 {d4-d5}, [%[src1]] \n\t"
2410
"vld1.32 {d6-d7}, [%[src2]] \n\t"
2411
"vmul.f32 q4, q2, q0 \n\t"
2412
"vmul.f32 q5, q3, q0 \n\t"
2413
"vadd.f32 q6, q4, q1 \n\t"
2414
"vadd.f32 q7, q5, q1 \n\t"
2415
"vcvt.s32.f32 q4, q6 \n\t"
2416
"vcvt.s32.f32 q5, q7 \n\t"
2417
"vst1.32 {d8-d9}, [%[dst1]] \n\t"
2418
"vst1.32 {d10-d11}, [%[dst2]] \n\t"
2419
: //no output
2420
: [src1] "r" (_src + i),
2421
[src2] "r" (_src + i + 4),
2422
[dst1] "r" (_dst + i),
2423
[dst2] "r" (_dst + i + 4),
2424
"w" (vscale), "w" (vshift)
2425
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
2426
);
2427
}
2428
})
2429
#else
2430
CVTS_FUNC(f32, s32, 8,
2431
float32x4_t vscale = vdupq_n_f32((f32)alpha);
2432
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
2433
{
2434
for (size_t i = 0; i < w; i += 8)
2435
{
2436
internal::prefetch(_src + i);
2437
float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
2438
float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
2439
vline1_f32 = vmulq_f32(vline1_f32, vscale);
2440
vline2_f32 = vmulq_f32(vline2_f32, vscale);
2441
vline1_f32 = vaddq_f32(vline1_f32, vshift);
2442
vline2_f32 = vaddq_f32(vline2_f32, vshift);
2443
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
2444
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
2445
vst1q_s32(_dst + i + 0, vline1_s32);
2446
vst1q_s32(_dst + i + 4, vline2_s32);
2447
}
2448
})
2449
#endif
2450
2451
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2452
CVTS_FUNC1(f32, 8,
2453
register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
2454
register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
2455
{
2456
for (size_t i = 0; i < w; i += 8)
2457
{
2458
internal::prefetch(_src + i);
2459
__asm__ (
2460
"vld1.32 {d4-d5}, [%[src1]] \n\t"
2461
"vld1.32 {d6-d7}, [%[src2]] \n\t"
2462
"vmul.f32 q4, q2, q0 \n\t"
2463
"vmul.f32 q5, q3, q0 \n\t"
2464
"vadd.f32 q6, q4, q1 \n\t"
2465
"vadd.f32 q7, q5, q1 \n\t"
2466
"vst1.32 {d12-d13}, [%[dst1]] \n\t"
2467
"vst1.32 {d14-d15}, [%[dst2]] \n\t"
2468
: /*no output*/
2469
: [src1] "r" (_src + i + 0),
2470
[src2] "r" (_src + i + 4),
2471
[dst1] "r" (_dst + i + 0),
2472
[dst2] "r" (_dst + i + 4),
2473
"w" (vscale), "w" (vshift)
2474
: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
2475
);
2476
}
2477
})
2478
#else
2479
CVTS_FUNC1(f32, 8,
2480
float32x4_t vscale = vdupq_n_f32((f32)alpha);
2481
float32x4_t vshift = vdupq_n_f32((f32)beta);,
2482
{
2483
for (size_t i = 0; i < w; i += 8)
2484
{
2485
internal::prefetch(_src + i);
2486
float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
2487
float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
2488
vline1_f32 = vmulq_f32(vline1_f32, vscale);
2489
vline2_f32 = vmulq_f32(vline2_f32, vscale);
2490
vline1_f32 = vaddq_f32(vline1_f32, vshift);
2491
vline2_f32 = vaddq_f32(vline2_f32, vshift);
2492
vst1q_f32(_dst + i + 0, vline1_f32);
2493
vst1q_f32(_dst + i + 4, vline2_f32);
2494
}
2495
})
2496
#endif
2497
2498
} // namespace CAROTENE_NS
2499
2500