Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/mul.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2014-2016, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
#include "vtransform.hpp"
42
43
#include <cstring>
44
#include <cfloat>
45
#include <cmath>
46
#include <limits>
47
48
namespace CAROTENE_NS {
49
50
#ifdef CAROTENE_NEON
51
52
namespace {
53
54
bool isIntegerScale(f32 scale)
55
{
56
return std::fabs(scale - static_cast<s32>(scale)) < FLT_EPSILON;
57
}
58
59
template <s32 shift>
60
void mulu8(const Size2D &size,
61
const u8 * src0Base, ptrdiff_t src0Stride,
62
const u8 * src1Base, ptrdiff_t src1Stride,
63
u8 * dstBase, ptrdiff_t dstStride,
64
CONVERT_POLICY cpolicy)
65
{
66
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
67
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
68
69
for (size_t i = 0; i < size.height; ++i)
70
{
71
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
72
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
73
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
74
size_t j = 0;
75
76
if (cpolicy == CONVERT_POLICY_SATURATE)
77
{
78
for (; j < roiw16; j += 16)
79
{
80
internal::prefetch(src0 + j);
81
internal::prefetch(src1 + j);
82
uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j);
83
84
uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1));
85
uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1));
86
87
v_dst0 = vshrq_n_u16(v_dst0, shift);
88
v_dst1 = vshrq_n_u16(v_dst1, shift);
89
90
vst1q_u8(dst + j, vcombine_u8(vqmovn_u16(v_dst0), vqmovn_u16(v_dst1)));
91
}
92
for (; j < roiw8; j += 8)
93
{
94
uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j));
95
vst1_u8(dst + j, vqmovn_u16(vshrq_n_u16(v_dst, shift)));
96
}
97
98
for (; j < size.width; j++)
99
{
100
u16 val = (u16)src0[j] * (u16)src1[j];
101
dst[j] = internal::saturate_cast<u8>(val >> shift);
102
}
103
}
104
else // CONVERT_POLICY_WRAP
105
{
106
for (; j < roiw16; j += 16)
107
{
108
internal::prefetch(src0 + j);
109
internal::prefetch(src1 + j);
110
uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j);
111
112
uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1));
113
uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1));
114
115
v_dst0 = vshrq_n_u16(v_dst0, shift);
116
v_dst1 = vshrq_n_u16(v_dst1, shift);
117
118
vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
119
}
120
for (; j < roiw8; j += 8)
121
{
122
uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j));
123
vst1_u8(dst + j, vmovn_u16(vshrq_n_u16(v_dst, shift)));
124
}
125
126
for (; j < size.width; j++)
127
{
128
u16 val = (u16)src0[j] * (u16)src1[j];
129
dst[j] = (u8)(val >> shift);
130
}
131
}
132
}
133
}
134
135
template <s32 shift>
136
void muls16(const Size2D &size,
137
const u8 * src0Base, ptrdiff_t src0Stride,
138
const u8 * src1Base, ptrdiff_t src1Stride,
139
s16 * dstBase, ptrdiff_t dstStride,
140
CONVERT_POLICY cpolicy)
141
{
142
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
143
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
144
145
uint16x8_t v_32767 = vdupq_n_u16(0x7FFF);
146
147
for (size_t i = 0; i < size.height; ++i)
148
{
149
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
150
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
151
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
152
size_t j = 0;
153
154
if (cpolicy == CONVERT_POLICY_SATURATE)
155
{
156
for (; j < roiw16; j += 16)
157
{
158
internal::prefetch(src0 + j);
159
internal::prefetch(src1 + j);
160
uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j);
161
162
uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1));
163
uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1));
164
165
v_dst0 = vshrq_n_u16(v_dst0, shift);
166
v_dst1 = vshrq_n_u16(v_dst1, shift);
167
168
vst1q_s16(dst + j, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst0)));
169
vst1q_s16(dst + j + 8, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst1)));
170
}
171
for (; j < roiw8; j += 8)
172
{
173
uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j));
174
v_dst = vshrq_n_u16(v_dst, shift);
175
vst1q_s16(dst + j, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst)));
176
}
177
178
for (; j < size.width; j++)
179
{
180
u16 val = (u16)src0[j] * (u16)src1[j];
181
dst[j] = internal::saturate_cast<s16>(val >> shift);
182
}
183
}
184
else // CONVERT_POLICY_WRAP
185
{
186
for (; j < roiw16; j += 16)
187
{
188
internal::prefetch(src0 + j);
189
internal::prefetch(src1 + j);
190
uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j);
191
192
uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1));
193
uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1));
194
195
v_dst0 = vshrq_n_u16(v_dst0, shift);
196
v_dst1 = vshrq_n_u16(v_dst1, shift);
197
198
vst1q_s16(dst + j, vreinterpretq_s16_u16(v_dst0));
199
vst1q_s16(dst + j + 8, vreinterpretq_s16_u16(v_dst1));
200
}
201
for (; j < roiw8; j += 8)
202
{
203
uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j));
204
v_dst = vshrq_n_u16(v_dst, shift);
205
vst1q_s16(dst + j, vreinterpretq_s16_u16(v_dst));
206
}
207
208
for (; j < size.width; j++)
209
{
210
u16 val = (u16)src0[j] * (u16)src1[j];
211
dst[j] = (s16)(val >> shift);
212
}
213
}
214
}
215
}
216
217
typedef void (* mulFuncu8)(const Size2D &size,
218
const u8 * src0Base, ptrdiff_t src0Stride,
219
const u8 * src1Base, ptrdiff_t src1Stride,
220
u8 * dstBase, ptrdiff_t dstStride,
221
CONVERT_POLICY cpolicy);
222
223
typedef void (* mulFuncs16)(const Size2D &size,
224
const u8 * src0Base, ptrdiff_t src0Stride,
225
const u8 * src1Base, ptrdiff_t src1Stride,
226
s16 * dstBase, ptrdiff_t dstStride,
227
CONVERT_POLICY cpolicy);
228
229
} // namespace
230
231
#endif
232
233
void mul(const Size2D &size,
234
const u8 * src0Base, ptrdiff_t src0Stride,
235
const u8 * src1Base, ptrdiff_t src1Stride,
236
u8 * dstBase, ptrdiff_t dstStride,
237
f32 scale,
238
CONVERT_POLICY cpolicy)
239
{
240
internal::assertSupportedConfiguration();
241
242
#ifdef CAROTENE_NEON
243
if ((scale * UCHAR_MAX * UCHAR_MAX) < 1.0f)
244
{
245
for (size_t y = 0; y < size.height; ++y)
246
{
247
u8 * dst = internal::getRowPtr(dstBase, dstStride, y);
248
std::memset(dst, 0, sizeof(u8) * size.width);
249
}
250
return;
251
}
252
253
s32 iscale = static_cast<s32>(scale), exp = 0;
254
f32 significand = frexp(scale, &exp);
255
bool is_integer_scale = isIntegerScale(scale),
256
is_power_of_2 = (significand == 0.5f) && (exp <= 0);
257
exp = -exp + 1;
258
259
if (is_power_of_2)
260
{
261
static const mulFuncu8 funcs[16] =
262
{
263
NULL,
264
mulu8<1>,
265
mulu8<2>,
266
mulu8<3>,
267
mulu8<4>,
268
mulu8<5>,
269
mulu8<6>,
270
mulu8<7>,
271
mulu8<8>,
272
mulu8<9>,
273
mulu8<10>,
274
mulu8<11>,
275
mulu8<12>,
276
mulu8<13>,
277
mulu8<14>,
278
mulu8<15>
279
};
280
281
mulFuncu8 func = funcs[exp];
282
283
func(size,
284
src0Base, src0Stride,
285
src1Base, src1Stride,
286
dstBase, dstStride,
287
cpolicy);
288
289
return;
290
}
291
292
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
293
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
294
295
for (size_t i = 0; i < size.height; ++i)
296
{
297
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
298
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
299
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
300
size_t j = 0;
301
302
if (cpolicy == CONVERT_POLICY_SATURATE)
303
{
304
if (is_integer_scale && iscale == 1)
305
{
306
for (; j < roiw16; j += 16)
307
{
308
internal::prefetch(src0 + j);
309
internal::prefetch(src1 + j);
310
uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j);
311
312
uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1));
313
uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1));
314
315
vst1q_u8(dst + j, vcombine_u8(vqmovn_u16(v_dst0), vqmovn_u16(v_dst1)));
316
}
317
for (; j < roiw8; j += 8)
318
{
319
vst1_u8(dst + j, vqmovn_u16(vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j))));
320
}
321
322
for (; j < size.width; j++)
323
{
324
u16 val = (u16)src0[j] * (u16)src1[j];
325
dst[j] = internal::saturate_cast<u8>(val);
326
}
327
}
328
else // generic case using floats
329
{
330
for (; j < roiw16; j += 16)
331
{
332
internal::prefetch(src0 + j);
333
internal::prefetch(src1 + j);
334
335
uint8x16_t v_src0 = vld1q_u8(src0 + j);
336
uint8x16_t v_src1 = vld1q_u8(src1 + j);
337
338
uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
339
uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
340
float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
341
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale);
342
float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
343
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale);
344
v_src0_p = vmovl_u8(vget_high_u8(v_src0));
345
v_src1_p = vmovl_u8(vget_high_u8(v_src1));
346
float32x4_t v_dst2f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
347
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale);
348
float32x4_t v_dst3f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
349
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale);
350
uint16x8_t v_dst0u = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst0f)),
351
vqmovn_u32(vcvtq_u32_f32(v_dst1f)));
352
uint16x8_t v_dst1u = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst2f)),
353
vqmovn_u32(vcvtq_u32_f32(v_dst3f)));
354
vst1q_u8(dst + j, vcombine_u8(vqmovn_u16(v_dst0u), vqmovn_u16(v_dst1u)));
355
}
356
for (; j < roiw8; j += 8)
357
{
358
uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j));
359
uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j));
360
float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))),
361
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))), scale);
362
float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))),
363
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))), scale);
364
uint16x8_t v_dstu = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst0f)),
365
vqmovn_u32(vcvtq_u32_f32(v_dst1f)));
366
vst1_u8(dst + j, vqmovn_u16(v_dstu));
367
}
368
369
for (; j < size.width; j++)
370
{
371
f32 fval = (f32)src0[j] * (f32)src1[j] * scale;
372
dst[j] = internal::saturate_cast<u8>((s32)trunc(fval));
373
}
374
}
375
}
376
else // CONVERT_POLICY_WRAP
377
{
378
if (is_integer_scale && iscale == 1)
379
{
380
for (; j < roiw16; j += 16)
381
{
382
internal::prefetch(src0 + j);
383
internal::prefetch(src1 + j);
384
uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j);
385
386
uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1));
387
uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1));
388
389
vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
390
}
391
for (; j < roiw8; j += 8)
392
{
393
vst1_u8(dst + j, vmovn_u16(vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j))));
394
}
395
396
for (; j < size.width; j++)
397
{
398
u16 val = (u16)src0[j] * (u16)src1[j];
399
dst[j] = (u8)(val);
400
}
401
}
402
else // generic case using floats
403
{
404
for (; j < roiw16; j += 16)
405
{
406
internal::prefetch(src0 + j);
407
internal::prefetch(src1 + j);
408
uint8x16_t v_src0 = vld1q_u8(src0 + j);
409
uint8x16_t v_src1 = vld1q_u8(src1 + j);
410
411
uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
412
uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
413
float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
414
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale);
415
float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
416
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale);
417
v_src0_p = vmovl_u8(vget_high_u8(v_src0));
418
v_src1_p = vmovl_u8(vget_high_u8(v_src1));
419
float32x4_t v_dst2f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
420
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale);
421
float32x4_t v_dst3f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
422
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale);
423
uint16x8_t v_dst0u = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
424
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
425
uint16x8_t v_dst1u = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst2f)),
426
vmovn_u32(vcvtq_u32_f32(v_dst3f)));
427
vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst0u), vmovn_u16(v_dst1u)));
428
}
429
for (; j < roiw8; j += 8)
430
{
431
uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j));
432
uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j));
433
float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))),
434
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))), scale);
435
float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))),
436
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))), scale);
437
uint16x8_t v_dstu = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
438
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
439
vst1_u8(dst + j, vmovn_u16(v_dstu));
440
}
441
442
for (; j < size.width; j++)
443
{
444
f32 fval = (f32)src0[j] * (f32)src1[j] * scale;
445
dst[j] = (u8)(s32)trunc(fval);
446
}
447
}
448
}
449
}
450
#else
451
(void)size;
452
(void)src0Base;
453
(void)src0Stride;
454
(void)src1Base;
455
(void)src1Stride;
456
(void)dstBase;
457
(void)dstStride;
458
(void)cpolicy;
459
(void)scale;
460
#endif
461
}
462
463
void mul(const Size2D &size,
464
const u8 * src0Base, ptrdiff_t src0Stride,
465
const u8 * src1Base, ptrdiff_t src1Stride,
466
s16 * dstBase, ptrdiff_t dstStride,
467
f32 scale,
468
CONVERT_POLICY cpolicy)
469
{
470
internal::assertSupportedConfiguration();
471
#ifdef CAROTENE_NEON
472
if (((scale * UCHAR_MAX * UCHAR_MAX) < 1.0f) && (scale >= 0))
473
{
474
for (size_t y = 0; y < size.height; ++y)
475
{
476
s16 * dst = internal::getRowPtr(dstBase, dstStride, y);
477
std::memset(dst, 0, sizeof(s16) * size.width);
478
}
479
return;
480
}
481
482
s32 iscale = static_cast<s32>(scale), exp = 0;
483
f32 significand = frexp(scale, &exp);
484
bool is_integer_scale = isIntegerScale(scale),
485
is_power_of_2 = (significand == 0.5f) && (exp <= 0);
486
exp = -exp + 1;
487
488
if (is_power_of_2)
489
{
490
static const mulFuncs16 funcs[16] =
491
{
492
NULL,
493
muls16<1>,
494
muls16<2>,
495
muls16<3>,
496
muls16<4>,
497
muls16<5>,
498
muls16<6>,
499
muls16<7>,
500
muls16<8>,
501
muls16<9>,
502
muls16<10>,
503
muls16<11>,
504
muls16<12>,
505
muls16<13>,
506
muls16<14>,
507
muls16<15>
508
};
509
510
mulFuncs16 func = funcs[exp];
511
512
func(size,
513
src0Base, src0Stride,
514
src1Base, src1Stride,
515
dstBase, dstStride,
516
cpolicy);
517
518
return;
519
}
520
521
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
522
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
523
524
uint16x8_t v_32767 = vdupq_n_u16(0x7FFF);
525
526
for (size_t i = 0; i < size.height; ++i)
527
{
528
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
529
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
530
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
531
size_t j = 0;
532
533
if (cpolicy == CONVERT_POLICY_SATURATE)
534
{
535
if (is_integer_scale && iscale == 1)
536
{
537
for (; j < roiw16; j += 16)
538
{
539
internal::prefetch(src0 + j);
540
internal::prefetch(src1 + j);
541
uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j);
542
543
uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1));
544
uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1));
545
546
vst1q_s16(dst + j, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst0)));
547
vst1q_s16(dst + j +8, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst1)));
548
}
549
for (; j < roiw8; j += 8)
550
{
551
uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j));
552
vst1q_s16(dst + j, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst)));
553
}
554
555
for (; j < size.width; j++)
556
{
557
u16 val = (u16)src0[j] * (u16)src1[j];
558
dst[j] = internal::saturate_cast<s16>(val);
559
}
560
}
561
else // generic case using floats
562
{
563
for (; j < roiw16; j += 16)
564
{
565
internal::prefetch(src0 + j);
566
internal::prefetch(src1 + j);
567
uint8x16_t v_src0 = vld1q_u8(src0 + j);
568
uint8x16_t v_src1 = vld1q_u8(src1 + j);
569
570
uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
571
uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
572
float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
573
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale);
574
float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
575
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale);
576
vst1q_s16(dst + j, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)),
577
vqmovn_s32(vcvtq_s32_f32(v_dst1f))));
578
579
v_src0_p = vmovl_u8(vget_high_u8(v_src0));
580
v_src1_p = vmovl_u8(vget_high_u8(v_src1));
581
v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
582
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale);
583
v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
584
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale);
585
vst1q_s16(dst + j + 8, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)),
586
vqmovn_s32(vcvtq_s32_f32(v_dst1f))));
587
}
588
for (; j < roiw8; j += 8)
589
{
590
uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j));
591
uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j));
592
float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))),
593
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))), scale);
594
float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))),
595
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))), scale);
596
vst1q_s16(dst + j, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)),
597
vqmovn_s32(vcvtq_s32_f32(v_dst1f))));
598
}
599
600
for (; j < size.width; j++)
601
{
602
f32 fval = (f32)src0[j] * (f32)src1[j] * scale;
603
dst[j] = internal::saturate_cast<s16>((s32)trunc(fval));
604
}
605
}
606
}
607
else // CONVERT_POLICY_WRAP
608
{
609
if (is_integer_scale && iscale == 1)
610
{
611
for (; j < roiw16; j += 16)
612
{
613
internal::prefetch(src0 + j);
614
internal::prefetch(src1 + j);
615
uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j);
616
617
uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1));
618
uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1));
619
620
vst1q_s16(dst + j, vreinterpretq_s16_u16(v_dst0));
621
vst1q_s16(dst + j + 8, vreinterpretq_s16_u16(v_dst1));
622
}
623
for (; j < roiw8; j += 8)
624
{
625
uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j));
626
vst1q_s16(dst + j, vreinterpretq_s16_u16(v_dst));
627
}
628
629
for (; j < size.width; j++)
630
{
631
u16 val = (u16)src0[j] * (u16)src1[j];
632
dst[j] = (s16)(val);
633
}
634
}
635
else // generic case using floats
636
{
637
for (; j < roiw16; j += 16)
638
{
639
internal::prefetch(src0 + j);
640
internal::prefetch(src1 + j);
641
uint8x16_t v_src0 = vld1q_u8(src0 + j);
642
uint8x16_t v_src1 = vld1q_u8(src1 + j);
643
644
uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
645
uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
646
float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
647
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale);
648
float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
649
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale);
650
vst1q_s16(dst + j, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)),
651
vmovn_s32(vcvtq_s32_f32(v_dst1f))));
652
653
v_src0_p = vmovl_u8(vget_high_u8(v_src0));
654
v_src1_p = vmovl_u8(vget_high_u8(v_src1));
655
v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
656
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale);
657
v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
658
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale);
659
vst1q_s16(dst + j + 8, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)),
660
vmovn_s32(vcvtq_s32_f32(v_dst1f))));
661
}
662
for (; j < roiw8; j += 8)
663
{
664
uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j));
665
uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j));
666
float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))),
667
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))), scale);
668
float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))),
669
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))), scale);
670
vst1q_s16(dst + j, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)),
671
vmovn_s32(vcvtq_s32_f32(v_dst1f))));
672
}
673
674
for (; j < size.width; j++)
675
{
676
f32 fval = (f32)src0[j] * (f32)src1[j] * scale;
677
dst[j] = (s16)(s32)trunc(fval);
678
}
679
}
680
}
681
}
682
#else
683
(void)size;
684
(void)src0Base;
685
(void)src0Stride;
686
(void)src1Base;
687
(void)src1Stride;
688
(void)dstBase;
689
(void)dstStride;
690
(void)cpolicy;
691
(void)scale;
692
#endif
693
}
694
695
void mul(const Size2D &size,
696
const u8 * src0Base, ptrdiff_t src0Stride,
697
const s16 * src1Base, ptrdiff_t src1Stride,
698
s16 * dstBase, ptrdiff_t dstStride,
699
f32 scale,
700
CONVERT_POLICY cpolicy)
701
{
702
internal::assertSupportedConfiguration();
703
704
#ifdef CAROTENE_NEON
705
if (scale == 0.0f)
706
{
707
for (size_t y = 0; y < size.height; ++y)
708
{
709
s16 * dst = internal::getRowPtr(dstBase, dstStride, y);
710
std::memset(dst, 0, sizeof(s16) * size.width);
711
}
712
return;
713
}
714
715
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
716
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
717
718
bool is_integer_scale = isIntegerScale(scale);
719
s32 iscale = static_cast<s32>(scale);
720
721
for (size_t i = 0; i < size.height; ++i)
722
{
723
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
724
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
725
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
726
size_t j = 0;
727
728
if (cpolicy == CONVERT_POLICY_SATURATE)
729
{
730
if (is_integer_scale && iscale == 1)
731
{
732
for (; j < roiw16; j += 16)
733
{
734
internal::prefetch(src0 + j);
735
internal::prefetch(src1 + j);
736
uint8x16_t v_src0 = vld1q_u8(src0 + j);
737
738
int16x8_t v_src0_p = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
739
int16x8_t v_src1_p = vld1q_s16(src1 + j);
740
int16x8_t v_dst = vcombine_s16(vqmovn_s32(vmull_s16(vget_low_s16(v_src0_p), vget_low_s16(v_src1_p))),
741
vqmovn_s32(vmull_s16(vget_high_s16(v_src0_p), vget_high_s16(v_src1_p))));
742
vst1q_s16(dst + j, v_dst);
743
744
v_src0_p = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
745
v_src1_p = vld1q_s16(src1 + j + 8);
746
v_dst = vcombine_s16(vqmovn_s32(vmull_s16(vget_low_s16(v_src0_p), vget_low_s16(v_src1_p))),
747
vqmovn_s32(vmull_s16(vget_high_s16(v_src0_p), vget_high_s16(v_src1_p))));
748
vst1q_s16(dst + j + 8, v_dst);
749
}
750
for (; j < roiw8; j += 8)
751
{
752
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vld1q_u8(src0 + j))));
753
int16x8_t v_src1 = vld1q_s16(src1 + j);
754
int16x8_t v_dst = vcombine_s16(vqmovn_s32(vmull_s16(vget_low_s16(v_src0), vget_low_s16(v_src1))),
755
vqmovn_s32(vmull_s16(vget_high_s16(v_src0), vget_high_s16(v_src1))));
756
vst1q_s16(dst + j, v_dst);
757
}
758
759
for (; j < size.width; j++)
760
{
761
s32 val = (s32)src0[j] * (s32)src1[j];
762
dst[j] = internal::saturate_cast<s16>(val);
763
}
764
}
765
else // generic case using floats
766
{
767
for (; j < roiw16; j += 16)
768
{
769
internal::prefetch(src0 + j);
770
internal::prefetch(src1 + j);
771
uint8x16_t v_src0 = vld1q_u8(src0 + j);
772
773
uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
774
int16x8_t v_src1_p = vld1q_s16(src1 + j);
775
float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
776
vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1_p)))), scale);
777
float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
778
vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1_p)))), scale);
779
vst1q_s16(dst + j, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)),
780
vqmovn_s32(vcvtq_s32_f32(v_dst1f))));
781
782
v_src0_p = vmovl_u8(vget_high_u8(v_src0));
783
v_src1_p = vld1q_s16(src1 + j + 8);
784
v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
785
vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1_p)))), scale);
786
v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
787
vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1_p)))), scale);
788
vst1q_s16(dst + j + 8, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)),
789
vqmovn_s32(vcvtq_s32_f32(v_dst1f))));
790
}
791
for (; j < roiw8; j += 8)
792
{
793
uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j));
794
int16x8_t v_src1 = vld1q_s16(src1 + j);
795
float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))),
796
vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)))), scale);
797
float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))),
798
vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)))), scale);
799
vst1q_s16(dst + j, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)),
800
vqmovn_s32(vcvtq_s32_f32(v_dst1f))));
801
}
802
803
for (; j < size.width; j++)
804
{
805
f32 fval = (f32)src0[j] * (f32)src1[j] * scale;
806
dst[j] = internal::saturate_cast<s16>((s32)trunc(fval));
807
}
808
}
809
}
810
else // CONVERT_POLICY_WRAP
811
{
812
if (is_integer_scale && iscale == 1)
813
{
814
for (; j < roiw16; j += 16)
815
{
816
internal::prefetch(src0 + j);
817
internal::prefetch(src1 + j);
818
uint8x16_t v_src0 = vld1q_u8(src0 + j);
819
820
int16x8_t v_src0_p = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
821
int16x8_t v_src1_p = vld1q_s16(src1 + j);
822
int16x8_t v_dst = vcombine_s16(vmovn_s32(vmull_s16(vget_low_s16(v_src0_p), vget_low_s16(v_src1_p))),
823
vmovn_s32(vmull_s16(vget_high_s16(v_src0_p), vget_high_s16(v_src1_p))));
824
vst1q_s16(dst + j, v_dst);
825
826
v_src0_p = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
827
v_src1_p = vld1q_s16(src1 + j + 8);
828
v_dst = vcombine_s16(vmovn_s32(vmull_s16(vget_low_s16(v_src0_p), vget_low_s16(v_src1_p))),
829
vmovn_s32(vmull_s16(vget_high_s16(v_src0_p), vget_high_s16(v_src1_p))));
830
vst1q_s16(dst + j + 8, v_dst);
831
}
832
for (; j < roiw8; j += 8)
833
{
834
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vld1q_u8(src0 + j))));
835
int16x8_t v_src1 = vld1q_s16(src1 + j);
836
int16x8_t v_dst = vcombine_s16(vmovn_s32(vmull_s16(vget_low_s16(v_src0), vget_low_s16(v_src1))),
837
vmovn_s32(vmull_s16(vget_high_s16(v_src0), vget_high_s16(v_src1))));
838
vst1q_s16(dst + j, v_dst);
839
}
840
841
for (; j < size.width; j++)
842
{
843
s32 val = (s32)src0[j] * (s32)src1[j];
844
dst[j] = (s16)(val);
845
}
846
}
847
else // generic case using floats
848
{
849
for (; j < roiw16; j += 16)
850
{
851
internal::prefetch(src0 + j);
852
internal::prefetch(src1 + j);
853
uint8x16_t v_src0 = vld1q_u8(src0 + j);
854
855
uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
856
int16x8_t v_src1_p = vld1q_s16(src1 + j);
857
float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
858
vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1_p)))), scale);
859
float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
860
vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1_p)))), scale);
861
vst1q_s16(dst + j, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)),
862
vmovn_s32(vcvtq_s32_f32(v_dst1f))));
863
864
v_src0_p = vmovl_u8(vget_high_u8(v_src0));
865
v_src1_p = vld1q_s16(src1 + j + 8);
866
v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))),
867
vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1_p)))), scale);
868
v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))),
869
vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1_p)))), scale);
870
vst1q_s16(dst + j + 8, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)),
871
vmovn_s32(vcvtq_s32_f32(v_dst1f))));
872
}
873
for (; j < roiw8; j += 8)
874
{
875
uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j));
876
int16x8_t v_src1 = vld1q_s16(src1 + j);
877
float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))),
878
vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)))), scale);
879
float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))),
880
vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)))), scale);
881
vst1q_s16(dst + j, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)),
882
vmovn_s32(vcvtq_s32_f32(v_dst1f))));
883
}
884
885
for (; j < size.width; j++)
886
{
887
f32 fval = (f32)src0[j] * (f32)src1[j] * scale;
888
dst[j] = (s16)(s32)trunc(fval);
889
}
890
}
891
}
892
}
893
#else
894
(void)size;
895
(void)src0Base;
896
(void)src0Stride;
897
(void)src1Base;
898
(void)src1Stride;
899
(void)dstBase;
900
(void)dstStride;
901
(void)cpolicy;
902
(void)scale;
903
#endif
904
}
905
906
namespace {
907
908
#ifdef CAROTENE_NEON
909
910
template <typename T>
911
inline T mulSaturateQ(const T &v1, const T &v2, const float scale)
912
{
913
return internal::vcombine(internal::vqmovn(mulSaturateQ(internal::vmovl(internal::vget_low(v1)),
914
internal::vmovl(internal::vget_low(v2)), scale)),
915
internal::vqmovn(mulSaturateQ(internal::vmovl(internal::vget_high(v1)),
916
internal::vmovl(internal::vget_high(v2)), scale))
917
);
918
}
919
template <>
920
inline int32x4_t mulSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
921
{ return vcvtq_s32_f32(vmulq_n_f32(vmulq_f32(vcvtq_f32_s32(v1), vcvtq_f32_s32(v2)), scale)); }
922
template <>
923
inline uint32x4_t mulSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
924
{ return vcvtq_u32_f32(vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(v1), vcvtq_f32_u32(v2)), scale)); }
925
926
template <typename T>
927
inline T mulSaturate(const T &v1, const T &v2, const float scale)
928
{
929
return internal::vqmovn(mulSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale));
930
}
931
template <>
932
inline int32x2_t mulSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
933
{ return vcvt_s32_f32(vmul_n_f32(vmul_f32(vcvt_f32_s32(v1), vcvt_f32_s32(v2)), scale)); }
934
template <>
935
inline uint32x2_t mulSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
936
{ return vcvt_u32_f32(vmul_n_f32(vmul_f32(vcvt_f32_u32(v1), vcvt_f32_u32(v2)), scale)); }
937
938
939
template <typename T>
940
inline T mulWrapQ(const T &v1, const T &v2, const float scale)
941
{
942
return internal::vcombine(internal::vmovn(mulWrapQ(internal::vmovl(internal::vget_low(v1)),
943
internal::vmovl(internal::vget_low(v2)), scale)),
944
internal::vmovn(mulWrapQ(internal::vmovl(internal::vget_high(v1)),
945
internal::vmovl(internal::vget_high(v2)), scale))
946
);
947
}
948
template <>
949
inline int32x4_t mulWrapQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
950
{ return vcvtq_s32_f32(vmulq_n_f32(vmulq_f32(vcvtq_f32_s32(v1), vcvtq_f32_s32(v2)), scale)); }
951
template <>
952
inline uint32x4_t mulWrapQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
953
{ return vcvtq_u32_f32(vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(v1), vcvtq_f32_u32(v2)), scale)); }
954
955
template <typename T>
956
inline T mulWrap(const T &v1, const T &v2, const float scale)
957
{
958
return internal::vmovn(mulWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale));
959
}
960
template <>
961
inline int32x2_t mulWrap<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
962
{ return vcvt_s32_f32(vmul_n_f32(vmul_f32(vcvt_f32_s32(v1), vcvt_f32_s32(v2)), scale)); }
963
template <>
964
inline uint32x2_t mulWrap<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
965
{ return vcvt_u32_f32(vmul_n_f32(vmul_f32(vcvt_f32_u32(v1), vcvt_f32_u32(v2)), scale)); }
966
967
968
template <int n> inline uint8x16_t vshrq_n(const uint8x16_t & v0) { return vshrq_n_u8 (v0, n); }
969
template <int n> inline int8x16_t vshrq_n(const int8x16_t & v0) { return vshrq_n_s8 (v0, n); }
970
template <int n> inline uint16x8_t vshrq_n(const uint16x8_t & v0) { return vshrq_n_u16(v0, n); }
971
template <int n> inline int16x8_t vshrq_n(const int16x8_t & v0) { return vshrq_n_s16(v0, n); }
972
template <int n> inline uint32x4_t vshrq_n(const uint32x4_t & v0) { return vshrq_n_u32(v0, n); }
973
template <int n> inline int32x4_t vshrq_n(const int32x4_t & v0) { return vshrq_n_s32(v0, n); }
974
template <int n> inline uint64x2_t vshrq_n(const uint64x2_t & v0) { return vshrq_n_u64(v0, n); }
975
template <int n> inline int64x2_t vshrq_n(const int64x2_t & v0) { return vshrq_n_s64(v0, n); }
976
977
template <int n> inline uint8x8_t vshr_n(const uint8x8_t & v0) { return vshr_n_u8 (v0, n); }
978
template <int n> inline int8x8_t vshr_n(const int8x8_t & v0) { return vshr_n_s8 (v0, n); }
979
template <int n> inline uint16x4_t vshr_n(const uint16x4_t & v0) { return vshr_n_u16(v0, n); }
980
template <int n> inline int16x4_t vshr_n(const int16x4_t & v0) { return vshr_n_s16(v0, n); }
981
template <int n> inline uint32x2_t vshr_n(const uint32x2_t & v0) { return vshr_n_u32(v0, n); }
982
template <int n> inline int32x2_t vshr_n(const int32x2_t & v0) { return vshr_n_s32(v0, n); }
983
template <int n> inline uint64x1_t vshr_n(const uint64x1_t & v0) { return vshr_n_u64(v0, n); }
984
template <int n> inline int64x1_t vshr_n(const int64x1_t & v0) { return vshr_n_s64(v0, n); }
985
986
template <int n> inline uint8x16_t vrshrq_n(const uint8x16_t & v0) { return vrshrq_n_u8 (v0, n); }
987
template <int n> inline int8x16_t vrshrq_n(const int8x16_t & v0) { return vrshrq_n_s8 (v0, n); }
988
template <int n> inline uint16x8_t vrshrq_n(const uint16x8_t & v0) { return vrshrq_n_u16(v0, n); }
989
template <int n> inline int16x8_t vrshrq_n(const int16x8_t & v0) { return vrshrq_n_s16(v0, n); }
990
template <int n> inline uint32x4_t vrshrq_n(const uint32x4_t & v0) { return vrshrq_n_u32(v0, n); }
991
template <int n> inline int32x4_t vrshrq_n(const int32x4_t & v0) { return vrshrq_n_s32(v0, n); }
992
template <int n> inline uint64x2_t vrshrq_n(const uint64x2_t & v0) { return vrshrq_n_u64(v0, n); }
993
template <int n> inline int64x2_t vrshrq_n(const int64x2_t & v0) { return vrshrq_n_s64(v0, n); }
994
995
template <int n> inline uint8x8_t vrshr_n(const uint8x8_t & v0) { return vrshr_n_u8 (v0, n); }
996
template <int n> inline int8x8_t vrshr_n(const int8x8_t & v0) { return vrshr_n_s8 (v0, n); }
997
template <int n> inline uint16x4_t vrshr_n(const uint16x4_t & v0) { return vrshr_n_u16(v0, n); }
998
template <int n> inline int16x4_t vrshr_n(const int16x4_t & v0) { return vrshr_n_s16(v0, n); }
999
template <int n> inline uint32x2_t vrshr_n(const uint32x2_t & v0) { return vrshr_n_u32(v0, n); }
1000
template <int n> inline int32x2_t vrshr_n(const int32x2_t & v0) { return vrshr_n_s32(v0, n); }
1001
template <int n> inline uint64x1_t vrshr_n(const uint64x1_t & v0) { return vrshr_n_u64(v0, n); }
1002
template <int n> inline int64x1_t vrshr_n(const int64x1_t & v0) { return vrshr_n_s64(v0, n); }
1003
1004
template <typename T, typename WT, s32 shift>
1005
void mulShift(const Size2D &size,
1006
const T * src0Base, ptrdiff_t src0Stride,
1007
const T * src1Base, ptrdiff_t src1Stride,
1008
T * dstBase, ptrdiff_t dstStride,
1009
CONVERT_POLICY cpolicy)
1010
{
1011
typedef typename internal::VecTraits<T>::vec128 vec128;
1012
typedef typename internal::VecTraits<WT>::vec128 wvec128;
1013
typedef typename internal::VecTraits<T>::vec64 vec64;
1014
const size_t step128 = 16 / sizeof(T);
1015
size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
1016
const size_t step64 = 8 / sizeof(T);
1017
size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
1018
1019
wvec128 v_mask = internal::vdupq_n((WT)(1<<shift));
1020
1021
for (size_t i = 0; i < size.height; ++i)
1022
{
1023
const T * src0 = internal::getRowPtr(src0Base, src0Stride, i);
1024
const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
1025
T * dst = internal::getRowPtr(dstBase, dstStride, i);
1026
size_t j = 0;
1027
1028
if (cpolicy == CONVERT_POLICY_SATURATE)
1029
{
1030
for (; j < roiw128; j += step128)
1031
{
1032
internal::prefetch(src0 + j);
1033
internal::prefetch(src1 + j);
1034
vec128 v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j);
1035
wvec128 v_mul0 = internal::vmull( internal::vget_low(v_src0), internal::vget_low(v_src1));
1036
wvec128 v_mul1 = internal::vmull(internal::vget_high(v_src0), internal::vget_high(v_src1));
1037
1038
vec64 v_res0 = internal::vqmovn(vrshrq_n<shift>(internal::vqsubq(v_mul0, vshrq_n<shift>(internal::vbicq(v_mask, v_mul0)) )));
1039
vec64 v_res1 = internal::vqmovn(vrshrq_n<shift>(internal::vqsubq(v_mul1, vshrq_n<shift>(internal::vbicq(v_mask, v_mul1)) )));
1040
1041
internal::vst1q(dst + j, internal::vcombine(v_res0, v_res1));
1042
}
1043
for (; j < roiw64; j += step64)
1044
{
1045
wvec128 v_mul = internal::vmull(internal::vld1(src0 + j), internal::vld1(src1 + j));
1046
vec64 v_res = internal::vqmovn(vrshrq_n<shift>(internal::vqsubq(v_mul, vshrq_n<shift>(internal::vbicq(v_mask, v_mul)) )));
1047
internal::vst1(dst + j, v_res);
1048
}
1049
1050
for (; j < size.width; j++)
1051
{
1052
WT val = (WT)src0[j] * (WT)src1[j];
1053
dst[j] = internal::saturate_cast<T>((val - (((1<<shift) & ~val) >> shift) + (1<<(shift-1))) >> shift);
1054
}
1055
}
1056
else // CONVERT_POLICY_WRAP
1057
{
1058
for (; j < roiw128; j += step128)
1059
{
1060
internal::prefetch(src0 + j);
1061
internal::prefetch(src1 + j);
1062
vec128 v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j);
1063
wvec128 v_mul0 = internal::vmull( internal::vget_low(v_src0), internal::vget_low(v_src1));
1064
wvec128 v_mul1 = internal::vmull(internal::vget_high(v_src0), internal::vget_high(v_src1));
1065
1066
vec64 v_res0 = internal::vmovn(vrshrq_n<shift>(internal::vqsubq(v_mul0, vshrq_n<shift>(internal::vbicq(v_mask, v_mul0)) )));
1067
vec64 v_res1 = internal::vmovn(vrshrq_n<shift>(internal::vqsubq(v_mul1, vshrq_n<shift>(internal::vbicq(v_mask, v_mul1)) )));
1068
1069
internal::vst1q(dst + j, internal::vcombine(v_res0, v_res1));
1070
}
1071
for (; j < roiw64; j += step64)
1072
{
1073
wvec128 v_mul = internal::vmull(internal::vld1(src0 + j), internal::vld1(src1 + j));
1074
vec64 v_res = internal::vmovn(vrshrq_n<shift>(internal::vqsubq(v_mul, vshrq_n<shift>(internal::vbicq(v_mask, v_mul)) )));
1075
internal::vst1(dst + j, v_res);
1076
}
1077
1078
for (; j < size.width; j++)
1079
{
1080
WT val = (WT)src0[j] * (WT)src1[j];
1081
dst[j] = (T)((val - (((1<<shift) & ~val) >> shift) + (1<<(shift-1))) >> shift);
1082
}
1083
}
1084
}
1085
}
1086
#endif
1087
1088
template <typename T, typename WT>
1089
void mul(const Size2D &size,
1090
const T * src0Base, ptrdiff_t src0Stride,
1091
const T * src1Base, ptrdiff_t src1Stride,
1092
T * dstBase, ptrdiff_t dstStride,
1093
f32 scale,
1094
CONVERT_POLICY cpolicy)
1095
{
1096
internal::assertSupportedConfiguration();
1097
1098
#ifdef CAROTENE_NEON
1099
typedef typename internal::VecTraits<T>::vec128 vec128;
1100
1101
typedef void (* mulFunc)(const Size2D &size,
1102
const T * src0Base, ptrdiff_t src0Stride,
1103
const T * src1Base, ptrdiff_t src1Stride,
1104
T * dstBase, ptrdiff_t dstStride,
1105
CONVERT_POLICY cpolicy);
1106
1107
if (scale == 0.0f ||
1108
(std::numeric_limits<T>::is_integer &&
1109
(scale * std::numeric_limits<T>::max() * std::numeric_limits<T>::max()) < 1.0f &&
1110
(scale * std::numeric_limits<T>::max() * std::numeric_limits<T>::max()) > -1.0f))
1111
{
1112
for (size_t y = 0; y < size.height; ++y)
1113
{
1114
T * dst = internal::getRowPtr(dstBase, dstStride, y);
1115
std::memset(dst, 0, sizeof(T) * size.width);
1116
}
1117
return;
1118
}
1119
1120
s32 iscale = static_cast<s32>(scale), exp = 0;
1121
f32 significand = frexp(scale, &exp);
1122
bool is_integer_scale = isIntegerScale(scale),
1123
is_power_of_2 = (significand == 0.5f) && (exp <= 0);
1124
exp = -exp + 1;
1125
1126
if (is_power_of_2)
1127
{
1128
static const mulFunc funcs[16] =
1129
{
1130
NULL,
1131
mulShift<T,WT,1>,
1132
mulShift<T,WT,2>,
1133
mulShift<T,WT,3>,
1134
mulShift<T,WT,4>,
1135
mulShift<T,WT,5>,
1136
mulShift<T,WT,6>,
1137
mulShift<T,WT,7>,
1138
mulShift<T,WT,8>,
1139
mulShift<T,WT,9>,
1140
mulShift<T,WT,10>,
1141
mulShift<T,WT,11>,
1142
mulShift<T,WT,12>,
1143
mulShift<T,WT,13>,
1144
mulShift<T,WT,14>,
1145
mulShift<T,WT,15>
1146
};
1147
1148
mulFunc func = funcs[exp];
1149
1150
func(size,
1151
src0Base, src0Stride,
1152
src1Base, src1Stride,
1153
dstBase, dstStride,
1154
cpolicy);
1155
1156
return;
1157
}
1158
1159
const size_t step128 = 16 / sizeof(T);
1160
size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
1161
const size_t step64 = 8 / sizeof(T);
1162
size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
1163
1164
for (size_t i = 0; i < size.height; ++i)
1165
{
1166
const T * src0 = internal::getRowPtr(src0Base, src0Stride, i);
1167
const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
1168
T * dst = internal::getRowPtr(dstBase, dstStride, i);
1169
size_t j = 0;
1170
1171
if (cpolicy == CONVERT_POLICY_SATURATE)
1172
{
1173
if (is_integer_scale && iscale == 1)
1174
{
1175
for (; j < roiw128; j += step128)
1176
{
1177
internal::prefetch(src0 + j);
1178
internal::prefetch(src1 + j);
1179
vec128 v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j);
1180
internal::vst1q(dst + j, internal::vcombine(
1181
internal::vqmovn(internal::vmull(internal::vget_low(v_src0),
1182
internal::vget_low(v_src1))),
1183
internal::vqmovn(internal::vmull(internal::vget_high(v_src0),
1184
internal::vget_high(v_src1)))
1185
)
1186
);
1187
}
1188
for (; j < roiw64; j += step64)
1189
{
1190
internal::vst1(dst + j, internal::vqmovn(internal::vmull(internal::vld1(src0 + j),
1191
internal::vld1(src1 + j))));
1192
}
1193
1194
for (; j < size.width; j++)
1195
{
1196
WT val = (WT)src0[j] * (WT)src1[j];
1197
dst[j] = internal::saturate_cast<T>(val);
1198
}
1199
}
1200
else // generic case using floats
1201
{
1202
for (; j < roiw128; j += step128)
1203
{
1204
internal::prefetch(src0 + j);
1205
internal::prefetch(src1 + j);
1206
internal::vst1q(dst + j, mulSaturateQ(internal::vld1q(src0 + j),
1207
internal::vld1q(src1 + j), scale));
1208
}
1209
for (; j < roiw64; j += step64)
1210
{
1211
internal::vst1(dst + j, mulSaturate(internal::vld1(src0 + j),
1212
internal::vld1(src1 + j), scale));
1213
}
1214
1215
for (; j < size.width; j++)
1216
{
1217
f32 fval = (f32)src0[j] * (f32)src1[j] * scale;
1218
dst[j] = internal::saturate_cast<T>(fval);
1219
}
1220
}
1221
}
1222
else // CONVERT_POLICY_WRAP
1223
{
1224
if (is_integer_scale && iscale == 1)
1225
{
1226
for (; j < roiw128; j += step128)
1227
{
1228
internal::prefetch(src0 + j);
1229
internal::prefetch(src1 + j);
1230
vec128 v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j);
1231
internal::vst1q(dst + j, internal::vcombine(
1232
internal::vmovn(internal::vmull(internal::vget_low(v_src0),
1233
internal::vget_low(v_src1))),
1234
internal::vmovn(internal::vmull(internal::vget_high(v_src0),
1235
internal::vget_high(v_src1)))
1236
)
1237
);
1238
}
1239
for (; j < roiw64; j += step64)
1240
{
1241
internal::vst1(dst + j, internal::vmovn(internal::vmull(internal::vld1(src0 + j),
1242
internal::vld1(src1 + j))));
1243
}
1244
1245
for (; j < size.width; j++)
1246
{
1247
WT val = (WT)src0[j] * (WT)src1[j];
1248
dst[j] = (T)(val);
1249
}
1250
}
1251
else // generic case using floats
1252
{
1253
for (; j < roiw128; j += step128)
1254
{
1255
internal::prefetch(src0 + j);
1256
internal::prefetch(src1 + j);
1257
internal::vst1q(dst + j, mulWrapQ(internal::vld1q(src0 + j),
1258
internal::vld1q(src1 + j), scale));
1259
}
1260
for (; j < roiw64; j += step64)
1261
{
1262
internal::vst1(dst + j, mulWrap(internal::vld1(src0 + j),
1263
internal::vld1(src1 + j), scale));
1264
}
1265
1266
for (; j < size.width; j++)
1267
{
1268
f32 fval = (f32)src0[j] * (f32)src1[j] * scale;
1269
dst[j] = (T)((s32)trunc(fval));
1270
}
1271
}
1272
}
1273
}
1274
#else
1275
(void)size;
1276
(void)src0Base;
1277
(void)src0Stride;
1278
(void)src1Base;
1279
(void)src1Stride;
1280
(void)dstBase;
1281
(void)dstStride;
1282
(void)cpolicy;
1283
(void)scale;
1284
#endif
1285
}
1286
1287
}
1288
1289
void mul(const Size2D &size,
1290
const s8 * src0Base, ptrdiff_t src0Stride,
1291
const s8 * src1Base, ptrdiff_t src1Stride,
1292
s8 * dstBase, ptrdiff_t dstStride,
1293
f32 scale,
1294
CONVERT_POLICY cpolicy)
1295
{
1296
mul<s8,s16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
1297
}
1298
1299
void mul(const Size2D &size,
1300
const u16 * src0Base, ptrdiff_t src0Stride,
1301
const u16 * src1Base, ptrdiff_t src1Stride,
1302
u16 * dstBase, ptrdiff_t dstStride,
1303
f32 scale,
1304
CONVERT_POLICY cpolicy)
1305
{
1306
mul<u16,u32>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
1307
}
1308
1309
void mul(const Size2D &size,
1310
const s16 * src0Base, ptrdiff_t src0Stride,
1311
const s16 * src1Base, ptrdiff_t src1Stride,
1312
s16 * dstBase, ptrdiff_t dstStride,
1313
f32 scale,
1314
CONVERT_POLICY cpolicy)
1315
{
1316
mul<s16,s32>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
1317
}
1318
1319
void mul(const Size2D &size,
1320
const s32 * src0Base, ptrdiff_t src0Stride,
1321
const s32 * src1Base, ptrdiff_t src1Stride,
1322
s32 * dstBase, ptrdiff_t dstStride,
1323
f64 scale,
1324
CONVERT_POLICY cpolicy)
1325
{
1326
internal::assertSupportedConfiguration();
1327
#ifdef CAROTENE_NEON
1328
typedef void (* mulFunc)(const Size2D &size,
1329
const s32 * src0Base, ptrdiff_t src0Stride,
1330
const s32 * src1Base, ptrdiff_t src1Stride,
1331
s32 * dstBase, ptrdiff_t dstStride,
1332
CONVERT_POLICY cpolicy);
1333
1334
if (!std::isnormal(scale) ||
1335
((scale * std::numeric_limits<s32>::max() * std::numeric_limits<s32>::max()) < 1.0f &&
1336
(scale * std::numeric_limits<s32>::max() * std::numeric_limits<s32>::max()) > -1.0f))
1337
{
1338
for (size_t y = 0; y < size.height; ++y)
1339
{
1340
s32 * dst = internal::getRowPtr(dstBase, dstStride, y);
1341
std::memset(dst, 0, sizeof(s32) * size.width);
1342
}
1343
return;
1344
}
1345
1346
s32 iscale = static_cast<s32>(scale), exp = 0;
1347
f64 significand = frexp(scale, &exp);
1348
bool is_integer_scale = isIntegerScale(scale),
1349
is_power_of_2 = (significand == 0.5) && (exp <= 0);
1350
exp = -exp + 1;
1351
1352
if (is_power_of_2)
1353
{
1354
static const mulFunc funcs[16] =
1355
{
1356
NULL,
1357
mulShift<s32,s64,1>,
1358
mulShift<s32,s64,2>,
1359
mulShift<s32,s64,3>,
1360
mulShift<s32,s64,4>,
1361
mulShift<s32,s64,5>,
1362
mulShift<s32,s64,6>,
1363
mulShift<s32,s64,7>,
1364
mulShift<s32,s64,8>,
1365
mulShift<s32,s64,9>,
1366
mulShift<s32,s64,10>,
1367
mulShift<s32,s64,11>,
1368
mulShift<s32,s64,12>,
1369
mulShift<s32,s64,13>,
1370
mulShift<s32,s64,14>,
1371
mulShift<s32,s64,15>
1372
};
1373
1374
mulFunc func = funcs[exp];
1375
1376
func(size,
1377
src0Base, src0Stride,
1378
src1Base, src1Stride,
1379
dstBase, dstStride,
1380
cpolicy);
1381
1382
return;
1383
}
1384
1385
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
1386
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
1387
1388
for (size_t i = 0; i < size.height; ++i)
1389
{
1390
const s32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
1391
const s32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
1392
s32 * dst = internal::getRowPtr(dstBase, dstStride, i);
1393
size_t j = 0;
1394
1395
if (cpolicy == CONVERT_POLICY_SATURATE)
1396
{
1397
if (is_integer_scale && iscale == 1)
1398
{
1399
for (; j < roiw128; j += 4)
1400
{
1401
internal::prefetch(src0 + j);
1402
internal::prefetch(src1 + j);
1403
int32x4_t v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j);
1404
internal::vst1q(dst + j, internal::vcombine(
1405
internal::vqmovn(internal::vmull(internal::vget_low(v_src0),
1406
internal::vget_low(v_src1))),
1407
internal::vqmovn(internal::vmull(internal::vget_high(v_src0),
1408
internal::vget_high(v_src1)))
1409
)
1410
);
1411
}
1412
for (; j < roiw64; j += 2)
1413
{
1414
internal::vst1(dst + j, internal::vqmovn(internal::vmull(internal::vld1(src0 + j),
1415
internal::vld1(src1 + j))));
1416
}
1417
1418
for (; j < size.width; j++)
1419
{
1420
s64 val = (s64)src0[j] * (s64)src1[j];
1421
dst[j] = internal::saturate_cast<s32>(val);
1422
}
1423
}
1424
else // generic case using floats
1425
{
1426
for (; j < size.width; j++)
1427
{
1428
f64 fval = src0[j] * src1[j] * scale;
1429
dst[j] = internal::saturate_cast<s32>(fval);
1430
}
1431
}
1432
}
1433
else // CONVERT_POLICY_WRAP
1434
{
1435
if (is_integer_scale && iscale == 1)
1436
{
1437
for (; j < roiw128; j += 4)
1438
{
1439
internal::prefetch(src0 + j);
1440
internal::prefetch(src1 + j);
1441
int32x4_t v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j);
1442
internal::vst1q(dst + j, internal::vcombine(
1443
internal::vmovn(internal::vmull(internal::vget_low(v_src0),
1444
internal::vget_low(v_src1))),
1445
internal::vmovn(internal::vmull(internal::vget_high(v_src0),
1446
internal::vget_high(v_src1)))
1447
)
1448
);
1449
}
1450
for (; j < roiw64; j += 2)
1451
{
1452
internal::vst1(dst + j, internal::vmovn(internal::vmull(internal::vld1(src0 + j),
1453
internal::vld1(src1 + j))));
1454
}
1455
1456
for (; j < size.width; j++)
1457
{
1458
s64 val = (s64)src0[j] * (s64)src1[j];
1459
dst[j] = (s32)(val);
1460
}
1461
}
1462
else // generic case using floats
1463
{
1464
for (; j < size.width; j++)
1465
{
1466
f64 fval = src0[j] * src1[j] * scale;
1467
dst[j] = (s32)trunc(fval);
1468
}
1469
}
1470
}
1471
}
1472
#else
1473
(void)size;
1474
(void)src0Base;
1475
(void)src0Stride;
1476
(void)src1Base;
1477
(void)src1Stride;
1478
(void)dstBase;
1479
(void)dstStride;
1480
(void)cpolicy;
1481
(void)scale;
1482
#endif
1483
}
1484
1485
void mul(const Size2D &size,
1486
const f32 * src0Base, ptrdiff_t src0Stride,
1487
const f32 * src1Base, ptrdiff_t src1Stride,
1488
f32 * dstBase, ptrdiff_t dstStride,
1489
f32 scale)
1490
{
1491
internal::assertSupportedConfiguration();
1492
#ifdef CAROTENE_NEON
1493
if (scale == 0.0f)
1494
{
1495
for (size_t y = 0; y < size.height; ++y)
1496
{
1497
f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
1498
std::memset(dst, 0, sizeof(f32) * size.width);
1499
}
1500
return;
1501
}
1502
1503
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
1504
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
1505
1506
if (std::fabs(scale - 1.0f) < FLT_EPSILON)
1507
{
1508
for (size_t i = 0; i < size.height; ++i)
1509
{
1510
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
1511
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
1512
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
1513
size_t j = 0;
1514
1515
for (; j < roiw128; j += 4)
1516
{
1517
internal::prefetch(src0 + j);
1518
internal::prefetch(src1 + j);
1519
vst1q_f32(dst + j, vmulq_f32(vld1q_f32(src0 + j), vld1q_f32(src1 + j)));
1520
}
1521
1522
for (; j < roiw64; j += 2)
1523
{
1524
vst1_f32(dst + j, vmul_f32(vld1_f32(src0 + j), vld1_f32(src1 + j)));
1525
}
1526
1527
for (; j < size.width; j++)
1528
{
1529
dst[j] = src0[j] * src1[j];
1530
}
1531
}
1532
}
1533
else
1534
{
1535
for (size_t i = 0; i < size.height; ++i)
1536
{
1537
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
1538
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
1539
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
1540
size_t j = 0;
1541
1542
for (; j < roiw128; j += 4)
1543
{
1544
internal::prefetch(src0 + j);
1545
internal::prefetch(src1 + j);
1546
vst1q_f32(dst + j, vmulq_n_f32(vmulq_f32(vld1q_f32(src0 + j), vld1q_f32(src1 + j)), scale));
1547
}
1548
1549
for (; j < roiw64; j += 2)
1550
{
1551
vst1_f32(dst + j, vmul_n_f32(vmul_f32(vld1_f32(src0 + j), vld1_f32(src1 + j)), scale));
1552
}
1553
1554
for (; j < size.width; j++)
1555
{
1556
dst[j] = src0[j] * src1[j] * scale;
1557
}
1558
}
1559
}
1560
#else
1561
(void)size;
1562
(void)src0Base;
1563
(void)src0Stride;
1564
(void)src1Base;
1565
(void)src1Stride;
1566
(void)dstBase;
1567
(void)dstStride;
1568
(void)scale;
1569
#endif
1570
}
1571
1572
} // namespace CAROTENE_NS
1573
1574