Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/div.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
#include "vtransform.hpp"
42
43
#include <cstring>
44
#include <cfloat>
45
#include <cmath>
46
#include <limits>
47
48
namespace CAROTENE_NS {
49
50
namespace {
51
52
#ifdef CAROTENE_NEON
53
54
inline float32x4_t vroundq(const float32x4_t& v)
55
{
56
const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
57
float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v))));
58
return vaddq_f32(v, v_addition);
59
}
60
61
template <typename T>
62
inline T divSaturateQ(const T &v1, const T &v2, const float scale)
63
{
64
return internal::vcombine(internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_low(v1)),
65
internal::vmovl(internal::vget_low(v2)), scale)),
66
internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_high(v1)),
67
internal::vmovl(internal::vget_high(v2)), scale))
68
);
69
}
70
template <>
71
inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
72
{ return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); }
73
template <>
74
inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
75
{ return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); }
76
77
inline float32x2_t vround(const float32x2_t& v)
78
{
79
const int32x2_t signMask = vdup_n_s32(1 << 31), half = vreinterpret_s32_f32(vdup_n_f32(0.5f));
80
float32x2_t v_addition = vreinterpret_f32_s32(vorr_s32(half, vand_s32(signMask, vreinterpret_s32_f32(v))));
81
return vadd_f32(v, v_addition);
82
}
83
84
template <typename T>
85
inline T divSaturate(const T &v1, const T &v2, const float scale)
86
{
87
return internal::vqmovn(divSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale));
88
}
89
template <>
90
inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
91
{ return vcvt_s32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2))))); }
92
template <>
93
inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
94
{ return vcvt_u32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2))))); }
95
96
97
template <typename T>
98
inline T divWrapQ(const T &v1, const T &v2, const float scale)
99
{
100
return internal::vcombine(internal::vmovn(divWrapQ(internal::vmovl(internal::vget_low(v1)),
101
internal::vmovl(internal::vget_low(v2)), scale)),
102
internal::vmovn(divWrapQ(internal::vmovl(internal::vget_high(v1)),
103
internal::vmovl(internal::vget_high(v2)), scale))
104
);
105
}
106
template <>
107
inline int32x4_t divWrapQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
108
{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
109
template <>
110
inline uint32x4_t divWrapQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
111
{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
112
113
template <typename T>
114
inline T divWrap(const T &v1, const T &v2, const float scale)
115
{
116
return internal::vmovn(divWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale));
117
}
118
template <>
119
inline int32x2_t divWrap<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
120
{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
121
template <>
122
inline uint32x2_t divWrap<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
123
{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
124
125
inline uint8x16_t vtstq(const uint8x16_t & v0, const uint8x16_t & v1) { return vtstq_u8 (v0, v1); }
126
inline uint16x8_t vtstq(const uint16x8_t & v0, const uint16x8_t & v1) { return vtstq_u16(v0, v1); }
127
inline uint32x4_t vtstq(const uint32x4_t & v0, const uint32x4_t & v1) { return vtstq_u32(v0, v1); }
128
inline int8x16_t vtstq(const int8x16_t & v0, const int8x16_t & v1) { return vreinterpretq_s8_u8 (vtstq_s8 (v0, v1)); }
129
inline int16x8_t vtstq(const int16x8_t & v0, const int16x8_t & v1) { return vreinterpretq_s16_u16(vtstq_s16(v0, v1)); }
130
inline int32x4_t vtstq(const int32x4_t & v0, const int32x4_t & v1) { return vreinterpretq_s32_u32(vtstq_s32(v0, v1)); }
131
132
inline uint8x8_t vtst(const uint8x8_t & v0, const uint8x8_t & v1) { return vtst_u8 (v0, v1); }
133
inline uint16x4_t vtst(const uint16x4_t & v0, const uint16x4_t & v1) { return vtst_u16(v0, v1); }
134
inline uint32x2_t vtst(const uint32x2_t & v0, const uint32x2_t & v1) { return vtst_u32(v0, v1); }
135
inline int8x8_t vtst(const int8x8_t & v0, const int8x8_t & v1) { return vreinterpret_s8_u8 (vtst_s8 (v0, v1)); }
136
inline int16x4_t vtst(const int16x4_t & v0, const int16x4_t & v1) { return vreinterpret_s16_u16(vtst_s16(v0, v1)); }
137
inline int32x2_t vtst(const int32x2_t & v0, const int32x2_t & v1) { return vreinterpret_s32_u32(vtst_s32(v0, v1)); }
138
#endif
139
140
template <typename T>
141
void div(const Size2D &size,
142
const T * src0Base, ptrdiff_t src0Stride,
143
const T * src1Base, ptrdiff_t src1Stride,
144
T * dstBase, ptrdiff_t dstStride,
145
f32 scale,
146
CONVERT_POLICY cpolicy)
147
{
148
internal::assertSupportedConfiguration();
149
150
#ifdef CAROTENE_NEON
151
typedef typename internal::VecTraits<T>::vec128 vec128;
152
typedef typename internal::VecTraits<T>::vec64 vec64;
153
154
#if defined(__GNUC__) && (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L)
155
static_assert(std::numeric_limits<T>::is_integer, "template implementation is for integer types only");
156
#endif
157
158
if (scale == 0.0f ||
159
(std::numeric_limits<T>::is_integer &&
160
(scale * std::numeric_limits<T>::max()) < 1.0f &&
161
(scale * std::numeric_limits<T>::max()) > -1.0f))
162
{
163
for (size_t y = 0; y < size.height; ++y)
164
{
165
T * dst = internal::getRowPtr(dstBase, dstStride, y);
166
std::memset(dst, 0, sizeof(T) * size.width);
167
}
168
return;
169
}
170
171
const size_t step128 = 16 / sizeof(T);
172
size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
173
const size_t step64 = 8 / sizeof(T);
174
size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
175
176
for (size_t i = 0; i < size.height; ++i)
177
{
178
const T * src0 = internal::getRowPtr(src0Base, src0Stride, i);
179
const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
180
T * dst = internal::getRowPtr(dstBase, dstStride, i);
181
size_t j = 0;
182
183
if (cpolicy == CONVERT_POLICY_SATURATE)
184
{
185
for (; j < roiw128; j += step128)
186
{
187
internal::prefetch(src0 + j);
188
internal::prefetch(src1 + j);
189
190
vec128 v_src0 = internal::vld1q(src0 + j);
191
vec128 v_src1 = internal::vld1q(src1 + j);
192
193
vec128 v_mask = vtstq(v_src1,v_src1);
194
internal::vst1q(dst + j, internal::vandq(v_mask, divSaturateQ(v_src0, v_src1, scale)));
195
}
196
for (; j < roiw64; j += step64)
197
{
198
vec64 v_src0 = internal::vld1(src0 + j);
199
vec64 v_src1 = internal::vld1(src1 + j);
200
201
vec64 v_mask = vtst(v_src1,v_src1);
202
internal::vst1(dst + j, internal::vand(v_mask,divSaturate(v_src0, v_src1, scale)));
203
}
204
for (; j < size.width; j++)
205
{
206
dst[j] = src1[j] ? internal::saturate_cast<T>(scale * src0[j] / src1[j]) : 0;
207
}
208
}
209
else // CONVERT_POLICY_WRAP
210
{
211
for (; j < roiw128; j += step128)
212
{
213
internal::prefetch(src0 + j);
214
internal::prefetch(src1 + j);
215
216
vec128 v_src0 = internal::vld1q(src0 + j);
217
vec128 v_src1 = internal::vld1q(src1 + j);
218
219
vec128 v_mask = vtstq(v_src1,v_src1);
220
internal::vst1q(dst + j, internal::vandq(v_mask, divWrapQ(v_src0, v_src1, scale)));
221
}
222
for (; j < roiw64; j += step64)
223
{
224
vec64 v_src0 = internal::vld1(src0 + j);
225
vec64 v_src1 = internal::vld1(src1 + j);
226
227
vec64 v_mask = vtst(v_src1,v_src1);
228
internal::vst1(dst + j, internal::vand(v_mask,divWrap(v_src0, v_src1, scale)));
229
}
230
for (; j < size.width; j++)
231
{
232
dst[j] = src1[j] ? (T)((s32)trunc(scale * src0[j] / src1[j])) : 0;
233
}
234
}
235
}
236
#else
237
(void)size;
238
(void)src0Base;
239
(void)src0Stride;
240
(void)src1Base;
241
(void)src1Stride;
242
(void)dstBase;
243
(void)dstStride;
244
(void)cpolicy;
245
(void)scale;
246
#endif
247
}
248
249
#ifdef CAROTENE_NEON
250
251
template <typename T>
252
inline T recipSaturateQ(const T &v2, const float scale)
253
{
254
return internal::vcombine(internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_low(v2)), scale)),
255
internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_high(v2)), scale))
256
);
257
}
258
template <>
259
inline int32x4_t recipSaturateQ<int32x4_t>(const int32x4_t &v2, const float scale)
260
{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
261
template <>
262
inline uint32x4_t recipSaturateQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
263
{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
264
265
template <typename T>
266
inline T recipSaturate(const T &v2, const float scale)
267
{
268
return internal::vqmovn(recipSaturateQ(internal::vmovl(v2), scale));
269
}
270
template <>
271
inline int32x2_t recipSaturate<int32x2_t>(const int32x2_t &v2, const float scale)
272
{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
273
template <>
274
inline uint32x2_t recipSaturate<uint32x2_t>(const uint32x2_t &v2, const float scale)
275
{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
276
277
278
template <typename T>
279
inline T recipWrapQ(const T &v2, const float scale)
280
{
281
return internal::vcombine(internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_low(v2)), scale)),
282
internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_high(v2)), scale))
283
);
284
}
285
template <>
286
inline int32x4_t recipWrapQ<int32x4_t>(const int32x4_t &v2, const float scale)
287
{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
288
template <>
289
inline uint32x4_t recipWrapQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
290
{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
291
292
template <typename T>
293
inline T recipWrap(const T &v2, const float scale)
294
{
295
return internal::vmovn(recipWrapQ(internal::vmovl(v2), scale));
296
}
297
template <>
298
inline int32x2_t recipWrap<int32x2_t>(const int32x2_t &v2, const float scale)
299
{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
300
template <>
301
inline uint32x2_t recipWrap<uint32x2_t>(const uint32x2_t &v2, const float scale)
302
{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
303
#endif
304
305
template <typename T>
306
void recip(const Size2D &size,
307
const T * src1Base, ptrdiff_t src1Stride,
308
T * dstBase, ptrdiff_t dstStride,
309
f32 scale,
310
CONVERT_POLICY cpolicy)
311
{
312
internal::assertSupportedConfiguration();
313
314
#ifdef CAROTENE_NEON
315
typedef typename internal::VecTraits<T>::vec128 vec128;
316
typedef typename internal::VecTraits<T>::vec64 vec64;
317
318
#if defined(__GNUC__) && (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L)
319
static_assert(std::numeric_limits<T>::is_integer, "template implementation is for integer types only");
320
#endif
321
322
if (scale == 0.0f ||
323
(std::numeric_limits<T>::is_integer &&
324
scale < 1.0f &&
325
scale > -1.0f))
326
{
327
for (size_t y = 0; y < size.height; ++y)
328
{
329
T * dst = internal::getRowPtr(dstBase, dstStride, y);
330
std::memset(dst, 0, sizeof(T) * size.width);
331
}
332
return;
333
}
334
335
const size_t step128 = 16 / sizeof(T);
336
size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
337
const size_t step64 = 8 / sizeof(T);
338
size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
339
340
for (size_t i = 0; i < size.height; ++i)
341
{
342
const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
343
T * dst = internal::getRowPtr(dstBase, dstStride, i);
344
size_t j = 0;
345
346
if (cpolicy == CONVERT_POLICY_SATURATE)
347
{
348
for (; j < roiw128; j += step128)
349
{
350
internal::prefetch(src1 + j);
351
352
vec128 v_src1 = internal::vld1q(src1 + j);
353
354
vec128 v_mask = vtstq(v_src1,v_src1);
355
internal::vst1q(dst + j, internal::vandq(v_mask, recipSaturateQ(v_src1, scale)));
356
}
357
for (; j < roiw64; j += step64)
358
{
359
vec64 v_src1 = internal::vld1(src1 + j);
360
361
vec64 v_mask = vtst(v_src1,v_src1);
362
internal::vst1(dst + j, internal::vand(v_mask, recipSaturate(v_src1, scale)));
363
}
364
for (; j < size.width; j++)
365
{
366
dst[j] = src1[j] ? internal::saturate_cast<T>(scale / src1[j]) : 0;
367
}
368
}
369
else // CONVERT_POLICY_WRAP
370
{
371
for (; j < roiw128; j += step128)
372
{
373
internal::prefetch(src1 + j);
374
375
vec128 v_src1 = internal::vld1q(src1 + j);
376
377
vec128 v_mask = vtstq(v_src1,v_src1);
378
internal::vst1q(dst + j, internal::vandq(v_mask, recipWrapQ(v_src1, scale)));
379
}
380
for (; j < roiw64; j += step64)
381
{
382
vec64 v_src1 = internal::vld1(src1 + j);
383
384
vec64 v_mask = vtst(v_src1,v_src1);
385
internal::vst1(dst + j, internal::vand(v_mask, recipWrap(v_src1, scale)));
386
}
387
for (; j < size.width; j++)
388
{
389
dst[j] = src1[j] ? (T)((s32)trunc(scale / src1[j])) : 0;
390
}
391
}
392
}
393
#else
394
(void)size;
395
(void)src1Base;
396
(void)src1Stride;
397
(void)dstBase;
398
(void)dstStride;
399
(void)cpolicy;
400
(void)scale;
401
#endif
402
}
403
404
}
405
406
void div(const Size2D &size,
407
const u8 * src0Base, ptrdiff_t src0Stride,
408
const u8 * src1Base, ptrdiff_t src1Stride,
409
u8 * dstBase, ptrdiff_t dstStride,
410
f32 scale,
411
CONVERT_POLICY cpolicy)
412
{
413
div<u8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
414
}
415
416
void div(const Size2D &size,
417
const s8 * src0Base, ptrdiff_t src0Stride,
418
const s8 * src1Base, ptrdiff_t src1Stride,
419
s8 * dstBase, ptrdiff_t dstStride,
420
f32 scale,
421
CONVERT_POLICY cpolicy)
422
{
423
div<s8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
424
}
425
426
void div(const Size2D &size,
427
const u16 * src0Base, ptrdiff_t src0Stride,
428
const u16 * src1Base, ptrdiff_t src1Stride,
429
u16 * dstBase, ptrdiff_t dstStride,
430
f32 scale,
431
CONVERT_POLICY cpolicy)
432
{
433
div<u16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
434
}
435
436
void div(const Size2D &size,
437
const s16 * src0Base, ptrdiff_t src0Stride,
438
const s16 * src1Base, ptrdiff_t src1Stride,
439
s16 * dstBase, ptrdiff_t dstStride,
440
f32 scale,
441
CONVERT_POLICY cpolicy)
442
{
443
div<s16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
444
}
445
446
void div(const Size2D &size,
447
const s32 * src0Base, ptrdiff_t src0Stride,
448
const s32 * src1Base, ptrdiff_t src1Stride,
449
s32 * dstBase, ptrdiff_t dstStride,
450
f32 scale,
451
CONVERT_POLICY cpolicy)
452
{
453
div<s32>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
454
}
455
456
void div(const Size2D &size,
457
const f32 * src0Base, ptrdiff_t src0Stride,
458
const f32 * src1Base, ptrdiff_t src1Stride,
459
f32 * dstBase, ptrdiff_t dstStride,
460
f32 scale)
461
{
462
internal::assertSupportedConfiguration();
463
#ifdef CAROTENE_NEON
464
if (scale == 0.0f)
465
{
466
for (size_t y = 0; y < size.height; ++y)
467
{
468
f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
469
std::memset(dst, 0, sizeof(f32) * size.width);
470
}
471
return;
472
}
473
474
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
475
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
476
477
if (std::fabs(scale - 1.0f) < FLT_EPSILON)
478
{
479
for (size_t i = 0; i < size.height; ++i)
480
{
481
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
482
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
483
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
484
size_t j = 0;
485
486
for (; j < roiw128; j += 4)
487
{
488
internal::prefetch(src0 + j);
489
internal::prefetch(src1 + j);
490
491
float32x4_t v_src0 = vld1q_f32(src0 + j);
492
float32x4_t v_src1 = vld1q_f32(src1 + j);
493
494
vst1q_f32(dst + j, vmulq_f32(v_src0, internal::vrecpq_f32(v_src1)));
495
}
496
497
for (; j < roiw64; j += 2)
498
{
499
float32x2_t v_src0 = vld1_f32(src0 + j);
500
float32x2_t v_src1 = vld1_f32(src1 + j);
501
502
vst1_f32(dst + j, vmul_f32(v_src0, internal::vrecp_f32(v_src1)));
503
}
504
505
for (; j < size.width; j++)
506
{
507
dst[j] = src0[j] / src1[j];
508
}
509
}
510
}
511
else
512
{
513
for (size_t i = 0; i < size.height; ++i)
514
{
515
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
516
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
517
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
518
size_t j = 0;
519
520
for (; j < roiw128; j += 4)
521
{
522
internal::prefetch(src0 + j);
523
internal::prefetch(src1 + j);
524
525
float32x4_t v_src0 = vld1q_f32(src0 + j);
526
float32x4_t v_src1 = vld1q_f32(src1 + j);
527
528
vst1q_f32(dst + j, vmulq_f32(vmulq_n_f32(v_src0, scale),
529
internal::vrecpq_f32(v_src1)));
530
}
531
532
for (; j < roiw64; j += 2)
533
{
534
float32x2_t v_src0 = vld1_f32(src0 + j);
535
float32x2_t v_src1 = vld1_f32(src1 + j);
536
537
vst1_f32(dst + j, vmul_f32(vmul_n_f32(v_src0, scale),
538
internal::vrecp_f32(v_src1)));
539
}
540
541
for (; j < size.width; j++)
542
{
543
dst[j] = src0[j] * scale / src1[j];
544
}
545
}
546
}
547
#else
548
(void)size;
549
(void)src0Base;
550
(void)src0Stride;
551
(void)src1Base;
552
(void)src1Stride;
553
(void)dstBase;
554
(void)dstStride;
555
(void)scale;
556
#endif
557
}
558
559
void reciprocal(const Size2D &size,
560
const u8 * srcBase, ptrdiff_t srcStride,
561
u8 * dstBase, ptrdiff_t dstStride,
562
f32 scale,
563
CONVERT_POLICY cpolicy)
564
{
565
recip<u8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
566
}
567
568
void reciprocal(const Size2D &size,
569
const s8 * srcBase, ptrdiff_t srcStride,
570
s8 * dstBase, ptrdiff_t dstStride,
571
f32 scale,
572
CONVERT_POLICY cpolicy)
573
{
574
recip<s8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
575
}
576
577
void reciprocal(const Size2D &size,
578
const u16 * srcBase, ptrdiff_t srcStride,
579
u16 * dstBase, ptrdiff_t dstStride,
580
f32 scale,
581
CONVERT_POLICY cpolicy)
582
{
583
recip<u16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
584
}
585
586
void reciprocal(const Size2D &size,
587
const s16 * srcBase, ptrdiff_t srcStride,
588
s16 * dstBase, ptrdiff_t dstStride,
589
f32 scale,
590
CONVERT_POLICY cpolicy)
591
{
592
recip<s16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
593
}
594
595
void reciprocal(const Size2D &size,
596
const s32 * srcBase, ptrdiff_t srcStride,
597
s32 * dstBase, ptrdiff_t dstStride,
598
f32 scale,
599
CONVERT_POLICY cpolicy)
600
{
601
recip<s32>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
602
}
603
604
void reciprocal(const Size2D &size,
605
const f32 * srcBase, ptrdiff_t srcStride,
606
f32 * dstBase, ptrdiff_t dstStride,
607
f32 scale)
608
{
609
internal::assertSupportedConfiguration();
610
#ifdef CAROTENE_NEON
611
if (scale == 0.0f)
612
{
613
for (size_t y = 0; y < size.height; ++y)
614
{
615
f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
616
std::memset(dst, 0, sizeof(f32) * size.width);
617
}
618
return;
619
}
620
621
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
622
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
623
624
if (std::fabs(scale - 1.0f) < FLT_EPSILON)
625
{
626
for (size_t i = 0; i < size.height; ++i)
627
{
628
const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
629
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
630
size_t j = 0;
631
632
for (; j < roiw128; j += 4)
633
{
634
internal::prefetch(src1 + j);
635
636
float32x4_t v_src1 = vld1q_f32(src1 + j);
637
638
vst1q_f32(dst + j, internal::vrecpq_f32(v_src1));
639
}
640
641
for (; j < roiw64; j += 2)
642
{
643
float32x2_t v_src1 = vld1_f32(src1 + j);
644
645
vst1_f32(dst + j, internal::vrecp_f32(v_src1));
646
}
647
648
for (; j < size.width; j++)
649
{
650
dst[j] = 1.0f / src1[j];
651
}
652
}
653
}
654
else
655
{
656
for (size_t i = 0; i < size.height; ++i)
657
{
658
const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
659
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
660
size_t j = 0;
661
662
for (; j < roiw128; j += 4)
663
{
664
internal::prefetch(src1 + j);
665
666
float32x4_t v_src1 = vld1q_f32(src1 + j);
667
668
vst1q_f32(dst + j, vmulq_n_f32(internal::vrecpq_f32(v_src1), scale));
669
}
670
671
for (; j < roiw64; j += 2)
672
{
673
float32x2_t v_src1 = vld1_f32(src1 + j);
674
675
vst1_f32(dst + j, vmul_n_f32(internal::vrecp_f32(v_src1), scale));
676
}
677
678
for (; j < size.width; j++)
679
{
680
dst[j] = scale / src1[j];
681
}
682
}
683
}
684
#else
685
(void)size;
686
(void)srcBase;
687
(void)srcStride;
688
(void)dstBase;
689
(void)dstStride;
690
(void)scale;
691
#endif
692
}
693
694
} // namespace CAROTENE_NS
695
696