#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T, typename WT>
struct AddWrap
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vaddq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vadd(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = (T)((WT)src0[0] + (WT)src1[0]);
}
};
template <typename T, typename WT>
struct AddSaturate
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vqaddq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vqadd(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = internal::saturate_cast<T>((WT)src0[0] + (WT)src1[0]);
}
};
}
#endif
void add(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<u8, u16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<u8, u16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride,
s8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<s8, s16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<s8, s16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
u16 * dst = internal::getRowPtr((u16 *)dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
vst1q_u16(dst + j, vaddl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
vst1q_u16(dst + j + 8, vaddl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
vst1q_u16(dst + j + 16, vaddl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
vst1q_u16(dst + j + 24, vaddl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src0 = vld1_u8(src0 + j);
uint8x8_t v_src1 = vld1_u8(src1 + j);
vst1q_u16(dst + j, vaddl_u8(v_src0, v_src1));
}
for (; j < size.width; j++)
dst[j] = (u16)src0[j] + (u16)src1[j];
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void add(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (policy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vqaddq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vqaddq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vqaddq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>((s32)src0[j] + (s32)src1[j]);
}
else
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vaddq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vaddq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vaddq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = (s16)((s32)src0[j] + (s32)src1[j]);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<s16, s32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<s16, s32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const u16 * src0Base, ptrdiff_t src0Stride,
const u16 * src1Base, ptrdiff_t src1Stride,
u16 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<u16, u32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<u16, u32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const s32 * src0Base, ptrdiff_t src0Stride,
const s32 * src1Base, ptrdiff_t src1Stride,
s32 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<s32, s64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<s32, s64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const u32 * src0Base, ptrdiff_t src0Stride,
const u32 * src1Base, ptrdiff_t src1Stride,
u32 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<u32, u64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<u32, u64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<f32, f32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
}