Path: blob/master/3rdparty/carotene/src/add_weighted.cpp
16337 views
/*1* By downloading, copying, installing or using the software you agree to this license.2* If you do not agree to this license, do not download, install,3* copy or use the software.4*5*6* License Agreement7* For Open Source Computer Vision Library8* (3-clause BSD License)9*10* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.11* Third party copyrights are property of their respective owners.12*13* Redistribution and use in source and binary forms, with or without modification,14* are permitted provided that the following conditions are met:15*16* * Redistributions of source code must retain the above copyright notice,17* this list of conditions and the following disclaimer.18*19* * Redistributions in binary form must reproduce the above copyright notice,20* this list of conditions and the following disclaimer in the documentation21* and/or other materials provided with the distribution.22*23* * Neither the names of the copyright holders nor the names of the contributors24* may be used to endorse or promote products derived from this software25* without specific prior written permission.26*27* This software is provided by the copyright holders and contributors "as is" and28* any express or implied warranties, including, but not limited to, the implied29* warranties of merchantability and fitness for a particular purpose are disclaimed.30* In no event shall copyright holders or contributors be liable for any direct,31* indirect, incidental, special, exemplary, or consequential damages32* (including, but not limited to, procurement of substitute goods or services;33* loss of use, data, or profits; or business interruption) however caused34* and on any theory of liability, whether in contract, strict liability,35* or tort (including negligence or otherwise) arising in any way out of36* the use of this software, even if advised of the possibility of such damage.37*/3839#include "common.hpp"40#include "vtransform.hpp"4142namespace CAROTENE_NS {4344#ifdef CAROTENE_NEON4546namespace {4748using namespace internal;4950template <typename T> struct TypeTraits;51template <> struct TypeTraits< u8> { typedef u16 wide; typedef u8 unsign; typedef uint8x16_t vec128; };52template <> struct TypeTraits< s8> { typedef s16 wide; typedef u8 unsign; typedef int8x16_t vec128; };53template <> struct TypeTraits<u16> { typedef u32 wide; typedef u8 narrow; typedef u16 unsign; typedef uint16x8_t vec128; };54template <> struct TypeTraits<s16> { typedef s32 wide; typedef s8 narrow; typedef u16 unsign; typedef int16x8_t vec128; };55template <> struct TypeTraits<u32> { typedef u64 wide; typedef u16 narrow; typedef u32 unsign; typedef uint32x4_t vec128; };56template <> struct TypeTraits<s32> { typedef s64 wide; typedef s16 narrow; typedef u32 unsign; typedef int32x4_t vec128; };57template <> struct TypeTraits<f32> { typedef f64 wide; typedef float32x4_t vec128; };5859template <typename T> struct wAdd60{61typedef T type;6263f32 alpha, beta, gamma;64typedef typename TypeTraits<T>::wide wtype;65wAdd<wtype> wideAdd;66wAdd(f32 _alpha, f32 _beta, f32 _gamma):67alpha(_alpha), beta(_beta), gamma(_gamma),68wideAdd(_alpha, _beta, _gamma) {}6970void operator() (const typename VecTraits<T>::vec128 & v_src0,71const typename VecTraits<T>::vec128 & v_src1,72typename VecTraits<T>::vec128 & v_dst) const73{74typename VecTraits<wtype>::vec128 vrl, vrh;75wideAdd(vmovl( vget_low(v_src0)), vmovl( vget_low(v_src1)), vrl);76wideAdd(vmovl(vget_high(v_src0)), vmovl(vget_high(v_src1)), vrh);7778v_dst = vcombine(vqmovn(vrl), vqmovn(vrh));79}8081void operator() (const typename VecTraits<T>::vec64 & v_src0,82const typename VecTraits<T>::vec64 & v_src1,83typename VecTraits<T>::vec64 & v_dst) const84{85typename VecTraits<wtype>::vec128 vr;86wideAdd(vmovl(v_src0), vmovl(v_src1), vr);8788v_dst = vqmovn(vr);89}9091void operator() (const T * src0, const T * src1, T * dst) const92{93dst[0] = saturate_cast<T>(alpha*src0[0] + beta*src1[0] + gamma);94}95};9697template <> struct wAdd<s32>98{99typedef s32 type;100101f32 alpha, beta, gamma;102float32x4_t valpha, vbeta, vgamma;103wAdd(f32 _alpha, f32 _beta, f32 _gamma):104alpha(_alpha), beta(_beta), gamma(_gamma)105{106valpha = vdupq_n_f32(_alpha);107vbeta = vdupq_n_f32(_beta);108vgamma = vdupq_n_f32(_gamma + 0.5);109}110111void operator() (const typename VecTraits<s32>::vec128 & v_src0,112const typename VecTraits<s32>::vec128 & v_src1,113typename VecTraits<s32>::vec128 & v_dst) const114{115float32x4_t vs1 = vcvtq_f32_s32(v_src0);116float32x4_t vs2 = vcvtq_f32_s32(v_src1);117118vs1 = vmlaq_f32(vgamma, vs1, valpha);119vs1 = vmlaq_f32(vs1, vs2, vbeta);120v_dst = vcvtq_s32_f32(vs1);121}122123void operator() (const typename VecTraits<s32>::vec64 & v_src0,124const typename VecTraits<s32>::vec64 & v_src1,125typename VecTraits<s32>::vec64 & v_dst) const126{127float32x2_t vs1 = vcvt_f32_s32(v_src0);128float32x2_t vs2 = vcvt_f32_s32(v_src1);129130vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));131vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));132v_dst = vcvt_s32_f32(vs1);133}134135void operator() (const s32 * src0, const s32 * src1, s32 * dst) const136{137dst[0] = saturate_cast<s32>(alpha*src0[0] + beta*src1[0] + gamma);138}139};140141template <> struct wAdd<u32>142{143typedef u32 type;144145f32 alpha, beta, gamma;146float32x4_t valpha, vbeta, vgamma;147wAdd(f32 _alpha, f32 _beta, f32 _gamma):148alpha(_alpha), beta(_beta), gamma(_gamma)149{150valpha = vdupq_n_f32(_alpha);151vbeta = vdupq_n_f32(_beta);152vgamma = vdupq_n_f32(_gamma + 0.5);153}154155void operator() (const typename VecTraits<u32>::vec128 & v_src0,156const typename VecTraits<u32>::vec128 & v_src1,157typename VecTraits<u32>::vec128 & v_dst) const158{159float32x4_t vs1 = vcvtq_f32_u32(v_src0);160float32x4_t vs2 = vcvtq_f32_u32(v_src1);161162vs1 = vmlaq_f32(vgamma, vs1, valpha);163vs1 = vmlaq_f32(vs1, vs2, vbeta);164v_dst = vcvtq_u32_f32(vs1);165}166167void operator() (const typename VecTraits<u32>::vec64 & v_src0,168const typename VecTraits<u32>::vec64 & v_src1,169typename VecTraits<u32>::vec64 & v_dst) const170{171float32x2_t vs1 = vcvt_f32_u32(v_src0);172float32x2_t vs2 = vcvt_f32_u32(v_src1);173174vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));175vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));176v_dst = vcvt_u32_f32(vs1);177}178179void operator() (const u32 * src0, const u32 * src1, u32 * dst) const180{181dst[0] = saturate_cast<u32>(alpha*src0[0] + beta*src1[0] + gamma);182}183};184185template <> struct wAdd<f32>186{187typedef f32 type;188189f32 alpha, beta, gamma;190float32x4_t valpha, vbeta, vgamma;191wAdd(f32 _alpha, f32 _beta, f32 _gamma):192alpha(_alpha), beta(_beta), gamma(_gamma)193{194valpha = vdupq_n_f32(_alpha);195vbeta = vdupq_n_f32(_beta);196vgamma = vdupq_n_f32(_gamma + 0.5);197}198199void operator() (const typename VecTraits<f32>::vec128 & v_src0,200const typename VecTraits<f32>::vec128 & v_src1,201typename VecTraits<f32>::vec128 & v_dst) const202{203float32x4_t vs1 = vmlaq_f32(vgamma, v_src0, valpha);204v_dst = vmlaq_f32(vs1, v_src1, vbeta);205}206207void operator() (const typename VecTraits<f32>::vec64 & v_src0,208const typename VecTraits<f32>::vec64 & v_src1,209typename VecTraits<f32>::vec64 & v_dst) const210{211float32x2_t vs1 = vmla_f32(vget_low(vgamma), v_src0, vget_low(valpha));212v_dst = vmla_f32(vs1, v_src1, vget_low(vbeta));213214}215216void operator() (const f32 * src0, const f32 * src1, f32 * dst) const217{218dst[0] = alpha*src0[0] + beta*src1[0] + gamma;219}220};221222} // namespace223224#define IMPL_ADDWEIGHTED(type) \225void addWeighted(const Size2D &size, \226const type * src0Base, ptrdiff_t src0Stride, \227const type * src1Base, ptrdiff_t src1Stride, \228type * dstBase, ptrdiff_t dstStride, \229f32 alpha, f32 beta, f32 gamma) \230{ \231internal::assertSupportedConfiguration(); \232wAdd<type> wgtAdd(alpha, \233beta, \234gamma); \235internal::vtransform(size, \236src0Base, src0Stride, \237src1Base, src1Stride, \238dstBase, dstStride, \239wgtAdd); \240}241242#else243244#define IMPL_ADDWEIGHTED(type) \245void addWeighted(const Size2D &, \246const type *, ptrdiff_t, \247const type *, ptrdiff_t, \248type *, ptrdiff_t, \249f32, f32, f32) \250{ \251internal::assertSupportedConfiguration(); \252}253254#endif255256IMPL_ADDWEIGHTED(u8)257IMPL_ADDWEIGHTED(s8)258IMPL_ADDWEIGHTED(u16)259IMPL_ADDWEIGHTED(s16)260IMPL_ADDWEIGHTED(u32)261IMPL_ADDWEIGHTED(s32)262IMPL_ADDWEIGHTED(f32)263264} // namespace CAROTENE_NS265266267