Path: blob/master/3rdparty/carotene/src/accumulate.cpp
16337 views
/*1* By downloading, copying, installing or using the software you agree to this license.2* If you do not agree to this license, do not download, install,3* copy or use the software.4*5*6* License Agreement7* For Open Source Computer Vision Library8* (3-clause BSD License)9*10* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.11* Third party copyrights are property of their respective owners.12*13* Redistribution and use in source and binary forms, with or without modification,14* are permitted provided that the following conditions are met:15*16* * Redistributions of source code must retain the above copyright notice,17* this list of conditions and the following disclaimer.18*19* * Redistributions in binary form must reproduce the above copyright notice,20* this list of conditions and the following disclaimer in the documentation21* and/or other materials provided with the distribution.22*23* * Neither the names of the copyright holders nor the names of the contributors24* may be used to endorse or promote products derived from this software25* without specific prior written permission.26*27* This software is provided by the copyright holders and contributors "as is" and28* any express or implied warranties, including, but not limited to, the implied29* warranties of merchantability and fitness for a particular purpose are disclaimed.30* In no event shall copyright holders or contributors be liable for any direct,31* indirect, incidental, special, exemplary, or consequential damages32* (including, but not limited to, procurement of substitute goods or services;33* loss of use, data, or profits; or business interruption) however caused34* and on any theory of liability, whether in contract, strict liability,35* or tort (including negligence or otherwise) arising in any way out of36* the use of this software, even if advised of the possibility of such damage.37*/383940#include "common.hpp"41#include "vtransform.hpp"4243#include <cstring>4445namespace CAROTENE_NS {4647void accumulate(const Size2D &size,48const u8 *srcBase, ptrdiff_t srcStride,49s16 *dstBase, ptrdiff_t dstStride)50{51internal::assertSupportedConfiguration();52#ifdef CAROTENE_NEON53size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;54size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;5556for (size_t i = 0; i < size.height; ++i)57{58const u8* src = internal::getRowPtr(srcBase, srcStride, i);59s16* dst = internal::getRowPtr(dstBase, dstStride, i);60size_t j = 0;6162for (; j < roiw16; j += 16)63{64internal::prefetch(src + j);65internal::prefetch(dst + j);66uint8x16_t v_src = vld1q_u8(src + j);67int16x8_t v_dst0 = vld1q_s16(dst + j);68int16x8_t v_dst1 = vld1q_s16(dst + j + 8);69int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));70int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));71v_dst0 = vqaddq_s16(v_dst0, v_src0);72v_dst1 = vqaddq_s16(v_dst1, v_src1);73vst1q_s16(dst + j, v_dst0);74vst1q_s16(dst + j + 8, v_dst1);75}76for (; j < roiw8; j += 8)77{78uint8x8_t v_src = vld1_u8(src + j);79int16x8_t v_src16 = vreinterpretq_s16_u16(vmovl_u8(v_src));80int16x8_t v_dst = vld1q_s16(dst + j);81v_dst = vqaddq_s16(v_dst, v_src16);82vst1q_s16(dst + j, v_dst);83}8485for (; j < size.width; j++)86dst[j] = internal::saturate_cast<s16>(src[j] + dst[j]);87}88#else89(void)size;90(void)srcBase;91(void)srcStride;92(void)dstBase;93(void)dstStride;94#endif95}9697#ifdef CAROTENE_NEON9899namespace {100101template <int shift>102void accumulateSquareConst(const Size2D &size,103const u8 *srcBase, ptrdiff_t srcStride,104s16 *dstBase, ptrdiff_t dstStride)105{106size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;107size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;108109for (size_t i = 0; i < size.height; ++i)110{111const u8* src = internal::getRowPtr(srcBase, srcStride, i);112s16* dst = internal::getRowPtr(dstBase, dstStride, i);113size_t j = 0;114115for (; j < roiw16; j += 16)116{117internal::prefetch(src + j);118internal::prefetch(dst + j);119uint8x16_t v_src = vld1q_u8(src + j);120int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);121int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));122int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));123124int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);125v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst0))),126vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst0))));127128v_srclo = vget_low_s16(v_src1);129v_srchi = vget_high_s16(v_src1);130v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst1))),131vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst1))));132133vst1q_s16(dst + j, v_dst0);134vst1q_s16(dst + j + 8, v_dst1);135}136for (; j < roiw8; j += 8)137{138int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));139int16x8_t v_dst = vld1q_s16(dst + j);140int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);141v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst))),142vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst))));143vst1q_s16(dst + j, v_dst);144}145146for (; j < size.width; j++)147{148s32 srcVal = src[j];149dst[j] = internal::saturate_cast<s16>(dst[j] + ((srcVal * srcVal) >> shift));150}151}152}153154template <>155void accumulateSquareConst<0>(const Size2D &size,156const u8 *srcBase, ptrdiff_t srcStride,157s16 *dstBase, ptrdiff_t dstStride)158{159size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;160size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;161162for (size_t i = 0; i < size.height; ++i)163{164const u8* src = internal::getRowPtr(srcBase, srcStride, i);165s16* dst = internal::getRowPtr(dstBase, dstStride, i);166size_t j = 0;167168for (; j < roiw16; j += 16)169{170internal::prefetch(src + j);171internal::prefetch(dst + j);172uint8x16_t v_src = vld1q_u8(src + j);173int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);174int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));175int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));176177int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);178v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst0))),179vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst0))));180181v_srclo = vget_low_s16(v_src1);182v_srchi = vget_high_s16(v_src1);183v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst1))),184vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst1))));185186vst1q_s16(dst + j, v_dst0);187vst1q_s16(dst + j + 8, v_dst1);188}189for (; j < roiw8; j += 8)190{191int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));192int16x8_t v_dst = vld1q_s16(dst + j);193int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);194v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst))),195vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst))));196vst1q_s16(dst + j, v_dst);197}198199for (; j < size.width; j++)200{201s32 srcVal = src[j];202dst[j] = internal::saturate_cast<s16>(dst[j] + srcVal * srcVal);203}204}205}206207typedef void (* accumulateSquareConstFunc)(const Size2D &size,208const u8 *srcBase, ptrdiff_t srcStride,209s16 *dstBase, ptrdiff_t dstStride);210211} // namespace212213#endif214215void accumulateSquare(const Size2D &size,216const u8 *srcBase, ptrdiff_t srcStride,217s16 *dstBase, ptrdiff_t dstStride,218u32 shift)219{220if (shift >= 16)221{222for (size_t i = 0; i < size.height; ++i)223{224s16 * dst = internal::getRowPtr(dstBase, dstStride, i);225std::memset(dst, 0, sizeof(s16) * size.width);226}227return;228}229230internal::assertSupportedConfiguration();231232#ifdef CAROTENE_NEON233// this ugly contruction is needed to avoid:234// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant235// return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);236237accumulateSquareConstFunc funcs[16] =238{239accumulateSquareConst<0>,240accumulateSquareConst<1>,241accumulateSquareConst<2>,242accumulateSquareConst<3>,243accumulateSquareConst<4>,244accumulateSquareConst<5>,245accumulateSquareConst<6>,246accumulateSquareConst<7>,247accumulateSquareConst<8>,248accumulateSquareConst<9>,249accumulateSquareConst<10>,250accumulateSquareConst<11>,251accumulateSquareConst<12>,252accumulateSquareConst<13>,253accumulateSquareConst<14>,254accumulateSquareConst<15>255}, func = funcs[shift];256257func(size, srcBase, srcStride, dstBase, dstStride);258#else259(void)size;260(void)srcBase;261(void)srcStride;262(void)dstBase;263(void)dstStride;264(void)shift;265#endif266}267268#ifdef CAROTENE_NEON269270namespace {271272struct AccumulateWeightedHalf273{274typedef u8 type;275276void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,277uint8x16_t & v_dst) const278{279v_dst = vhaddq_u8(v_src0, v_src1);280}281282void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,283uint8x8_t & v_dst) const284{285v_dst = vhadd_u8(v_src0, v_src1);286}287288void operator() (const u8 * src0, const u8 * src1, u8 * dst) const289{290dst[0] = ((u16)(src0[0]) + src1[0]) >> 1;291}292};293294struct AccumulateWeighted295{296typedef u8 type;297298float alpha, beta;299float32x4_t v_alpha, v_beta;300301explicit AccumulateWeighted(float _alpha) :302alpha(_alpha), beta(1 - _alpha)303{304v_alpha = vdupq_n_f32(alpha);305v_beta = vdupq_n_f32(beta);306}307308void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,309uint8x16_t & v_dst) const310{311uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));312uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));313float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),314v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));315float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),316v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));317uint16x8_t v_dst0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),318vmovn_u32(vcvtq_u32_f32(v_dst1f)));319320v_src0_p = vmovl_u8(vget_high_u8(v_src0));321v_src1_p = vmovl_u8(vget_high_u8(v_src1));322v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),323v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));324v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),325v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));326uint16x8_t v_dst1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),327vmovn_u32(vcvtq_u32_f32(v_dst1f)));328329v_dst = vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1));330}331332void operator() (const uint8x8_t & _v_src0, const uint8x8_t & _v_src1,333uint8x8_t & v_dst) const334{335uint16x8_t v_src0 = vmovl_u8(_v_src0), v_src1 = vmovl_u8(_v_src1);336337float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_beta),338v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))));339float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_beta),340v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))));341uint16x8_t _v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),342vmovn_u32(vcvtq_u32_f32(v_dst1f)));343344v_dst = vmovn_u16(_v_dst);345}346347void operator() (const u8 * src0, const u8 * src1, u8 * dst) const348{349dst[0] = beta * src1[0] + alpha * src0[0];350}351};352353} // namespace354355#endif356357void accumulateWeighted(const Size2D &size,358const u8 *srcBase, ptrdiff_t srcStride,359u8 *dstBase, ptrdiff_t dstStride,360f32 alpha)361{362if (alpha == 0.0f)363return;364if (alpha == 1.0f)365{366for (size_t i = 0; i < size.height; ++i)367{368const u8 * src = internal::getRowPtr(srcBase, srcStride, i);369u8 * dst = internal::getRowPtr(dstBase, dstStride, i);370std::memcpy(dst, src, sizeof(u8) * size.width);371}372return;373}374375internal::assertSupportedConfiguration();376377#ifdef CAROTENE_NEON378// in this case we can use the following scheme:379// dst[p] = (src[p] + dst[p]) >> 1380// which is faster381if (alpha == 0.5f)382{383internal::vtransform(size,384srcBase, srcStride,385dstBase, dstStride,386dstBase, dstStride,387AccumulateWeightedHalf());388389return;390}391392internal::vtransform(size,393srcBase, srcStride,394dstBase, dstStride,395dstBase, dstStride,396AccumulateWeighted(alpha));397#else398(void)size;399(void)srcBase;400(void)srcStride;401(void)dstBase;402(void)dstStride;403(void)alpha;404#endif405}406407} //namespace CAROTENE_NS408409410