Path: blob/master/3rdparty/carotene/src/convolution.cpp
16337 views
/*1* By downloading, copying, installing or using the software you agree to this license.2* If you do not agree to this license, do not download, install,3* copy or use the software.4*5*6* License Agreement7* For Open Source Computer Vision Library8* (3-clause BSD License)9*10* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.11* Third party copyrights are property of their respective owners.12*13* Redistribution and use in source and binary forms, with or without modification,14* are permitted provided that the following conditions are met:15*16* * Redistributions of source code must retain the above copyright notice,17* this list of conditions and the following disclaimer.18*19* * Redistributions in binary form must reproduce the above copyright notice,20* this list of conditions and the following disclaimer in the documentation21* and/or other materials provided with the distribution.22*23* * Neither the names of the copyright holders nor the names of the contributors24* may be used to endorse or promote products derived from this software25* without specific prior written permission.26*27* This software is provided by the copyright holders and contributors "as is" and28* any express or implied warranties, including, but not limited to, the implied29* warranties of merchantability and fitness for a particular purpose are disclaimed.30* In no event shall copyright holders or contributors be liable for any direct,31* indirect, incidental, special, exemplary, or consequential damages32* (including, but not limited to, procurement of substitute goods or services;33* loss of use, data, or profits; or business interruption) however caused34* and on any theory of liability, whether in contract, strict liability,35* or tort (including negligence or otherwise) arising in any way out of36* the use of this software, even if advised of the possibility of such damage.37*/3839#include "common.hpp"40#include "saturate_cast.hpp"4142namespace CAROTENE_NS {4344bool isConvolutionSupported(const Size2D &size, const Size2D &ksize,45BORDER_MODE border)46{47return isSupportedConfiguration() && size.width >= 8 &&48(border == BORDER_MODE_CONSTANT ||49border == BORDER_MODE_REPLICATE) &&50(ksize.width == 3) && (ksize.height == 3);51}5253#ifdef CAROTENE_NEON5455namespace {5657template <int shift>58int32x4_t vshrq_s32(int32x4_t value)59{60return vshrq_n_s32(value, shift);61}6263template <>64int32x4_t vshrq_s32<0>(int32x4_t value)65{66return value;67}6869} // namespace7071typedef int32x4_t (* vshrq_s32_func)(int32x4_t value);7273#endif7475void convolution(const Size2D &size,76const u8 * srcBase, ptrdiff_t srcStride,77u8 * dstBase, ptrdiff_t dstStride,78BORDER_MODE border, u8 borderValue,79const Size2D & ksize, s16 * kernelBase, u32 scale)80{81internal::assertSupportedConfiguration(isConvolutionSupported(size, ksize, border));82#ifdef CAROTENE_NEON83const uint8x8_t v_zero_u8 = vdup_n_u8(0);84const uint8x8_t v_border = vdup_n_u8(borderValue);85const int32x4_t v_zero_s32 = vdupq_n_s32(0);8687uint8x8_t tprev[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },88tcurr[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },89tnext[3] = { v_zero_u8, v_zero_u8, v_zero_u8 };90uint8x8_t t0 = v_zero_u8, t1 = v_zero_u8, t2 = v_zero_u8;9192ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;93static const vshrq_s32_func vshrq_s32_a[33] =94{95vshrq_s32<0>,96vshrq_s32<1>,97vshrq_s32<2>,98vshrq_s32<3>,99vshrq_s32<4>,100vshrq_s32<5>,101vshrq_s32<6>,102vshrq_s32<7>,103vshrq_s32<8>,104vshrq_s32<9>,105vshrq_s32<10>,106vshrq_s32<11>,107vshrq_s32<12>,108vshrq_s32<13>,109vshrq_s32<14>,110vshrq_s32<15>,111vshrq_s32<16>,112vshrq_s32<17>,113vshrq_s32<18>,114vshrq_s32<19>,115vshrq_s32<20>,116vshrq_s32<21>,117vshrq_s32<22>,118vshrq_s32<23>,119vshrq_s32<24>,120vshrq_s32<25>,121vshrq_s32<26>,122vshrq_s32<27>,123vshrq_s32<28>,124vshrq_s32<29>,125vshrq_s32<30>,126vshrq_s32<31>,127vshrq_s32<32>128};129vshrq_s32_func vshrq_s32_p = vshrq_s32_a[scale];130131for (ptrdiff_t y = 0; y < height; ++y)132{133const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));134const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);135const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));136u8 * drow = internal::getRowPtr(dstBase, dstStride, y);137138u8 prevx[3] = { 0, 0, 0 },139currx[3] = { 0, 0, 0 },140nextx[3] = { 0, 0, 0 };141ptrdiff_t x = 0;142const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);143144// perform vertical convolution145for ( ; x <= bwidth; x += 8)146{147internal::prefetch(srow0 + x);148internal::prefetch(srow1 + x);149internal::prefetch(srow2 + x);150151uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);152uint8x8_t x1 = vld1_u8(srow1 + x);153uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);154155// calculate values for plain CPU part below if needed156if (x + 8 >= bwidth)157{158ptrdiff_t x3 = x == width ? width - 1 : x;159ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);160161if (border == BORDER_MODE_CONSTANT && x4 < 0)162prevx[0] = prevx[1] = prevx[2] = borderValue;163else164{165prevx[0] = srow0 ? srow0[x4] : borderValue;166prevx[1] = srow1[x4] ;167prevx[2] = srow2 ? srow2[x4] : borderValue;168}169170currx[0] = srow0 ? srow0[x3] : borderValue;171currx[1] = srow1[x3] ;172currx[2] = srow2 ? srow2[x3] : borderValue;173}174175// make shift176if (x)177{178tprev[0] = tcurr[0];179tcurr[0] = tnext[0];180181tprev[1] = tcurr[1];182tcurr[1] = tnext[1];183184tprev[2] = tcurr[2];185tcurr[2] = tnext[2];186}187188tnext[0] = x0;189tnext[1] = x1;190tnext[2] = x2;191192// make extrapolation for the first elements193if (!x)194{195// make border196if (border == BORDER_MODE_CONSTANT)197tcurr[0] = tcurr[1] = tcurr[2] = v_border;198else if (border == BORDER_MODE_REPLICATE)199{200tcurr[0] = vdup_n_u8(vget_lane_u8(tnext[0], 0));201tcurr[1] = vdup_n_u8(vget_lane_u8(tnext[1], 0));202tcurr[2] = vdup_n_u8(vget_lane_u8(tnext[2], 0));203}204205continue;206}207208int32x4_t v_dst0 = v_zero_s32, v_dst1 = v_zero_s32;209210{211// combine 3 "shifted" vectors212t0 = vext_u8(tprev[0], tcurr[0], 7);213t1 = tcurr[0];214t2 = vext_u8(tcurr[0], tnext[0], 1);215216int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));217int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));218int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));219220v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[8]);221v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[7]);222v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[6]);223224v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[8]);225v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[7]);226v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[6]);227}228229{230// combine 3 "shifted" vectors231t0 = vext_u8(tprev[1], tcurr[1], 7);232t1 = tcurr[1];233t2 = vext_u8(tcurr[1], tnext[1], 1);234235int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));236int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));237int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));238239v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[5]);240v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[4]);241v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[3]);242243v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[5]);244v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[4]);245v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[3]);246}247248{249// combine 3 "shifted" vectors250t0 = vext_u8(tprev[2], tcurr[2], 7);251t1 = tcurr[2];252t2 = vext_u8(tcurr[2], tnext[2], 1);253254int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));255int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));256int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));257258v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[2]);259v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[1]);260v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[0]);261262v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[2]);263v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[1]);264v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[0]);265}266267268// make scale269v_dst0 = vshrq_s32_p(v_dst0);270v_dst1 = vshrq_s32_p(v_dst1);271272// and add them273vst1_u8(drow + x - 8, vqmovn_u16(vcombine_u16(vqmovun_s32(v_dst0),274vqmovun_s32(v_dst1))));275}276277x -= 8;278if (x == width)279--x;280281for ( ; x < width; ++x)282{283// make extrapolation for the last elements284if (x + 1 >= width)285{286if (border == BORDER_MODE_CONSTANT)287{288nextx[0] = borderValue;289nextx[1] = borderValue;290nextx[2] = borderValue;291}292else if (border == BORDER_MODE_REPLICATE)293{294nextx[0] = srow0[x];295nextx[1] = srow1[x];296nextx[2] = srow2[x];297}298}299else300{301nextx[0] = srow0 ? srow0[x + 1] : borderValue;302nextx[1] = srow1[x + 1] ;303nextx[2] = srow2 ? srow2[x + 1] : borderValue;304}305306s32 val = 0;307for (s32 _y = 0; _y < 3; ++_y)308val += prevx[_y] * kernelBase[(2 - _y) * 3 + 2] +309currx[_y] * kernelBase[(2 - _y) * 3 + 1] +310nextx[_y] * kernelBase[(2 - _y) * 3 + 0];311312drow[x] = internal::saturate_cast<u8>(val >> scale);313314// make shift315prevx[0] = currx[0];316currx[0] = nextx[0];317318prevx[1] = currx[1];319currx[1] = nextx[1];320321prevx[2] = currx[2];322currx[2] = nextx[2];323}324}325#else326(void)size;327(void)srcBase;328(void)srcStride;329(void)dstBase;330(void)dstStride;331(void)border;332(void)borderValue;333(void)ksize;334(void)kernelBase;335(void)scale;336#endif337}338339} // namespace CAROTENE_NS340341342