Path: blob/master/3rdparty/carotene/src/gaussian_blur.cpp
16337 views
/*1* By downloading, copying, installing or using the software you agree to this license.2* If you do not agree to this license, do not download, install,3* copy or use the software.4*5*6* License Agreement7* For Open Source Computer Vision Library8* (3-clause BSD License)9*10* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.11* Third party copyrights are property of their respective owners.12*13* Redistribution and use in source and binary forms, with or without modification,14* are permitted provided that the following conditions are met:15*16* * Redistributions of source code must retain the above copyright notice,17* this list of conditions and the following disclaimer.18*19* * Redistributions in binary form must reproduce the above copyright notice,20* this list of conditions and the following disclaimer in the documentation21* and/or other materials provided with the distribution.22*23* * Neither the names of the copyright holders nor the names of the contributors24* may be used to endorse or promote products derived from this software25* without specific prior written permission.26*27* This software is provided by the copyright holders and contributors "as is" and28* any express or implied warranties, including, but not limited to, the implied29* warranties of merchantability and fitness for a particular purpose are disclaimed.30* In no event shall copyright holders or contributors be liable for any direct,31* indirect, incidental, special, exemplary, or consequential damages32* (including, but not limited to, procurement of substitute goods or services;33* loss of use, data, or profits; or business interruption) however caused34* and on any theory of liability, whether in contract, strict liability,35* or tort (including negligence or otherwise) arising in any way out of36* the use of this software, even if advised of the possibility of such damage.37*/3839#include "common.hpp"40#include "saturate_cast.hpp"41#include "separable_filter.hpp"4243namespace CAROTENE_NS {4445bool isGaussianBlur3x3Supported(const Size2D &size, BORDER_MODE border)46{47return isSupportedConfiguration() && size.width >= 8 &&48(border == BORDER_MODE_CONSTANT ||49border == BORDER_MODE_REPLICATE);50}5152void gaussianBlur3x3(const Size2D &size,53const u8 * srcBase, ptrdiff_t srcStride,54u8 * dstBase, ptrdiff_t dstStride,55BORDER_MODE border, u8 borderValue)56{57internal::assertSupportedConfiguration(isGaussianBlur3x3Supported(size, border));58#ifdef CAROTENE_NEON59const uint16x8_t v_border_x4 = vdupq_n_u16(borderValue << 2);60const uint16x8_t v_zero = vdupq_n_u16(0);61const uint8x8_t v_border = vdup_n_u8(borderValue);6263uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;64uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;6566ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;6768for (ptrdiff_t y = 0; y < height; ++y)69{70const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));71const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);72const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));73u8 * drow = internal::getRowPtr(dstBase, dstStride, y);7475s16 prevx = 0, currx = 0, nextx = 0;76ptrdiff_t x = 0;77const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);7879// perform vertical convolution80for ( ; x <= bwidth; x += 8)81{82internal::prefetch(srow0 + x);83internal::prefetch(srow1 + x);84internal::prefetch(srow2 + x);8586uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);87uint8x8_t x1 = vld1_u8(srow1 + x);88uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);8990// calculate values for plain CPU part below if needed91if (x + 8 >= bwidth)92{93ptrdiff_t x3 = x == width ? width - 1 : x;94ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);9596if (border == BORDER_MODE_CONSTANT && x4 < 0)97prevx = borderValue;98else99prevx = (srow2 ? srow2[x4] : borderValue) + (srow1[x4] << 1) + (srow0 ? srow0[x4] : borderValue);100101currx = (srow2 ? srow2[x3] : borderValue) + (srow1[x3] << 1) + (srow0 ? srow0[x3] : borderValue);102}103104// make shift105if (x)106{107tprev = tcurr;108tcurr = tnext;109}110111// and calculate next value112tnext = vaddq_u16(vaddl_u8(x0, x2), vshll_n_u8(x1, 1));113114// make extrapolation for the first elements115if (!x)116{117// make border118if (border == BORDER_MODE_CONSTANT)119tcurr = v_border_x4;120else if (border == BORDER_MODE_REPLICATE)121tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));122123continue;124}125126// combine 3 "shifted" vectors127t0 = vextq_u16(tprev, tcurr, 7);128t1 = tcurr;129t2 = vextq_u16(tcurr, tnext, 1);130131// and add them132t0 = vqaddq_u16(vshlq_n_u16(t1, 1), vqaddq_u16(t0, t2));133vst1_u8(drow + x - 8, vshrn_n_u16(t0, 4));134}135136x -= 8;137if (x == width)138--x;139140for ( ; x < width; ++x)141{142// make extrapolation for the last elements143if (x + 1 >= width)144{145if (border == BORDER_MODE_CONSTANT)146nextx = borderValue << 2;147else if (border == BORDER_MODE_REPLICATE)148nextx = srow2[x] + (srow1[x] << 1) + srow0[x];149}150else151nextx = (srow2 ? srow2[x + 1] : borderValue) +152(srow1[x + 1] << 1) +153(srow0 ? srow0[x + 1] : borderValue);154155f32 val = (prevx + (currx << 1) + nextx) >> 4;156drow[x] = internal::saturate_cast<u8>((s32)val);157158// make shift159prevx = currx;160currx = nextx;161}162}163#else164(void)srcBase;165(void)srcStride;166(void)dstBase;167(void)dstStride;168(void)borderValue;169#endif170}171172bool isGaussianBlur3x3MarginSupported(const Size2D &size, BORDER_MODE border, Margin borderMargin)173{174return isSeparableFilter3x3Supported(size, border, 0, 0, borderMargin);175}176177void gaussianBlur3x3Margin(const Size2D &size,178const u8 * srcBase, ptrdiff_t srcStride,179u8 * dstBase, ptrdiff_t dstStride,180BORDER_MODE border, u8 borderValue, Margin borderMargin)181{182internal::assertSupportedConfiguration(isGaussianBlur3x3MarginSupported(size, border, borderMargin));183#ifdef CAROTENE_NEON184internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3U8_121>::process(185size, srcBase, srcStride, dstBase, dstStride,1860, 0, border, borderValue, borderMargin);187#else188(void)srcBase;189(void)srcStride;190(void)dstBase;191(void)dstStride;192(void)borderValue;193#endif194}195196bool isGaussianBlur5x5Supported(const Size2D &size, s32 cn, BORDER_MODE border)197{198return isSupportedConfiguration() &&199cn > 0 && cn <= 4 &&200size.width >= 8 && size.height >= 2 &&201(border == BORDER_MODE_CONSTANT ||202border == BORDER_MODE_REFLECT101 ||203border == BORDER_MODE_REFLECT ||204border == BORDER_MODE_REPLICATE ||205border == BORDER_MODE_WRAP);206}207208void gaussianBlur5x5(const Size2D &size, s32 cn,209const u8 * srcBase, ptrdiff_t srcStride,210u8 * dstBase, ptrdiff_t dstStride,211BORDER_MODE borderType, u8 borderValue, Margin borderMargin)212{213internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));214#ifdef CAROTENE_NEON215size_t colsn = size.width * cn;216217std::vector<u8> _tmp;218u8 *tmp = 0;219if (borderType == BORDER_MODE_CONSTANT)220{221_tmp.assign(colsn + 4*cn, borderValue);222tmp = &_tmp[cn << 1];223}224225ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;226ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;227ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;228ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;229230//1-line buffer231std::vector<u16> _buf(cn * (size.width + 4) + 32 / sizeof(u16));232u16* lane = internal::alignPtr(&_buf[cn << 1], 32);233234if (borderType == BORDER_MODE_CONSTANT)235for (s32 k = 0; k < cn; ++k)236{237lane[-cn+k] = borderValue;238lane[-cn-cn+k] = borderValue;239lane[colsn+k] = borderValue;240lane[colsn+cn+k] = borderValue;241}242243uint8x8_t vc6u8 = vmov_n_u8(6);244uint16x8_t vc6u16 = vmovq_n_u16(6);245uint16x8_t vc4u16 = vmovq_n_u16(4);246247for (size_t i = 0; i < size.height; ++i)248{249u8* dst = internal::getRowPtr(dstBase, dstStride, i);250//vertical convolution251ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);252ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);253ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);254ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);255256const u8* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;257const u8* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;258const u8* ln2 = internal::getRowPtr(srcBase, srcStride, i);259const u8* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;260const u8* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;261262size_t x = 0;263for (; x <= colsn - 8; x += 8)264{265internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));266uint8x8_t v0 = vld1_u8(ln0+x);267uint8x8_t v1 = vld1_u8(ln1+x);268uint8x8_t v2 = vld1_u8(ln2+x);269uint8x8_t v3 = vld1_u8(ln3+x);270uint8x8_t v4 = vld1_u8(ln4+x);271272uint16x8_t v = vaddl_u8(v0, v4);273uint16x8_t v13 = vaddl_u8(v1, v3);274275v = vmlal_u8(v, v2, vc6u8);276v = vmlaq_u16(v, v13, vc4u16);277278vst1q_u16(lane + x, v);279}280for (; x < colsn; ++x)281lane[x] = ln0[x] + ln4[x] + u16(4) * (ln1[x] + ln3[x]) + u16(6) * ln2[x];282283//left&right borders284if (borderType != BORDER_MODE_CONSTANT)285for (s32 k = 0; k < cn; ++k)286{287lane[-cn+k] = lane[idx_l1 + k];288lane[-cn-cn+k] = lane[idx_l2 + k];289290lane[colsn+k] = lane[idx_r1 + k];291lane[colsn+cn+k] = lane[idx_r2 + k];292}293294//horizontal convolution295x = 0;296switch(cn)297{298case 1:299for (; x <= colsn - 8; x += 8)300{301internal::prefetch(lane + x);302303uint16x8_t lane0 = vld1q_u16(lane + x - 2);304uint16x8_t lane4 = vld1q_u16(lane + x + 2);305uint16x8_t lane1 = vld1q_u16(lane + x - 1);306uint16x8_t lane3 = vld1q_u16(lane + x + 1);307uint16x8_t lane2 = vld1q_u16(lane + x + 0);308309uint16x8_t ln04 = vaddq_u16(lane0, lane4);310uint16x8_t ln13 = vaddq_u16(lane1, lane3);311312uint16x8_t ln042 = vmlaq_u16(ln04, lane2, vc6u16);313uint16x8_t lsw = vmlaq_u16(ln042, ln13, vc4u16);314315uint8x8_t ls = vrshrn_n_u16(lsw, 8);316317vst1_u8(dst + x, ls);318}319break;320case 2:321for (; x <= colsn - 8*2; x += 8*2)322{323internal::prefetch(lane + x);324325u16* lidx0 = lane + x - 2*2;326u16* lidx1 = lane + x - 1*2;327u16* lidx3 = lane + x + 1*2;328u16* lidx4 = lane + x + 2*2;329#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)330__asm__ __volatile__ (331"vld2.16 {d0, d2}, [%[in0]]! \n\t"332"vld2.16 {d1, d3}, [%[in0]] \n\t"333"vld2.16 {d8, d10}, [%[in4]]! \n\t"334"vld2.16 {d9, d11}, [%[in4]] \n\t"335"vadd.i16 q0, q4 \n\t"336"vadd.i16 q1, q5 \n\t"337"vld2.16 {d16, d18}, [%[in1]]! \n\t"338"vld2.16 {d17, d19}, [%[in1]] \n\t"339"vld2.16 {d8, d10}, [%[in3]]! \n\t"340"vld2.16 {d9, d11}, [%[in3]] \n\t"341"vadd.i16 q4, q8 \n\t"342"vadd.i16 q5, q9 \n\t"343"vld2.16 {d16, d18}, [%[in2]] \n\t"344"vld2.16 {d17, d19}, [%[in22]] \n\t"345"vmla.i16 q0, q4, %q[c4] \n\t"346"vmla.i16 q1, q5, %q[c4] \n\t"347"vmla.i16 q0, q8, %q[c6] \n\t"348"vmla.i16 q1, q9, %q[c6] \n\t"349"vrshrn.u16 d8, q0, #8 \n\t"350"vrshrn.u16 d9, q1, #8 \n\t"351"vst2.8 {d8-d9}, [%[out]] \n\t"352: [in0] "=r" (lidx0),353[in1] "=r" (lidx1),354[in3] "=r" (lidx3),355[in4] "=r" (lidx4)356: [out] "r" (dst + x),357"0" (lidx0),358"1" (lidx1),359"2" (lidx3),360"3" (lidx4),361[in2] "r" (lane + x),362[in22] "r" (lane + x + 4*2),363[c4] "w" (vc4u16), [c6] "w" (vc6u16)364: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"365);366#else367uint16x8x2_t vLane0 = vld2q_u16(lidx0);368uint16x8x2_t vLane1 = vld2q_u16(lidx1);369uint16x8x2_t vLane2 = vld2q_u16(lane + x);370uint16x8x2_t vLane3 = vld2q_u16(lidx3);371uint16x8x2_t vLane4 = vld2q_u16(lidx4);372373uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);374uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]);375376uint16x8_t vSum_4_8 = vaddq_u16(vLane1.val[0], vLane3.val[0]);377uint16x8_t vSum_5_9 = vaddq_u16(vLane1.val[1], vLane3.val[1]);378379vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16);380vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16);381vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);382vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16);383384uint8x8x2_t vRes;385vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);386vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);387vst2_u8(dst + x, vRes);388#endif389}390break;391case 3:392for (; x <= colsn - 8*3; x += 8*3)393{394internal::prefetch(lane + x);395396u16* lidx0 = lane + x - 2*3;397u16* lidx1 = lane + x - 1*3;398u16* lidx3 = lane + x + 1*3;399u16* lidx4 = lane + x + 2*3;400#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)401__asm__ __volatile__ (402"vld3.16 {d0, d2, d4}, [%[in0]]! \n\t"403"vld3.16 {d1, d3, d5}, [%[in0]] \n\t"404"vld3.16 {d8, d10, d12}, [%[in4]]! \n\t"405"vld3.16 {d9, d11, d13}, [%[in4]] \n\t"406"vadd.i16 q0, q4 \n\t"407"vadd.i16 q1, q5 \n\t"408"vadd.i16 q2, q6 \n\t"409"vld3.16 {d16, d18, d20}, [%[in1]]! \n\t"410"vld3.16 {d17, d19, d21}, [%[in1]] \n\t"411"vld3.16 {d8, d10, d12}, [%[in3]]! \n\t"412"vld3.16 {d9, d11, d13}, [%[in3]] \n\t"413"vadd.i16 q4, q8 \n\t"414"vadd.i16 q5, q9 \n\t"415"vadd.i16 q6, q10 \n\t"416"vld3.16 {d16, d18, d20}, [%[in2]] \n\t"417"vld3.16 {d17, d19, d21}, [%[in22]] \n\t"418"vmla.i16 q0, q4, %q[c4] \n\t"419"vmla.i16 q1, q5, %q[c4] \n\t"420"vmla.i16 q2, q6, %q[c4] \n\t"421"vmla.i16 q0, q8, %q[c6] \n\t"422"vmla.i16 q1, q9, %q[c6] \n\t"423"vmla.i16 q2, q10, %q[c6] \n\t"424"vrshrn.u16 d8, q0, #8 \n\t"425"vrshrn.u16 d9, q1, #8 \n\t"426"vrshrn.u16 d10, q2, #8 \n\t"427"vst3.8 {d8-d10}, [%[out]] \n\t"428: [in0] "=r" (lidx0),429[in1] "=r" (lidx1),430[in3] "=r" (lidx3),431[in4] "=r" (lidx4)432: [out] "r" (dst + x),433"0" (lidx0),434"1" (lidx1),435"2" (lidx3),436"3" (lidx4),437[in2] "r" (lane + x),438[in22] "r" (lane + x + 4*3),439[c4] "w" (vc4u16), [c6] "w" (vc6u16)440: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"441);442#else443uint16x8x3_t vLane0 = vld3q_u16(lidx0);444uint16x8x3_t vLane1 = vld3q_u16(lidx1);445uint16x8x3_t vLane2 = vld3q_u16(lane + x);446uint16x8x3_t vLane3 = vld3q_u16(lidx3);447uint16x8x3_t vLane4 = vld3q_u16(lidx4);448449uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);450uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]);451uint16x8_t vSum_2_6 = vaddq_u16(vLane0.val[2], vLane4.val[2]);452453uint16x8_t vSum_3_1 = vaddq_u16(vLane3.val[0], vLane1.val[0]);454uint16x8_t vSum_4_2 = vaddq_u16(vLane3.val[1], vLane1.val[1]);455uint16x8_t vSum_5_6 = vaddq_u16(vLane3.val[2], vLane1.val[2]);456457vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_3_1, vc4u16);458vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_4_2, vc4u16);459vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_5_6, vc4u16);460461vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);462vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16);463vSum_2_6 = vmlaq_u16(vSum_2_6, vLane2.val[2], vc6u16);464465uint8x8x3_t vRes;466vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);467vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);468vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8);469470vst3_u8(dst + x, vRes);471#endif472}473break;474case 4:475for (; x <= colsn - 8*4; x += 8*4)476{477internal::prefetch(lane + x);478internal::prefetch(lane + x + 16);479480u16* lidx0 = lane + x - 2*4;481u16* lidx1 = lane + x - 1*4;482u16* lidx3 = lane + x + 1*4;483u16* lidx4 = lane + x + 2*4;484#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)485__asm__ __volatile__ (486"vld4.16 {d0, d2, d4, d6}, [%[in0]]! \n\t"487"vld4.16 {d1, d3, d5, d7}, [%[in0]] \n\t"488"vld4.16 {d8, d10, d12, d14}, [%[in4]]! \n\t"489"vld4.16 {d9, d11, d13, d15}, [%[in4]] \n\t"490"vadd.i16 q0, q4 \n\t"491"vadd.i16 q1, q5 \n\t"492"vadd.i16 q2, q6 \n\t"493"vadd.i16 q3, q7 \n\t"494"vld4.16 {d16, d18, d20, d22}, [%[in1]]! \n\t"495"vld4.16 {d17, d19, d21, d23}, [%[in1]] \n\t"496"vld4.16 {d8, d10, d12, d14}, [%[in3]]! \n\t"497"vld4.16 {d9, d11, d13, d15}, [%[in3]] \n\t"498"vadd.i16 q4, q8 \n\t"499"vadd.i16 q5, q9 \n\t"500"vadd.i16 q6, q10 \n\t"501"vadd.i16 q7, q11 \n\t"502"vld4.16 {d16, d18, d20, d22}, [%[in2],:256] \n\t"503"vld4.16 {d17, d19, d21, d23}, [%[in22],:256] \n\t"504"vmla.i16 q0, q4, %q[c4] \n\t"505"vmla.i16 q1, q5, %q[c4] \n\t"506"vmla.i16 q2, q6, %q[c4] \n\t"507"vmla.i16 q3, q7, %q[c4] \n\t"508"vmla.i16 q0, q8, %q[c6] \n\t"509"vmla.i16 q1, q9, %q[c6] \n\t"510"vmla.i16 q2, q10, %q[c6] \n\t"511"vmla.i16 q3, q11, %q[c6] \n\t"512"vrshrn.u16 d8, q0, #8 \n\t"513"vrshrn.u16 d9, q1, #8 \n\t"514"vrshrn.u16 d10, q2, #8 \n\t"515"vrshrn.u16 d11, q3, #8 \n\t"516"vst4.8 {d8-d11}, [%[out]] \n\t"517: [in0] "=r" (lidx0),518[in1] "=r" (lidx1),519[in3] "=r" (lidx3),520[in4] "=r" (lidx4)521: [out] "r" (dst + x),522"0" (lidx0),523"1" (lidx1),524"2" (lidx3),525"3" (lidx4),526[in2] "r" (lane + x),527[in22] "r" (lane + x + 4*4),528[c4] "w" (vc4u16), [c6] "w" (vc6u16)529: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"530);531#else532uint16x8x4_t vLane0 = vld4q_u16(lidx0);533uint16x8x4_t vLane2 = vld4q_u16(lidx4);534uint16x8x4_t vLane4 = vld4q_u16(lidx1);535uint16x8x4_t vLane6 = vld4q_u16(lidx3);536uint16x8x4_t vLane8 = vld4q_u16(lane + x);537538uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane2.val[0]);539uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane2.val[1]);540uint16x8_t vSum_2_6 = vaddq_u16(vLane0.val[2], vLane2.val[2]);541uint16x8_t vSum_3_7 = vaddq_u16(vLane0.val[3], vLane2.val[3]);542543uint16x8_t vSum_4_8 = vaddq_u16(vLane4.val[0], vLane6.val[0]);544uint16x8_t vSum_5_9 = vaddq_u16(vLane4.val[1], vLane6.val[1]);545uint16x8_t vSum_6_10 = vaddq_u16(vLane4.val[2], vLane6.val[2]);546uint16x8_t vSum_7_11 = vaddq_u16(vLane4.val[3], vLane6.val[3]);547548vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16);549vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16);550vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_6_10, vc4u16);551vSum_3_7 = vmlaq_u16(vSum_3_7, vSum_7_11, vc4u16);552553vSum_0_4 = vmlaq_u16(vSum_0_4, vLane8.val[0], vc6u16);554vSum_1_5 = vmlaq_u16(vSum_1_5, vLane8.val[1], vc6u16);555vSum_2_6 = vmlaq_u16(vSum_2_6, vLane8.val[2], vc6u16);556vSum_3_7 = vmlaq_u16(vSum_3_7, vLane8.val[3], vc6u16);557558uint8x8x4_t vRes;559vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);560vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);561vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8);562vRes.val[3] = vrshrn_n_u16(vSum_3_7, 8);563564vst4_u8(dst + x, vRes);565#endif566}567break;568}569for (s32 h = 0; h < cn; ++h)570{571u16* ln = lane + h;572u8* dt = dst + h;573for (size_t k = x; k < colsn; k += cn)574{575dt[k] = (u8)((ln[k-2*cn] + ln[k+2*cn]576+ u16(4) * (ln[k-cn] + ln[k+cn])577+ u16(6) * ln[k] + (1 << 7)) >> 8);578}579}580}581#else582(void)srcBase;583(void)srcStride;584(void)dstBase;585(void)dstStride;586(void)borderValue;587(void)borderMargin;588#endif589}590591void gaussianBlur5x5(const Size2D &size, s32 cn,592const u16 * srcBase, ptrdiff_t srcStride,593u16 * dstBase, ptrdiff_t dstStride,594BORDER_MODE borderType, u16 borderValue, Margin borderMargin)595{596internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));597#ifdef CAROTENE_NEON598size_t colsn = size.width * cn;599600std::vector<u16> _tmp;601u16 *tmp = 0;602if (borderType == BORDER_MODE_CONSTANT)603{604_tmp.assign(colsn + 4*cn, borderValue);605tmp = &_tmp[cn << 1];606}607608ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;609ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;610ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;611ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;612613//1-line buffer614std::vector<u32> _buf(cn * (size.width + 4) + 32 / sizeof(u32));615u32* lane = internal::alignPtr(&_buf[cn << 1], 32);616617if (borderType == BORDER_MODE_CONSTANT)618for (s32 k = 0; k < cn; ++k)619{620lane[-cn+k] = borderValue;621lane[-cn-cn+k] = borderValue;622lane[colsn+k] = borderValue;623lane[colsn+cn+k] = borderValue;624}625626uint16x4_t vc6u16 = vmov_n_u16(6);627uint32x4_t vc6u32 = vmovq_n_u32(6);628uint32x4_t vc4u32 = vmovq_n_u32(4);629630for (size_t i = 0; i < size.height; ++i)631{632u16* dst = internal::getRowPtr(dstBase, dstStride, i);633//vertical convolution634ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);635ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);636ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);637ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);638639const u16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;640const u16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;641const u16* ln2 = internal::getRowPtr(srcBase, srcStride, i);642const u16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;643const u16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;644645size_t x = 0;646for (; x <= colsn - 4; x += 4)647{648internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));649uint16x4_t v0 = vld1_u16(ln0+x);650uint16x4_t v1 = vld1_u16(ln1+x);651uint16x4_t v2 = vld1_u16(ln2+x);652uint16x4_t v3 = vld1_u16(ln3+x);653uint16x4_t v4 = vld1_u16(ln4+x);654655uint32x4_t v = vaddl_u16(v0, v4);656uint32x4_t v13 = vaddl_u16(v1, v3);657658v = vmlal_u16(v, v2, vc6u16);659v = vmlaq_u32(v, v13, vc4u32);660661vst1q_u32(lane + x, v);662}663for (; x < colsn; ++x)664lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];665666//left&right borders667if (borderType != BORDER_MODE_CONSTANT)668for (s32 k = 0; k < cn; ++k)669{670lane[-cn+k] = lane[idx_l1 + k];671lane[-cn-cn+k] = lane[idx_l2 + k];672673lane[colsn+k] = lane[idx_r1 + k];674lane[colsn+cn+k] = lane[idx_r2 + k];675}676677//horizontal convolution678x = 0;679for (; x <= colsn - 4; x += 4)680{681internal::prefetch(lane + x);682683uint32x4_t lane0 = vld1q_u32(lane + x - 2);684uint32x4_t lane4 = vld1q_u32(lane + x + 2);685uint32x4_t lane1 = vld1q_u32(lane + x - 1);686uint32x4_t lane3 = vld1q_u32(lane + x + 1);687uint32x4_t lane2 = vld1q_u32(lane + x + 0);688689uint32x4_t ln04 = vaddq_u32(lane0, lane4);690uint32x4_t ln13 = vaddq_u32(lane1, lane3);691692uint32x4_t ln042 = vmlaq_u32(ln04, lane2, vc6u32);693uint32x4_t lsw = vmlaq_u32(ln042, ln13, vc4u32);694695uint16x4_t ls = vrshrn_n_u32(lsw, 8);696697vst1_u16(dst + x, ls);698}699for (s32 h = 0; h < cn; ++h)700{701u32* ln = lane + h;702u16* dt = dst + h;703for (size_t k = x; k < colsn; k += cn)704{705dt[k] = (u16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8);706}707}708}709#else710(void)srcBase;711(void)srcStride;712(void)dstBase;713(void)dstStride;714(void)borderValue;715(void)borderMargin;716#endif717}718719void gaussianBlur5x5(const Size2D &size, s32 cn,720const s16 * srcBase, ptrdiff_t srcStride,721s16 * dstBase, ptrdiff_t dstStride,722BORDER_MODE borderType, s16 borderValue, Margin borderMargin)723{724internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));725#ifdef CAROTENE_NEON726size_t colsn = size.width * cn;727728std::vector<s16> _tmp;729s16 *tmp = 0;730if (borderType == BORDER_MODE_CONSTANT)731{732_tmp.assign(colsn + 4*cn, borderValue);733tmp = &_tmp[cn << 1];734}735736ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;737ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;738ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;739ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;740741//1-line buffer742std::vector<s32> _buf(cn * (size.width + 4) + 32 / sizeof(s32));743s32* lane = internal::alignPtr(&_buf[cn << 1], 32);744745if (borderType == BORDER_MODE_CONSTANT)746for (s32 k = 0; k < cn; ++k)747{748lane[-cn+k] = borderValue;749lane[-cn-cn+k] = borderValue;750lane[colsn+k] = borderValue;751lane[colsn+cn+k] = borderValue;752}753754int16x4_t vc6s16 = vmov_n_s16(6);755int32x4_t vc6s32 = vmovq_n_s32(6);756int32x4_t vc4s32 = vmovq_n_s32(4);757758for (size_t i = 0; i < size.height; ++i)759{760s16* dst = internal::getRowPtr(dstBase, dstStride, i);761//vertical convolution762ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);763ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);764ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);765ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);766767const s16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;768const s16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;769const s16* ln2 = internal::getRowPtr(srcBase, srcStride, i);770const s16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;771const s16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;772773size_t x = 0;774for (; x <= colsn - 4; x += 4)775{776internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));777int16x4_t v0 = vld1_s16(ln0+x);778int16x4_t v1 = vld1_s16(ln1+x);779int16x4_t v2 = vld1_s16(ln2+x);780int16x4_t v3 = vld1_s16(ln3+x);781int16x4_t v4 = vld1_s16(ln4+x);782783int32x4_t v = vaddl_s16(v0, v4);784int32x4_t v13 = vaddl_s16(v1, v3);785786v = vmlal_s16(v, v2, vc6s16);787v = vmlaq_s32(v, v13, vc4s32);788789vst1q_s32(lane + x, v);790}791for (; x < colsn; ++x)792lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];793794//left&right borders795if (borderType != BORDER_MODE_CONSTANT)796for (s32 k = 0; k < cn; ++k)797{798lane[-cn+k] = lane[idx_l1 + k];799lane[-cn-cn+k] = lane[idx_l2 + k];800801lane[colsn+k] = lane[idx_r1 + k];802lane[colsn+cn+k] = lane[idx_r2 + k];803}804805//horizontal convolution806x = 0;807switch(cn)808{809case 1:810case 2:811case 3:812for (; x <= colsn - 4; x += 4)813{814internal::prefetch(lane + x);815816int32x4_t lane0 = vld1q_s32(lane + x - 2);817int32x4_t lane4 = vld1q_s32(lane + x + 2);818int32x4_t lane1 = vld1q_s32(lane + x - 1);819int32x4_t lane3 = vld1q_s32(lane + x + 1);820int32x4_t lane2 = vld1q_s32(lane + x + 0);821822int32x4_t ln04 = vaddq_s32(lane0, lane4);823int32x4_t ln13 = vaddq_s32(lane1, lane3);824825int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);826int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);827828int16x4_t ls = vrshrn_n_s32(lsw, 8);829830vst1_s16(dst + x, ls);831}832break;833case 4:834/* for (; x <= colsn - 4*4; x += 4*4)835{836internal::prefetch(lane + x);837internal::prefetch(lane + x + 16);838839ptrdiff_t* lidx0 = lane + x - 2*4;840ptrdiff_t* lidx1 = lane + x - 1*4;841ptrdiff_t* lidx3 = lane + x + 1*4;842ptrdiff_t* lidx4 = lane + x + 2*4;843844__asm__ __volatile__ (845"vld4.32 {d0, d2, d4, d6}, [%[in0]]! \n\t"846"vld4.32 {d1, d3, d5, d7}, [%[in0]] \n\t"847"vld4.32 {d8, d10, d12, d14}, [%[in4]]! \n\t"848"vld4.32 {d9, d11, d13, d15}, [%[in4]] \n\t"849"vadd.i32 q0, q4 \n\t"850"vadd.i32 q1, q5 \n\t"851"vadd.i32 q2, q6 \n\t"852"vadd.i32 q3, q7 \n\t"853"vld4.32 {d16, d18, d20, d22}, [%[in1]]! \n\t"854"vld4.32 {d17, d19, d21, d23}, [%[in1]] \n\t"855"vld4.32 {d8, d10, d12, d14}, [%[in3]]! \n\t"856"vld4.32 {d9, d11, d13, d15}, [%[in3]] \n\t"857"vadd.i32 q4, q8 \n\t"858"vadd.i32 q5, q9 \n\t"859"vadd.i32 q6, q10 \n\t"860"vadd.i32 q7, q11 \n\t"861"vld4.32 {d16, d18, d20, d22}, [%[in2],:256] \n\t"862"vld4.32 {d17, d19, d21, d23}, [%[in22],:256] \n\t"863"vmla.i32 q0, q4, %q[c4] \n\t"864"vmla.i32 q1, q5, %q[c4] \n\t"865"vmla.i32 q2, q6, %q[c4] \n\t"866"vmla.i32 q3, q7, %q[c4] \n\t"867"vmla.i32 q0, q8, %q[c6] \n\t"868"vmla.i32 q1, q9, %q[c6] \n\t"869"vmla.i32 q2, q10, %q[c6] \n\t"870"vmla.i32 q3, q11, %q[c6] \n\t"871"vrshrn.i32 d8, q0, #8 \n\t"872"vrshrn.i32 d9, q1, #8 \n\t"873"vrshrn.i32 d10, q2, #8 \n\t"874"vrshrn.i32 d11, q3, #8 \n\t"875"vst4.16 {d8-d11}, [%[out]] \n\t"876: [in0] "=r" (lidx0),877[in1] "=r" (lidx1),878[in3] "=r" (lidx3),879[in4] "=r" (lidx4)880: [out] "r" (dst + x),881"0" (lidx0),882"1" (lidx1),883"2" (lidx3),884"3" (lidx4),885[in2] "r" (lane + x),886[in22] "r" (lane + x + 4*2),887[c4] "w" (vc4s32), [c6] "w" (vc6s32)888: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"889);890*/891for (; x <= colsn - 4; x += 4)892{893internal::prefetch(lane + x);894895int32x4_t lane0 = vld1q_s32(lane + x - 2);896int32x4_t lane4 = vld1q_s32(lane + x + 2);897int32x4_t lane1 = vld1q_s32(lane + x - 1);898int32x4_t lane3 = vld1q_s32(lane + x + 1);899int32x4_t lane2 = vld1q_s32(lane + x + 0);900901int32x4_t ln04 = vaddq_s32(lane0, lane4);902int32x4_t ln13 = vaddq_s32(lane1, lane3);903904int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);905int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);906907int16x4_t ls = vrshrn_n_s32(lsw, 8);908909vst1_s16(dst + x, ls);910}911break;912}913for (s32 h = 0; h < cn; ++h)914{915s32* ln = lane + h;916s16* dt = dst + h;917for (size_t k = x; k < colsn; k += cn)918{919dt[k] = (s16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8);920}921}922}923#else924(void)srcBase;925(void)srcStride;926(void)dstBase;927(void)dstStride;928(void)borderValue;929(void)borderMargin;930#endif931}932933void gaussianBlur5x5(const Size2D &size, s32 cn,934const s32 * srcBase, ptrdiff_t srcStride,935s32 * dstBase, ptrdiff_t dstStride,936BORDER_MODE borderType, s32 borderValue, Margin borderMargin)937{938internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));939#ifdef CAROTENE_NEON940size_t colsn = size.width * cn;941942std::vector<s32> _tmp;943s32 *tmp = 0;944if (borderType == BORDER_MODE_CONSTANT)945{946_tmp.assign(colsn + 4*cn, borderValue);947tmp = &_tmp[cn << 1];948}949950ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;951ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;952ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;953ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;954955//1-line buffer956std::vector<s32> _buf(cn * (size.width + 4) + 32 / sizeof(s32));957s32* lane = internal::alignPtr(&_buf[cn << 1], 32);958959if (borderType == BORDER_MODE_CONSTANT)960for (s32 k = 0; k < cn; ++k)961{962lane[-cn+k] = borderValue;963lane[-cn-cn+k] = borderValue;964lane[colsn+k] = borderValue;965lane[colsn+cn+k] = borderValue;966}967968int32x4_t vc6s32 = vmovq_n_s32(6);969int32x4_t vc4s32 = vmovq_n_s32(4);970971for (size_t i = 0; i < size.height; ++i)972{973s32* dst = internal::getRowPtr(dstBase, dstStride, i);974//vertical convolution975ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);976ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);977ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);978ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);979980const s32* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;981const s32* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;982const s32* ln2 = internal::getRowPtr(srcBase, srcStride, i);983const s32* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;984const s32* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;985986size_t x = 0;987for (; x <= colsn - 4; x += 4)988{989internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));990int32x4_t v0 = vld1q_s32(ln0+x);991int32x4_t v1 = vld1q_s32(ln1+x);992int32x4_t v2 = vld1q_s32(ln2+x);993int32x4_t v3 = vld1q_s32(ln3+x);994int32x4_t v4 = vld1q_s32(ln4+x);995996int32x4_t v = vaddq_s32(v0, v4);997int32x4_t v13 = vaddq_s32(v1, v3);998999v = vmlaq_s32(v, v2, vc6s32);1000v = vmlaq_s32(v, v13, vc4s32);10011002vst1q_s32(lane + x, v);1003}1004for (; x < colsn; ++x)1005lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];10061007//left&right borders1008if (borderType != BORDER_MODE_CONSTANT)1009for (s32 k = 0; k < cn; ++k)1010{1011lane[-cn+k] = lane[idx_l1 + k];1012lane[-cn-cn+k] = lane[idx_l2 + k];10131014lane[colsn+k] = lane[idx_r1 + k];1015lane[colsn+cn+k] = lane[idx_r2 + k];1016}10171018//horizontal convolution1019x = 0;1020for (; x <= colsn - 4; x += 4)1021{1022internal::prefetch(lane + x);10231024int32x4_t lane0 = vld1q_s32(lane + x - 2);1025int32x4_t lane4 = vld1q_s32(lane + x + 2);1026int32x4_t lane1 = vld1q_s32(lane + x - 1);1027int32x4_t lane3 = vld1q_s32(lane + x + 1);1028int32x4_t lane2 = vld1q_s32(lane + x + 0);10291030int32x4_t ln04 = vaddq_s32(lane0, lane4);1031int32x4_t ln13 = vaddq_s32(lane1, lane3);10321033int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);1034int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);10351036vst1q_s32(dst + x, lsw);1037}1038for (s32 h = 0; h < cn; ++h)1039{1040s32* ln = lane + h;1041s32* dt = dst + h;1042for (size_t k = x; k < colsn; k += cn)1043{1044dt[k] = ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k];1045}1046}1047}1048#else1049(void)srcBase;1050(void)srcStride;1051(void)dstBase;1052(void)dstStride;1053(void)borderValue;1054(void)borderMargin;1055#endif1056}10571058} // namespace CAROTENE_NS105910601061