Path: blob/master/modules/imgproc/src/accum.simd.hpp
16354 views
// This file is part of OpenCV project.1// It is subject to the license terms in the LICENSE file found in the top-level directory2// of this distribution and at http://opencv.org/license.html.34#include "opencv2/core/hal/intrin.hpp"56#define DEF_ACC_INT_FUNCS(suffix, type, acctype) \7void acc_##suffix(const type* src, acctype* dst, \8const uchar* mask, int len, int cn) \9{ \10CV_CPU_DISPATCH(acc_simd_, (src, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \11} \12void accSqr_##suffix(const type* src, acctype* dst, \13const uchar* mask, int len, int cn) \14{ \15CV_CPU_DISPATCH(accSqr_simd_, (src, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \16} \17void accProd_##suffix(const type* src1, const type* src2, \18acctype* dst, const uchar* mask, int len, int cn) \19{ \20CV_CPU_DISPATCH(accProd_simd_, (src1, src2, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \21} \22void accW_##suffix(const type* src, acctype* dst, \23const uchar* mask, int len, int cn, double alpha) \24{ \25CV_CPU_DISPATCH(accW_simd_, (src, dst, mask, len, cn, alpha), CV_CPU_DISPATCH_MODES_ALL); \26}27#define DEF_ACC_FLT_FUNCS(suffix, type, acctype) \28void acc_##suffix(const type* src, acctype* dst, \29const uchar* mask, int len, int cn) \30{ \31CV_CPU_DISPATCH(acc_simd_, (src, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \32} \33void accSqr_##suffix(const type* src, acctype* dst, \34const uchar* mask, int len, int cn) \35{ \36CV_CPU_DISPATCH(accSqr_simd_, (src, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \37} \38void accProd_##suffix(const type* src1, const type* src2, \39acctype* dst, const uchar* mask, int len, int cn) \40{ \41CV_CPU_DISPATCH(accProd_simd_, (src1, src2, dst, mask, len, cn), CV_CPU_DISPATCH_MODES_ALL); \42} \43void accW_##suffix(const type* src, acctype* dst, \44const uchar* mask, int len, int cn, double alpha) \45{ \46CV_CPU_DISPATCH(accW_simd_, (src, dst, mask, len, cn, alpha), CV_CPU_DISPATCH_MODES_ALL); \47}48#define DECLARATE_ACC_FUNCS(suffix, type, acctype) \49void acc_##suffix(const type* src, acctype* dst, const uchar* mask, int len, int cn); \50void accSqr_##suffix(const type* src, acctype* dst, const uchar* mask, int len, int cn); \51void accProd_##suffix(const type* src1, const type* src2, acctype* dst, const uchar* mask, int len, int cn); \52void accW_##suffix(const type* src, acctype* dst, const uchar* mask, int len, int cn, double alpha);535455namespace cv {5657DECLARATE_ACC_FUNCS(8u32f, uchar, float)58DECLARATE_ACC_FUNCS(8u64f, uchar, double)59DECLARATE_ACC_FUNCS(16u32f, ushort, float)60DECLARATE_ACC_FUNCS(16u64f, ushort, double)61DECLARATE_ACC_FUNCS(32f, float, float)62DECLARATE_ACC_FUNCS(32f64f, float, double)63DECLARATE_ACC_FUNCS(64f, double, double)6465CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN6667void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn);68void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn);69void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn);70void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn);71void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn);72void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn);73void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn);74void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn);75void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn);76void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn);77void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn);78void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int cn);79void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn);80void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn);81void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar* mask, int len, int cn);82void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uchar* mask, int len, int cn);83void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn);84void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn);85void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn);86void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn);87void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn);88void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn, double alpha);89void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn, double alpha);90void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha);91void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha);92void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha);93void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha);94void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha);9596#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY97// todo: remove AVX branch after support it by universal intrinsics98template <typename T, typename AT>99void acc_general_(const T* src, AT* dst, const uchar* mask, int len, int cn, int start = 0 )100{101int i = start;102103if( !mask )104{105len *= cn;106#if CV_ENABLE_UNROLLED107for( ; i <= len - 4; i += 4 )108{109AT t0, t1;110t0 = src[i] + dst[i];111t1 = src[i+1] + dst[i+1];112dst[i] = t0; dst[i+1] = t1;113114t0 = src[i+2] + dst[i+2];115t1 = src[i+3] + dst[i+3];116dst[i+2] = t0; dst[i+3] = t1;117}118#endif119for( ; i < len; i++ )120{121dst[i] += src[i];122}123}124else125{126src += (i * cn);127dst += (i * cn);128for( ; i < len; i++, src += cn, dst += cn )129{130if( mask[i] )131{132for( int k = 0; k < cn; k++ )133{134dst[k] += src[k];135}136}137}138}139#if CV_AVX && !CV_AVX2140_mm256_zeroupper();141#elif CV_SIMD142vx_cleanup();143#endif144}145146template<typename T, typename AT> void147accSqr_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, int start = 0 )148{149int i = start;150151if( !mask )152{153len *= cn;154#if CV_ENABLE_UNROLLED155for( ; i <= len - 4; i += 4 )156{157AT t0, t1;158t0 = (AT)src[i]*src[i] + dst[i];159t1 = (AT)src[i+1]*src[i+1] + dst[i+1];160dst[i] = t0; dst[i+1] = t1;161162t0 = (AT)src[i+2]*src[i+2] + dst[i+2];163t1 = (AT)src[i+3]*src[i+3] + dst[i+3];164dst[i+2] = t0; dst[i+3] = t1;165}166#endif167for( ; i < len; i++ )168{169dst[i] += (AT)src[i]*src[i];170}171}172else173{174src += (i * cn);175dst += (i * cn);176for( ; i < len; i++, src += cn, dst += cn )177{178if( mask[i] )179{180for( int k = 0; k < cn; k++ )181{182dst[k] += (AT)src[k]*src[k];183}184}185}186}187#if CV_AVX && !CV_AVX2188_mm256_zeroupper();189#elif CV_SIMD190vx_cleanup();191#endif192}193194template<typename T, typename AT> void195accProd_general_( const T* src1, const T* src2, AT* dst, const uchar* mask, int len, int cn, int start = 0 )196{197int i = start;198199if( !mask )200{201len *= cn;202#if CV_ENABLE_UNROLLED203for( ; i <= len - 4; i += 4 )204{205AT t0, t1;206t0 = (AT)src1[i]*src2[i] + dst[i];207t1 = (AT)src1[i+1]*src2[i+1] + dst[i+1];208dst[i] = t0; dst[i+1] = t1;209210t0 = (AT)src1[i+2]*src2[i+2] + dst[i+2];211t1 = (AT)src1[i+3]*src2[i+3] + dst[i+3];212dst[i+2] = t0; dst[i+3] = t1;213}214#endif215for( ; i < len; i++ )216{217dst[i] += (AT)src1[i]*src2[i];218}219}220else221{222src1 += (i * cn);223src2 += (i * cn);224dst += (i * cn);225for( ; i < len; i++, src1 += cn, src2 += cn, dst += cn )226{227if( mask[i] )228{229for( int k = 0; k < cn; k++ )230{231dst[k] += (AT)src1[k]*src2[k];232}233}234}235}236#if CV_AVX && !CV_AVX2237_mm256_zeroupper();238#elif CV_SIMD239vx_cleanup();240#endif241}242243template<typename T, typename AT> void244accW_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, double alpha, int start = 0 )245{246AT a = (AT)alpha, b = 1 - a;247int i = start;248249if( !mask )250{251len *= cn;252#if CV_ENABLE_UNROLLED253for( ; i <= len - 4; i += 4 )254{255AT t0, t1;256t0 = src[i]*a + dst[i]*b;257t1 = src[i+1]*a + dst[i+1]*b;258dst[i] = t0; dst[i+1] = t1;259260t0 = src[i+2]*a + dst[i+2]*b;261t1 = src[i+3]*a + dst[i+3]*b;262dst[i+2] = t0; dst[i+3] = t1;263}264#endif265for( ; i < len; i++ )266{267dst[i] = src[i]*a + dst[i]*b;268}269}270else271{272src += (i * cn);273dst += (i * cn);274for( ; i < len; i++, src += cn, dst += cn )275{276if( mask[i] )277{278for( int k = 0; k < cn; k++ )279{280dst[k] = src[k]*a + dst[k]*b;281}282}283}284}285#if CV_AVX && !CV_AVX2286_mm256_zeroupper();287#elif CV_SIMD288vx_cleanup();289#endif290}291void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)292{293int x = 0;294#if CV_SIMD295const int cVectorWidth = v_uint8::nlanes;296const int step = v_float32::nlanes;297298if (!mask)299{300int size = len * cn;301for (; x <= size - cVectorWidth; x += cVectorWidth)302{303v_uint8 v_src = vx_load(src + x);304v_uint16 v_src0, v_src1;305v_expand(v_src, v_src0, v_src1);306307v_uint32 v_src00, v_src01, v_src10, v_src11;308v_expand(v_src0, v_src00, v_src01);309v_expand(v_src1, v_src10, v_src11);310311v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));312v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));313v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));314v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));315}316}317else318{319v_uint8 v_0 = vx_setall_u8(0);320if (cn == 1)321{322for ( ; x <= len - cVectorWidth; x += cVectorWidth)323{324v_uint8 v_mask = vx_load(mask + x);325v_mask = ~(v_0 == v_mask);326v_uint8 v_src = vx_load(src + x);327v_src = v_src & v_mask;328v_uint16 v_src0, v_src1;329v_expand(v_src, v_src0, v_src1);330331v_uint32 v_src00, v_src01, v_src10, v_src11;332v_expand(v_src0, v_src00, v_src01);333v_expand(v_src1, v_src10, v_src11);334335v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));336v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));337v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));338v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));339}340}341else if (cn == 3)342{343for ( ; x <= len - cVectorWidth; x += cVectorWidth)344{345v_uint8 v_mask = vx_load(mask + x);346v_mask = ~(v_0 == v_mask);347v_uint8 v_src0, v_src1, v_src2;348v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2);349v_src0 = v_src0 & v_mask;350v_src1 = v_src1 & v_mask;351v_src2 = v_src2 & v_mask;352v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;353v_expand(v_src0, v_src00, v_src01);354v_expand(v_src1, v_src10, v_src11);355v_expand(v_src2, v_src20, v_src21);356357v_uint32 v_src000, v_src001, v_src010, v_src011;358v_uint32 v_src100, v_src101, v_src110, v_src111;359v_uint32 v_src200, v_src201, v_src210, v_src211;360v_expand(v_src00, v_src000, v_src001);361v_expand(v_src01, v_src010, v_src011);362v_expand(v_src10, v_src100, v_src101);363v_expand(v_src11, v_src110, v_src111);364v_expand(v_src20, v_src200, v_src201);365v_expand(v_src21, v_src210, v_src211);366367v_float32 v_dst000, v_dst001, v_dst010, v_dst011;368v_float32 v_dst100, v_dst101, v_dst110, v_dst111;369v_float32 v_dst200, v_dst201, v_dst210, v_dst211;370v_load_deinterleave(dst + (x * cn), v_dst000, v_dst100, v_dst200);371v_load_deinterleave(dst + ((x + step) * cn), v_dst001, v_dst101, v_dst201);372v_load_deinterleave(dst + ((x + step * 2) * cn), v_dst010, v_dst110, v_dst210);373v_load_deinterleave(dst + ((x + step * 3) * cn), v_dst011, v_dst111, v_dst211);374375v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000));376v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100));377v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200));378v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001));379v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101));380v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201));381v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010));382v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110));383v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210));384v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011));385v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111));386v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211));387388v_store_interleave(dst + (x * cn), v_dst000, v_dst100, v_dst200);389v_store_interleave(dst + ((x + step) * cn), v_dst001, v_dst101, v_dst201);390v_store_interleave(dst + ((x + step * 2) * cn), v_dst010, v_dst110, v_dst210);391v_store_interleave(dst + ((x + step * 3) * cn), v_dst011, v_dst111, v_dst211);392}393}394}395#endif // CV_SIMD396acc_general_(src, dst, mask, len, cn, x);397}398399void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn)400{401int x = 0;402#if CV_SIMD403const int cVectorWidth = v_uint16::nlanes;404const int step = v_float32::nlanes;405406if (!mask)407{408int size = len * cn;409for (; x <= size - cVectorWidth; x += cVectorWidth)410{411v_uint16 v_src = vx_load(src + x);412v_uint32 v_src0, v_src1;413v_expand(v_src, v_src0, v_src1);414415v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));416v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));417}418}419else420{421if (cn == 1)422{423v_uint16 v_0 = vx_setall_u16(0);424for ( ; x <= len - cVectorWidth; x += cVectorWidth)425{426v_uint16 v_mask = vx_load_expand(mask + x);427v_mask = ~(v_mask == v_0);428v_uint16 v_src = vx_load(src + x);429v_src = v_src & v_mask;430v_uint32 v_src0, v_src1;431v_expand(v_src, v_src0, v_src1);432433v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));434v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));435}436}437else if (cn == 3)438{439v_uint16 v_0 = vx_setall_u16(0);440for ( ; x <= len - cVectorWidth; x += cVectorWidth)441{442v_uint16 v_mask = vx_load_expand(mask + x);443v_mask = ~(v_mask == v_0);444v_uint16 v_src0, v_src1, v_src2;445v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);446v_src0 = v_src0 & v_mask;447v_src1 = v_src1 & v_mask;448v_src2 = v_src2 & v_mask;449v_uint32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;450v_expand(v_src0, v_src00, v_src01);451v_expand(v_src1, v_src10, v_src11);452v_expand(v_src2, v_src20, v_src21);453454v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;455v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);456v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);457458v_dst00 += v_cvt_f32(v_reinterpret_as_s32(v_src00));459v_dst01 += v_cvt_f32(v_reinterpret_as_s32(v_src01));460v_dst10 += v_cvt_f32(v_reinterpret_as_s32(v_src10));461v_dst11 += v_cvt_f32(v_reinterpret_as_s32(v_src11));462v_dst20 += v_cvt_f32(v_reinterpret_as_s32(v_src20));463v_dst21 += v_cvt_f32(v_reinterpret_as_s32(v_src21));464465v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);466v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);467}468}469}470#endif // CV_SIMD471acc_general_(src, dst, mask, len, cn, x);472}473// todo: remove AVX branch after support it by universal intrinsics474void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)475{476int x = 0;477#if CV_SIMD478const int cVectorWidth = v_uint16::nlanes;479const int step = v_float32::nlanes;480481if (!mask)482{483int size = len * cn;484#if CV_AVX && !CV_AVX2485for (; x <= size - 8 ; x += 8)486{487__m256 v_src = _mm256_loadu_ps(src + x);488__m256 v_dst = _mm256_loadu_ps(dst + x);489v_dst = _mm256_add_ps(v_src, v_dst);490_mm256_storeu_ps(dst + x, v_dst);491}492#else493for (; x <= size - cVectorWidth; x += cVectorWidth)494{495v_store(dst + x, vx_load(dst + x) + vx_load(src + x));496v_store(dst + x + step, vx_load(dst + x + step) + vx_load(src + x + step));497}498#endif // CV_AVX && !CV_AVX2499}500else501{502v_float32 v_0 = vx_setzero_f32();503if (cn == 1)504{505for ( ; x <= len - cVectorWidth ; x += cVectorWidth)506{507v_uint16 v_masku16 = vx_load_expand(mask + x);508v_uint32 v_masku320, v_masku321;509v_expand(v_masku16, v_masku320, v_masku321);510v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));511v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));512513v_store(dst + x, vx_load(dst + x) + (vx_load(src + x) & v_mask0));514v_store(dst + x + step, vx_load(dst + x + step) + (vx_load(src + x + step) & v_mask1));515}516}517else if (cn == 3)518{519for ( ; x <= len - cVectorWidth ; x += cVectorWidth)520{521v_uint16 v_masku16 = vx_load_expand(mask + x);522v_uint32 v_masku320, v_masku321;523v_expand(v_masku16, v_masku320, v_masku321);524v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));525v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));526527v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;528v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);529v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);530v_src00 = v_src00 & v_mask0;531v_src01 = v_src01 & v_mask1;532v_src10 = v_src10 & v_mask0;533v_src11 = v_src11 & v_mask1;534v_src20 = v_src20 & v_mask0;535v_src21 = v_src21 & v_mask1;536537v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;538v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);539v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);540541v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);542v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);543}544}545}546#endif // CV_SIMD547acc_general_(src, dst, mask, len, cn, x);548}549550void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)551{552int x = 0;553#if CV_SIMD_64F554const int cVectorWidth = v_uint8::nlanes;555const int step = v_float64::nlanes;556557if (!mask)558{559int size = len * cn;560for (; x <= size - cVectorWidth; x += cVectorWidth)561{562v_uint8 v_src = vx_load(src + x);563v_uint16 v_int0, v_int1;564v_expand(v_src, v_int0, v_int1);565566v_uint32 v_int00, v_int01, v_int10, v_int11;567v_expand(v_int0, v_int00, v_int01);568v_expand(v_int1, v_int10, v_int11);569570v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00));571v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));572v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01));573v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));574v_float64 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10));575v_float64 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));576v_float64 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11));577v_float64 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));578579v_float64 v_dst0 = vx_load(dst + x);580v_float64 v_dst1 = vx_load(dst + x + step);581v_float64 v_dst2 = vx_load(dst + x + step * 2);582v_float64 v_dst3 = vx_load(dst + x + step * 3);583v_float64 v_dst4 = vx_load(dst + x + step * 4);584v_float64 v_dst5 = vx_load(dst + x + step * 5);585v_float64 v_dst6 = vx_load(dst + x + step * 6);586v_float64 v_dst7 = vx_load(dst + x + step * 7);587588v_dst0 = v_dst0 + v_src0;589v_dst1 = v_dst1 + v_src1;590v_dst2 = v_dst2 + v_src2;591v_dst3 = v_dst3 + v_src3;592v_dst4 = v_dst4 + v_src4;593v_dst5 = v_dst5 + v_src5;594v_dst6 = v_dst6 + v_src6;595v_dst7 = v_dst7 + v_src7;596597v_store(dst + x, v_dst0);598v_store(dst + x + step, v_dst1);599v_store(dst + x + step * 2, v_dst2);600v_store(dst + x + step * 3, v_dst3);601v_store(dst + x + step * 4, v_dst4);602v_store(dst + x + step * 5, v_dst5);603v_store(dst + x + step * 6, v_dst6);604v_store(dst + x + step * 7, v_dst7);605}606}607else608{609v_uint8 v_0 = vx_setall_u8(0);610if (cn == 1)611{612for ( ; x <= len - cVectorWidth; x += cVectorWidth)613{614v_uint8 v_mask = vx_load(mask + x);615v_mask = ~(v_mask == v_0);616v_uint8 v_src = vx_load(src + x);617v_src = v_src & v_mask;618v_uint16 v_int0, v_int1;619v_expand(v_src, v_int0, v_int1);620621v_uint32 v_int00, v_int01, v_int10, v_int11;622v_expand(v_int0, v_int00, v_int01);623v_expand(v_int1, v_int10, v_int11);624625v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00));626v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));627v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01));628v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));629v_float64 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10));630v_float64 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));631v_float64 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11));632v_float64 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));633634v_float64 v_dst0 = vx_load(dst + x);635v_float64 v_dst1 = vx_load(dst + x + step);636v_float64 v_dst2 = vx_load(dst + x + step * 2);637v_float64 v_dst3 = vx_load(dst + x + step * 3);638v_float64 v_dst4 = vx_load(dst + x + step * 4);639v_float64 v_dst5 = vx_load(dst + x + step * 5);640v_float64 v_dst6 = vx_load(dst + x + step * 6);641v_float64 v_dst7 = vx_load(dst + x + step * 7);642643v_dst0 = v_dst0 + v_src0;644v_dst1 = v_dst1 + v_src1;645v_dst2 = v_dst2 + v_src2;646v_dst3 = v_dst3 + v_src3;647v_dst4 = v_dst4 + v_src4;648v_dst5 = v_dst5 + v_src5;649v_dst6 = v_dst6 + v_src6;650v_dst7 = v_dst7 + v_src7;651652v_store(dst + x, v_dst0);653v_store(dst + x + step, v_dst1);654v_store(dst + x + step * 2, v_dst2);655v_store(dst + x + step * 3, v_dst3);656v_store(dst + x + step * 4, v_dst4);657v_store(dst + x + step * 5, v_dst5);658v_store(dst + x + step * 6, v_dst6);659v_store(dst + x + step * 7, v_dst7);660}661}662else if (cn == 3)663{664for ( ; x <= len - cVectorWidth; x += cVectorWidth)665{666v_uint8 v_mask = vx_load(mask + x);667v_mask = ~(v_0 == v_mask);668v_uint8 v_src0, v_src1, v_src2;669v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2);670v_src0 = v_src0 & v_mask;671v_src1 = v_src1 & v_mask;672v_src2 = v_src2 & v_mask;673v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;674v_expand(v_src0, v_src00, v_src01);675v_expand(v_src1, v_src10, v_src11);676v_expand(v_src2, v_src20, v_src21);677678v_uint32 v_src000, v_src001, v_src010, v_src011;679v_uint32 v_src100, v_src101, v_src110, v_src111;680v_uint32 v_src200, v_src201, v_src210, v_src211;681v_expand(v_src00, v_src000, v_src001);682v_expand(v_src01, v_src010, v_src011);683v_expand(v_src10, v_src100, v_src101);684v_expand(v_src11, v_src110, v_src111);685v_expand(v_src20, v_src200, v_src201);686v_expand(v_src21, v_src210, v_src211);687688v_float64 v_src0000, v_src0001, v_src0010, v_src0011, v_src0100, v_src0101, v_src0110, v_src0111;689v_float64 v_src1000, v_src1001, v_src1010, v_src1011, v_src1100, v_src1101, v_src1110, v_src1111;690v_float64 v_src2000, v_src2001, v_src2010, v_src2011, v_src2100, v_src2101, v_src2110, v_src2111;691v_src0000 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src000)));692v_src0001 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src000)));693v_src0010 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src001)));694v_src0011 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src001)));695v_src0100 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src010)));696v_src0101 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src010)));697v_src0110 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src011)));698v_src0111 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src011)));699v_src1000 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src100)));700v_src1001 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src100)));701v_src1010 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src101)));702v_src1011 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src101)));703v_src1100 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src110)));704v_src1101 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src110)));705v_src1110 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src111)));706v_src1111 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src111)));707v_src2000 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src200)));708v_src2001 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src200)));709v_src2010 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src201)));710v_src2011 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src201)));711v_src2100 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src210)));712v_src2101 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src210)));713v_src2110 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src211)));714v_src2111 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src211)));715716v_float64 v_dst0000, v_dst0001, v_dst0010, v_dst0011, v_dst0100, v_dst0101, v_dst0110, v_dst0111;717v_float64 v_dst1000, v_dst1001, v_dst1010, v_dst1011, v_dst1100, v_dst1101, v_dst1110, v_dst1111;718v_float64 v_dst2000, v_dst2001, v_dst2010, v_dst2011, v_dst2100, v_dst2101, v_dst2110, v_dst2111;719v_load_deinterleave(dst + (x * cn), v_dst0000, v_dst1000, v_dst2000);720v_load_deinterleave(dst + ((x + step) * cn), v_dst0001, v_dst1001, v_dst2001);721v_load_deinterleave(dst + ((x + step * 2) * cn), v_dst0010, v_dst1010, v_dst2010);722v_load_deinterleave(dst + ((x + step * 3) * cn), v_dst0011, v_dst1011, v_dst2011);723v_load_deinterleave(dst + ((x + step * 4) * cn), v_dst0100, v_dst1100, v_dst2100);724v_load_deinterleave(dst + ((x + step * 5) * cn), v_dst0101, v_dst1101, v_dst2101);725v_load_deinterleave(dst + ((x + step * 6) * cn), v_dst0110, v_dst1110, v_dst2110);726v_load_deinterleave(dst + ((x + step * 7) * cn), v_dst0111, v_dst1111, v_dst2111);727728v_store_interleave(dst + (x * cn), v_dst0000 + v_src0000, v_dst1000 + v_src1000, v_dst2000 + v_src2000);729v_store_interleave(dst + ((x + step) * cn), v_dst0001 + v_src0001, v_dst1001 + v_src1001, v_dst2001 + v_src2001);730v_store_interleave(dst + ((x + step * 2) * cn), v_dst0010 + v_src0010, v_dst1010 + v_src1010, v_dst2010 + v_src2010);731v_store_interleave(dst + ((x + step * 3) * cn), v_dst0011 + v_src0011, v_dst1011 + v_src1011, v_dst2011 + v_src2011);732v_store_interleave(dst + ((x + step * 4) * cn), v_dst0100 + v_src0100, v_dst1100 + v_src1100, v_dst2100 + v_src2100);733v_store_interleave(dst + ((x + step * 5) * cn), v_dst0101 + v_src0101, v_dst1101 + v_src1101, v_dst2101 + v_src2101);734v_store_interleave(dst + ((x + step * 6) * cn), v_dst0110 + v_src0110, v_dst1110 + v_src1110, v_dst2110 + v_src2110);735v_store_interleave(dst + ((x + step * 7) * cn), v_dst0111 + v_src0111, v_dst1111 + v_src1111, v_dst2111 + v_src2111);736}737}738}739#endif // CV_SIMD_64F740acc_general_(src, dst, mask, len, cn, x);741}742743void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)744{745int x = 0;746#if CV_SIMD_64F747const int cVectorWidth = v_uint16::nlanes;748const int step = v_float64::nlanes;749750if (!mask)751{752int size = len * cn;753for (; x <= size - cVectorWidth; x += cVectorWidth)754{755v_uint16 v_src = vx_load(src + x);756v_uint32 v_int0, v_int1;757v_expand(v_src, v_int0, v_int1);758759v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));760v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));761v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));762v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));763764v_float64 v_dst0 = vx_load(dst + x);765v_float64 v_dst1 = vx_load(dst + x + step);766v_float64 v_dst2 = vx_load(dst + x + step * 2);767v_float64 v_dst3 = vx_load(dst + x + step * 3);768769v_dst0 = v_dst0 + v_src0;770v_dst1 = v_dst1 + v_src1;771v_dst2 = v_dst2 + v_src2;772v_dst3 = v_dst3 + v_src3;773774v_store(dst + x, v_dst0);775v_store(dst + x + step, v_dst1);776v_store(dst + x + step * 2, v_dst2);777v_store(dst + x + step * 3, v_dst3);778}779}780else781{782v_uint16 v_0 = vx_setzero_u16();783if (cn == 1)784{785for ( ; x <= len - cVectorWidth; x += cVectorWidth)786{787v_uint16 v_mask = vx_load_expand(mask + x);788v_mask = ~(v_mask == v_0);789v_uint16 v_src = vx_load(src + x);790v_src = v_src & v_mask;791v_uint32 v_int0, v_int1;792v_expand(v_src, v_int0, v_int1);793794v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));795v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));796v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));797v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));798799v_float64 v_dst0 = vx_load(dst + x);800v_float64 v_dst1 = vx_load(dst + x + step);801v_float64 v_dst2 = vx_load(dst + x + step * 2);802v_float64 v_dst3 = vx_load(dst + x + step * 3);803804v_dst0 = v_dst0 + v_src0;805v_dst1 = v_dst1 + v_src1;806v_dst2 = v_dst2 + v_src2;807v_dst3 = v_dst3 + v_src3;808809v_store(dst + x, v_dst0);810v_store(dst + x + step, v_dst1);811v_store(dst + x + step * 2, v_dst2);812v_store(dst + x + step * 3, v_dst3);813}814}815if (cn == 3)816{817for ( ; x <= len - cVectorWidth; x += cVectorWidth)818{819v_uint16 v_mask = vx_load_expand(mask + x);820v_mask = ~(v_mask == v_0);821v_uint16 v_src0, v_src1, v_src2;822v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);823v_src0 = v_src0 & v_mask;824v_src1 = v_src1 & v_mask;825v_src2 = v_src2 & v_mask;826v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;827v_expand(v_src0, v_int00, v_int01);828v_expand(v_src1, v_int10, v_int11);829v_expand(v_src2, v_int20, v_int21);830831v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));832v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));833v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));834v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));835v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));836v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));837v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));838v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));839v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));840v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));841v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));842v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));843844v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;845v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);846v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);847v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);848v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);849850v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);851v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);852v_store_interleave(dst + (x + step * 2) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);853v_store_interleave(dst + (x + step * 3) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);854}855}856}857#endif // CV_SIMD_64F858acc_general_(src, dst, mask, len, cn, x);859}860861void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)862{863int x = 0;864#if CV_SIMD_64F865const int cVectorWidth = v_float32::nlanes;866const int step = v_float64::nlanes;867868if (!mask)869{870int size = len * cn;871#if CV_AVX && !CV_AVX2872for (; x <= size - 8 ; x += 8)873{874__m256 v_src = _mm256_loadu_ps(src + x);875__m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 0));876__m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 1));877__m256d v_dst0 = _mm256_loadu_pd(dst + x);878__m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);879v_dst0 = _mm256_add_pd(v_src0, v_dst0);880v_dst1 = _mm256_add_pd(v_src1, v_dst1);881_mm256_storeu_pd(dst + x, v_dst0);882_mm256_storeu_pd(dst + x + 4, v_dst1);883}884#else885for (; x <= size - cVectorWidth; x += cVectorWidth)886{887v_float32 v_src = vx_load(src + x);888v_float64 v_src0 = v_cvt_f64(v_src);889v_float64 v_src1 = v_cvt_f64_high(v_src);890891v_store(dst + x, vx_load(dst + x) + v_src0);892v_store(dst + x + step, vx_load(dst + x + step) + v_src1);893}894#endif // CV_AVX && !CV_AVX2895}896else897{898v_uint64 v_0 = vx_setzero_u64();899if (cn == 1)900{901for ( ; x <= len - cVectorWidth ; x += cVectorWidth)902{903v_uint32 v_masku32 = vx_load_expand_q(mask + x);904v_uint64 v_masku640, v_masku641;905v_expand(v_masku32, v_masku640, v_masku641);906v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));907v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));908909v_float32 v_src = vx_load(src + x);910v_float64 v_src0 = v_cvt_f64(v_src) & v_mask0;911v_float64 v_src1 = v_cvt_f64_high(v_src) & v_mask1;912913v_store(dst + x, vx_load(dst + x) + v_src0);914v_store(dst + x + step, vx_load(dst + x + step) + v_src1);915}916}917else if (cn == 3)918{919for ( ; x <= len - cVectorWidth ; x += cVectorWidth)920{921v_uint32 v_masku32 = vx_load_expand_q(mask + x);922v_uint64 v_masku640, v_masku641;923v_expand(v_masku32, v_masku640, v_masku641);924v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));925v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));926927v_float32 v_src0, v_src1, v_src2;928v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);929v_float64 v_src00 = v_cvt_f64(v_src0) & v_mask0;930v_float64 v_src01 = v_cvt_f64_high(v_src0) & v_mask1;931v_float64 v_src10 = v_cvt_f64(v_src1) & v_mask0;932v_float64 v_src11 = v_cvt_f64_high(v_src1) & v_mask1;933v_float64 v_src20 = v_cvt_f64(v_src2) & v_mask0;934v_float64 v_src21 = v_cvt_f64_high(v_src2) & v_mask1;935936v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;937v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);938v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);939940v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);941v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);942}943}944}945#endif // CV_SIMD_64F946acc_general_(src, dst, mask, len, cn, x);947}948949void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)950{951int x = 0;952#if CV_SIMD_64F953const int cVectorWidth = v_float64::nlanes * 2;954const int step = v_float64::nlanes;955956if (!mask)957{958int size = len * cn;959#if CV_AVX && !CV_AVX2960for ( ; x <= size - 4 ; x += 4)961{962__m256d v_src = _mm256_loadu_pd(src + x);963__m256d v_dst = _mm256_loadu_pd(dst + x);964v_dst = _mm256_add_pd(v_dst, v_src);965_mm256_storeu_pd(dst + x, v_dst);966}967#else968for (; x <= size - cVectorWidth; x += cVectorWidth)969{970v_float64 v_src0 = vx_load(src + x);971v_float64 v_src1 = vx_load(src + x + step);972973v_store(dst + x, vx_load(dst + x) + v_src0);974v_store(dst + x + step, vx_load(dst + x + step) + v_src1);975}976#endif // CV_AVX && !CV_AVX2977}978else979{980v_uint64 v_0 = vx_setzero_u64();981if (cn == 1)982{983for ( ; x <= len - cVectorWidth ; x += cVectorWidth)984{985v_uint32 v_masku32 = vx_load_expand_q(mask + x);986v_uint64 v_masku640, v_masku641;987v_expand(v_masku32, v_masku640, v_masku641);988v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));989v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));990991v_float64 v_src0 = vx_load(src + x);992v_float64 v_src1 = vx_load(src + x + step);993994v_store(dst + x, vx_load(dst + x) + (v_src0 & v_mask0));995v_store(dst + x + step, vx_load(dst + x + step) + (v_src1 & v_mask1));996}997}998else if (cn == 3)999{1000for ( ; x <= len - cVectorWidth ; x += cVectorWidth)1001{1002v_uint32 v_masku32 = vx_load_expand_q(mask + x);1003v_uint64 v_masku640, v_masku641;1004v_expand(v_masku32, v_masku640, v_masku641);1005v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));1006v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));10071008v_float64 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;1009v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);1010v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);1011v_src00 = v_src00 & v_mask0;1012v_src01 = v_src01 & v_mask1;1013v_src10 = v_src10 & v_mask0;1014v_src11 = v_src11 & v_mask1;1015v_src20 = v_src20 & v_mask0;1016v_src21 = v_src21 & v_mask1;10171018v_float64 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;1019v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);1020v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);10211022v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);1023v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);1024}1025}1026}1027#endif // CV_SIMD_64F1028acc_general_(src, dst, mask, len, cn, x);1029}10301031// square accumulate optimized by universal intrinsic1032void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)1033{1034int x = 0;1035#if CV_SIMD1036const int cVectorWidth = v_uint8::nlanes;1037const int step = v_float32::nlanes;10381039if (!mask)1040{1041int size = len * cn;1042for (; x <= size - cVectorWidth; x += cVectorWidth)1043{1044v_uint8 v_src = vx_load(src + x);1045v_uint16 v_src0, v_src1;1046v_expand(v_src, v_src0, v_src1);1047v_src0 = v_mul_wrap(v_src0, v_src0);1048v_src1 = v_mul_wrap(v_src1, v_src1);10491050v_uint32 v_src00, v_src01, v_src10, v_src11;1051v_expand(v_src0, v_src00, v_src01);1052v_expand(v_src1, v_src10, v_src11);10531054v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));1055v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));1056v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));1057v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));1058}1059}1060else1061{1062v_uint8 v_0 = vx_setall_u8(0);1063if (cn == 1)1064{1065for ( ; x <= len - cVectorWidth ; x += cVectorWidth)1066{1067v_uint8 v_mask = vx_load(mask + x);1068v_mask = ~(v_0 == v_mask);1069v_uint8 v_src = vx_load(src + x);1070v_src = v_src & v_mask;1071v_uint16 v_src0, v_src1;1072v_expand(v_src, v_src0, v_src1);1073v_src0 = v_mul_wrap(v_src0, v_src0);1074v_src1 = v_mul_wrap(v_src1, v_src1);10751076v_uint32 v_src00, v_src01, v_src10, v_src11;1077v_expand(v_src0, v_src00, v_src01);1078v_expand(v_src1, v_src10, v_src11);10791080v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));1081v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));1082v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));1083v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));1084}1085}1086else if (cn == 3)1087{1088for ( ; x <= len - cVectorWidth ; x += cVectorWidth)1089{1090v_uint8 v_mask = vx_load(mask + x);1091v_mask = ~(v_0 == v_mask);10921093v_uint8 v_src0, v_src1, v_src2;1094v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);1095v_src0 = v_src0 & v_mask;1096v_src1 = v_src1 & v_mask;1097v_src2 = v_src2 & v_mask;10981099v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;1100v_expand(v_src0, v_src00, v_src01);1101v_expand(v_src1, v_src10, v_src11);1102v_expand(v_src2, v_src20, v_src21);1103v_src00 = v_mul_wrap(v_src00, v_src00);1104v_src01 = v_mul_wrap(v_src01, v_src01);1105v_src10 = v_mul_wrap(v_src10, v_src10);1106v_src11 = v_mul_wrap(v_src11, v_src11);1107v_src20 = v_mul_wrap(v_src20, v_src20);1108v_src21 = v_mul_wrap(v_src21, v_src21);11091110v_uint32 v_src000, v_src001, v_src010, v_src011;1111v_uint32 v_src100, v_src101, v_src110, v_src111;1112v_uint32 v_src200, v_src201, v_src210, v_src211;1113v_expand(v_src00, v_src000, v_src001);1114v_expand(v_src01, v_src010, v_src011);1115v_expand(v_src10, v_src100, v_src101);1116v_expand(v_src11, v_src110, v_src111);1117v_expand(v_src20, v_src200, v_src201);1118v_expand(v_src21, v_src210, v_src211);11191120v_float32 v_dst000, v_dst001, v_dst010, v_dst011;1121v_float32 v_dst100, v_dst101, v_dst110, v_dst111;1122v_float32 v_dst200, v_dst201, v_dst210, v_dst211;1123v_load_deinterleave(dst + x * cn, v_dst000, v_dst100, v_dst200);1124v_load_deinterleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);1125v_load_deinterleave(dst + (x + step * 2) * cn, v_dst010, v_dst110, v_dst210);1126v_load_deinterleave(dst + (x + step * 3) * cn, v_dst011, v_dst111, v_dst211);11271128v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000));1129v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001));1130v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010));1131v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011));11321133v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100));1134v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101));1135v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110));1136v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111));11371138v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200));1139v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201));1140v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210));1141v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211));11421143v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200);1144v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);1145v_store_interleave(dst + (x + step * 2) * cn, v_dst010, v_dst110, v_dst210);1146v_store_interleave(dst + (x + step * 3) * cn, v_dst011, v_dst111, v_dst211);1147}1148}1149}1150#endif // CV_SIMD1151accSqr_general_(src, dst, mask, len, cn, x);1152}11531154void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn)1155{1156int x = 0;1157#if CV_SIMD1158const int cVectorWidth = v_uint16::nlanes;1159const int step = v_float32::nlanes;11601161if (!mask)1162{1163int size = len * cn;1164for (; x <= size - cVectorWidth; x += cVectorWidth)1165{1166v_uint16 v_src = vx_load(src + x);1167v_uint32 v_src0, v_src1;1168v_expand(v_src, v_src0, v_src1);11691170v_float32 v_float0, v_float1;1171v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0));1172v_float1 = v_cvt_f32(v_reinterpret_as_s32(v_src1));11731174v_store(dst + x, v_fma(v_float0, v_float0, vx_load(dst + x)));1175v_store(dst + x + step, v_fma(v_float1, v_float1, vx_load(dst + x + step)));1176}1177}1178else1179{1180v_uint32 v_0 = vx_setzero_u32();1181if (cn == 1)1182{1183for ( ; x <= len - cVectorWidth ; x += cVectorWidth)1184{1185v_uint16 v_mask16 = vx_load_expand(mask + x);1186v_uint32 v_mask0, v_mask1;1187v_expand(v_mask16, v_mask0, v_mask1);1188v_mask0 = ~(v_mask0 == v_0);1189v_mask1 = ~(v_mask1 == v_0);1190v_uint16 v_src = vx_load(src + x);1191v_uint32 v_src0, v_src1;1192v_expand(v_src, v_src0, v_src1);1193v_src0 = v_src0 & v_mask0;1194v_src1 = v_src1 & v_mask1;11951196v_float32 v_float0, v_float1;1197v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0));1198v_float1 = v_cvt_f32(v_reinterpret_as_s32(v_src1));11991200v_store(dst + x, v_fma(v_float0, v_float0, vx_load(dst + x)));1201v_store(dst + x + step, v_fma(v_float1, v_float1, vx_load(dst + x + step)));1202}1203}1204else if (cn == 3)1205{1206for ( ; x <= len - cVectorWidth ; x += cVectorWidth)1207{1208v_uint16 v_mask16 = vx_load_expand(mask + x);1209v_uint32 v_mask0, v_mask1;1210v_expand(v_mask16, v_mask0, v_mask1);1211v_mask0 = ~(v_mask0 == v_0);1212v_mask1 = ~(v_mask1 == v_0);12131214v_uint16 v_src0, v_src1, v_src2;1215v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);1216v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;1217v_expand(v_src0, v_int00, v_int01);1218v_expand(v_src1, v_int10, v_int11);1219v_expand(v_src2, v_int20, v_int21);1220v_int00 = v_int00 & v_mask0;1221v_int01 = v_int01 & v_mask1;1222v_int10 = v_int10 & v_mask0;1223v_int11 = v_int11 & v_mask1;1224v_int20 = v_int20 & v_mask0;1225v_int21 = v_int21 & v_mask1;12261227v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;1228v_src00 = v_cvt_f32(v_reinterpret_as_s32(v_int00));1229v_src01 = v_cvt_f32(v_reinterpret_as_s32(v_int01));1230v_src10 = v_cvt_f32(v_reinterpret_as_s32(v_int10));1231v_src11 = v_cvt_f32(v_reinterpret_as_s32(v_int11));1232v_src20 = v_cvt_f32(v_reinterpret_as_s32(v_int20));1233v_src21 = v_cvt_f32(v_reinterpret_as_s32(v_int21));12341235v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;1236v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);1237v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);12381239v_dst00 = v_fma(v_src00, v_src00, v_dst00);1240v_dst01 = v_fma(v_src01, v_src01, v_dst01);1241v_dst10 = v_fma(v_src10, v_src10, v_dst10);1242v_dst11 = v_fma(v_src11, v_src11, v_dst11);1243v_dst20 = v_fma(v_src20, v_src20, v_dst20);1244v_dst21 = v_fma(v_src21, v_src21, v_dst21);12451246v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);1247v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);1248}1249}1250}1251#endif // CV_SIMD1252accSqr_general_(src, dst, mask, len, cn, x);1253}12541255void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)1256{1257int x = 0;1258#if CV_SIMD1259const int cVectorWidth = v_uint16::nlanes;1260const int step = v_float32::nlanes;12611262if (!mask)1263{1264int size = len * cn;1265#if CV_AVX && !CV_AVX21266for ( ; x <= size - 8 ; x += 8)1267{1268__m256 v_src = _mm256_loadu_ps(src + x);1269__m256 v_dst = _mm256_loadu_ps(dst + x);1270v_src = _mm256_mul_ps(v_src, v_src);1271v_dst = _mm256_add_ps(v_src, v_dst);1272_mm256_storeu_ps(dst + x, v_dst);1273}1274#else1275for (; x <= size - cVectorWidth; x += cVectorWidth)1276{1277v_float32 v_src0 = vx_load(src + x);1278v_float32 v_src1 = vx_load(src + x + step);12791280v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));1281v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));1282}1283#endif // CV_AVX && !CV_AVX21284}1285else1286{1287v_uint32 v_0 = vx_setzero_u32();1288if (cn == 1)1289{1290for (; x <= len - cVectorWidth; x += cVectorWidth)1291{1292v_uint16 v_mask16 = vx_load_expand(mask + x);1293v_uint32 v_mask_0, v_mask_1;1294v_expand(v_mask16, v_mask_0, v_mask_1);1295v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));1296v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));1297v_float32 v_src0 = vx_load(src + x);1298v_float32 v_src1 = vx_load(src + x + step);1299v_src0 = v_src0 & v_mask0;1300v_src1 = v_src1 & v_mask1;13011302v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));1303v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));1304}1305}1306else if (cn == 3)1307{1308for (; x <= len - cVectorWidth; x += cVectorWidth)1309{1310v_uint16 v_mask16 = vx_load_expand(mask + x);1311v_uint32 v_mask_0, v_mask_1;1312v_expand(v_mask16, v_mask_0, v_mask_1);1313v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));1314v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));13151316v_float32 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;1317v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);1318v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);1319v_src00 = v_src00 & v_mask0;1320v_src01 = v_src01 & v_mask1;1321v_src10 = v_src10 & v_mask0;1322v_src11 = v_src11 & v_mask1;1323v_src20 = v_src20 & v_mask0;1324v_src21 = v_src21 & v_mask1;13251326v_float32 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;1327v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);1328v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);13291330v_dst00 = v_fma(v_src00, v_src00, v_dst00);1331v_dst01 = v_fma(v_src01, v_src01, v_dst01);1332v_dst10 = v_fma(v_src10, v_src10, v_dst10);1333v_dst11 = v_fma(v_src11, v_src11, v_dst11);1334v_dst20 = v_fma(v_src20, v_src20, v_dst20);1335v_dst21 = v_fma(v_src21, v_src21, v_dst21);13361337v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);1338v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);1339}1340}1341}1342#endif // CV_SIMD1343accSqr_general_(src, dst, mask, len, cn, x);1344}13451346void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)1347{1348int x = 0;1349#if CV_SIMD_64F1350const int cVectorWidth = v_uint16::nlanes;1351const int step = v_float64::nlanes;13521353if (!mask)1354{1355int size = len * cn;1356for (; x <= size - cVectorWidth; x += cVectorWidth)1357{1358v_uint16 v_int = vx_load_expand(src + x);13591360v_uint32 v_int0, v_int1;1361v_expand(v_int, v_int0, v_int1);13621363v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));1364v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));1365v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));1366v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));13671368v_float64 v_dst0 = vx_load(dst + x);1369v_float64 v_dst1 = vx_load(dst + x + step);1370v_float64 v_dst2 = vx_load(dst + x + step * 2);1371v_float64 v_dst3 = vx_load(dst + x + step * 3);13721373v_dst0 = v_fma(v_src0, v_src0, v_dst0);1374v_dst1 = v_fma(v_src1, v_src1, v_dst1);1375v_dst2 = v_fma(v_src2, v_src2, v_dst2);1376v_dst3 = v_fma(v_src3, v_src3, v_dst3);13771378v_store(dst + x, v_dst0);1379v_store(dst + x + step, v_dst1);1380v_store(dst + x + step * 2, v_dst2);1381v_store(dst + x + step * 3, v_dst3);1382}1383}1384else1385{1386v_uint16 v_0 = vx_setzero_u16();1387if (cn == 1)1388{1389for (; x <= len - cVectorWidth; x += cVectorWidth)1390{1391v_uint16 v_mask = vx_load_expand(mask + x);1392v_mask = ~(v_mask == v_0);1393v_uint16 v_src = vx_load_expand(src + x);1394v_uint16 v_int = v_src & v_mask;13951396v_uint32 v_int0, v_int1;1397v_expand(v_int, v_int0, v_int1);13981399v_float64 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0));1400v_float64 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0));1401v_float64 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1));1402v_float64 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1));14031404v_float64 v_dst0 = vx_load(dst + x);1405v_float64 v_dst1 = vx_load(dst + x + step);1406v_float64 v_dst2 = vx_load(dst + x + step * 2);1407v_float64 v_dst3 = vx_load(dst + x + step * 3);14081409v_dst0 = v_fma(v_src0, v_src0, v_dst0);1410v_dst1 = v_fma(v_src1, v_src1, v_dst1);1411v_dst2 = v_fma(v_src2, v_src2, v_dst2);1412v_dst3 = v_fma(v_src3, v_src3, v_dst3);14131414v_store(dst + x, v_dst0);1415v_store(dst + x + step, v_dst1);1416v_store(dst + x + step * 2, v_dst2);1417v_store(dst + x + step * 3, v_dst3);1418}1419}1420else if (cn == 3)1421{1422for (; x <= len - cVectorWidth * 2; x += cVectorWidth)1423{1424v_uint8 v_src0, v_src1, v_src2;1425v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);14261427v_uint16 v_int0 = v_expand_low(v_src0);1428v_uint16 v_int1 = v_expand_low(v_src1);1429v_uint16 v_int2 = v_expand_low(v_src2);14301431v_uint16 v_mask = vx_load_expand(mask + x);1432v_mask = ~(v_mask == v_0);1433v_int0 = v_int0 & v_mask;1434v_int1 = v_int1 & v_mask;1435v_int2 = v_int2 & v_mask;14361437v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;1438v_expand(v_int0, v_int00, v_int01);1439v_expand(v_int1, v_int10, v_int11);1440v_expand(v_int2, v_int20, v_int21);14411442v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));1443v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));1444v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));1445v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));1446v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));1447v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));1448v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));1449v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));1450v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));1451v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));1452v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));1453v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));14541455v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;1456v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);1457v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);1458v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);1459v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);14601461v_dst00 = v_fma(v_src00, v_src00, v_dst00);1462v_dst01 = v_fma(v_src01, v_src01, v_dst01);1463v_dst02 = v_fma(v_src02, v_src02, v_dst02);1464v_dst03 = v_fma(v_src03, v_src03, v_dst03);1465v_dst10 = v_fma(v_src10, v_src10, v_dst10);1466v_dst11 = v_fma(v_src11, v_src11, v_dst11);1467v_dst12 = v_fma(v_src12, v_src12, v_dst12);1468v_dst13 = v_fma(v_src13, v_src13, v_dst13);1469v_dst20 = v_fma(v_src20, v_src20, v_dst20);1470v_dst21 = v_fma(v_src21, v_src21, v_dst21);1471v_dst22 = v_fma(v_src22, v_src22, v_dst22);1472v_dst23 = v_fma(v_src23, v_src23, v_dst23);14731474v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);1475v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);1476v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);1477v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);1478}1479}1480}1481#endif // CV_SIMD_64F1482accSqr_general_(src, dst, mask, len, cn, x);1483}14841485void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)1486{1487int x = 0;1488#if CV_SIMD_64F1489const int cVectorWidth = v_uint16::nlanes;1490const int step = v_float64::nlanes;14911492if (!mask)1493{1494int size = len * cn;1495for (; x <= size - cVectorWidth; x += cVectorWidth)1496{1497v_uint16 v_src = vx_load(src + x);1498v_uint32 v_int_0, v_int_1;1499v_expand(v_src, v_int_0, v_int_1);15001501v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);1502v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);15031504v_float64 v_src0 = v_cvt_f64(v_int0);1505v_float64 v_src1 = v_cvt_f64_high(v_int0);1506v_float64 v_src2 = v_cvt_f64(v_int1);1507v_float64 v_src3 = v_cvt_f64_high(v_int1);15081509v_float64 v_dst0 = vx_load(dst + x);1510v_float64 v_dst1 = vx_load(dst + x + step);1511v_float64 v_dst2 = vx_load(dst + x + step * 2);1512v_float64 v_dst3 = vx_load(dst + x + step * 3);15131514v_dst0 = v_fma(v_src0, v_src0, v_dst0);1515v_dst1 = v_fma(v_src1, v_src1, v_dst1);1516v_dst2 = v_fma(v_src2, v_src2, v_dst2);1517v_dst3 = v_fma(v_src3, v_src3, v_dst3);15181519v_store(dst + x, v_dst0);1520v_store(dst + x + step, v_dst1);1521v_store(dst + x + step * 2, v_dst2);1522v_store(dst + x + step * 3, v_dst3);1523}1524}1525else1526{1527v_uint16 v_0 = vx_setzero_u16();1528if (cn == 1)1529{1530for (; x <= len - cVectorWidth; x += cVectorWidth)1531{1532v_uint16 v_mask = vx_load_expand(mask + x);1533v_mask = ~(v_mask == v_0);1534v_uint16 v_src = vx_load(src + x);1535v_src = v_src & v_mask;1536v_uint32 v_int_0, v_int_1;1537v_expand(v_src, v_int_0, v_int_1);15381539v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);1540v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);15411542v_float64 v_src0 = v_cvt_f64(v_int0);1543v_float64 v_src1 = v_cvt_f64_high(v_int0);1544v_float64 v_src2 = v_cvt_f64(v_int1);1545v_float64 v_src3 = v_cvt_f64_high(v_int1);15461547v_float64 v_dst0 = vx_load(dst + x);1548v_float64 v_dst1 = vx_load(dst + x + step);1549v_float64 v_dst2 = vx_load(dst + x + step * 2);1550v_float64 v_dst3 = vx_load(dst + x + step * 3);15511552v_dst0 = v_fma(v_src0, v_src0, v_dst0);1553v_dst1 = v_fma(v_src1, v_src1, v_dst1);1554v_dst2 = v_fma(v_src2, v_src2, v_dst2);1555v_dst3 = v_fma(v_src3, v_src3, v_dst3);15561557v_store(dst + x, v_dst0);1558v_store(dst + x + step, v_dst1);1559v_store(dst + x + step * 2, v_dst2);1560v_store(dst + x + step * 3, v_dst3);1561}1562}1563else if (cn == 3)1564{1565for (; x <= len - cVectorWidth; x += cVectorWidth)1566{1567v_uint16 v_mask = vx_load_expand(mask + x);1568v_mask = ~(v_mask == v_0);1569v_uint16 v_src0, v_src1, v_src2;1570v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);1571v_src0 = v_src0 & v_mask;1572v_src1 = v_src1 & v_mask;1573v_src2 = v_src2 & v_mask;1574v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;1575v_expand(v_src0, v_int00, v_int01);1576v_expand(v_src1, v_int10, v_int11);1577v_expand(v_src2, v_int20, v_int21);15781579v_float64 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00));1580v_float64 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00));1581v_float64 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01));1582v_float64 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01));1583v_float64 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10));1584v_float64 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10));1585v_float64 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11));1586v_float64 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11));1587v_float64 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20));1588v_float64 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20));1589v_float64 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21));1590v_float64 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21));15911592v_float64 v_dst00, v_dst01, v_dst02, v_dst03;1593v_float64 v_dst10, v_dst11, v_dst12, v_dst13;1594v_float64 v_dst20, v_dst21, v_dst22, v_dst23;1595v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);1596v_load_deinterleave(dst + (x + step)* cn, v_dst01, v_dst11, v_dst21);1597v_load_deinterleave(dst + (x + step * 2)* cn, v_dst02, v_dst12, v_dst22);1598v_load_deinterleave(dst + (x + step * 3)* cn, v_dst03, v_dst13, v_dst23);15991600v_dst00 = v_fma(v_src00, v_src00, v_dst00);1601v_dst01 = v_fma(v_src01, v_src01, v_dst01);1602v_dst02 = v_fma(v_src02, v_src02, v_dst02);1603v_dst03 = v_fma(v_src03, v_src03, v_dst03);1604v_dst10 = v_fma(v_src10, v_src10, v_dst10);1605v_dst11 = v_fma(v_src11, v_src11, v_dst11);1606v_dst12 = v_fma(v_src12, v_src12, v_dst12);1607v_dst13 = v_fma(v_src13, v_src13, v_dst13);1608v_dst20 = v_fma(v_src20, v_src20, v_dst20);1609v_dst21 = v_fma(v_src21, v_src21, v_dst21);1610v_dst22 = v_fma(v_src22, v_src22, v_dst22);1611v_dst23 = v_fma(v_src23, v_src23, v_dst23);16121613v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);1614v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);1615v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);1616v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);1617}1618}1619}1620#endif // CV_SIMD_64F1621accSqr_general_(src, dst, mask, len, cn, x);1622}16231624void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)1625{1626int x = 0;1627#if CV_SIMD_64F1628const int cVectorWidth = v_float32::nlanes;1629const int step = v_float64::nlanes;16301631if (!mask)1632{1633int size = len * cn;1634#if CV_AVX && !CV_AVX21635for (; x <= size - 8 ; x += 8)1636{1637__m256 v_src = _mm256_loadu_ps(src + x);1638__m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0));1639__m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1));1640__m256d v_dst0 = _mm256_loadu_pd(dst + x);1641__m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);1642v_src0 = _mm256_mul_pd(v_src0, v_src0);1643v_src1 = _mm256_mul_pd(v_src1, v_src1);1644v_dst0 = _mm256_add_pd(v_src0, v_dst0);1645v_dst1 = _mm256_add_pd(v_src1, v_dst1);1646_mm256_storeu_pd(dst + x, v_dst0);1647_mm256_storeu_pd(dst + x + 4, v_dst1);1648}1649#else1650for (; x <= size - cVectorWidth; x += cVectorWidth)1651{1652v_float32 v_src = vx_load(src + x);1653v_float64 v_src0 = v_cvt_f64(v_src);1654v_float64 v_src1 = v_cvt_f64_high(v_src);16551656v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));1657v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));1658}1659#endif // CV_AVX && !CV_AVX21660}1661else1662{1663v_uint32 v_0 = vx_setzero_u32();1664if (cn == 1)1665{1666for (; x <= len - cVectorWidth; x += cVectorWidth)1667{1668v_uint32 v_mask = vx_load_expand_q(mask + x);;1669v_mask = ~(v_mask == v_0);1670v_float32 v_src = vx_load(src + x);1671v_src = v_src & v_reinterpret_as_f32(v_mask);1672v_float64 v_src0 = v_cvt_f64(v_src);1673v_float64 v_src1 = v_cvt_f64_high(v_src);16741675v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));1676v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));1677}1678}1679else if (cn == 3)1680{1681for (; x <= len - cVectorWidth; x += cVectorWidth)1682{1683v_uint32 v_mask = vx_load_expand_q(mask + x);1684v_mask = ~(v_mask == v_0);16851686v_float32 v_src0, v_src1, v_src2;1687v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);1688v_src0 = v_src0 & v_reinterpret_as_f32(v_mask);1689v_src1 = v_src1 & v_reinterpret_as_f32(v_mask);1690v_src2 = v_src2 & v_reinterpret_as_f32(v_mask);16911692v_float64 v_src00 = v_cvt_f64(v_src0);1693v_float64 v_src01 = v_cvt_f64_high(v_src0);1694v_float64 v_src10 = v_cvt_f64(v_src1);1695v_float64 v_src11 = v_cvt_f64_high(v_src1);1696v_float64 v_src20 = v_cvt_f64(v_src2);1697v_float64 v_src21 = v_cvt_f64_high(v_src2);16981699v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;1700v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);1701v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);17021703v_dst00 = v_fma(v_src00, v_src00, v_dst00);1704v_dst01 = v_fma(v_src01, v_src01, v_dst01);1705v_dst10 = v_fma(v_src10, v_src10, v_dst10);1706v_dst11 = v_fma(v_src11, v_src11, v_dst11);1707v_dst20 = v_fma(v_src20, v_src20, v_dst20);1708v_dst21 = v_fma(v_src21, v_src21, v_dst21);17091710v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);1711v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);1712}1713}1714}1715#endif // CV_SIMD_64F1716accSqr_general_(src, dst, mask, len, cn, x);1717}17181719void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)1720{1721int x = 0;1722#if CV_SIMD_64F1723const int cVectorWidth = v_float64::nlanes * 2;1724const int step = v_float64::nlanes;17251726if (!mask)1727{1728int size = len * cn;1729#if CV_AVX && !CV_AVX21730for (; x <= size - 4 ; x += 4)1731{1732__m256d v_src = _mm256_loadu_pd(src + x);1733__m256d v_dst = _mm256_loadu_pd(dst + x);1734v_src = _mm256_mul_pd(v_src, v_src);1735v_dst = _mm256_add_pd(v_dst, v_src);1736_mm256_storeu_pd(dst + x, v_dst);1737}1738#else1739for (; x <= size - cVectorWidth; x += cVectorWidth)1740{1741v_float64 v_src0 = vx_load(src + x);1742v_float64 v_src1 = vx_load(src + x + step);1743v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));1744v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));1745}1746#endif // CV_AVX && !CV_AVX21747}1748else1749{1750v_uint64 v_0 = vx_setzero_u64();1751if (cn == 1)1752{1753for (; x <= len - cVectorWidth; x += cVectorWidth)1754{1755v_uint32 v_mask32 = vx_load_expand_q(mask + x);1756v_uint64 v_masku640, v_masku641;1757v_expand(v_mask32, v_masku640, v_masku641);1758v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));1759v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));1760v_float64 v_src0 = vx_load(src + x);1761v_float64 v_src1 = vx_load(src + x + step);1762v_src0 = v_src0 & v_mask0;1763v_src1 = v_src1 & v_mask1;1764v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));1765v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));1766}1767}1768else if (cn == 3)1769{1770for (; x <= len - cVectorWidth; x += cVectorWidth)1771{1772v_uint32 v_mask32 = vx_load_expand_q(mask + x);1773v_uint64 v_masku640, v_masku641;1774v_expand(v_mask32, v_masku640, v_masku641);1775v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));1776v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));17771778v_float64 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;1779v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);1780v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);1781v_src00 = v_src00 & v_mask0;1782v_src01 = v_src01 & v_mask1;1783v_src10 = v_src10 & v_mask0;1784v_src11 = v_src11 & v_mask1;1785v_src20 = v_src20 & v_mask0;1786v_src21 = v_src21 & v_mask1;17871788v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;1789v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);1790v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);17911792v_dst00 = v_fma(v_src00, v_src00, v_dst00);1793v_dst01 = v_fma(v_src01, v_src01, v_dst01);1794v_dst10 = v_fma(v_src10, v_src10, v_dst10);1795v_dst11 = v_fma(v_src11, v_src11, v_dst11);1796v_dst20 = v_fma(v_src20, v_src20, v_dst20);1797v_dst21 = v_fma(v_src21, v_src21, v_dst21);17981799v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);1800v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);1801}1802}1803}1804#endif // CV_SIMD_64F1805accSqr_general_(src, dst, mask, len, cn, x);1806}18071808// product accumulate optimized by universal intrinsic1809void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar* mask, int len, int cn)1810{1811int x = 0;1812#if CV_SIMD1813const int cVectorWidth = v_uint8::nlanes;1814const int step = v_uint32::nlanes;18151816if (!mask)1817{1818int size = len * cn;1819for (; x <= size - cVectorWidth; x += cVectorWidth)1820{1821v_uint8 v_1src = vx_load(src1 + x);1822v_uint8 v_2src = vx_load(src2 + x);18231824v_uint16 v_src0, v_src1;1825v_mul_expand(v_1src, v_2src, v_src0, v_src1);18261827v_uint32 v_src00, v_src01, v_src10, v_src11;1828v_expand(v_src0, v_src00, v_src01);1829v_expand(v_src1, v_src10, v_src11);18301831v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));1832v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));1833v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));1834v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));1835}1836}1837else1838{1839v_uint8 v_0 = vx_setzero_u8();1840if (cn == 1)1841{1842for (; x <= len - cVectorWidth; x += cVectorWidth)1843{1844v_uint8 v_mask = vx_load(mask + x);1845v_mask = ~(v_mask == v_0);1846v_uint8 v_1src = vx_load(src1 + x);1847v_uint8 v_2src = vx_load(src2 + x);1848v_1src = v_1src & v_mask;1849v_2src = v_2src & v_mask;18501851v_uint16 v_src0, v_src1;1852v_mul_expand(v_1src, v_2src, v_src0, v_src1);18531854v_uint32 v_src00, v_src01, v_src10, v_src11;1855v_expand(v_src0, v_src00, v_src01);1856v_expand(v_src1, v_src10, v_src11);18571858v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));1859v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));1860v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));1861v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));1862}1863}1864else if (cn == 3)1865{1866for (; x <= len - cVectorWidth; x += cVectorWidth)1867{1868v_uint8 v_mask = vx_load(mask + x);1869v_mask = ~(v_mask == v_0);1870v_uint8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;1871v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);1872v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);1873v_1src0 = v_1src0 & v_mask;1874v_1src1 = v_1src1 & v_mask;1875v_1src2 = v_1src2 & v_mask;1876v_2src0 = v_2src0 & v_mask;1877v_2src1 = v_2src1 & v_mask;1878v_2src2 = v_2src2 & v_mask;18791880v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;1881v_mul_expand(v_1src0, v_2src0, v_src00, v_src01);1882v_mul_expand(v_1src1, v_2src1, v_src10, v_src11);1883v_mul_expand(v_1src2, v_2src2, v_src20, v_src21);18841885v_uint32 v_src000, v_src001, v_src002, v_src003, v_src100, v_src101, v_src102, v_src103, v_src200, v_src201, v_src202, v_src203;1886v_expand(v_src00, v_src000, v_src001);1887v_expand(v_src01, v_src002, v_src003);1888v_expand(v_src10, v_src100, v_src101);1889v_expand(v_src11, v_src102, v_src103);1890v_expand(v_src20, v_src200, v_src201);1891v_expand(v_src21, v_src202, v_src203);18921893v_float32 v_dst000, v_dst001, v_dst002, v_dst003, v_dst100, v_dst101, v_dst102, v_dst103, v_dst200, v_dst201, v_dst202, v_dst203;1894v_load_deinterleave(dst + x * cn, v_dst000, v_dst100, v_dst200);1895v_load_deinterleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);1896v_load_deinterleave(dst + (x + step * 2) * cn, v_dst002, v_dst102, v_dst202);1897v_load_deinterleave(dst + (x + step * 3) * cn, v_dst003, v_dst103, v_dst203);1898v_dst000 = v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000));1899v_dst001 = v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001));1900v_dst002 = v_dst002 + v_cvt_f32(v_reinterpret_as_s32(v_src002));1901v_dst003 = v_dst003 + v_cvt_f32(v_reinterpret_as_s32(v_src003));1902v_dst100 = v_dst100 + v_cvt_f32(v_reinterpret_as_s32(v_src100));1903v_dst101 = v_dst101 + v_cvt_f32(v_reinterpret_as_s32(v_src101));1904v_dst102 = v_dst102 + v_cvt_f32(v_reinterpret_as_s32(v_src102));1905v_dst103 = v_dst103 + v_cvt_f32(v_reinterpret_as_s32(v_src103));1906v_dst200 = v_dst200 + v_cvt_f32(v_reinterpret_as_s32(v_src200));1907v_dst201 = v_dst201 + v_cvt_f32(v_reinterpret_as_s32(v_src201));1908v_dst202 = v_dst202 + v_cvt_f32(v_reinterpret_as_s32(v_src202));1909v_dst203 = v_dst203 + v_cvt_f32(v_reinterpret_as_s32(v_src203));19101911v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200);1912v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);1913v_store_interleave(dst + (x + step * 2) * cn, v_dst002, v_dst102, v_dst202);1914v_store_interleave(dst + (x + step * 3) * cn, v_dst003, v_dst103, v_dst203);1915}1916}1917}1918#endif // CV_SIMD1919accProd_general_(src1, src2, dst, mask, len, cn, x);1920}19211922void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uchar* mask, int len, int cn)1923{1924int x = 0;1925#if CV_SIMD1926const int cVectorWidth = v_uint16::nlanes;1927const int step = v_float32::nlanes;19281929if (!mask)1930{1931int size = len * cn;1932for (; x <= size - cVectorWidth; x += cVectorWidth)1933{1934v_uint16 v_1src = vx_load(src1 + x);1935v_uint16 v_2src = vx_load(src2 + x);19361937v_uint32 v_1src0, v_1src1, v_2src0, v_2src1;1938v_expand(v_1src, v_1src0, v_1src1);1939v_expand(v_2src, v_2src0, v_2src1);19401941v_float32 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0));1942v_float32 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1));1943v_float32 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0));1944v_float32 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1));19451946v_store(dst + x, v_fma(v_1float0, v_2float0, vx_load(dst + x)));1947v_store(dst + x + step, v_fma(v_1float1, v_2float1, vx_load(dst + x + step)));1948}1949}1950else1951{1952v_uint16 v_0 = vx_setzero_u16();1953if (cn == 1)1954{1955for (; x <= len - cVectorWidth; x += cVectorWidth)1956{1957v_uint16 v_mask = vx_load_expand(mask + x);1958v_mask = ~(v_0 == v_mask);19591960v_uint16 v_1src = vx_load(src1 + x) & v_mask;1961v_uint16 v_2src = vx_load(src2 + x) & v_mask;19621963v_uint32 v_1src0, v_1src1, v_2src0, v_2src1;1964v_expand(v_1src, v_1src0, v_1src1);1965v_expand(v_2src, v_2src0, v_2src1);19661967v_float32 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0));1968v_float32 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1));1969v_float32 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0));1970v_float32 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1));19711972v_store(dst + x, v_fma(v_1float0, v_2float0, vx_load(dst + x)));1973v_store(dst + x + step, v_fma(v_1float1, v_2float1, vx_load(dst + x + step)));1974}1975}1976else if (cn == 3)1977{1978for (; x <= len - cVectorWidth; x += cVectorWidth)1979{1980v_uint16 v_mask = vx_load_expand(mask + x);1981v_mask = ~(v_0 == v_mask);19821983v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;1984v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);1985v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);1986v_1src0 = v_1src0 & v_mask;1987v_1src1 = v_1src1 & v_mask;1988v_1src2 = v_1src2 & v_mask;1989v_2src0 = v_2src0 & v_mask;1990v_2src1 = v_2src1 & v_mask;1991v_2src2 = v_2src2 & v_mask;19921993v_uint32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;1994v_expand(v_1src0, v_1src00, v_1src01);1995v_expand(v_1src1, v_1src10, v_1src11);1996v_expand(v_1src2, v_1src20, v_1src21);1997v_expand(v_2src0, v_2src00, v_2src01);1998v_expand(v_2src1, v_2src10, v_2src11);1999v_expand(v_2src2, v_2src20, v_2src21);20002001v_float32 v_1float00 = v_cvt_f32(v_reinterpret_as_s32(v_1src00));2002v_float32 v_1float01 = v_cvt_f32(v_reinterpret_as_s32(v_1src01));2003v_float32 v_1float10 = v_cvt_f32(v_reinterpret_as_s32(v_1src10));2004v_float32 v_1float11 = v_cvt_f32(v_reinterpret_as_s32(v_1src11));2005v_float32 v_1float20 = v_cvt_f32(v_reinterpret_as_s32(v_1src20));2006v_float32 v_1float21 = v_cvt_f32(v_reinterpret_as_s32(v_1src21));2007v_float32 v_2float00 = v_cvt_f32(v_reinterpret_as_s32(v_2src00));2008v_float32 v_2float01 = v_cvt_f32(v_reinterpret_as_s32(v_2src01));2009v_float32 v_2float10 = v_cvt_f32(v_reinterpret_as_s32(v_2src10));2010v_float32 v_2float11 = v_cvt_f32(v_reinterpret_as_s32(v_2src11));2011v_float32 v_2float20 = v_cvt_f32(v_reinterpret_as_s32(v_2src20));2012v_float32 v_2float21 = v_cvt_f32(v_reinterpret_as_s32(v_2src21));20132014v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;2015v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);2016v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);20172018v_dst00 = v_fma(v_1float00, v_2float00, v_dst00);2019v_dst01 = v_fma(v_1float01, v_2float01, v_dst01);2020v_dst10 = v_fma(v_1float10, v_2float10, v_dst10);2021v_dst11 = v_fma(v_1float11, v_2float11, v_dst11);2022v_dst20 = v_fma(v_1float20, v_2float20, v_dst20);2023v_dst21 = v_fma(v_1float21, v_2float21, v_dst21);20242025v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);2026v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);2027}2028}2029}2030#endif // CV_SIMD2031accProd_general_(src1, src2, dst, mask, len, cn, x);2032}20332034void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn)2035{2036int x = 0;2037#if CV_SIMD2038const int cVectorWidth = v_uint16::nlanes;2039const int step = v_float32::nlanes;20402041if (!mask)2042{2043int size = len * cn;2044#if CV_AVX && !CV_AVX22045for (; x <= size - 8 ; x += 8)2046{2047__m256 v_src0 = _mm256_loadu_ps(src1 + x);2048__m256 v_src1 = _mm256_loadu_ps(src2 + x);2049__m256 v_dst = _mm256_loadu_ps(dst + x);2050__m256 v_src = _mm256_mul_ps(v_src0, v_src1);2051v_dst = _mm256_add_ps(v_src, v_dst);2052_mm256_storeu_ps(dst + x, v_dst);2053}2054#else2055for (; x <= size - cVectorWidth; x += cVectorWidth)2056{2057v_store(dst + x, v_fma(vx_load(src1 + x), vx_load(src2 + x), vx_load(dst + x)));2058v_store(dst + x + step, v_fma(vx_load(src1 + x + step), vx_load(src2 + x + step), vx_load(dst + x + step)));2059}2060#endif // CV_AVX && !CV_AVX22061}2062else2063{2064v_uint32 v_0 = vx_setzero_u32();2065if (cn == 1)2066{2067for (; x <= len - cVectorWidth; x += cVectorWidth)2068{2069v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);2070v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);2071v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));2072v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));20732074v_store(dst + x, vx_load(dst + x) + ((vx_load(src1 + x) * vx_load(src2 + x)) & v_mask0));2075v_store(dst + x + step, vx_load(dst + x + step) + ((vx_load(src1 + x + step) * vx_load(src2 + x + step)) & v_mask1));2076}2077}2078else if (cn == 3)2079{2080for (; x <= len - cVectorWidth; x += cVectorWidth)2081{2082v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);2083v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);2084v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));2085v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));20862087v_float32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;2088v_float32 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;2089v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20);2090v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20);2091v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21);2092v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21);20932094v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;2095v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);2096v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);20972098v_store_interleave(dst + x * cn, v_dst00 + ((v_1src00 * v_2src00) & v_mask0), v_dst10 + ((v_1src10 * v_2src10) & v_mask0), v_dst20 + ((v_1src20 * v_2src20) & v_mask0));2099v_store_interleave(dst + (x + step) * cn, v_dst01 + ((v_1src01 * v_2src01) & v_mask1), v_dst11 + ((v_1src11 * v_2src11) & v_mask1), v_dst21 + ((v_1src21 * v_2src21) & v_mask1));2100}2101}2102}2103#endif // CV_SIMD2104accProd_general_(src1, src2, dst, mask, len, cn, x);2105}21062107void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn)2108{2109int x = 0;2110#if CV_SIMD_64F2111const int cVectorWidth = v_uint16::nlanes;2112const int step = v_float64::nlanes;21132114if (!mask)2115{2116int size = len * cn;2117for (; x <= size - cVectorWidth; x += cVectorWidth)2118{2119v_uint16 v_1int = vx_load_expand(src1 + x);2120v_uint16 v_2int = vx_load_expand(src2 + x);21212122v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;2123v_expand(v_1int, v_1int_0, v_1int_1);2124v_expand(v_2int, v_2int_0, v_2int_1);21252126v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);2127v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);2128v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);2129v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);21302131v_float64 v_dst0 = vx_load(dst + x);2132v_float64 v_dst1 = vx_load(dst + x + step);2133v_float64 v_dst2 = vx_load(dst + x + step * 2);2134v_float64 v_dst3 = vx_load(dst + x + step * 3);21352136v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);2137v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);2138v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);2139v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);21402141v_store(dst + x, v_dst0);2142v_store(dst + x + step, v_dst1);2143v_store(dst + x + step * 2, v_dst2);2144v_store(dst + x + step * 3, v_dst3);2145}2146}2147else2148{2149v_uint16 v_0 = vx_setzero_u16();2150if (cn == 1)2151{2152for (; x <= len - cVectorWidth; x += cVectorWidth)2153{2154v_uint16 v_mask = vx_load_expand(mask + x);2155v_mask = ~(v_mask == v_0);2156v_uint16 v_1int = vx_load_expand(src1 + x) & v_mask;2157v_uint16 v_2int = vx_load_expand(src2 + x) & v_mask;21582159v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;2160v_expand(v_1int, v_1int_0, v_1int_1);2161v_expand(v_2int, v_2int_0, v_2int_1);21622163v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);2164v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);2165v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);2166v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);21672168v_float64 v_dst0 = vx_load(dst + x);2169v_float64 v_dst1 = vx_load(dst + x + step);2170v_float64 v_dst2 = vx_load(dst + x + step * 2);2171v_float64 v_dst3 = vx_load(dst + x + step * 3);21722173v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);2174v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);2175v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);2176v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);21772178v_store(dst + x, v_dst0);2179v_store(dst + x + step, v_dst1);2180v_store(dst + x + step * 2, v_dst2);2181v_store(dst + x + step * 3, v_dst3);2182}2183}2184else if (cn == 3)2185{2186for (; x <= len - cVectorWidth * 2; x += cVectorWidth)2187{2188v_uint8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;2189v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);2190v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);21912192v_uint16 v_1int0 = v_expand_low(v_1src0);2193v_uint16 v_1int1 = v_expand_low(v_1src1);2194v_uint16 v_1int2 = v_expand_low(v_1src2);2195v_uint16 v_2int0 = v_expand_low(v_2src0);2196v_uint16 v_2int1 = v_expand_low(v_2src1);2197v_uint16 v_2int2 = v_expand_low(v_2src2);21982199v_uint16 v_mask = vx_load_expand(mask + x);2200v_mask = ~(v_mask == v_0);2201v_1int0 = v_1int0 & v_mask;2202v_1int1 = v_1int1 & v_mask;2203v_1int2 = v_1int2 & v_mask;2204v_2int0 = v_2int0 & v_mask;2205v_2int1 = v_2int1 & v_mask;2206v_2int2 = v_2int2 & v_mask;22072208v_uint32 v_1int00, v_1int01, v_1int10, v_1int11, v_1int20, v_1int21;2209v_uint32 v_2int00, v_2int01, v_2int10, v_2int11, v_2int20, v_2int21;2210v_expand(v_1int0, v_1int00, v_1int01);2211v_expand(v_1int1, v_1int10, v_1int11);2212v_expand(v_1int2, v_1int20, v_1int21);2213v_expand(v_2int0, v_2int00, v_2int01);2214v_expand(v_2int1, v_2int10, v_2int11);2215v_expand(v_2int2, v_2int20, v_2int21);22162217v_float64 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23;2218v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);2219v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);2220v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);2221v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);22222223v_dst00 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int00)), v_cvt_f64(v_reinterpret_as_s32(v_2int00)), v_dst00);2224v_dst01 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int00)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int00)), v_dst01);2225v_dst02 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int01)), v_cvt_f64(v_reinterpret_as_s32(v_2int01)), v_dst02);2226v_dst03 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int01)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int01)), v_dst03);2227v_dst10 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int10)), v_cvt_f64(v_reinterpret_as_s32(v_2int10)), v_dst10);2228v_dst11 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int10)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int10)), v_dst11);2229v_dst12 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int11)), v_cvt_f64(v_reinterpret_as_s32(v_2int11)), v_dst12);2230v_dst13 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int11)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int11)), v_dst13);2231v_dst20 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int20)), v_cvt_f64(v_reinterpret_as_s32(v_2int20)), v_dst20);2232v_dst21 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int20)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int20)), v_dst21);2233v_dst22 = v_fma(v_cvt_f64(v_reinterpret_as_s32(v_1int21)), v_cvt_f64(v_reinterpret_as_s32(v_2int21)), v_dst22);2234v_dst23 = v_fma(v_cvt_f64_high(v_reinterpret_as_s32(v_1int21)), v_cvt_f64_high(v_reinterpret_as_s32(v_2int21)), v_dst23);22352236v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);2237v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);2238v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);2239v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);2240}2241}2242}2243#endif // CV_SIMD_64F2244accProd_general_(src1, src2, dst, mask, len, cn, x);2245}22462247void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn)2248{2249int x = 0;2250#if CV_SIMD_64F2251const int cVectorWidth = v_uint16::nlanes;2252const int step = v_float64::nlanes;22532254if (!mask)2255{2256int size = len * cn;2257for (; x <= size - cVectorWidth; x += cVectorWidth)2258{2259v_uint16 v_1src = vx_load(src1 + x);2260v_uint16 v_2src = vx_load(src2 + x);22612262v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;2263v_expand(v_1src, v_1int_0, v_1int_1);2264v_expand(v_2src, v_2int_0, v_2int_1);22652266v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);2267v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);2268v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);2269v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);22702271v_float64 v_dst0 = vx_load(dst + x);2272v_float64 v_dst1 = vx_load(dst + x + step);2273v_float64 v_dst2 = vx_load(dst + x + step * 2);2274v_float64 v_dst3 = vx_load(dst + x + step * 3);22752276v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);2277v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);2278v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);2279v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);22802281v_store(dst + x, v_dst0);2282v_store(dst + x + step, v_dst1);2283v_store(dst + x + step * 2, v_dst2);2284v_store(dst + x + step * 3, v_dst3);2285}2286}2287else2288{2289v_uint16 v_0 = vx_setzero_u16();2290if (cn == 1)2291{2292for (; x <= len - cVectorWidth; x += cVectorWidth)2293{2294v_uint16 v_mask = vx_load_expand(mask + x);2295v_mask = ~(v_mask == v_0);2296v_uint16 v_1src = vx_load(src1 + x);2297v_uint16 v_2src = vx_load(src2 + x);2298v_1src = v_1src & v_mask;2299v_2src = v_2src & v_mask;23002301v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;2302v_expand(v_1src, v_1int_0, v_1int_1);2303v_expand(v_2src, v_2int_0, v_2int_1);23042305v_int32 v_1int0 = v_reinterpret_as_s32(v_1int_0);2306v_int32 v_1int1 = v_reinterpret_as_s32(v_1int_1);2307v_int32 v_2int0 = v_reinterpret_as_s32(v_2int_0);2308v_int32 v_2int1 = v_reinterpret_as_s32(v_2int_1);23092310v_float64 v_dst0 = vx_load(dst + x);2311v_float64 v_dst1 = vx_load(dst + x + step);2312v_float64 v_dst2 = vx_load(dst + x + step * 2);2313v_float64 v_dst3 = vx_load(dst + x + step * 3);23142315v_dst0 = v_fma(v_cvt_f64(v_1int0), v_cvt_f64(v_2int0), v_dst0);2316v_dst1 = v_fma(v_cvt_f64_high(v_1int0), v_cvt_f64_high(v_2int0), v_dst1);2317v_dst2 = v_fma(v_cvt_f64(v_1int1), v_cvt_f64(v_2int1), v_dst2);2318v_dst3 = v_fma(v_cvt_f64_high(v_1int1), v_cvt_f64_high(v_2int1), v_dst3);23192320v_store(dst + x, v_dst0);2321v_store(dst + x + step, v_dst1);2322v_store(dst + x + step * 2, v_dst2);2323v_store(dst + x + step * 3, v_dst3);2324}2325}2326else if (cn == 3)2327{2328for (; x <= len - cVectorWidth; x += cVectorWidth)2329{2330v_uint16 v_mask = vx_load_expand(mask + x);2331v_mask = ~(v_mask == v_0);2332v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;2333v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);2334v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);2335v_1src0 = v_1src0 & v_mask;2336v_1src1 = v_1src1 & v_mask;2337v_1src2 = v_1src2 & v_mask;2338v_2src0 = v_2src0 & v_mask;2339v_2src1 = v_2src1 & v_mask;2340v_2src2 = v_2src2 & v_mask;23412342v_uint32 v_1int_00, v_1int_01, v_2int_00, v_2int_01;2343v_uint32 v_1int_10, v_1int_11, v_2int_10, v_2int_11;2344v_uint32 v_1int_20, v_1int_21, v_2int_20, v_2int_21;2345v_expand(v_1src0, v_1int_00, v_1int_01);2346v_expand(v_1src1, v_1int_10, v_1int_11);2347v_expand(v_1src2, v_1int_20, v_1int_21);2348v_expand(v_2src0, v_2int_00, v_2int_01);2349v_expand(v_2src1, v_2int_10, v_2int_11);2350v_expand(v_2src2, v_2int_20, v_2int_21);23512352v_int32 v_1int00 = v_reinterpret_as_s32(v_1int_00);2353v_int32 v_1int01 = v_reinterpret_as_s32(v_1int_01);2354v_int32 v_1int10 = v_reinterpret_as_s32(v_1int_10);2355v_int32 v_1int11 = v_reinterpret_as_s32(v_1int_11);2356v_int32 v_1int20 = v_reinterpret_as_s32(v_1int_20);2357v_int32 v_1int21 = v_reinterpret_as_s32(v_1int_21);2358v_int32 v_2int00 = v_reinterpret_as_s32(v_2int_00);2359v_int32 v_2int01 = v_reinterpret_as_s32(v_2int_01);2360v_int32 v_2int10 = v_reinterpret_as_s32(v_2int_10);2361v_int32 v_2int11 = v_reinterpret_as_s32(v_2int_11);2362v_int32 v_2int20 = v_reinterpret_as_s32(v_2int_20);2363v_int32 v_2int21 = v_reinterpret_as_s32(v_2int_21);23642365v_float64 v_dst00, v_dst01, v_dst02, v_dst03;2366v_float64 v_dst10, v_dst11, v_dst12, v_dst13;2367v_float64 v_dst20, v_dst21, v_dst22, v_dst23;2368v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);2369v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);2370v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);2371v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);23722373v_dst00 = v_fma(v_cvt_f64(v_1int00), v_cvt_f64(v_2int00), v_dst00);2374v_dst01 = v_fma(v_cvt_f64_high(v_1int00), v_cvt_f64_high(v_2int00), v_dst01);2375v_dst02 = v_fma(v_cvt_f64(v_1int01), v_cvt_f64(v_2int01), v_dst02);2376v_dst03 = v_fma(v_cvt_f64_high(v_1int01), v_cvt_f64_high(v_2int01), v_dst03);2377v_dst10 = v_fma(v_cvt_f64(v_1int10), v_cvt_f64(v_2int10), v_dst10);2378v_dst11 = v_fma(v_cvt_f64_high(v_1int10), v_cvt_f64_high(v_2int10), v_dst11);2379v_dst12 = v_fma(v_cvt_f64(v_1int11), v_cvt_f64(v_2int11), v_dst12);2380v_dst13 = v_fma(v_cvt_f64_high(v_1int11), v_cvt_f64_high(v_2int11), v_dst13);2381v_dst20 = v_fma(v_cvt_f64(v_1int20), v_cvt_f64(v_2int20), v_dst20);2382v_dst21 = v_fma(v_cvt_f64_high(v_1int20), v_cvt_f64_high(v_2int20), v_dst21);2383v_dst22 = v_fma(v_cvt_f64(v_1int21), v_cvt_f64(v_2int21), v_dst22);2384v_dst23 = v_fma(v_cvt_f64_high(v_1int21), v_cvt_f64_high(v_2int21), v_dst23);23852386v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);2387v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);2388v_store_interleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);2389v_store_interleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);2390}2391}2392}2393#endif // CV_SIMD_64F2394accProd_general_(src1, src2, dst, mask, len, cn, x);2395}23962397void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn)2398{2399int x = 0;2400#if CV_SIMD_64F2401const int cVectorWidth = v_float32::nlanes;2402const int step = v_float64::nlanes;24032404if (!mask)2405{2406int size = len * cn;2407#if CV_AVX && !CV_AVX22408for ( ; x <= size - 8 ; x += 8)2409{2410__m256 v_1src = _mm256_loadu_ps(src1 + x);2411__m256 v_2src = _mm256_loadu_ps(src2 + x);2412__m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,0));2413__m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,1));2414__m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,0));2415__m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,1));2416__m256d v_dst0 = _mm256_loadu_pd(dst + x);2417__m256d v_dst1 = _mm256_loadu_pd(dst + x + 4);2418__m256d v_src0 = _mm256_mul_pd(v_src00, v_src10);2419__m256d v_src1 = _mm256_mul_pd(v_src01, v_src11);2420v_dst0 = _mm256_add_pd(v_src0, v_dst0);2421v_dst1 = _mm256_add_pd(v_src1, v_dst1);2422_mm256_storeu_pd(dst + x, v_dst0);2423_mm256_storeu_pd(dst + x + 4, v_dst1);2424}2425#else2426for (; x <= size - cVectorWidth; x += cVectorWidth)2427{2428v_float32 v_1src = vx_load(src1 + x);2429v_float32 v_2src = vx_load(src2 + x);24302431v_float64 v_1src0 = v_cvt_f64(v_1src);2432v_float64 v_1src1 = v_cvt_f64_high(v_1src);2433v_float64 v_2src0 = v_cvt_f64(v_2src);2434v_float64 v_2src1 = v_cvt_f64_high(v_2src);24352436v_store(dst + x, v_fma(v_1src0, v_2src0, vx_load(dst + x)));2437v_store(dst + x + step, v_fma(v_1src1, v_2src1, vx_load(dst + x + step)));2438}2439#endif // CV_AVX && !CV_AVX22440}2441else2442{2443v_uint32 v_0 = vx_setzero_u32();2444if (cn == 1)2445{2446for (; x <= len - cVectorWidth; x += cVectorWidth)2447{2448v_uint32 v_mask = vx_load_expand_q(mask + x);2449v_mask = ~(v_mask == v_0);2450v_float32 v_1src = vx_load(src1 + x);2451v_float32 v_2src = vx_load(src2 + x);2452v_1src = v_1src & v_reinterpret_as_f32(v_mask);2453v_2src = v_2src & v_reinterpret_as_f32(v_mask);24542455v_float64 v_1src0 = v_cvt_f64(v_1src);2456v_float64 v_1src1 = v_cvt_f64_high(v_1src);2457v_float64 v_2src0 = v_cvt_f64(v_2src);2458v_float64 v_2src1 = v_cvt_f64_high(v_2src);24592460v_store(dst + x, v_fma(v_1src0, v_2src0, vx_load(dst + x)));2461v_store(dst + x + step, v_fma(v_1src1, v_2src1, vx_load(dst + x + step)));2462}2463}2464else if (cn == 3)2465{2466for (; x <= len - cVectorWidth; x += cVectorWidth)2467{2468v_uint32 v_mask = vx_load_expand_q(mask + x);2469v_mask = ~(v_mask == v_0);2470v_float32 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;2471v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);2472v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);2473v_1src0 = v_1src0 & v_reinterpret_as_f32(v_mask);2474v_1src1 = v_1src1 & v_reinterpret_as_f32(v_mask);2475v_1src2 = v_1src2 & v_reinterpret_as_f32(v_mask);2476v_2src0 = v_2src0 & v_reinterpret_as_f32(v_mask);2477v_2src1 = v_2src1 & v_reinterpret_as_f32(v_mask);2478v_2src2 = v_2src2 & v_reinterpret_as_f32(v_mask);24792480v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;2481v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);2482v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);24832484v_dst00 = v_fma(v_cvt_f64(v_1src0), v_cvt_f64(v_2src0), v_dst00);2485v_dst01 = v_fma(v_cvt_f64_high(v_1src0), v_cvt_f64_high(v_2src0), v_dst01);2486v_dst10 = v_fma(v_cvt_f64(v_1src1), v_cvt_f64(v_2src1), v_dst10);2487v_dst11 = v_fma(v_cvt_f64_high(v_1src1), v_cvt_f64_high(v_2src1), v_dst11);2488v_dst20 = v_fma(v_cvt_f64(v_1src2), v_cvt_f64(v_2src2), v_dst20);2489v_dst21 = v_fma(v_cvt_f64_high(v_1src2), v_cvt_f64_high(v_2src2), v_dst21);24902491v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);2492v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);2493}2494}2495}2496#endif // CV_SIMD_64F2497accProd_general_(src1, src2, dst, mask, len, cn, x);2498}24992500void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn)2501{2502int x = 0;2503#if CV_SIMD_64F2504const int cVectorWidth = v_float64::nlanes * 2;2505const int step = v_float64::nlanes;25062507if (!mask)2508{2509int size = len * cn;2510#if CV_AVX && !CV_AVX22511for ( ; x <= size - 4 ; x += 4)2512{2513__m256d v_src0 = _mm256_loadu_pd(src1 + x);2514__m256d v_src1 = _mm256_loadu_pd(src2 + x);2515__m256d v_dst = _mm256_loadu_pd(dst + x);2516v_src0 = _mm256_mul_pd(v_src0, v_src1);2517v_dst = _mm256_add_pd(v_dst, v_src0);2518_mm256_storeu_pd(dst + x, v_dst);2519}2520#else2521for (; x <= size - cVectorWidth; x += cVectorWidth)2522{2523v_float64 v_src00 = vx_load(src1 + x);2524v_float64 v_src01 = vx_load(src1 + x + step);2525v_float64 v_src10 = vx_load(src2 + x);2526v_float64 v_src11 = vx_load(src2 + x + step);25272528v_store(dst + x, v_fma(v_src00, v_src10, vx_load(dst + x)));2529v_store(dst + x + step, v_fma(v_src01, v_src11, vx_load(dst + x + step)));2530}2531#endif2532}2533else2534{2535// todo: try fma2536v_uint64 v_0 = vx_setzero_u64();2537if (cn == 1)2538{2539for (; x <= len - cVectorWidth; x += cVectorWidth)2540{2541v_uint32 v_mask32 = vx_load_expand_q(mask + x);2542v_uint64 v_masku640, v_masku641;2543v_expand(v_mask32, v_masku640, v_masku641);2544v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));2545v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));25462547v_float64 v_src00 = vx_load(src1 + x);2548v_float64 v_src01 = vx_load(src1 + x + step);2549v_float64 v_src10 = vx_load(src2 + x);2550v_float64 v_src11 = vx_load(src2 + x + step);25512552v_store(dst + x, vx_load(dst + x) + ((v_src00 * v_src10) & v_mask0));2553v_store(dst + x + step, vx_load(dst + x + step) + ((v_src01 * v_src11) & v_mask1));2554}2555}2556else if (cn == 3)2557{2558for (; x <= len - cVectorWidth; x += cVectorWidth)2559{2560v_uint32 v_mask32 = vx_load_expand_q(mask + x);2561v_uint64 v_masku640, v_masku641;2562v_expand(v_mask32, v_masku640, v_masku641);2563v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));2564v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));25652566v_float64 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;2567v_float64 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;2568v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20);2569v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21);2570v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20);2571v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21);2572v_float64 v_src00 = (v_1src00 & v_mask0) * v_2src00;2573v_float64 v_src01 = (v_1src01 & v_mask1) * v_2src01;2574v_float64 v_src10 = (v_1src10 & v_mask0) * v_2src10;2575v_float64 v_src11 = (v_1src11 & v_mask1) * v_2src11;2576v_float64 v_src20 = (v_1src20 & v_mask0) * v_2src20;2577v_float64 v_src21 = (v_1src21 & v_mask1) * v_2src21;25782579v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;2580v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);2581v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);25822583v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);2584v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);2585}2586}2587}2588#endif // CV_SIMD_64F2589accProd_general_(src1, src2, dst, mask, len, cn, x);2590}25912592// running weight accumulate optimized by universal intrinsic2593void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn, double alpha)2594{2595int x = 0;2596#if CV_SIMD2597const v_float32 v_alpha = vx_setall_f32((float)alpha);2598const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));2599const int cVectorWidth = v_uint8::nlanes;2600const int step = v_float32::nlanes;26012602if (!mask)2603{2604int size = len * cn;2605for (; x <= size - cVectorWidth; x += cVectorWidth)2606{2607v_uint8 v_src = vx_load(src + x);26082609v_uint16 v_src0, v_src1;2610v_expand(v_src, v_src0, v_src1);26112612v_uint32 v_src00, v_src01, v_src10, v_src11;2613v_expand(v_src0, v_src00, v_src01);2614v_expand(v_src1, v_src10, v_src11);26152616v_float32 v_dst00 = vx_load(dst + x);2617v_float32 v_dst01 = vx_load(dst + x + step);2618v_float32 v_dst10 = vx_load(dst + x + step * 2);2619v_float32 v_dst11 = vx_load(dst + x + step * 3);26202621v_dst00 = v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha);2622v_dst01 = v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha);2623v_dst10 = v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha);2624v_dst11 = v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha);26252626v_store(dst + x, v_dst00);2627v_store(dst + x + step, v_dst01);2628v_store(dst + x + step * 2, v_dst10);2629v_store(dst + x + step * 3, v_dst11);2630}2631}2632#endif // CV_SIMD2633accW_general_(src, dst, mask, len, cn, alpha, x);2634}26352636void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn, double alpha)2637{2638int x = 0;2639#if CV_SIMD2640const v_float32 v_alpha = vx_setall_f32((float)alpha);2641const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));2642const int cVectorWidth = v_uint16::nlanes;2643const int step = v_float32::nlanes;26442645if (!mask)2646{2647int size = len * cn;2648for (; x <= size - cVectorWidth; x += cVectorWidth)2649{2650v_uint16 v_src = vx_load(src + x);2651v_uint32 v_int0, v_int1;2652v_expand(v_src, v_int0, v_int1);26532654v_float32 v_dst0 = vx_load(dst + x);2655v_float32 v_dst1 = vx_load(dst + x + step);2656v_dst0 = v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int0)) * v_alpha);2657v_dst1 = v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int1)) * v_alpha);26582659v_store(dst + x, v_dst0);2660v_store(dst + x + step, v_dst1);2661}2662}2663#endif // CV_SIMD2664accW_general_(src, dst, mask, len, cn, alpha, x);2665}26662667void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha)2668{2669int x = 0;2670#if CV_AVX && !CV_AVX22671const __m256 v_alpha = _mm256_set1_ps((float)alpha);2672const __m256 v_beta = _mm256_set1_ps((float)(1.0f - alpha));2673const int cVectorWidth = 16;26742675if (!mask)2676{2677int size = len * cn;2678for ( ; x <= size - cVectorWidth ; x += cVectorWidth)2679{2680_mm256_storeu_ps(dst + x, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x), v_alpha)));2681_mm256_storeu_ps(dst + x + 8, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x + 8), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x + 8), v_alpha)));2682}2683}2684#elif CV_SIMD2685const v_float32 v_alpha = vx_setall_f32((float)alpha);2686const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));2687const int cVectorWidth = v_uint16::nlanes;2688const int step = v_float32::nlanes;26892690if (!mask)2691{2692int size = len * cn;2693for (; x <= size - cVectorWidth; x += cVectorWidth)2694{2695v_float32 v_dst0 = vx_load(dst + x);2696v_float32 v_dst1 = vx_load(dst + x + step);26972698v_dst0 = v_fma(v_dst0, v_beta, vx_load(src + x) * v_alpha);2699v_dst1 = v_fma(v_dst1, v_beta, vx_load(src + x + step) * v_alpha);27002701v_store(dst + x, v_dst0);2702v_store(dst + x + step, v_dst1);2703}2704}2705#endif // CV_SIMD2706accW_general_(src, dst, mask, len, cn, alpha, x);2707}27082709void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha)2710{2711int x = 0;2712#if CV_SIMD_64F2713const v_float64 v_alpha = vx_setall_f64(alpha);2714const v_float64 v_beta = vx_setall_f64(1.0f - alpha);2715const int cVectorWidth = v_uint16::nlanes;2716const int step = v_float64::nlanes;27172718if (!mask)2719{2720int size = len * cn;2721for (; x <= size - cVectorWidth; x += cVectorWidth)2722{2723v_uint16 v_src16 = vx_load_expand(src + x);27242725v_uint32 v_int_0, v_int_1;2726v_expand(v_src16, v_int_0, v_int_1);27272728v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);2729v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);27302731v_float64 v_src0 = v_cvt_f64(v_int0);2732v_float64 v_src1 = v_cvt_f64_high(v_int0);2733v_float64 v_src2 = v_cvt_f64(v_int1);2734v_float64 v_src3 = v_cvt_f64_high(v_int1);27352736v_float64 v_dst0 = vx_load(dst + x);2737v_float64 v_dst1 = vx_load(dst + x + step);2738v_float64 v_dst2 = vx_load(dst + x + step * 2);2739v_float64 v_dst3 = vx_load(dst + x + step * 3);27402741v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha);2742v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha);2743v_dst2 = v_fma(v_dst2, v_beta, v_src2 * v_alpha);2744v_dst3 = v_fma(v_dst3, v_beta, v_src3 * v_alpha);27452746v_store(dst + x, v_dst0);2747v_store(dst + x + step, v_dst1);2748v_store(dst + x + step * 2, v_dst2);2749v_store(dst + x + step * 3, v_dst3);2750}2751}2752#endif // CV_SIMD_64F2753accW_general_(src, dst, mask, len, cn, alpha, x);2754}27552756void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha)2757{2758int x = 0;2759#if CV_SIMD_64F2760const v_float64 v_alpha = vx_setall_f64(alpha);2761const v_float64 v_beta = vx_setall_f64(1.0f - alpha);2762const int cVectorWidth = v_uint16::nlanes;2763const int step = v_float64::nlanes;27642765if (!mask)2766{2767int size = len * cn;2768for (; x <= size - cVectorWidth; x += cVectorWidth)2769{2770v_uint16 v_src = vx_load(src + x);2771v_uint32 v_int_0, v_int_1;2772v_expand(v_src, v_int_0, v_int_1);27732774v_int32 v_int0 = v_reinterpret_as_s32(v_int_0);2775v_int32 v_int1 = v_reinterpret_as_s32(v_int_1);27762777v_float64 v_src00 = v_cvt_f64(v_int0);2778v_float64 v_src01 = v_cvt_f64_high(v_int0);2779v_float64 v_src10 = v_cvt_f64(v_int1);2780v_float64 v_src11 = v_cvt_f64_high(v_int1);27812782v_float64 v_dst00 = vx_load(dst + x);2783v_float64 v_dst01 = vx_load(dst + x + step);2784v_float64 v_dst10 = vx_load(dst + x + step * 2);2785v_float64 v_dst11 = vx_load(dst + x + step * 3);27862787v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha);2788v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha);2789v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha);2790v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha);27912792v_store(dst + x, v_dst00);2793v_store(dst + x + step, v_dst01);2794v_store(dst + x + step * 2, v_dst10);2795v_store(dst + x + step * 3, v_dst11);2796}2797}2798#endif // CV_SIMD_64F2799accW_general_(src, dst, mask, len, cn, alpha, x);2800}28012802void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha)2803{2804int x = 0;2805#if CV_AVX && !CV_AVX22806const __m256d v_alpha = _mm256_set1_pd(alpha);2807const __m256d v_beta = _mm256_set1_pd(1.0f - alpha);2808const int cVectorWidth = 16;28092810if (!mask)2811{2812int size = len * cn;2813for ( ; x <= size - cVectorWidth ; x += cVectorWidth)2814{2815__m256 v_src0 = _mm256_loadu_ps(src + x);2816__m256 v_src1 = _mm256_loadu_ps(src + x + 8);2817__m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src0,0));2818__m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src0,1));2819__m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src1,0));2820__m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src1,1));28212822_mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src00, v_alpha)));2823_mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src01, v_alpha)));2824_mm256_storeu_pd(dst + x + 8, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 8), v_beta), _mm256_mul_pd(v_src10, v_alpha)));2825_mm256_storeu_pd(dst + x + 12, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 12), v_beta), _mm256_mul_pd(v_src11, v_alpha)));2826}2827}2828#elif CV_SIMD_64F2829const v_float64 v_alpha = vx_setall_f64(alpha);2830const v_float64 v_beta = vx_setall_f64(1.0f - alpha);2831const int cVectorWidth = v_float32::nlanes * 2;2832const int step = v_float64::nlanes;28332834if (!mask)2835{2836int size = len * cn;2837for (; x <= size - cVectorWidth; x += cVectorWidth)2838{2839v_float32 v_src0 = vx_load(src + x);2840v_float32 v_src1 = vx_load(src + x + v_float32::nlanes);2841v_float64 v_src00 = v_cvt_f64(v_src0);2842v_float64 v_src01 = v_cvt_f64_high(v_src0);2843v_float64 v_src10 = v_cvt_f64(v_src1);2844v_float64 v_src11 = v_cvt_f64_high(v_src1);28452846v_float64 v_dst00 = vx_load(dst + x);2847v_float64 v_dst01 = vx_load(dst + x + step);2848v_float64 v_dst10 = vx_load(dst + x + step * 2);2849v_float64 v_dst11 = vx_load(dst + x + step * 3);28502851v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha);2852v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha);2853v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha);2854v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha);28552856v_store(dst + x, v_dst00);2857v_store(dst + x + step, v_dst01);2858v_store(dst + x + step * 2, v_dst10);2859v_store(dst + x + step * 3, v_dst11);2860}2861}2862#endif // CV_SIMD_64F2863accW_general_(src, dst, mask, len, cn, alpha, x);2864}28652866void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha)2867{2868int x = 0;2869#if CV_AVX && !CV_AVX22870const __m256d v_alpha = _mm256_set1_pd(alpha);2871const __m256d v_beta = _mm256_set1_pd(1.0f - alpha);2872const int cVectorWidth = 8;28732874if (!mask)2875{2876int size = len * cn;2877for ( ; x <= size - cVectorWidth ; x += cVectorWidth)2878{2879__m256d v_src0 = _mm256_loadu_pd(src + x);2880__m256d v_src1 = _mm256_loadu_pd(src + x + 4);28812882_mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src0, v_alpha)));2883_mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src1, v_alpha)));2884}2885}2886#elif CV_SIMD_64F2887const v_float64 v_alpha = vx_setall_f64(alpha);2888const v_float64 v_beta = vx_setall_f64(1.0f - alpha);2889const int cVectorWidth = v_float64::nlanes * 2;2890const int step = v_float64::nlanes;28912892if (!mask)2893{2894int size = len * cn;2895for (; x <= size - cVectorWidth; x += cVectorWidth)2896{2897v_float64 v_src0 = vx_load(src + x);2898v_float64 v_src1 = vx_load(src + x + step);28992900v_float64 v_dst0 = vx_load(dst + x);2901v_float64 v_dst1 = vx_load(dst + x + step);29022903v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha);2904v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha);29052906v_store(dst + x, v_dst0);2907v_store(dst + x + step, v_dst1);2908}2909}2910#endif // CV_SIMD_64F2911accW_general_(src, dst, mask, len, cn, alpha, x);2912}29132914#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY29152916CV_CPU_OPTIMIZATION_NAMESPACE_END29172918} // namespace cv29192920///* End of file. */292129222923