Path: blob/master/modules/core/src/count_non_zero.cpp
16337 views
// This file is part of OpenCV project.1// It is subject to the license terms in the LICENSE file found in the top-level directory2// of this distribution and at http://opencv.org/license.html345#include "precomp.hpp"6#include "opencl_kernels_core.hpp"7#include "stat.hpp"89namespace cv {1011template<typename T>12static int countNonZero_(const T* src, int len )13{14int i=0, nz = 0;15#if CV_ENABLE_UNROLLED16for(; i <= len - 4; i += 4 )17nz += (src[i] != 0) + (src[i+1] != 0) + (src[i+2] != 0) + (src[i+3] != 0);18#endif19for( ; i < len; i++ )20nz += src[i] != 0;21return nz;22}2324static int countNonZero8u( const uchar* src, int len )25{26int i=0, nz = 0;27#if CV_SIMD28int len0 = len & -v_uint8::nlanes;29v_uint8 v_zero = vx_setzero_u8();30v_uint8 v_one = vx_setall_u8(1);3132v_uint32 v_sum32 = vx_setzero_u32();33while (i < len0)34{35v_uint16 v_sum16 = vx_setzero_u16();36int j = i;37while (j < std::min(len0, i + 65280 * v_uint16::nlanes))38{39v_uint8 v_sum8 = vx_setzero_u8();40int k = j;41for (; k < std::min(len0, j + 255 * v_uint8::nlanes); k += v_uint8::nlanes)42v_sum8 += v_one & (vx_load(src + k) == v_zero);43v_uint16 part1, part2;44v_expand(v_sum8, part1, part2);45v_sum16 += part1 + part2;46j = k;47}48v_uint32 part1, part2;49v_expand(v_sum16, part1, part2);50v_sum32 += part1 + part2;51i = j;52}53nz = i - v_reduce_sum(v_sum32);54v_cleanup();55#endif56for( ; i < len; i++ )57nz += src[i] != 0;58return nz;59}6061static int countNonZero16u( const ushort* src, int len )62{63int i = 0, nz = 0;64#if CV_SIMD65int len0 = len & -v_int8::nlanes;66v_uint16 v_zero = vx_setzero_u16();67v_int8 v_one = vx_setall_s8(1);6869v_int32 v_sum32 = vx_setzero_s32();70while (i < len0)71{72v_int16 v_sum16 = vx_setzero_s16();73int j = i;74while (j < std::min(len0, i + 32766 * v_int16::nlanes))75{76v_int8 v_sum8 = vx_setzero_s8();77int k = j;78for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)79v_sum8 += v_one & v_pack(v_reinterpret_as_s16(vx_load(src + k) == v_zero), v_reinterpret_as_s16(vx_load(src + k + v_uint16::nlanes) == v_zero));80v_int16 part1, part2;81v_expand(v_sum8, part1, part2);82v_sum16 += part1 + part2;83j = k;84}85v_int32 part1, part2;86v_expand(v_sum16, part1, part2);87v_sum32 += part1 + part2;88i = j;89}90nz = i - v_reduce_sum(v_sum32);91v_cleanup();92#endif93return nz + countNonZero_(src + i, len - i);94}9596static int countNonZero32s( const int* src, int len )97{98int i = 0, nz = 0;99#if CV_SIMD100int len0 = len & -v_int8::nlanes;101v_int32 v_zero = vx_setzero_s32();102v_int8 v_one = vx_setall_s8(1);103104v_int32 v_sum32 = vx_setzero_s32();105while (i < len0)106{107v_int16 v_sum16 = vx_setzero_s16();108int j = i;109while (j < std::min(len0, i + 32766 * v_int16::nlanes))110{111v_int8 v_sum8 = vx_setzero_s8();112int k = j;113for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)114v_sum8 += v_one & v_pack(115v_pack(vx_load(src + k ) == v_zero, vx_load(src + k + v_int32::nlanes) == v_zero),116v_pack(vx_load(src + k + 2*v_int32::nlanes) == v_zero, vx_load(src + k + 3*v_int32::nlanes) == v_zero)117);118v_int16 part1, part2;119v_expand(v_sum8, part1, part2);120v_sum16 += part1 + part2;121j = k;122}123v_int32 part1, part2;124v_expand(v_sum16, part1, part2);125v_sum32 += part1 + part2;126i = j;127}128nz = i - v_reduce_sum(v_sum32);129v_cleanup();130#endif131return nz + countNonZero_(src + i, len - i);132}133134static int countNonZero32f( const float* src, int len )135{136int i = 0, nz = 0;137#if CV_SIMD138int len0 = len & -v_int8::nlanes;139v_float32 v_zero = vx_setzero_f32();140v_int8 v_one = vx_setall_s8(1);141142v_int32 v_sum32 = vx_setzero_s32();143while (i < len0)144{145v_int16 v_sum16 = vx_setzero_s16();146int j = i;147while (j < std::min(len0, i + 32766 * v_int16::nlanes))148{149v_int8 v_sum8 = vx_setzero_s8();150int k = j;151for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)152v_sum8 += v_one & v_pack(153v_pack(v_reinterpret_as_s32(vx_load(src + k ) == v_zero), v_reinterpret_as_s32(vx_load(src + k + v_float32::nlanes) == v_zero)),154v_pack(v_reinterpret_as_s32(vx_load(src + k + 2*v_float32::nlanes) == v_zero), v_reinterpret_as_s32(vx_load(src + k + 3*v_float32::nlanes) == v_zero))155);156v_int16 part1, part2;157v_expand(v_sum8, part1, part2);158v_sum16 += part1 + part2;159j = k;160}161v_int32 part1, part2;162v_expand(v_sum16, part1, part2);163v_sum32 += part1 + part2;164i = j;165}166nz = i - v_reduce_sum(v_sum32);167v_cleanup();168#endif169return nz + countNonZero_(src + i, len - i);170}171172static int countNonZero64f( const double* src, int len )173{174return countNonZero_(src, len);175}176177typedef int (*CountNonZeroFunc)(const uchar*, int);178179static CountNonZeroFunc getCountNonZeroTab(int depth)180{181static CountNonZeroFunc countNonZeroTab[] =182{183(CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u),184(CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u),185(CountNonZeroFunc)GET_OPTIMIZED(countNonZero32s), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32f),186(CountNonZeroFunc)GET_OPTIMIZED(countNonZero64f), 0187};188189return countNonZeroTab[depth];190}191192193#ifdef HAVE_OPENCL194static bool ocl_countNonZero( InputArray _src, int & res )195{196int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src);197bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;198199if (depth == CV_64F && !doubleSupport)200return false;201202int dbsize = ocl::Device::getDefault().maxComputeUnits();203size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();204205int wgs2_aligned = 1;206while (wgs2_aligned < (int)wgs)207wgs2_aligned <<= 1;208wgs2_aligned >>= 1;209210ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,211format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO"212" -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s",213ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),214ocl::typeToStr(depth), (int)wgs, kercn,215wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",216_src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));217if (k.empty())218return false;219220UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1);221k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),222dbsize, ocl::KernelArg::PtrWriteOnly(db));223224size_t globalsize = dbsize * wgs;225if (k.run(1, &globalsize, &wgs, true))226return res = saturate_cast<int>(cv::sum(db.getMat(ACCESS_READ))[0]), true;227return false;228}229#endif230231#if defined HAVE_IPP232static bool ipp_countNonZero( Mat &src, int &res )233{234CV_INSTRUMENT_REGION_IPP();235236#if IPP_VERSION_X100 < 201801237// Poor performance of SSE42238if(cv::ipp::getIppTopFeatures() == ippCPUID_SSE42)239return false;240#endif241242Ipp32s count = 0;243int depth = src.depth();244245if(src.dims <= 2)246{247IppStatus status;248IppiSize size = {src.cols*src.channels(), src.rows};249250if(depth == CV_8U)251status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, (const Ipp8u *)src.ptr(), (int)src.step, size, &count, 0, 0);252else if(depth == CV_32F)253status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, (const Ipp32f *)src.ptr(), (int)src.step, size, &count, 0, 0);254else255return false;256257if(status < 0)258return false;259260res = size.width*size.height - count;261}262else263{264IppStatus status;265const Mat *arrays[] = {&src, NULL};266Mat planes[1];267NAryMatIterator it(arrays, planes, 1);268IppiSize size = {(int)it.size*src.channels(), 1};269res = 0;270for (size_t i = 0; i < it.nplanes; i++, ++it)271{272if(depth == CV_8U)273status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, it.planes->ptr<Ipp8u>(), (int)it.planes->step, size, &count, 0, 0);274else if(depth == CV_32F)275status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, it.planes->ptr<Ipp32f>(), (int)it.planes->step, size, &count, 0, 0);276else277return false;278279if(status < 0 || (int)it.planes->total()*src.channels() < count)280return false;281282res += (int)it.planes->total()*src.channels() - count;283}284}285286return true;287}288#endif289290} // cv::291292int cv::countNonZero( InputArray _src )293{294CV_INSTRUMENT_REGION();295296int type = _src.type(), cn = CV_MAT_CN(type);297CV_Assert( cn == 1 );298299#if defined HAVE_OPENCL || defined HAVE_IPP300int res = -1;301#endif302303#ifdef HAVE_OPENCL304CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,305ocl_countNonZero(_src, res),306res)307#endif308309Mat src = _src.getMat();310CV_IPP_RUN_FAST(ipp_countNonZero(src, res), res);311312CountNonZeroFunc func = getCountNonZeroTab(src.depth());313CV_Assert( func != 0 );314315const Mat* arrays[] = {&src, 0};316uchar* ptrs[1] = {};317NAryMatIterator it(arrays, ptrs);318int total = (int)it.size, nz = 0;319320for( size_t i = 0; i < it.nplanes; i++, ++it )321nz += func( ptrs[0], total );322323return nz;324}325326void cv::findNonZero( InputArray _src, OutputArray _idx )327{328CV_INSTRUMENT_REGION();329330Mat src = _src.getMat();331CV_Assert( src.channels() == 1 && src.dims == 2 );332333int depth = src.depth();334std::vector<Point> idxvec;335int rows = src.rows, cols = src.cols;336AutoBuffer<int> buf_(cols + 1);337int* buf = buf_.data();338339for( int i = 0; i < rows; i++ )340{341int j, k = 0;342const uchar* ptr8 = src.ptr(i);343if( depth == CV_8U || depth == CV_8S )344{345for( j = 0; j < cols; j++ )346if( ptr8[j] != 0 ) buf[k++] = j;347}348else if( depth == CV_16U || depth == CV_16S )349{350const ushort* ptr16 = (const ushort*)ptr8;351for( j = 0; j < cols; j++ )352if( ptr16[j] != 0 ) buf[k++] = j;353}354else if( depth == CV_32S )355{356const int* ptr32s = (const int*)ptr8;357for( j = 0; j < cols; j++ )358if( ptr32s[j] != 0 ) buf[k++] = j;359}360else if( depth == CV_32F )361{362const float* ptr32f = (const float*)ptr8;363for( j = 0; j < cols; j++ )364if( ptr32f[j] != 0 ) buf[k++] = j;365}366else367{368const double* ptr64f = (const double*)ptr8;369for( j = 0; j < cols; j++ )370if( ptr64f[j] != 0 ) buf[k++] = j;371}372373if( k > 0 )374{375size_t sz = idxvec.size();376idxvec.resize(sz + k);377for( j = 0; j < k; j++ )378idxvec[sz + j] = Point(buf[j], i);379}380}381382if( idxvec.empty() || (_idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous()) )383_idx.release();384385if( !idxvec.empty() )386Mat(idxvec).copyTo(_idx);387}388389390