Path: blob/master/3rdparty/carotene/src/count_nonzero.cpp
16337 views
/*1* By downloading, copying, installing or using the software you agree to this license.2* If you do not agree to this license, do not download, install,3* copy or use the software.4*5*6* License Agreement7* For Open Source Computer Vision Library8* (3-clause BSD License)9*10* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.11* Third party copyrights are property of their respective owners.12*13* Redistribution and use in source and binary forms, with or without modification,14* are permitted provided that the following conditions are met:15*16* * Redistributions of source code must retain the above copyright notice,17* this list of conditions and the following disclaimer.18*19* * Redistributions in binary form must reproduce the above copyright notice,20* this list of conditions and the following disclaimer in the documentation21* and/or other materials provided with the distribution.22*23* * Neither the names of the copyright holders nor the names of the contributors24* may be used to endorse or promote products derived from this software25* without specific prior written permission.26*27* This software is provided by the copyright holders and contributors "as is" and28* any express or implied warranties, including, but not limited to, the implied29* warranties of merchantability and fitness for a particular purpose are disclaimed.30* In no event shall copyright holders or contributors be liable for any direct,31* indirect, incidental, special, exemplary, or consequential damages32* (including, but not limited to, procurement of substitute goods or services;33* loss of use, data, or profits; or business interruption) however caused34* and on any theory of liability, whether in contract, strict liability,35* or tort (including negligence or otherwise) arising in any way out of36* the use of this software, even if advised of the possibility of such damage.37*/3839#include "common.hpp"4041#include <limits>4243namespace CAROTENE_NS {4445s32 countNonZero(const Size2D &_size,46const u8 * srcBase, ptrdiff_t srcStride)47{48internal::assertSupportedConfiguration();49#ifdef CAROTENE_NEON50Size2D size(_size);51if (srcStride == (ptrdiff_t)(size.width))52{53size.width *= size.height;54size.height = 1;55}56size_t roiw16 = size.width & ~15u;57s32 result = 0;58for(size_t k = 0; k < size.height; ++k)59{60const u8* src = internal::getRowPtr( srcBase, srcStride, k);61size_t i = 0;6263#define COUNTNONZERO8U_BLOCK_SIZE (16*255)64uint8x16_t vc1 = vmovq_n_u8(1);65for (; i < roiw16;)66{67size_t lim = std::min(i + COUNTNONZERO8U_BLOCK_SIZE, size.width) - 16;68uint8x16_t vs = vmovq_n_u8(0);6970for (; i <= lim; i+= 16)71{72internal::prefetch(src + i);73uint8x16_t vln = vld1q_u8(src + i);74uint8x16_t vnz = vminq_u8(vln, vc1);75vs = vaddq_u8(vs, vnz);76}7778uint32x4_t vs4 = vpaddlq_u16(vpaddlq_u8(vs));79uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));8081s32 s[2];82vst1_u32((u32*)s, vs2);8384if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 2GB of non-zeros...85{86return 0x7fFFffFF;87}88result += (s[0] += s[1]);89if (s[0] < 0 || result < 0)90{91return 0x7fFFffFF;92}93}94for (; i < size.width; i++)95result += (src[i] != 0)?1:0;96if (result < 0)//saturate in case of overflow ~ 2GB of non-zeros...97{98return 0x7fFFffFF;99}100}101return result;102#else103(void)_size;104(void)srcBase;105(void)srcStride;106107return 0;108#endif109}110111s32 countNonZero(const Size2D &_size,112const u16 * srcBase, ptrdiff_t srcStride)113{114internal::assertSupportedConfiguration();115#ifdef CAROTENE_NEON116Size2D size(_size);117if (srcStride == (ptrdiff_t)(size.width))118{119size.width *= size.height;120size.height = 1;121}122size_t roiw8 = size.width & ~7u;123s32 result = 0;124for(size_t k = 0; k < size.height; ++k)125{126const u16* src = internal::getRowPtr( srcBase, srcStride, k);127size_t i = 0;128129#define COUNTNONZERO16U_BLOCK_SIZE (8*(256*256-1))130uint16x8_t vc1 = vmovq_n_u16(1);131for (; i < roiw8;)132{133size_t lim = std::min(i + COUNTNONZERO16U_BLOCK_SIZE, size.width) - 8;134uint16x8_t vs = vmovq_n_u16(0);135136for (; i <= lim; i+= 8)137{138internal::prefetch(src + i);139uint16x8_t vln = vld1q_u16(src + i);140uint16x8_t vnz = vminq_u16(vln, vc1);141vs = vaddq_u16(vs, vnz);142}143144uint32x4_t vs4 = vpaddlq_u16(vs);145uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));146147s32 s[2];148vst1_u32((u32*)s, vs2);149150if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 4GB of non-zeros...151{152return 0x7fFFffFF;153}154result += (s[0] += s[1]);155if (s[0] < 0 || result < 0)156{157return 0x7fFFffFF;158}159}160for (; i < size.width; i++)161result += (src[i] != 0)?1:0;162if (result < 0)//saturate in case of overflow ~ 4GB of non-zeros...163{164return 0x7fFFffFF;165}166}167return result;168#else169(void)_size;170(void)srcBase;171(void)srcStride;172173return 0;174#endif175}176177s32 countNonZero(const Size2D &_size,178const s32 * srcBase, ptrdiff_t srcStride)179{180internal::assertSupportedConfiguration();181#ifdef CAROTENE_NEON182Size2D size(_size);183if (srcStride == (ptrdiff_t)(size.width))184{185size.width *= size.height;186size.height = 1;187}188size_t roiw4 = size.width & ~3u;189s32 result = 0;190for(size_t k = 0; k < size.height; ++k)191{192const u32* src = (const u32*)internal::getRowPtr( srcBase, srcStride, k);193u32 i = 0;194195uint32x4_t vc1 = vmovq_n_u32(1);196uint32x4_t vs = vmovq_n_u32(0);197198for (; i < roiw4; i += 4 )199{200internal::prefetch(src + i);201uint32x4_t vln = vld1q_u32(src + i);202uint32x4_t vnz = vminq_u32(vln, vc1);203vs = vqaddq_u32(vs, vnz);204}205206uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));207208s32 s[2];209vst1_u32((u32*)s, vs2);210211if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 8GB of non-zeros...212{213return 0x7fFFffFF;214}215result += (s[0] += s[1]);216if (s[0] < 0 || result < 0)217{218return 0x7fFFffFF;219}220221for (; i < size.width; i++)222result += (src[i] != 0)?1:0;223if (result < 0)//saturate in case of overflow ~ 8GB of non-zeros...224{225return 0x7fFFffFF;226}227}228return result;229#else230(void)_size;231(void)srcBase;232(void)srcStride;233234return 0;235#endif236}237238s32 countNonZero(const Size2D &_size,239const f32 * srcBase, ptrdiff_t srcStride)240{241internal::assertSupportedConfiguration();242#ifdef CAROTENE_NEON243Size2D size(_size);244if (srcStride == (ptrdiff_t)(size.width))245{246size.width *= size.height;247size.height = 1;248}249size_t roiw4 = size.width & ~3u;250s32 result = 0;251for(size_t k = 0; k < size.height; ++k)252{253const f32* src = internal::getRowPtr( srcBase, srcStride, k);254size_t i = 0;255256float32x4_t vc0 = vmovq_n_f32(0);257int32x4_t vs = vmovq_n_s32(0);258259for (; i < roiw4; i += 4 )260{261internal::prefetch(src + i);262float32x4_t vln = vld1q_f32(src + i);263int32x4_t vnz = vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(vln, vc0)));264vs = vqaddq_s32(vs, vnz);265}266267int32x2_t vs2 = vqneg_s32(vqadd_s32(vget_low_s32(vs), vget_high_s32(vs)));268269int s[2];270vst1_s32(s, vs2);271272result += (s[0] += s[1]);273if (s[0] < 0 || result < 0)//case of overflow ~ 8GB of non-zeros...274{275return 0x7fFFffFF;276}277278for (; i < size.width; i++)279result += (src[i] < std::numeric_limits<float>::min() && src[i] > -std::numeric_limits<float>::min())?0:1;280281if (result < 0)282{283return 0x7fFFffFF;284}285}286return result;287#else288(void)_size;289(void)srcBase;290(void)srcStride;291292return 0;293#endif294}295296s32 countNonZero(const Size2D &_size,297const f64 * srcBase, ptrdiff_t srcStride)298{299internal::assertSupportedConfiguration();300#ifdef CAROTENE_NEON301Size2D size(_size);302if (srcStride == (ptrdiff_t)(size.width))303{304size.width *= size.height;305size.height = 1;306}307size_t roiw8 = size.width & ~7u;308size_t roiw4 = size.width & ~3u;309size_t roiw2 = size.width & ~1u;310uint64x2_t vmask1 = vdupq_n_u64(0x7fFFffFFffFFffFFULL); //will treat denormals as non-zero311uint32x4_t vc0 = vmovq_n_u32(0);312313s32 result = 0;314for(size_t k = 0; k < size.height; ++k)315{316const f64* src = internal::getRowPtr( srcBase, srcStride, k);317size_t i = 0;318319int32x2_t vs1 = vmov_n_s32(0);320int32x2_t vs2 = vmov_n_s32(0);321int32x2_t vs3 = vmov_n_s32(0);322int32x2_t vs4 = vmov_n_s32(0);323324for (; i < roiw8; i += 8 )325{326internal::prefetch(src + i + 6);327uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));328uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));329uint64x2_t vln3 = vld1q_u64((const u64*)(src + i + 4));330uint64x2_t vln4 = vld1q_u64((const u64*)(src + i + 6));331332uint64x2_t vm1 = vandq_u64(vln1, vmask1);333uint64x2_t vm2 = vandq_u64(vln2, vmask1);334uint64x2_t vm3 = vandq_u64(vln3, vmask1);335uint64x2_t vm4 = vandq_u64(vln4, vmask1);336337uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);338uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);339uint32x4_t vequ3 = vceqq_u32(vreinterpretq_u32_u64(vm3), vc0);340uint32x4_t vequ4 = vceqq_u32(vreinterpretq_u32_u64(vm4), vc0);341342uint32x4_t vlx1 = vmvnq_u32(vequ1);343uint32x4_t vlx2 = vmvnq_u32(vequ2);344uint32x4_t vlx3 = vmvnq_u32(vequ3);345uint32x4_t vlx4 = vmvnq_u32(vequ4);346347int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));348int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));349int32x2_t vnz3 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx3), vget_high_u32(vlx3)));350int32x2_t vnz4 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx4), vget_high_u32(vlx4)));351352vs1 = vqadd_s32(vs1, vnz1);353vs2 = vqadd_s32(vs2, vnz2);354vs3 = vqadd_s32(vs3, vnz3);355vs4 = vqadd_s32(vs4, vnz4);356}357358if (i < roiw4)359{360internal::prefetch(src + i + 2);361uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));362uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));363364uint64x2_t vm1 = vandq_u64(vln1, vmask1);365uint64x2_t vm2 = vandq_u64(vln2, vmask1);366367uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);368uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);369370uint32x4_t vlx1 = vmvnq_u32(vequ1);371uint32x4_t vlx2 = vmvnq_u32(vequ2);372373int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));374int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));375376vs1 = vqadd_s32(vs1, vnz1);377vs2 = vqadd_s32(vs2, vnz2);378i += 4;379}380381if (i < roiw2)382{383internal::prefetch(src + i);384uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));385386uint64x2_t vm1 = vandq_u64(vln1, vmask1);387388uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);389390uint32x4_t vlx1 = vmvnq_u32(vequ1);391392int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));393394vs1 = vqadd_s32(vs1, vnz1);395i += 2;396}397398vs1 = vqadd_s32(vs1, vs2);399vs3 = vqadd_s32(vs3, vs4);400vs1 = vqadd_s32(vs1, vs3);401int32x2_t vsneg = vqneg_s32(vs1);402403s32 s[2];404vst1_s32(s, vsneg);405406result += (s[0] += s[1]);407if (s[0] < 0 || result < 0)//case of overflow ~ 16GB of non-zeros...408{409return 0x7fFFffFF;410}411412for (; i < size.width; i++)413result += (src[i] < std::numeric_limits<double>::min() && src[i] > -std::numeric_limits<double>::min())?0:1;414if (result < 0)415{416return 0x7fFFffFF;417}418}419return result;420#else421(void)_size;422(void)srcBase;423(void)srcStride;424425return 0;426#endif427}428429} // namespace CAROTENE_NS430431432