Path: blob/master/3rdparty/carotene/src/fill_minmaxloc.cpp
16337 views
/*1* By downloading, copying, installing or using the software you agree to this license.2* If you do not agree to this license, do not download, install,3* copy or use the software.4*5*6* License Agreement7* For Open Source Computer Vision Library8* (3-clause BSD License)9*10* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.11* Third party copyrights are property of their respective owners.12*13* Redistribution and use in source and binary forms, with or without modification,14* are permitted provided that the following conditions are met:15*16* * Redistributions of source code must retain the above copyright notice,17* this list of conditions and the following disclaimer.18*19* * Redistributions in binary form must reproduce the above copyright notice,20* this list of conditions and the following disclaimer in the documentation21* and/or other materials provided with the distribution.22*23* * Neither the names of the copyright holders nor the names of the contributors24* may be used to endorse or promote products derived from this software25* without specific prior written permission.26*27* This software is provided by the copyright holders and contributors "as is" and28* any express or implied warranties, including, but not limited to, the implied29* warranties of merchantability and fitness for a particular purpose are disclaimed.30* In no event shall copyright holders or contributors be liable for any direct,31* indirect, incidental, special, exemplary, or consequential damages32* (including, but not limited to, procurement of substitute goods or services;33* loss of use, data, or profits; or business interruption) however caused34* and on any theory of liability, whether in contract, strict liability,35* or tort (including negligence or otherwise) arising in any way out of36* the use of this software, even if advised of the possibility of such damage.37*/3839#include "common.hpp"4041namespace CAROTENE_NS {4243#ifdef CAROTENE_NEON4445namespace {4647template <typename T>48void process(const T * src, size_t j0, size_t j1, size_t i,49T minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,50T maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)51{52for (size_t j = j0; j < j1; ++j)53{54T val = src[j];5556if (val == maxVal)57{58if (maxLocCount < maxLocCapacity)59{60maxLocPtr[maxLocCount] = j;61maxLocPtr[maxLocCount + 1] = i;62}63maxLocCount += 2;64}6566if (val == minVal)67{68if (minLocCount < minLocCapacity)69{70minLocPtr[minLocCount] = j;71minLocPtr[minLocCount + 1] = i;72}73minLocCount += 2;74}75}76}7778} // namespace7980#endif8182void fillMinMaxLocs(const Size2D & size,83const u8 * srcBase, ptrdiff_t srcStride,84u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,85u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)86{87internal::assertSupportedConfiguration();88#ifdef CAROTENE_NEON89size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;90size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;9192uint8x16_t v_maxval16 = vdupq_n_u8(maxVal), v_minval16 = vdupq_n_u8(minVal);93uint8x8_t v_maxval8 = vdup_n_u8(maxVal), v_minval8 = vdup_n_u8(minVal);9495u64 mask[2] = { 0ul };9697minLocCapacity <<= 1;98maxLocCapacity <<= 1;99100for (size_t i = 0; i < size.height; ++i)101{102const u8 * src = internal::getRowPtr(srcBase, srcStride, i);103size_t j = 0;104105for ( ; j < roiw16; j += 16)106{107internal::prefetch(src + j);108uint8x16_t v_src = vld1q_u8(src + j);109110uint8x16_t v_maxmask = vceqq_u8(v_src, v_maxval16);111uint8x16_t v_minmask = vceqq_u8(v_src, v_minval16);112uint8x16_t v_mask = vorrq_u8(v_maxmask, v_minmask);113114vst1q_u8((u8 *)&mask[0], v_mask);115116if (mask[0])117process(src, j, j + 8, i,118minVal, minLocPtr, minLocCount, minLocCapacity,119maxVal, maxLocPtr, maxLocCount, maxLocCapacity);120if (mask[1])121process(src, j + 8, j + 16, i,122minVal, minLocPtr, minLocCount, minLocCapacity,123maxVal, maxLocPtr, maxLocCount, maxLocCapacity);124}125for ( ; j < roiw8; j += 8)126{127uint8x8_t v_src = vld1_u8(src + j);128129uint8x8_t v_maxmask = vceq_u8(v_src, v_maxval8);130uint8x8_t v_minmask = vceq_u8(v_src, v_minval8);131uint8x8_t v_mask = vorr_u8(v_maxmask, v_minmask);132133vst1_u8((u8 *)&mask[0], v_mask);134135if (mask[0])136process(src, j, j + 8, i,137minVal, minLocPtr, minLocCount, minLocCapacity,138maxVal, maxLocPtr, maxLocCount, maxLocCapacity);139}140141process(src, j, size.width, i,142minVal, minLocPtr, minLocCount, minLocCapacity,143maxVal, maxLocPtr, maxLocCount, maxLocCapacity);144}145146minLocCount >>= 1;147maxLocCount >>= 1;148#else149(void)size;150(void)srcBase;151(void)srcStride;152(void)minVal;153(void)minLocPtr;154(void)minLocCount;155(void)minLocCapacity;156(void)maxVal;157(void)maxLocPtr;158(void)maxLocCount;159(void)maxLocCapacity;160#endif161}162163void fillMinMaxLocs(const Size2D & size,164const u16 * srcBase, ptrdiff_t srcStride,165u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,166u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)167{168internal::assertSupportedConfiguration();169#ifdef CAROTENE_NEON170size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;171size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;172173uint16x8_t v_maxval8 = vdupq_n_u16(maxVal),174v_minval8 = vdupq_n_u16(minVal);175u64 mask[2] = { 0ul };176177minLocCapacity <<= 1;178maxLocCapacity <<= 1;179180for (size_t i = 0; i < size.height; ++i)181{182const u16 * src = internal::getRowPtr(srcBase, srcStride, i);183size_t j = 0;184185for ( ; j < roiw16; j += 16)186{187internal::prefetch(src + j);188uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);189190uint16x8_t v_mask0 = vorrq_u16(vceqq_u16(v_src0, v_maxval8), vceqq_u16(v_src0, v_minval8));191uint16x8_t v_mask1 = vorrq_u16(vceqq_u16(v_src1, v_maxval8), vceqq_u16(v_src1, v_minval8));192193vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));194195if (mask[0])196process(src, j, j + 8, i,197minVal, minLocPtr, minLocCount, minLocCapacity,198maxVal, maxLocPtr, maxLocCount, maxLocCapacity);199if (mask[1])200process(src, j + 8, j + 16, i,201minVal, minLocPtr, minLocCount, minLocCapacity,202maxVal, maxLocPtr, maxLocCount, maxLocCapacity);203}204for ( ; j < roiw8; j += 8)205{206internal::prefetch(src + j);207uint16x8_t v_src = vld1q_u16(src + j);208209uint16x8_t v_maxmask = vceqq_u16(v_src, v_maxval8);210uint16x8_t v_minmask = vceqq_u16(v_src, v_minval8);211uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);212213vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));214215if (mask[0])216process(src, j, j + 8, i,217minVal, minLocPtr, minLocCount, minLocCapacity,218maxVal, maxLocPtr, maxLocCount, maxLocCapacity);219}220221process(src, j, size.width, i,222minVal, minLocPtr, minLocCount, minLocCapacity,223maxVal, maxLocPtr, maxLocCount, maxLocCapacity);224}225226minLocCount >>= 1;227maxLocCount >>= 1;228#else229(void)size;230(void)srcBase;231(void)srcStride;232(void)minVal;233(void)minLocPtr;234(void)minLocCount;235(void)minLocCapacity;236(void)maxVal;237(void)maxLocPtr;238(void)maxLocCount;239(void)maxLocCapacity;240#endif241}242243void fillMinMaxLocs(const Size2D & size,244const s16 * srcBase, ptrdiff_t srcStride,245s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,246s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)247{248internal::assertSupportedConfiguration();249#ifdef CAROTENE_NEON250size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;251size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;252253int16x8_t v_maxval8 = vdupq_n_s16(maxVal),254v_minval8 = vdupq_n_s16(minVal);255u64 mask[2] = { 0ul };256257minLocCapacity <<= 1;258maxLocCapacity <<= 1;259260for (size_t i = 0; i < size.height; ++i)261{262const s16 * src = internal::getRowPtr(srcBase, srcStride, i);263size_t j = 0;264265for ( ; j < roiw16; j += 16)266{267internal::prefetch(src + j);268int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);269270uint16x8_t v_mask0 = vorrq_u16(vceqq_s16(v_src0, v_maxval8), vceqq_s16(v_src0, v_minval8));271uint16x8_t v_mask1 = vorrq_u16(vceqq_s16(v_src1, v_maxval8), vceqq_s16(v_src1, v_minval8));272273vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));274275if (mask[0])276process(src, j, j + 8, i,277minVal, minLocPtr, minLocCount, minLocCapacity,278maxVal, maxLocPtr, maxLocCount, maxLocCapacity);279if (mask[1])280process(src, j + 8, j + 16, i,281minVal, minLocPtr, minLocCount, minLocCapacity,282maxVal, maxLocPtr, maxLocCount, maxLocCapacity);283}284for ( ; j < roiw8; j += 8)285{286internal::prefetch(src + j);287int16x8_t v_src = vld1q_s16(src + j);288289uint16x8_t v_maxmask = vceqq_s16(v_src, v_maxval8);290uint16x8_t v_minmask = vceqq_s16(v_src, v_minval8);291uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);292293vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));294295if (mask[0])296process(src, j, j + 8, i,297minVal, minLocPtr, minLocCount, minLocCapacity,298maxVal, maxLocPtr, maxLocCount, maxLocCapacity);299}300301process(src, j, size.width, i,302minVal, minLocPtr, minLocCount, minLocCapacity,303maxVal, maxLocPtr, maxLocCount, maxLocCapacity);304}305306minLocCount >>= 1;307maxLocCount >>= 1;308#else309(void)size;310(void)srcBase;311(void)srcStride;312(void)minVal;313(void)minLocPtr;314(void)minLocCount;315(void)minLocCapacity;316(void)maxVal;317(void)maxLocPtr;318(void)maxLocCount;319(void)maxLocCapacity;320#endif321}322323void fillMinMaxLocs(const Size2D & size,324const s32 * srcBase, ptrdiff_t srcStride,325s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,326s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)327{328internal::assertSupportedConfiguration();329#ifdef CAROTENE_NEON330size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;331332int32x4_t v_maxval4 = vdupq_n_s32(maxVal),333v_minval4 = vdupq_n_s32(minVal);334u64 mask = 0ul;335336minLocCapacity <<= 1;337maxLocCapacity <<= 1;338339for (size_t i = 0; i < size.height; ++i)340{341const s32 * src = internal::getRowPtr(srcBase, srcStride, i);342size_t j = 0;343344for ( ; j < roiw8; j += 8)345{346internal::prefetch(src + j);347int32x4_t v_src0 = vld1q_s32(src + j), v_src1 = vld1q_s32(src + j + 4);348349uint32x4_t v_mask0 = vorrq_u32(vceqq_s32(v_src0, v_maxval4), vceqq_s32(v_src0, v_minval4));350uint32x4_t v_mask1 = vorrq_u32(vceqq_s32(v_src1, v_maxval4), vceqq_s32(v_src1, v_minval4));351352vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));353354if (mask)355process(src, j, j + 8, i,356minVal, minLocPtr, minLocCount, minLocCapacity,357maxVal, maxLocPtr, maxLocCount, maxLocCapacity);358}359360process(src, j, size.width, i,361minVal, minLocPtr, minLocCount, minLocCapacity,362maxVal, maxLocPtr, maxLocCount, maxLocCapacity);363}364365minLocCount >>= 1;366maxLocCount >>= 1;367#else368(void)size;369(void)srcBase;370(void)srcStride;371(void)minVal;372(void)minLocPtr;373(void)minLocCount;374(void)minLocCapacity;375(void)maxVal;376(void)maxLocPtr;377(void)maxLocCount;378(void)maxLocCapacity;379#endif380}381382void fillMinMaxLocs(const Size2D & size,383const u32 * srcBase, ptrdiff_t srcStride,384u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,385u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)386{387internal::assertSupportedConfiguration();388#ifdef CAROTENE_NEON389size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;390391uint32x4_t v_maxval4 = vdupq_n_u32(maxVal),392v_minval4 = vdupq_n_u32(minVal);393u64 mask = 0ul;394395minLocCapacity <<= 1;396maxLocCapacity <<= 1;397398for (size_t i = 0; i < size.height; ++i)399{400const u32 * src = internal::getRowPtr(srcBase, srcStride, i);401size_t j = 0;402403for ( ; j < roiw8; j += 8)404{405internal::prefetch(src + j);406uint32x4_t v_src0 = vld1q_u32(src + j), v_src1 = vld1q_u32(src + j + 4);407408uint32x4_t v_mask0 = vorrq_u32(vceqq_u32(v_src0, v_maxval4), vceqq_u32(v_src0, v_minval4));409uint32x4_t v_mask1 = vorrq_u32(vceqq_u32(v_src1, v_maxval4), vceqq_u32(v_src1, v_minval4));410411vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));412413if (mask)414process(src, j, j + 8, i,415minVal, minLocPtr, minLocCount, minLocCapacity,416maxVal, maxLocPtr, maxLocCount, maxLocCapacity);417}418419process(src, j, size.width, i,420minVal, minLocPtr, minLocCount, minLocCapacity,421maxVal, maxLocPtr, maxLocCount, maxLocCapacity);422}423424minLocCount >>= 1;425maxLocCount >>= 1;426#else427(void)size;428(void)srcBase;429(void)srcStride;430(void)minVal;431(void)minLocPtr;432(void)minLocCount;433(void)minLocCapacity;434(void)maxVal;435(void)maxLocPtr;436(void)maxLocCount;437(void)maxLocCapacity;438#endif439}440441} // namespace CAROTENE_NS442443444