Path: blob/master/3rdparty/carotene/src/minmaxloc.cpp
16337 views
/*1* By downloading, copying, installing or using the software you agree to this license.2* If you do not agree to this license, do not download, install,3* copy or use the software.4*5*6* License Agreement7* For Open Source Computer Vision Library8* (3-clause BSD License)9*10* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.11* Third party copyrights are property of their respective owners.12*13* Redistribution and use in source and binary forms, with or without modification,14* are permitted provided that the following conditions are met:15*16* * Redistributions of source code must retain the above copyright notice,17* this list of conditions and the following disclaimer.18*19* * Redistributions in binary form must reproduce the above copyright notice,20* this list of conditions and the following disclaimer in the documentation21* and/or other materials provided with the distribution.22*23* * Neither the names of the copyright holders nor the names of the contributors24* may be used to endorse or promote products derived from this software25* without specific prior written permission.26*27* This software is provided by the copyright holders and contributors "as is" and28* any express or implied warranties, including, but not limited to, the implied29* warranties of merchantability and fitness for a particular purpose are disclaimed.30* In no event shall copyright holders or contributors be liable for any direct,31* indirect, incidental, special, exemplary, or consequential damages32* (including, but not limited to, procurement of substitute goods or services;33* loss of use, data, or profits; or business interruption) however caused34* and on any theory of liability, whether in contract, strict liability,35* or tort (including negligence or otherwise) arising in any way out of36* the use of this software, even if advised of the possibility of such damage.37*/3839#include "common.hpp"40#include "vtransform.hpp"4142#include <limits>4344namespace CAROTENE_NS {4546#ifdef CAROTENE_NEON4748namespace {4950template <typename T>51void minMaxVals(const Size2D &size,52const T * srcBase, ptrdiff_t srcStride,53T * pMinVal, T * pMaxVal)54{55using namespace internal;5657typedef typename VecTraits<T>::vec128 vec128;58typedef typename VecTraits<T>::vec64 vec64;5960u32 step_base = 32 / sizeof(T), step_tail = 8 / sizeof(T);61size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;62size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;6364T maxVal = std::numeric_limits<T>::min();65T minVal = std::numeric_limits<T>::max();66vec128 v_min_base = vdupq_n(minVal), v_max_base = vdupq_n(maxVal);67vec64 v_min_tail = vdup_n(minVal), v_max_tail = vdup_n(maxVal);6869for (size_t i = 0; i < size.height; ++i)70{71const T * src = getRowPtr(srcBase, srcStride, i);72size_t j = 0;7374for (; j < roiw_base; j += step_base)75{76prefetch(src + j);77vec128 v_src0 = vld1q(src + j), v_src1 = vld1q(src + j + 16 / sizeof(T));78v_min_base = vminq(v_min_base, v_src0);79v_max_base = vmaxq(v_max_base, v_src0);80v_min_base = vminq(v_min_base, v_src1);81v_max_base = vmaxq(v_max_base, v_src1);82}83for (; j < roiw_tail; j += step_tail)84{85vec64 v_src0 = vld1(src + j);86v_min_tail = vmin(v_min_tail, v_src0);87v_max_tail = vmax(v_max_tail, v_src0);88}8990for (; j < size.width; j++)91{92T srcval = src[j];93minVal = std::min(srcval, minVal);94maxVal = std::max(srcval, maxVal);95}96}9798// collect min & max values99T ar[16 / sizeof(T)];100vst1q(ar, vcombine(vmin(v_min_tail, vmin(vget_low(v_min_base), vget_high(v_min_base))),101vmax(v_max_tail, vmax(vget_low(v_max_base), vget_high(v_max_base)))));102103for (size_t x = 0; x < 8u / sizeof(T); ++x)104{105minVal = std::min(minVal, ar[x]);106maxVal = std::max(maxVal, ar[x + 8 / sizeof(T)]);107}108109if (pMaxVal)110*pMaxVal = maxVal;111if (pMinVal)112*pMinVal = minVal;113}114115} // namespace116117#endif118119void minMaxVals(const Size2D &size,120const u8 * srcBase, ptrdiff_t srcStride,121u8 * pMinVal, u8 * pMaxVal)122{123internal::assertSupportedConfiguration();124#ifdef CAROTENE_NEON125minMaxVals<u8>(size,126srcBase, srcStride,127pMinVal, pMaxVal);128#else129(void)size;130(void)srcBase;131(void)srcStride;132(void)pMinVal;133(void)pMaxVal;134#endif135}136137void minMaxVals(const Size2D &size,138const s16 * srcBase, ptrdiff_t srcStride,139s16 * pMinVal, s16 * pMaxVal)140{141internal::assertSupportedConfiguration();142#ifdef CAROTENE_NEON143minMaxVals<s16>(size,144srcBase, srcStride,145pMinVal, pMaxVal);146#else147(void)size;148(void)srcBase;149(void)srcStride;150(void)pMinVal;151(void)pMaxVal;152#endif153}154155void minMaxVals(const Size2D &size,156const u16 * srcBase, ptrdiff_t srcStride,157u16 * pMinVal, u16 * pMaxVal)158{159internal::assertSupportedConfiguration();160#ifdef CAROTENE_NEON161minMaxVals<u16>(size,162srcBase, srcStride,163pMinVal, pMaxVal);164#else165(void)size;166(void)srcBase;167(void)srcStride;168(void)pMinVal;169(void)pMaxVal;170#endif171}172173void minMaxVals(const Size2D &size,174const s32 * srcBase, ptrdiff_t srcStride,175s32 * pMinVal, s32 * pMaxVal)176{177internal::assertSupportedConfiguration();178#ifdef CAROTENE_NEON179minMaxVals<s32>(size,180srcBase, srcStride,181pMinVal, pMaxVal);182#else183(void)size;184(void)srcBase;185(void)srcStride;186(void)pMinVal;187(void)pMaxVal;188#endif189}190191void minMaxVals(const Size2D &size,192const u32 * srcBase, ptrdiff_t srcStride,193u32 * pMinVal, u32 * pMaxVal)194{195internal::assertSupportedConfiguration();196#ifdef CAROTENE_NEON197minMaxVals<u32>(size,198srcBase, srcStride,199pMinVal, pMaxVal);200#else201(void)size;202(void)srcBase;203(void)srcStride;204(void)pMinVal;205(void)pMaxVal;206#endif207}208209void minMaxLoc(const Size2D &size,210const f32 * srcBase, ptrdiff_t srcStride,211f32 &minVal, size_t &minCol, size_t &minRow,212f32 &maxVal, size_t &maxCol, size_t &maxRow)213{214internal::assertSupportedConfiguration();215#ifdef CAROTENE_NEON216minVal = srcBase[0];217minCol = 0;218minRow = 0;219maxVal = srcBase[0];220maxCol = 0;221maxRow = 0;222for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)223{224const f32 * src = internal::getRowPtr( srcBase, srcStride, l);225if (size.width >= 16)226{227u32 tmp0123[4] = { 0, 1, 2, 3 };228uint32x4_t c4 = vdupq_n_u32(4);229230#if SIZE_MAX > UINT32_MAX231size_t boundAll = size.width - (4 - 1);232for(size_t b = 0; i < boundAll; b = i)233{234size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);235#else236{237size_t bound = size.width - (4 - 1);238#endif239uint32x4_t lineIdxOffset = vld1q_u32(tmp0123);240float32x4_t n_min = vdupq_n_f32(minVal);241uint32x4_t n_minIdx = vdupq_n_u32(0xffffFFFC);242float32x4_t n_max = vdupq_n_f32(maxVal);243uint32x4_t n_maxIdx = vdupq_n_u32(0xffffFFFC);244245for(; i < bound; i+=4)246{247internal::prefetch(src + i);248float32x4_t line = vld1q_f32(src + i);249250uint32x4_t minmask = vcltq_f32(line, n_min);251uint32x4_t maxmask = vcgtq_f32(line, n_max);252253n_min = vbslq_f32(minmask, line, n_min);254n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);255n_max = vbslq_f32(maxmask, line, n_max);256n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);257258// idx[] +=4259lineIdxOffset = vaddq_u32(lineIdxOffset, c4);260}261262f32 fmin[4], fmax[4];263u32 fminIdx[4], fmaxIdx[4];264265vst1q_f32(fmin, n_min);266vst1q_f32(fmax, n_max);267268vst1q_u32(fminIdx, n_minIdx);269vst1q_u32(fmaxIdx, n_maxIdx);270271size_t minIdx = fminIdx[0];272size_t maxIdx = fmaxIdx[0];273minVal = fmin[0];274maxVal = fmax[0];275276for (s32 j = 1; j < 4; ++j)277{278f32 minval = fmin[j];279f32 maxval = fmax[j];280if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))281{282minIdx = fminIdx[j];283minVal = minval;284}285if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))286{287maxIdx = fmaxIdx[j];288maxVal = maxval;289}290}291if(minIdx < 0xffffFFFC)292{293#if SIZE_MAX > UINT32_MAX294minCol = b + minIdx;295#else296minCol = minIdx;297#endif298minRow = l;299}300if(maxIdx < 0xffffFFFC)301{302#if SIZE_MAX > UINT32_MAX303maxCol = b + maxIdx;304#else305maxCol = maxIdx;306#endif307maxRow = l;308}309}310}311for(; i < size.width; ++i )312{313float val = src[i];314if( val < minVal )315{316minVal = val;317minCol = i;318minRow = l;319}320else if( val > maxVal )321{322maxVal = val;323maxCol = i;324maxRow = l;325}326}327}328#else329(void)size;330(void)srcBase;331(void)srcStride;332(void)minVal;333(void)minCol;334(void)minRow;335(void)maxVal;336(void)maxCol;337(void)maxRow;338#endif339}340341void minMaxLoc(const Size2D &size,342const f32 * srcBase, ptrdiff_t srcStride,343const u8 * maskBase, ptrdiff_t maskStride,344f32 &minVal, size_t &minCol, size_t &minRow,345f32 &maxVal, size_t &maxCol, size_t &maxRow)346{347internal::assertSupportedConfiguration();348#ifdef CAROTENE_NEON349minVal = std::numeric_limits<f32>::max();350minCol = size.width;351minRow = size.height;352maxVal = -std::numeric_limits<f32>::max();353maxCol = size.width;354maxRow = size.height;355for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)356{357const f32 * src = internal::getRowPtr( srcBase, srcStride, l);358const u8 * mask = internal::getRowPtr( maskBase, maskStride, l);359if (size.width >= 16)360{361u32 tmp0123[4] = { 0, 1, 2, 3 };362uint32x4_t uOne = vdupq_n_u32(1);363uint32x4_t c4 = vdupq_n_u32(4);364365#if SIZE_MAX > UINT32_MAX366size_t boundAll = size.width - (4 - 1);367for(size_t b = 0; i < boundAll; b = i)368{369size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);370#else371{372size_t bound = size.width - (4 - 1);373#endif374uint32x4_t lineIdxOffset = vld1q_u32(tmp0123);375float32x4_t n_min = vdupq_n_f32(minVal);376uint32x4_t n_minIdx = vdupq_n_u32(0xffffFFFC);377float32x4_t n_max = vdupq_n_f32(maxVal);378uint32x4_t n_maxIdx = vdupq_n_u32(0xffffFFFC);379380for(; i < bound; i+=4)381{382internal::prefetch(src + i);383internal::prefetch(mask + i);384float32x4_t line = vld1q_f32(src + i);385uint8x8_t maskLine = vld1_u8(mask + i);386387uint32x4_t maskLine4 = vmovl_u16(vget_low_u16(vmovl_u8(maskLine)));388maskLine4 = vcgeq_u32(maskLine4, uOne);389390uint32x4_t minmask = vcltq_f32(line, n_min);391uint32x4_t maxmask = vcgtq_f32(line, n_max);392393minmask = vandq_u32(minmask, maskLine4);394maxmask = vandq_u32(maxmask, maskLine4);395396n_min = vbslq_f32(minmask, line, n_min);397n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);398n_max = vbslq_f32(maxmask, line, n_max);399n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);400401// idx[] +=4402lineIdxOffset = vaddq_u32(lineIdxOffset, c4);403}404405f32 fmin[4], fmax[4];406u32 fminIdx[4], fmaxIdx[4];407408vst1q_f32(fmin, n_min);409vst1q_f32(fmax, n_max);410411vst1q_u32(fminIdx, n_minIdx);412vst1q_u32(fmaxIdx, n_maxIdx);413414size_t minIdx = fminIdx[0];415size_t maxIdx = fmaxIdx[0];416minVal = fmin[0];417maxVal = fmax[0];418419for (s32 j = 1; j < 4; ++j)420{421f32 minval = fmin[j];422f32 maxval = fmax[j];423if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))424{425minIdx = fminIdx[j];426minVal = minval;427}428if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))429{430maxIdx = fmaxIdx[j];431maxVal = maxval;432}433}434if(minIdx < 0xffffFFFC)435{436#if SIZE_MAX > UINT32_MAX437minCol = b + minIdx;438#else439minCol = minIdx;440#endif441minRow = l;442}443if(maxIdx < 0xffffFFFC)444{445#if SIZE_MAX > UINT32_MAX446maxCol = b + maxIdx;447#else448maxCol = maxIdx;449#endif450maxRow = l;451}452}453}454for(; i < size.width; i++ )455{456if (!mask[i])457continue;458f32 val = src[i];459if( val < minVal )460{461minVal = val;462minCol = i;463minRow = l;464}465if( val > maxVal )466{467maxVal = val;468maxCol = i;469maxRow = l;470}471}472}473#else474(void)size;475(void)srcBase;476(void)srcStride;477(void)maskBase;478(void)maskStride;479(void)minVal;480(void)minCol;481(void)minRow;482(void)maxVal;483(void)maxCol;484(void)maxRow;485#endif486}487488void minMaxLoc(const Size2D &size,489const s32 * srcBase, ptrdiff_t srcStride,490s32 &minVal, size_t &minCol, size_t &minRow,491s32 &maxVal, size_t &maxCol, size_t &maxRow)492{493internal::assertSupportedConfiguration();494#ifdef CAROTENE_NEON495minVal = srcBase[0];496minCol = 0;497minRow = 0;498maxVal = srcBase[0];499maxCol = 0;500maxRow = 0;501for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)502{503const s32 * src = internal::getRowPtr( srcBase, srcStride, l);504if (size.width >= 16)505{506u32 tmp0123[4] = { 0, 1, 2, 3 };507uint32x4_t c4 = vdupq_n_u32(4);508509#if SIZE_MAX > UINT32_MAX510size_t boundAll = size.width - (4 - 1);511for(size_t b = 0; i < boundAll; b = i)512{513size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);514#else515{516size_t bound = size.width - (4 - 1);517#endif518uint32x4_t lineIdxOffset = vld1q_u32(tmp0123);519int32x4_t n_min = vdupq_n_s32(minVal);520uint32x4_t n_minIdx = vdupq_n_u32(0xffffFFFC);521int32x4_t n_max = vdupq_n_s32(maxVal);522uint32x4_t n_maxIdx = vdupq_n_u32(0xffffFFFC);523524for(; i < bound; i+=4 )525{526internal::prefetch(src + i);527int32x4_t line = vld1q_s32(src + i);528529uint32x4_t minmask = vcltq_s32(line, n_min);530uint32x4_t maxmask = vcgtq_s32(line, n_max);531532n_min = vbslq_s32(minmask, line, n_min);533n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);534n_max = vbslq_s32(maxmask, line, n_max);535n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);536537// idx[] +=4538lineIdxOffset = vaddq_u32(lineIdxOffset, c4);539}540541s32 fmin[4], fmax[4];542u32 fminIdx[4], fmaxIdx[4];543544vst1q_s32(fmin, n_min);545vst1q_s32(fmax, n_max);546547vst1q_u32(fminIdx, n_minIdx);548vst1q_u32(fmaxIdx, n_maxIdx);549550size_t minIdx = fminIdx[0];551size_t maxIdx = fmaxIdx[0];552minVal = fmin[0];553maxVal = fmax[0];554555for (s32 j = 1; j < 4; ++j)556{557s32 minval = fmin[j];558s32 maxval = fmax[j];559if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))560{561minIdx = fminIdx[j];562minVal = minval;563}564if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))565{566maxIdx = fmaxIdx[j];567maxVal = maxval;568}569}570if(minIdx < 0xffffFFFC)571{572#if SIZE_MAX > UINT32_MAX573minCol = b + minIdx;574#else575minCol = minIdx;576#endif577minRow = l;578}579if(maxIdx < 0xffffFFFC)580{581#if SIZE_MAX > UINT32_MAX582maxCol = b + maxIdx;583#else584maxCol = maxIdx;585#endif586maxRow = l;587}588}589}590for(; i < size.width; ++i )591{592s32 val = src[i];593if( val < minVal )594{595minVal = val;596minCol = i;597minRow = l;598}599else if( val > maxVal )600{601maxVal = val;602maxCol = i;603maxRow = l;604}605}606}607#else608(void)size;609(void)srcBase;610(void)srcStride;611(void)minVal;612(void)minCol;613(void)minRow;614(void)maxVal;615(void)maxCol;616(void)maxRow;617#endif618}619620void minMaxLoc(const Size2D &size,621const s16 * srcBase, ptrdiff_t srcStride,622s16 &minVal, size_t &minCol, size_t &minRow,623s16 &maxVal, size_t &maxCol, size_t &maxRow)624{625internal::assertSupportedConfiguration();626#ifdef CAROTENE_NEON627minVal = srcBase[0];628minCol = 0;629minRow = 0;630maxVal = srcBase[0];631maxCol = 0;632maxRow = 0;633for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)634{635const s16 * src = internal::getRowPtr( srcBase, srcStride, l);636if (size.width >= 32)637{638u32 tmp0123[4] = { 0, 1, 2, 3 };639uint32x4_t c8 = vdupq_n_u32(8);640641#if SIZE_MAX > UINT32_MAX642size_t boundAll = size.width - (8 - 1);643for(size_t b = 0; i < boundAll; b = i)644{645size_t bound = std::min<size_t>(boundAll, b + 0xffffFFF8);646#else647{648size_t bound = size.width - (8 - 1);649#endif650uint32x4_t lineIdxOffset = vld1q_u32(tmp0123);651int16x8_t n_min = vdupq_n_s16(minVal);652uint32x4_t n_minIdxl = vdupq_n_u32(0xffffFFF8);653uint32x4_t n_minIdxh = vdupq_n_u32(0xffffFFF8);654int16x8_t n_max = vdupq_n_s16(maxVal);655uint32x4_t n_maxIdxl = vdupq_n_u32(0xffffFFF8);656uint32x4_t n_maxIdxh = vdupq_n_u32(0xffffFFF8);657658for(; i < bound; i+=8 )659{660internal::prefetch(src + i);661int16x8_t line = vld1q_s16(src + i);662663uint16x8_t minmask = vcltq_s16(line, n_min);664uint16x8_t maxmask = vcgtq_s16(line, n_max);665666n_min = vbslq_s16(minmask, line, n_min);667uint16x4_t minml = vget_low_u16(minmask);668uint16x4_t minmh = vget_high_u16(minmask);669uint32x4_t minml2 = vmovl_u16(minml);670uint32x4_t minmh2 = vmovl_u16(minmh);671minml2 = vqshlq_n_u32(minml2, 31);672minmh2 = vqshlq_n_u32(minmh2, 31);673n_minIdxl = vbslq_u32(minml2, lineIdxOffset, n_minIdxl);674n_minIdxh = vbslq_u32(minmh2, lineIdxOffset, n_minIdxh);675676n_max = vbslq_s16(maxmask, line, n_max);677uint16x4_t maxml = vget_low_u16(maxmask);678uint16x4_t maxmh = vget_high_u16(maxmask);679uint32x4_t maxml2 = vmovl_u16(maxml);680uint32x4_t maxmh2 = vmovl_u16(maxmh);681maxml2 = vqshlq_n_u32(maxml2, 31);682maxmh2 = vqshlq_n_u32(maxmh2, 31);683n_maxIdxl = vbslq_u32(maxml2, lineIdxOffset, n_maxIdxl);684n_maxIdxh = vbslq_u32(maxmh2, lineIdxOffset, n_maxIdxh);685686// idx[] +=8687lineIdxOffset = vaddq_u32(lineIdxOffset, c8);688}689690// fix high part of indexes691uint32x4_t c4 = vdupq_n_u32((int32_t) 4);692n_minIdxh = vaddq_u32(n_minIdxh, c4);693n_maxIdxh = vaddq_u32(n_maxIdxh, c4);694695s16 fmin[8], fmax[8];696u32 fminIdx[8], fmaxIdx[8];697698vst1q_s16(fmin, n_min);699vst1q_s16(fmax, n_max);700vst1q_u32(fminIdx+0, n_minIdxl);701vst1q_u32(fmaxIdx+0, n_maxIdxl);702vst1q_u32(fminIdx+4, n_minIdxh);703vst1q_u32(fmaxIdx+4, n_maxIdxh);704705size_t minIdx = fminIdx[0];706size_t maxIdx = fmaxIdx[0];707minVal = fmin[0];708maxVal = fmax[0];709710for (s32 j = 1; j < 8; ++j)711{712s16 minval = fmin[j];713s16 maxval = fmax[j];714if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))715{716minIdx = fminIdx[j];717minVal = minval;718}719if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))720{721maxIdx = fmaxIdx[j];722maxVal = maxval;723}724}725if(minIdx < 0xffffFFF8)726{727#if SIZE_MAX > UINT32_MAX728minCol = b + minIdx;729#else730minCol = minIdx;731#endif732minRow = l;733}734if(maxIdx < 0xffffFFF8)735{736#if SIZE_MAX > UINT32_MAX737maxCol = b + maxIdx;738#else739maxCol = maxIdx;740#endif741maxRow = l;742}743}744}745for(; i < size.width; ++i )746{747short val = src[i];748if( val < minVal )749{750minVal = val;751minCol = i;752minRow = l;753}754else if( val > maxVal )755{756maxVal = val;757maxCol = i;758maxRow = l;759}760}761}762#else763(void)size;764(void)srcBase;765(void)srcStride;766(void)minVal;767(void)minCol;768(void)minRow;769(void)maxVal;770(void)maxCol;771(void)maxRow;772#endif773}774775void minMaxLoc(const Size2D &size,776const u16 * srcBase, ptrdiff_t srcStride,777u16 &minVal, size_t &minCol, size_t &minRow,778u16 &maxVal, size_t &maxCol, size_t &maxRow)779{780internal::assertSupportedConfiguration();781#ifdef CAROTENE_NEON782minVal = srcBase[0];783minCol = 0;784minRow = 0;785maxVal = srcBase[0];786maxCol = 0;787maxRow = 0;788for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)789{790const u16 * src = internal::getRowPtr( srcBase, srcStride, l);791if (size.width >= 32)792{793u32 tmp0123[4] = { 0, 1, 2, 3 };794uint32x4_t c8 = vdupq_n_u32(8);795796#if SIZE_MAX > UINT32_MAX797size_t boundAll = size.width - (8 - 1);798for(size_t b = 0; i < boundAll; b = i)799{800size_t bound = std::min<size_t>(boundAll, b + 0xffffFFF8);801#else802{803size_t bound = size.width - (8 - 1);804#endif805uint32x4_t lineIdxOffset = vld1q_u32(tmp0123);806uint16x8_t n_min = vdupq_n_u16(minVal);807uint32x4_t n_minIdxl = vdupq_n_u32(0xffffFFF8);808uint32x4_t n_minIdxh = vdupq_n_u32(0xffffFFF8);809uint16x8_t n_max = vdupq_n_u16(maxVal);810uint32x4_t n_maxIdxl = vdupq_n_u32(0xffffFFF8);811uint32x4_t n_maxIdxh = vdupq_n_u32(0xffffFFF8);812813for(; i < bound; i+=8 )814{815internal::prefetch(src + i);816uint16x8_t line = vld1q_u16(src + i);817818uint16x8_t minmask = vcltq_u16(line, n_min);819uint16x8_t maxmask = vcgtq_u16(line, n_max);820821n_min = vbslq_u16(minmask, line, n_min);822uint16x4_t minml = vget_low_u16(minmask);823uint16x4_t minmh = vget_high_u16(minmask);824uint32x4_t minml2 = vmovl_u16(minml);825uint32x4_t minmh2 = vmovl_u16(minmh);826minml2 = vqshlq_n_u32(minml2, 31);827minmh2 = vqshlq_n_u32(minmh2, 31);828n_minIdxl = vbslq_u32(minml2, lineIdxOffset, n_minIdxl);829n_minIdxh = vbslq_u32(minmh2, lineIdxOffset, n_minIdxh);830831n_max = vbslq_u16(maxmask, line, n_max);832uint16x4_t maxml = vget_low_u16(maxmask);833uint16x4_t maxmh = vget_high_u16(maxmask);834uint32x4_t maxml2 = vmovl_u16(maxml);835uint32x4_t maxmh2 = vmovl_u16(maxmh);836maxml2 = vqshlq_n_u32(maxml2, 31);837maxmh2 = vqshlq_n_u32(maxmh2, 31);838n_maxIdxl = vbslq_u32(maxml2, lineIdxOffset, n_maxIdxl);839n_maxIdxh = vbslq_u32(maxmh2, lineIdxOffset, n_maxIdxh);840841// idx[] +=8842lineIdxOffset = vaddq_u32(lineIdxOffset, c8);843}844845// fix high part of indexes846uint32x4_t c4 = vdupq_n_u32(4);847n_minIdxh = vaddq_u32(n_minIdxh, c4);848n_maxIdxh = vaddq_u32(n_maxIdxh, c4);849850u16 fmin[8], fmax[8];851u32 fminIdx[8], fmaxIdx[8];852853vst1q_u16(fmin, n_min);854vst1q_u16(fmax, n_max);855vst1q_u32(fminIdx+0, n_minIdxl);856vst1q_u32(fmaxIdx+0, n_maxIdxl);857vst1q_u32(fminIdx+4, n_minIdxh);858vst1q_u32(fmaxIdx+4, n_maxIdxh);859860size_t minIdx = fminIdx[0];861size_t maxIdx = fmaxIdx[0];862minVal = fmin[0];863maxVal = fmax[0];864865for (s32 j = 1; j < 8; ++j)866{867u16 minval = fmin[j];868u16 maxval = fmax[j];869if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))870{871minIdx = fminIdx[j];872minVal = minval;873}874if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))875{876maxIdx = fmaxIdx[j];877maxVal = maxval;878}879}880if(minIdx < 0xffffFFF8)881{882#if SIZE_MAX > UINT32_MAX883minCol = b + minIdx;884#else885minCol = minIdx;886#endif887minRow = l;888}889if(maxIdx < 0xffffFFF8)890{891#if SIZE_MAX > UINT32_MAX892maxCol = b + maxIdx;893#else894maxCol = maxIdx;895#endif896maxRow = l;897}898}899}900for(; i < size.width; ++i )901{902u16 val = src[i];903if( val < minVal )904{905minVal = val;906minCol = i;907minRow = l;908}909else if( val > maxVal )910{911maxVal = val;912maxCol = i;913maxRow = l;914}915}916}917#else918(void)size;919(void)srcBase;920(void)srcStride;921(void)minVal;922(void)minCol;923(void)minRow;924(void)maxVal;925(void)maxCol;926(void)maxRow;927#endif928}929930#ifdef CAROTENE_NEON931namespace {932933void minMaxLocBlock(const u8 * src, u32 len,934u8 &minVal, u16 &minIdx,935u8 &maxVal, u16 &maxIdx)936{937u16 tmp0123[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };938939uint8x16_t n_min = vdupq_n_u8(src[0]);940uint16x8_t n_minIdxl = vdupq_n_u16(0);941uint16x8_t n_minIdxh = vdupq_n_u16(0);942uint8x16_t n_max = vdupq_n_u8(src[0]);943uint16x8_t n_maxIdxl = vdupq_n_u16(0);944uint16x8_t n_maxIdxh = vdupq_n_u16(0);945uint16x8_t c16 = vdupq_n_u16(16);946uint16x8_t lineIdxOffset = vld1q_u16(tmp0123);947948s32 i = 0;949s32 bound = len - (16 - 1);950for(; i < bound; i+=16 )951{952internal::prefetch(src + i);953uint8x16_t line = vld1q_u8(src + i);954955uint8x16_t minmask = vcltq_u8(line, n_min);956uint8x16_t maxmask = vcgtq_u8(line, n_max);957958n_min = vbslq_u8(minmask, line, n_min);959uint8x8_t minml = vget_low_u8(minmask);960uint8x8_t minmh = vget_high_u8(minmask);961uint16x8_t minml2 = vmovl_u8(minml);962uint16x8_t minmh2 = vmovl_u8(minmh);963minml2 = vqshlq_n_u16(minml2, 15);964minmh2 = vqshlq_n_u16(minmh2, 15);965n_minIdxl = vbslq_u16(minml2, lineIdxOffset, n_minIdxl);966n_minIdxh = vbslq_u16(minmh2, lineIdxOffset, n_minIdxh);967968n_max = vbslq_u8(maxmask, line, n_max);969uint8x8_t maxml = vget_low_u8(maxmask);970uint8x8_t maxmh = vget_high_u8(maxmask);971uint16x8_t maxml2 = vmovl_u8(maxml);972uint16x8_t maxmh2 = vmovl_u8(maxmh);973maxml2 = vqshlq_n_u16(maxml2, 15);974maxmh2 = vqshlq_n_u16(maxmh2, 15);975n_maxIdxl = vbslq_u16(maxml2, lineIdxOffset, n_maxIdxl);976n_maxIdxh = vbslq_u16(maxmh2, lineIdxOffset, n_maxIdxh);977978// idx[] +=16979lineIdxOffset = vaddq_u16(lineIdxOffset, c16);980}981982// fix high part of indexes983uint16x8_t c8 = vdupq_n_u16(8);984n_minIdxh = vaddq_u16(n_minIdxh, c8);985n_maxIdxh = vaddq_u16(n_maxIdxh, c8);986987u8 fmin[16], fmax[16];988u16 fminIdx[16], fmaxIdx[16];989/*{990uint8x8_t min_low = vget_low_u8(n_min);991uint8x8_t min_high = vget_high_u8(n_min);992uint8x8_t max_low = vget_low_u8(n_max);993uint8x8_t max_high = vget_high_u8(n_max);994995uint8x8_t minmask = vclt_u8(min_low, min_high);996uint8x8_t maxmask = vcgt_u8(max_low, max_high);997998uint8x8_t min2 = vbsl_u8(minmask, min_low, min_high);999uint8x8_t max2 = vbsl_u8(maxmask, max_low, max_high);10001001uint16x8_t minidxmask = vmovl_u8(minmask);1002uint16x8_t maxidxmask = vmovl_u8(maxmask);1003minidxmask = vqshlq_n_u16(minidxmask, 15);1004maxidxmask = vqshlq_n_u16(maxidxmask, 15);10051006uint16x8_t n_minIdx = vbslq_u16(minidxmask, n_minIdxl, n_minIdxh);1007uint16x8_t n_maxIdx = vbslq_u16(maxidxmask, n_maxIdxl, n_maxIdxh);10081009vst1_u8((uint8_t*)fmin, min2);1010vst1_u8((uint8_t*)fmax, max2);10111012vst1q_u16((uint16_t*)(fminIdx), n_minIdx);1013vst1q_u16((uint16_t*)(fmaxIdx), n_maxIdx);1014}*/10151016vst1q_u8(fmin, n_min);1017vst1q_u8(fmax, n_max);1018vst1q_u16(fminIdx+0, n_minIdxl);1019vst1q_u16(fmaxIdx+0, n_maxIdxl);1020vst1q_u16(fminIdx+8, n_minIdxh);1021vst1q_u16(fmaxIdx+8, n_maxIdxh);10221023minIdx = fminIdx[0];1024maxIdx = fmaxIdx[0];1025minVal = fmin[0];1026maxVal = fmax[0];10271028for (s32 j = 1; j < 16; ++j)1029{1030u8 minval = fmin[j];1031u8 maxval = fmax[j];1032if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))1033{1034minIdx = fminIdx[j];1035minVal = minval;1036}1037if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))1038{1039maxIdx = fmaxIdx[j];1040maxVal = maxval;1041}1042}10431044for(; i < (s32)len; ++i )1045{1046u8 val = src[i];1047if( val < minVal )1048{1049minVal = val;1050minIdx = (u16)i;1051}1052else if( val > maxVal )1053{1054maxVal = val;1055maxIdx = (u16)i;1056}1057}1058}10591060void minMaxLocBlock(const s8 * src, u32 len,1061s8 &minVal, u16 &minIdx,1062s8 &maxVal, u16 &maxIdx)1063{1064u16 tmp0123[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };10651066int8x16_t n_min = vdupq_n_s8(src[0]);1067uint16x8_t n_minIdxl = vdupq_n_u16(0);1068uint16x8_t n_minIdxh = vdupq_n_u16(0);1069int8x16_t n_max = vdupq_n_s8(src[0]);1070uint16x8_t n_maxIdxl = vdupq_n_u16(0);1071uint16x8_t n_maxIdxh = vdupq_n_u16(0);1072uint16x8_t c16 = vdupq_n_u16(16);1073uint16x8_t lineIdxOffset = vld1q_u16(tmp0123);10741075s32 i = 0;1076s32 bound = len - (16 - 1);1077for(; i < bound; i+=16 )1078{1079internal::prefetch(src + i);1080int8x16_t line = vld1q_s8(src + i);10811082uint8x16_t minmask = vcltq_s8(line, n_min);1083uint8x16_t maxmask = vcgtq_s8(line, n_max);10841085n_min = vbslq_s8(minmask, line, n_min);1086uint8x8_t minml = vget_low_u8(minmask);1087uint8x8_t minmh = vget_high_u8(minmask);1088uint16x8_t minml2 = vmovl_u8(minml);1089uint16x8_t minmh2 = vmovl_u8(minmh);1090minml2 = vqshlq_n_u16(minml2, 15);1091minmh2 = vqshlq_n_u16(minmh2, 15);1092n_minIdxl = vbslq_u16(minml2, lineIdxOffset, n_minIdxl);1093n_minIdxh = vbslq_u16(minmh2, lineIdxOffset, n_minIdxh);10941095n_max = vbslq_s8(maxmask, line, n_max);1096uint8x8_t maxml = vget_low_u8(maxmask);1097uint8x8_t maxmh = vget_high_u8(maxmask);1098uint16x8_t maxml2 = vmovl_u8(maxml);1099uint16x8_t maxmh2 = vmovl_u8(maxmh);1100maxml2 = vqshlq_n_u16(maxml2, 15);1101maxmh2 = vqshlq_n_u16(maxmh2, 15);1102n_maxIdxl = vbslq_u16(maxml2, lineIdxOffset, n_maxIdxl);1103n_maxIdxh = vbslq_u16(maxmh2, lineIdxOffset, n_maxIdxh);11041105// idx[] +=161106lineIdxOffset = vaddq_u16(lineIdxOffset, c16);1107}11081109// fix high part of indexes1110uint16x8_t c8 = vdupq_n_u16(8);1111n_minIdxh = vaddq_u16(n_minIdxh, c8);1112n_maxIdxh = vaddq_u16(n_maxIdxh, c8);11131114s8 fmin[16], fmax[16];1115u16 fminIdx[16], fmaxIdx[16];11161117vst1q_s8(fmin, n_min);1118vst1q_s8(fmax, n_max);1119vst1q_u16(fminIdx+0, n_minIdxl);1120vst1q_u16(fmaxIdx+0, n_maxIdxl);1121vst1q_u16(fminIdx+8, n_minIdxh);1122vst1q_u16(fmaxIdx+8, n_maxIdxh);11231124minIdx = fminIdx[0];1125maxIdx = fmaxIdx[0];1126minVal = fmin[0];1127maxVal = fmax[0];11281129for (s32 j = 1; j < 16; ++j)1130{1131s8 minval = fmin[j];1132s8 maxval = fmax[j];1133if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))1134{1135minIdx = fminIdx[j];1136minVal = minval;1137}1138if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))1139{1140maxIdx = fmaxIdx[j];1141maxVal = maxval;1142}1143}11441145for(; i < (s32)len; ++i )1146{1147s8 val = src[i];1148if( val < minVal )1149{1150minVal = val;1151minIdx = (u16)i;1152}1153else if( val > maxVal )1154{1155maxVal = val;1156maxIdx = (u16)i;1157}1158}1159}11601161} // namespace1162#endif // CAROTENE_NEON11631164#define USHORT_BLOCK_MAX_SIZE (1 << 16)11651166void minMaxLoc(const Size2D &size,1167const u8 * srcBase, ptrdiff_t srcStride,1168u8 &minVal, size_t &minCol, size_t &minRow,1169u8 &maxVal, size_t &maxCol, size_t &maxRow)1170{1171internal::assertSupportedConfiguration();1172#ifdef CAROTENE_NEON1173minVal = srcBase[0];1174minCol = 0;1175minRow = 0;1176maxVal = srcBase[0];1177maxCol = 0;1178maxRow = 0;1179for(size_t l = 0; l < size.height; ++l)1180{1181const u8 * src = internal::getRowPtr( srcBase, srcStride, l);1182if (size.width > 128)1183{1184for(size_t blockStart = 0; blockStart < size.width; blockStart += USHORT_BLOCK_MAX_SIZE)1185{1186u8 locMinVal, locMaxVal;1187u16 locMinIdx, locMaxIdx;1188size_t tail = size.width - blockStart;1189minMaxLocBlock(src + blockStart, tail < USHORT_BLOCK_MAX_SIZE ? tail : USHORT_BLOCK_MAX_SIZE,1190locMinVal, locMinIdx, locMaxVal, locMaxIdx);11911192if (locMinVal == 0 && locMaxVal == 255)1193{1194minCol = blockStart + locMinIdx;1195maxCol = blockStart + locMaxIdx;1196minRow = l;1197maxRow = l;1198minVal = 0;1199maxVal = 255;1200return;1201}1202else1203{1204if (locMinVal < minVal)1205{1206minCol = blockStart + locMinIdx;1207minRow = l;1208minVal = locMinVal;1209}1210if (locMaxVal > maxVal)1211{1212maxCol = blockStart + locMaxIdx;1213maxRow = l;1214maxVal = locMaxVal;1215}1216}1217}1218}1219else1220{1221for(size_t i = 0; i < size.width; ++i )1222{1223u8 val = src[i];1224if( val < minVal )1225{1226minVal = val;1227minCol = i;1228minRow = l;1229}1230else if( val > maxVal )1231{1232maxVal = val;1233maxCol = i;1234maxRow = l;1235}1236}1237}12381239}1240#else1241(void)size;1242(void)srcBase;1243(void)srcStride;1244(void)minVal;1245(void)minCol;1246(void)minRow;1247(void)maxVal;1248(void)maxCol;1249(void)maxRow;1250#endif1251}12521253void minMaxLoc(const Size2D &size,1254const s8 * srcBase, ptrdiff_t srcStride,1255s8 &minVal, size_t &minCol, size_t &minRow,1256s8 &maxVal, size_t &maxCol, size_t &maxRow)1257{1258internal::assertSupportedConfiguration();1259#ifdef CAROTENE_NEON1260minVal = srcBase[0];1261minCol = 0;1262minRow = 0;1263maxVal = srcBase[0];1264maxCol = 0;1265maxRow = 0;1266for(size_t l = 0; l < size.height; ++l)1267{1268const s8 * src = internal::getRowPtr( srcBase, srcStride, l);1269if (size.width > 128)1270{1271for(size_t blockStart = 0; blockStart < size.width; blockStart += USHORT_BLOCK_MAX_SIZE)1272{1273s8 locMinVal, locMaxVal;1274u16 locMinIdx, locMaxIdx;1275size_t tail = size.width - blockStart;1276minMaxLocBlock(src + blockStart, tail < USHORT_BLOCK_MAX_SIZE ? tail : USHORT_BLOCK_MAX_SIZE,1277locMinVal, locMinIdx, locMaxVal, locMaxIdx);12781279if (locMinVal == -128 && locMaxVal == 127)1280{1281minCol = blockStart + locMinIdx;1282maxCol = blockStart + locMaxIdx;1283minRow = l;1284maxRow = l;1285minVal = -128;1286maxVal = 127;1287return;1288}1289else1290{1291if (locMinVal < minVal)1292{1293minCol = blockStart + locMinIdx;1294minRow = l;1295minVal = locMinVal;1296}1297if (locMaxVal > maxVal)1298{1299maxCol = blockStart + locMaxIdx;1300maxRow = l;1301maxVal = locMaxVal;1302}1303}1304}1305}1306else1307{1308for(size_t i = 0; i < size.width; ++i )1309{1310s8 val = src[i];1311if( val < minVal )1312{1313minVal = val;1314minRow = l;1315minCol = i;1316}1317else if( val > maxVal )1318{1319maxVal = val;1320maxRow = l;1321maxCol = i;1322}1323}1324}1325}1326#else1327(void)size;1328(void)srcBase;1329(void)srcStride;1330(void)minVal;1331(void)minCol;1332(void)minRow;1333(void)maxVal;1334(void)maxCol;1335(void)maxRow;1336#endif1337}13381339} // namespace CAROTENE_NS134013411342