Path: blob/master/3rdparty/carotene/src/meanstddev.cpp
16337 views
/*1* By downloading, copying, installing or using the software you agree to this license.2* If you do not agree to this license, do not download, install,3* copy or use the software.4*5*6* License Agreement7* For Open Source Computer Vision Library8* (3-clause BSD License)9*10* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.11* Third party copyrights are property of their respective owners.12*13* Redistribution and use in source and binary forms, with or without modification,14* are permitted provided that the following conditions are met:15*16* * Redistributions of source code must retain the above copyright notice,17* this list of conditions and the following disclaimer.18*19* * Redistributions in binary form must reproduce the above copyright notice,20* this list of conditions and the following disclaimer in the documentation21* and/or other materials provided with the distribution.22*23* * Neither the names of the copyright holders nor the names of the contributors24* may be used to endorse or promote products derived from this software25* without specific prior written permission.26*27* This software is provided by the copyright holders and contributors "as is" and28* any express or implied warranties, including, but not limited to, the implied29* warranties of merchantability and fitness for a particular purpose are disclaimed.30* In no event shall copyright holders or contributors be liable for any direct,31* indirect, incidental, special, exemplary, or consequential damages32* (including, but not limited to, procurement of substitute goods or services;33* loss of use, data, or profits; or business interruption) however caused34* and on any theory of liability, whether in contract, strict liability,35* or tort (including negligence or otherwise) arising in any way out of36* the use of this software, even if advised of the possibility of such damage.37*/3839#include "common.hpp"4041#include <cmath>4243namespace CAROTENE_NS {4445void meanStdDev(const Size2D &size,46const u8 * srcBase, ptrdiff_t srcStride,47f32 * pMean, f32 * pStdDev)48{49internal::assertSupportedConfiguration();50#ifdef CAROTENE_NEON51f64 fsum = 0.0f, fsqsum = 0.0f;52sqsum(size, srcBase, srcStride, &fsum, &fsqsum, 1);5354// calc mean and stddev55f64 itotal = 1.0 / size.total();56f64 mean = fsum * itotal;57f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));5859if (pMean)60*pMean = mean;61if (pStdDev)62*pStdDev = stddev;63#else64(void)size;65(void)srcBase;66(void)srcStride;67(void)pMean;68(void)pStdDev;69#endif70}7172void meanStdDev(const Size2D &size,73const u16 * srcBase, ptrdiff_t srcStride,74f32 * pMean, f32 * pStdDev)75{76internal::assertSupportedConfiguration();77#ifdef CAROTENE_NEON78size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3;79f64 fsum = 0.0f, fsqsum = 0.0f;8081f32 arsum[8];82uint32x4_t v_zero = vdupq_n_u32(0u), v_sum;83float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum;8485for (size_t i = 0; i < size.height; ++i)86{87const u16 * src = internal::getRowPtr(srcBase, srcStride, i);88size_t j = 0u;8990while (j < roiw4)91{92size_t blockSize = std::min(roiw4 - j, blockSize0) + j;93v_sum = v_zero;94v_sqsum = v_zero_f;9596for ( ; j + 16 < blockSize ; j += 16)97{98internal::prefetch(src + j);99uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);100101// 0102uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0));103uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0));104v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));105float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo);106float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi);107v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);108v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);109110// 1111v_srclo = vmovl_u16(vget_low_u16(v_src1));112v_srchi = vmovl_u16(vget_high_u16(v_src1));113v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));114v_srclo_f = vcvtq_f32_u32(v_srclo);115v_srchi_f = vcvtq_f32_u32(v_srchi);116v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);117v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);118}119120for ( ; j < blockSize; j += 4)121{122uint32x4_t v_src = vmovl_u16(vld1_u16(src + j));123float32x4_t v_src_f = vcvtq_f32_u32(v_src);124v_sum = vaddq_u32(v_sum, v_src);125v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f);126}127128vst1q_f32(arsum, vcvtq_f32_u32(v_sum));129vst1q_f32(arsum + 4, v_sqsum);130131fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3];132fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7];133}134135// collect a few last elements in the current row136for ( ; j < size.width; ++j)137{138f32 srcval = src[j];139fsum += srcval;140fsqsum += srcval * srcval;141}142}143144// calc mean and stddev145f64 itotal = 1.0 / size.total();146f64 mean = fsum * itotal;147f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));148149if (pMean)150*pMean = mean;151if (pStdDev)152*pStdDev = stddev;153#else154(void)size;155(void)srcBase;156(void)srcStride;157(void)pMean;158(void)pStdDev;159#endif160}161162} // namespace CAROTENE_NS163164165