Path: blob/master/modules/objdetect/src/haar.avx.cpp
16337 views
/*M///////////////////////////////////////////////////////////////////////////////////////1//2// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.3//4// By downloading, copying, installing or using the software you agree to this license.5// If you do not agree to this license, do not download, install,6// copy or use the software.7//8//9// Intel License Agreement10// For Open Source Computer Vision Library11//12// Copyright (C) 2000, Intel Corporation, all rights reserved.13// Third party copyrights are property of their respective owners.14//15// Redistribution and use in source and binary forms, with or without modification,16// are permitted provided that the following conditions are met:17//18// * Redistribution's of source code must retain the above copyright notice,19// this list of conditions and the following disclaimer.20//21// * Redistribution's in binary form must reproduce the above copyright notice,22// this list of conditions and the following disclaimer in the documentation23// and/or other materials provided with the distribution.24//25// * The name of Intel Corporation may not be used to endorse or promote products26// derived from this software without specific prior written permission.27//28// This software is provided by the copyright holders and contributors "as is" and29// any express or implied warranties, including, but not limited to, the implied30// warranties of merchantability and fitness for a particular purpose are disclaimed.31// In no event shall the Intel Corporation or contributors be liable for any direct,32// indirect, incidental, special, exemplary, or consequential damages33// (including, but not limited to, procurement of substitute goods or services;34// loss of use, data, or profits; or business interruption) however caused35// and on any theory of liability, whether in contract, strict liability,36// or tort (including negligence or otherwise) arising in any way out of37// the use of this software, even if advised of the possibility of such damage.38//39//M*/4041/* Haar features calculation */4243#include "precomp.hpp"44#include "haar.hpp"4546namespace cv_haar_avx47{4849// AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!50#if CV_HAAR_USE_AVX51double icvEvalHidHaarClassifierAVX(CvHidHaarClassifier* classifier,52double variance_norm_factor, size_t p_offset)53{54int CV_DECL_ALIGNED(32) idxV[8] = { 0,0,0,0,0,0,0,0 };55uchar flags[8] = { 0,0,0,0,0,0,0,0 };56CvHidHaarTreeNode* nodes[8];57double res = 0;58uchar exitConditionFlag = 0;59for (;;)60{61float CV_DECL_ALIGNED(32) tmp[8] = { 0,0,0,0,0,0,0,0 };62nodes[0] = (classifier + 0)->node + idxV[0];63nodes[1] = (classifier + 1)->node + idxV[1];64nodes[2] = (classifier + 2)->node + idxV[2];65nodes[3] = (classifier + 3)->node + idxV[3];66nodes[4] = (classifier + 4)->node + idxV[4];67nodes[5] = (classifier + 5)->node + idxV[5];68nodes[6] = (classifier + 6)->node + idxV[6];69nodes[7] = (classifier + 7)->node + idxV[7];7071__m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));7273t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,74nodes[6]->threshold,75nodes[5]->threshold,76nodes[4]->threshold,77nodes[3]->threshold,78nodes[2]->threshold,79nodes[1]->threshold,80nodes[0]->threshold));8182__m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),83calc_sumf(nodes[6]->feature.rect[0], p_offset),84calc_sumf(nodes[5]->feature.rect[0], p_offset),85calc_sumf(nodes[4]->feature.rect[0], p_offset),86calc_sumf(nodes[3]->feature.rect[0], p_offset),87calc_sumf(nodes[2]->feature.rect[0], p_offset),88calc_sumf(nodes[1]->feature.rect[0], p_offset),89calc_sumf(nodes[0]->feature.rect[0], p_offset));9091__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,92nodes[6]->feature.rect[0].weight,93nodes[5]->feature.rect[0].weight,94nodes[4]->feature.rect[0].weight,95nodes[3]->feature.rect[0].weight,96nodes[2]->feature.rect[0].weight,97nodes[1]->feature.rect[0].weight,98nodes[0]->feature.rect[0].weight);99100__m256 sum = _mm256_mul_ps(offset, weight);101102offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),103calc_sumf(nodes[6]->feature.rect[1], p_offset),104calc_sumf(nodes[5]->feature.rect[1], p_offset),105calc_sumf(nodes[4]->feature.rect[1], p_offset),106calc_sumf(nodes[3]->feature.rect[1], p_offset),107calc_sumf(nodes[2]->feature.rect[1], p_offset),108calc_sumf(nodes[1]->feature.rect[1], p_offset),109calc_sumf(nodes[0]->feature.rect[1], p_offset));110111weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,112nodes[6]->feature.rect[1].weight,113nodes[5]->feature.rect[1].weight,114nodes[4]->feature.rect[1].weight,115nodes[3]->feature.rect[1].weight,116nodes[2]->feature.rect[1].weight,117nodes[1]->feature.rect[1].weight,118nodes[0]->feature.rect[1].weight);119120sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));121122if (nodes[0]->feature.rect[2].p0)123tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;124if (nodes[1]->feature.rect[2].p0)125tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;126if (nodes[2]->feature.rect[2].p0)127tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;128if (nodes[3]->feature.rect[2].p0)129tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;130if (nodes[4]->feature.rect[2].p0)131tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;132if (nodes[5]->feature.rect[2].p0)133tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;134if (nodes[6]->feature.rect[2].p0)135tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;136if (nodes[7]->feature.rect[2].p0)137tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;138139sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));140141__m256 left = _mm256_set_ps(static_cast<float>(nodes[7]->left), static_cast<float>(nodes[6]->left),142static_cast<float>(nodes[5]->left), static_cast<float>(nodes[4]->left),143static_cast<float>(nodes[3]->left), static_cast<float>(nodes[2]->left),144static_cast<float>(nodes[1]->left), static_cast<float>(nodes[0]->left));145__m256 right = _mm256_set_ps(static_cast<float>(nodes[7]->right), static_cast<float>(nodes[6]->right),146static_cast<float>(nodes[5]->right), static_cast<float>(nodes[4]->right),147static_cast<float>(nodes[3]->right), static_cast<float>(nodes[2]->right),148static_cast<float>(nodes[1]->right), static_cast<float>(nodes[0]->right));149150_mm256_store_si256((__m256i*)idxV, _mm256_cvttps_epi32(_mm256_blendv_ps(right, left, _mm256_cmp_ps(sum, t, _CMP_LT_OQ))));151152for (int i = 0; i < 8; i++)153{154if (idxV[i] <= 0)155{156if (!flags[i])157{158exitConditionFlag++;159flags[i] = 1;160res += (classifier + i)->alpha[-idxV[i]];161}162idxV[i] = 0;163}164}165if (exitConditionFlag == 8)166return res;167}168}169170double icvEvalHidHaarStumpClassifierAVX(CvHidHaarClassifier* classifier,171double variance_norm_factor, size_t p_offset)172{173float CV_DECL_ALIGNED(32) tmp[8] = { 0,0,0,0,0,0,0,0 };174CvHidHaarTreeNode* nodes[8];175176nodes[0] = classifier[0].node;177nodes[1] = classifier[1].node;178nodes[2] = classifier[2].node;179nodes[3] = classifier[3].node;180nodes[4] = classifier[4].node;181nodes[5] = classifier[5].node;182nodes[6] = classifier[6].node;183nodes[7] = classifier[7].node;184185__m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));186187t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,188nodes[6]->threshold,189nodes[5]->threshold,190nodes[4]->threshold,191nodes[3]->threshold,192nodes[2]->threshold,193nodes[1]->threshold,194nodes[0]->threshold));195196__m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),197calc_sumf(nodes[6]->feature.rect[0], p_offset),198calc_sumf(nodes[5]->feature.rect[0], p_offset),199calc_sumf(nodes[4]->feature.rect[0], p_offset),200calc_sumf(nodes[3]->feature.rect[0], p_offset),201calc_sumf(nodes[2]->feature.rect[0], p_offset),202calc_sumf(nodes[1]->feature.rect[0], p_offset),203calc_sumf(nodes[0]->feature.rect[0], p_offset));204205__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,206nodes[6]->feature.rect[0].weight,207nodes[5]->feature.rect[0].weight,208nodes[4]->feature.rect[0].weight,209nodes[3]->feature.rect[0].weight,210nodes[2]->feature.rect[0].weight,211nodes[1]->feature.rect[0].weight,212nodes[0]->feature.rect[0].weight);213214__m256 sum = _mm256_mul_ps(offset, weight);215216offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),217calc_sumf(nodes[6]->feature.rect[1], p_offset),218calc_sumf(nodes[5]->feature.rect[1], p_offset),219calc_sumf(nodes[4]->feature.rect[1], p_offset),220calc_sumf(nodes[3]->feature.rect[1], p_offset),221calc_sumf(nodes[2]->feature.rect[1], p_offset),222calc_sumf(nodes[1]->feature.rect[1], p_offset),223calc_sumf(nodes[0]->feature.rect[1], p_offset));224225weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,226nodes[6]->feature.rect[1].weight,227nodes[5]->feature.rect[1].weight,228nodes[4]->feature.rect[1].weight,229nodes[3]->feature.rect[1].weight,230nodes[2]->feature.rect[1].weight,231nodes[1]->feature.rect[1].weight,232nodes[0]->feature.rect[1].weight);233234sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));235236if (nodes[0]->feature.rect[2].p0)237tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;238if (nodes[1]->feature.rect[2].p0)239tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;240if (nodes[2]->feature.rect[2].p0)241tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;242if (nodes[3]->feature.rect[2].p0)243tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;244if (nodes[4]->feature.rect[2].p0)245tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;246if (nodes[5]->feature.rect[2].p0)247tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;248if (nodes[6]->feature.rect[2].p0)249tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;250if (nodes[7]->feature.rect[2].p0)251tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;252253sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));254255__m256 alpha0 = _mm256_set_ps(classifier[7].alpha[0],256classifier[6].alpha[0],257classifier[5].alpha[0],258classifier[4].alpha[0],259classifier[3].alpha[0],260classifier[2].alpha[0],261classifier[1].alpha[0],262classifier[0].alpha[0]);263__m256 alpha1 = _mm256_set_ps(classifier[7].alpha[1],264classifier[6].alpha[1],265classifier[5].alpha[1],266classifier[4].alpha[1],267classifier[3].alpha[1],268classifier[2].alpha[1],269classifier[1].alpha[1],270classifier[0].alpha[1]);271272__m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ));273outBuf = _mm256_hadd_ps(outBuf, outBuf);274outBuf = _mm256_hadd_ps(outBuf, outBuf);275_mm256_store_ps(tmp, outBuf);276return (tmp[0] + tmp[4]);277}278279double icvEvalHidHaarStumpClassifierTwoRectAVX(CvHidHaarClassifier* classifier,280double variance_norm_factor, size_t p_offset)281{282float CV_DECL_ALIGNED(32) buf[8];283CvHidHaarTreeNode* nodes[8];284nodes[0] = classifier[0].node;285nodes[1] = classifier[1].node;286nodes[2] = classifier[2].node;287nodes[3] = classifier[3].node;288nodes[4] = classifier[4].node;289nodes[5] = classifier[5].node;290nodes[6] = classifier[6].node;291nodes[7] = classifier[7].node;292293__m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));294t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,295nodes[6]->threshold,296nodes[5]->threshold,297nodes[4]->threshold,298nodes[3]->threshold,299nodes[2]->threshold,300nodes[1]->threshold,301nodes[0]->threshold));302303__m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),304calc_sumf(nodes[6]->feature.rect[0], p_offset),305calc_sumf(nodes[5]->feature.rect[0], p_offset),306calc_sumf(nodes[4]->feature.rect[0], p_offset),307calc_sumf(nodes[3]->feature.rect[0], p_offset),308calc_sumf(nodes[2]->feature.rect[0], p_offset),309calc_sumf(nodes[1]->feature.rect[0], p_offset),310calc_sumf(nodes[0]->feature.rect[0], p_offset));311312__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,313nodes[6]->feature.rect[0].weight,314nodes[5]->feature.rect[0].weight,315nodes[4]->feature.rect[0].weight,316nodes[3]->feature.rect[0].weight,317nodes[2]->feature.rect[0].weight,318nodes[1]->feature.rect[0].weight,319nodes[0]->feature.rect[0].weight);320321__m256 sum = _mm256_mul_ps(offset, weight);322323offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),324calc_sumf(nodes[6]->feature.rect[1], p_offset),325calc_sumf(nodes[5]->feature.rect[1], p_offset),326calc_sumf(nodes[4]->feature.rect[1], p_offset),327calc_sumf(nodes[3]->feature.rect[1], p_offset),328calc_sumf(nodes[2]->feature.rect[1], p_offset),329calc_sumf(nodes[1]->feature.rect[1], p_offset),330calc_sumf(nodes[0]->feature.rect[1], p_offset));331332weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,333nodes[6]->feature.rect[1].weight,334nodes[5]->feature.rect[1].weight,335nodes[4]->feature.rect[1].weight,336nodes[3]->feature.rect[1].weight,337nodes[2]->feature.rect[1].weight,338nodes[1]->feature.rect[1].weight,339nodes[0]->feature.rect[1].weight);340341sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));342343__m256 alpha0 = _mm256_set_ps(classifier[7].alpha[0],344classifier[6].alpha[0],345classifier[5].alpha[0],346classifier[4].alpha[0],347classifier[3].alpha[0],348classifier[2].alpha[0],349classifier[1].alpha[0],350classifier[0].alpha[0]);351__m256 alpha1 = _mm256_set_ps(classifier[7].alpha[1],352classifier[6].alpha[1],353classifier[5].alpha[1],354classifier[4].alpha[1],355classifier[3].alpha[1],356classifier[2].alpha[1],357classifier[1].alpha[1],358classifier[0].alpha[1]);359360_mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ)));361return (buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7]);362}363364#endif //CV_HAAR_USE_AVX365366}367368/* End of file. */369370371