Path: blob/master/modules/dnn/src/layers/detection_output_layer.cpp
16337 views
/*M ///////////////////////////////////////////////////////////////////////////////////////1//2// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.3//4// By downloading, copying, installing or using the software you agree to this license.5// If you do not agree to this license, do not download, install,6// copy or use the software.7//8//9// License Agreement10// For Open Source Computer Vision Library11//12// Copyright (C) 2013, OpenCV Foundation, all rights reserved.13// Copyright (C) 2017, Intel Corporation, all rights reserved.14// Third party copyrights are property of their respective owners.15//16// Redistribution and use in source and binary forms, with or without modification,17// are permitted provided that the following conditions are met:18//19// * Redistribution's of source code must retain the above copyright notice,20// this list of conditions and the following disclaimer.21//22// * Redistribution's in binary form must reproduce the above copyright notice,23// this list of conditions and the following disclaimer in the documentation24// and/or other materials provided with the distribution.25//26// * The name of the copyright holders may not be used to endorse or promote products27// derived from this software without specific prior written permission.28//29// This software is provided by the copyright holders and contributors "as is" and30// any express or implied warranties, including, but not limited to, the implied31// warranties of merchantability and fitness for a particular purpose are disclaimed.32// In no event shall the Intel Corporation or contributors be liable for any direct,33// indirect, incidental, special, exemplary, or consequential damages34// (including, but not limited to, procurement of substitute goods or services;35// loss of use, data, or profits; or business interruption) however caused36// and on any theory of liability, whether in contract, strict liability,37// or tort (including negligence or otherwise) arising in any way out of38// the use of this software, even if advised of the possibility of such damage.39//40//M*/4142#include "../precomp.hpp"43#include "layers_common.hpp"44#include "../op_inf_engine.hpp"45#include <float.h>46#include <string>47#include "../nms.inl.hpp"4849#ifdef HAVE_OPENCL50#include "opencl_kernels_dnn.hpp"51#endif5253namespace cv54{55namespace dnn56{5758namespace util59{6061class NormalizedBBox62{63public:64float xmin, ymin, xmax, ymax;6566NormalizedBBox()67: xmin(0), ymin(0), xmax(0), ymax(0), has_size_(false), size_(0) {}6869float size() const { return size_; }7071bool has_size() const { return has_size_; }7273void set_size(float value) { size_ = value; has_size_ = true; }7475void clear_size() { size_ = 0; has_size_ = false; }7677private:78bool has_size_;79float size_;80};8182template <typename T>83static inline bool SortScorePairDescend(const std::pair<float, T>& pair1,84const std::pair<float, T>& pair2)85{86return pair1.first > pair2.first;87}8889static inline float caffe_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b);9091static inline float caffe_norm_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b);9293} // namespace9495class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer96{97public:98unsigned _numClasses;99bool _shareLocation;100int _numLocClasses;101102int _backgroundLabelId;103104cv::String _codeType;105106bool _varianceEncodedInTarget;107int _keepTopK;108float _confidenceThreshold;109110float _nmsThreshold;111int _topK;112// Whenever predicted bounding boxes are represented in YXHW instead of XYWH layout.113bool _locPredTransposed;114// It's true whenever predicted bounding boxes and proposals are normalized to [0, 1].115bool _bboxesNormalized;116bool _clip;117bool _groupByClasses;118119enum { _numAxes = 4 };120static const std::string _layerName;121122typedef std::map<int, std::vector<util::NormalizedBBox> > LabelBBox;123124bool getParameterDict(const LayerParams ¶ms,125const std::string ¶meterName,126DictValue& result)127{128if (!params.has(parameterName))129{130return false;131}132133result = params.get(parameterName);134return true;135}136137template<typename T>138T getParameter(const LayerParams ¶ms,139const std::string ¶meterName,140const size_t &idx=0,141const bool required=true,142const T& defaultValue=T())143{144DictValue dictValue;145bool success = getParameterDict(params, parameterName, dictValue);146if(!success)147{148if(required)149{150std::string message = _layerName;151message += " layer parameter does not contain ";152message += parameterName;153message += " parameter.";154CV_Error(Error::StsBadArg, message);155}156else157{158return defaultValue;159}160}161return dictValue.get<T>(idx);162}163164void getCodeType(const LayerParams ¶ms)165{166String codeTypeString = toLowerCase(params.get<String>("code_type"));167if (codeTypeString == "center_size")168_codeType = "CENTER_SIZE";169else170_codeType = "CORNER";171}172173DetectionOutputLayerImpl(const LayerParams ¶ms)174{175_numClasses = getParameter<unsigned>(params, "num_classes");176_shareLocation = getParameter<bool>(params, "share_location");177_numLocClasses = _shareLocation ? 1 : _numClasses;178_backgroundLabelId = getParameter<int>(params, "background_label_id");179_varianceEncodedInTarget = getParameter<bool>(params, "variance_encoded_in_target", 0, false, false);180_keepTopK = getParameter<int>(params, "keep_top_k");181_confidenceThreshold = getParameter<float>(params, "confidence_threshold", 0, false, -FLT_MAX);182_topK = getParameter<int>(params, "top_k", 0, false, -1);183_locPredTransposed = getParameter<bool>(params, "loc_pred_transposed", 0, false, false);184_bboxesNormalized = getParameter<bool>(params, "normalized_bbox", 0, false, true);185_clip = getParameter<bool>(params, "clip", 0, false, false);186_groupByClasses = getParameter<bool>(params, "group_by_classes", 0, false, true);187188getCodeType(params);189190// Parameters used in nms.191_nmsThreshold = getParameter<float>(params, "nms_threshold");192CV_Assert(_nmsThreshold > 0.);193194setParamsFrom(params);195}196197virtual bool supportBackend(int backendId) CV_OVERRIDE198{199return backendId == DNN_BACKEND_OPENCV ||200backendId == DNN_BACKEND_INFERENCE_ENGINE && !_locPredTransposed && _bboxesNormalized && !_clip;201}202203bool getMemoryShapes(const std::vector<MatShape> &inputs,204const int requiredOutputs,205std::vector<MatShape> &outputs,206std::vector<MatShape> &internals) const CV_OVERRIDE207{208CV_Assert(inputs.size() >= 3);209CV_Assert(inputs[0][0] == inputs[1][0]);210211int numPriors = inputs[2][2] / 4;212CV_Assert((numPriors * _numLocClasses * 4) == total(inputs[0], 1));213CV_Assert(int(numPriors * _numClasses) == total(inputs[1], 1));214CV_Assert(inputs[2][1] == 1 + (int)(!_varianceEncodedInTarget));215216// num() and channels() are 1.217// Since the number of bboxes to be kept is unknown before nms, we manually218// set it to maximal number of detections, [keep_top_k] parameter.219// Each row is a 7 dimension std::vector, which stores220// [image_id, label, confidence, xmin, ymin, xmax, ymax]221outputs.resize(1, shape(1, 1, _keepTopK, 7));222223return false;224}225226#ifdef HAVE_OPENCL227// Decode all bboxes in a batch228bool ocl_DecodeBBoxesAll(UMat& loc_mat, UMat& prior_mat,229const int num, const int numPriors, const bool share_location,230const int num_loc_classes, const int background_label_id,231const cv::String& code_type, const bool variance_encoded_in_target,232const bool clip, std::vector<LabelBBox>& all_decode_bboxes)233{234UMat outmat = UMat(loc_mat.dims, loc_mat.size, CV_32F);235size_t nthreads = loc_mat.total();236String kernel_name;237238if (code_type == "CORNER")239kernel_name = "DecodeBBoxesCORNER";240else if (code_type == "CENTER_SIZE")241kernel_name = "DecodeBBoxesCENTER_SIZE";242else243return false;244245for (int i = 0; i < num; ++i)246{247ocl::Kernel kernel(kernel_name.c_str(), ocl::dnn::detection_output_oclsrc);248kernel.set(0, (int)nthreads);249kernel.set(1, ocl::KernelArg::PtrReadOnly(loc_mat));250kernel.set(2, ocl::KernelArg::PtrReadOnly(prior_mat));251kernel.set(3, (int)variance_encoded_in_target);252kernel.set(4, (int)numPriors);253kernel.set(5, (int)share_location);254kernel.set(6, (int)num_loc_classes);255kernel.set(7, (int)background_label_id);256kernel.set(8, (int)clip);257kernel.set(9, (int)_locPredTransposed);258kernel.set(10, ocl::KernelArg::PtrWriteOnly(outmat));259260if (!kernel.run(1, &nthreads, NULL, false))261return false;262}263264all_decode_bboxes.clear();265all_decode_bboxes.resize(num);266{267Mat mat = outmat.getMat(ACCESS_READ);268const float* decode_data = mat.ptr<float>();269for (int i = 0; i < num; ++i)270{271LabelBBox& decode_bboxes = all_decode_bboxes[i];272for (int c = 0; c < num_loc_classes; ++c)273{274int label = share_location ? -1 : c;275decode_bboxes[label].resize(numPriors);276for (int p = 0; p < numPriors; ++p)277{278int startIdx = p * num_loc_classes * 4;279util::NormalizedBBox& bbox = decode_bboxes[label][p];280bbox.xmin = decode_data[startIdx + c * 4];281bbox.ymin = decode_data[startIdx + c * 4 + 1];282bbox.xmax = decode_data[startIdx + c * 4 + 2];283bbox.ymax = decode_data[startIdx + c * 4 + 3];284}285}286}287}288return true;289}290291void ocl_GetConfidenceScores(const UMat& inp1, const int num,292const int numPredsPerClass, const int numClasses,293std::vector<Mat>& confPreds)294{295int shape[] = { numClasses, numPredsPerClass };296for (int i = 0; i < num; i++)297confPreds.push_back(Mat(2, shape, CV_32F));298299shape[0] = num * numPredsPerClass;300shape[1] = inp1.total() / shape[0];301UMat umat = inp1.reshape(1, 2, &shape[0]);302for (int i = 0; i < num; ++i)303{304Range ranges[] = { Range(i * numPredsPerClass, (i + 1) * numPredsPerClass), Range::all() };305transpose(umat(ranges), confPreds[i]);306}307}308309bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)310{311std::vector<UMat> inputs;312std::vector<UMat> outputs;313314bool use_half = (inps.depth() == CV_16S);315if (use_half)316{317std::vector<UMat> orig_inputs;318std::vector<UMat> orig_outputs;319320inps.getUMatVector(orig_inputs);321outs.getUMatVector(orig_outputs);322323inputs.resize(orig_inputs.size());324for (size_t i = 0; i < orig_inputs.size(); i++)325convertFp16(orig_inputs[i], inputs[i]);326}327else328{329inps.getUMatVector(inputs);330outs.getUMatVector(outputs);331}332333std::vector<LabelBBox> allDecodedBBoxes;334std::vector<Mat> allConfidenceScores;335336int num = inputs[0].size[0];337338// extract predictions from input layers339{340int numPriors = inputs[2].size[2] / 4;341342// Retrieve all confidences343ocl_GetConfidenceScores(inputs[1], num, numPriors, _numClasses, allConfidenceScores);344345// Decode all loc predictions to bboxes346bool ret = ocl_DecodeBBoxesAll(inputs[0], inputs[2], num, numPriors,347_shareLocation, _numLocClasses, _backgroundLabelId,348_codeType, _varianceEncodedInTarget, _clip,349allDecodedBBoxes);350if (!ret)351return false;352}353354size_t numKept = 0;355std::vector<std::map<int, std::vector<int> > > allIndices;356for (int i = 0; i < num; ++i)357{358numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices);359}360361if (numKept == 0)362{363// Set confidences to zeros.364Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};365if (use_half)366{367std::vector<UMat> orig_outputs;368outs.getUMatVector(orig_outputs);369orig_outputs[0](ranges).setTo(0);370} else371outputs[0](ranges).setTo(0);372return true;373}374int outputShape[] = {1, 1, (int)numKept, 7};375UMat umat = UMat(4, outputShape, CV_32F);376{377Mat mat = umat.getMat(ACCESS_WRITE);378float* outputsData = mat.ptr<float>();379380size_t count = 0;381for (int i = 0; i < num; ++i)382{383count += outputDetections_(i, &outputsData[count * 7],384allDecodedBBoxes[i], allConfidenceScores[i],385allIndices[i], _groupByClasses);386}387CV_Assert(count == numKept);388}389390if (use_half)391{392UMat half_umat;393convertFp16(umat, half_umat);394395std::vector<UMat> orig_outputs;396outs.getUMatVector(orig_outputs);397orig_outputs.clear();398orig_outputs.push_back(half_umat);399outs.assign(orig_outputs);400} else {401outputs.clear();402outputs.push_back(umat);403outs.assign(outputs);404}405406return true;407}408#endif409410void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE411{412CV_TRACE_FUNCTION();413CV_TRACE_ARG_VALUE(name, "name", name.c_str());414415if (_bboxesNormalized)416{417CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),418forward_ocl(inputs_arr, outputs_arr, internals_arr))419}420if (inputs_arr.depth() == CV_16S)421{422forward_fallback(inputs_arr, outputs_arr, internals_arr);423return;424}425426std::vector<Mat> inputs, outputs;427inputs_arr.getMatVector(inputs);428outputs_arr.getMatVector(outputs);429430std::vector<LabelBBox> allDecodedBBoxes;431std::vector<Mat> allConfidenceScores;432433int num = inputs[0].size[0];434435// extract predictions from input layers436{437int numPriors = inputs[2].size[2] / 4;438439const float* locationData = inputs[0].ptr<float>();440const float* confidenceData = inputs[1].ptr<float>();441const float* priorData = inputs[2].ptr<float>();442443// Retrieve all location predictions444std::vector<LabelBBox> allLocationPredictions;445GetLocPredictions(locationData, num, numPriors, _numLocClasses,446_shareLocation, _locPredTransposed, allLocationPredictions);447448// Retrieve all confidences449GetConfidenceScores(confidenceData, num, numPriors, _numClasses, allConfidenceScores);450451// Retrieve all prior bboxes452std::vector<util::NormalizedBBox> priorBBoxes;453std::vector<std::vector<float> > priorVariances;454GetPriorBBoxes(priorData, numPriors, _bboxesNormalized, priorBBoxes, priorVariances);455456// Decode all loc predictions to bboxes457util::NormalizedBBox clipBounds;458if (_clip)459{460CV_Assert(_bboxesNormalized || inputs.size() >= 4);461clipBounds.xmin = clipBounds.ymin = 0.0f;462if (_bboxesNormalized)463clipBounds.xmax = clipBounds.ymax = 1.0f;464else465{466// Input image sizes;467CV_Assert(inputs[3].dims == 4);468clipBounds.xmax = inputs[3].size[3] - 1;469clipBounds.ymax = inputs[3].size[2] - 1;470}471}472DecodeBBoxesAll(allLocationPredictions, priorBBoxes, priorVariances, num,473_shareLocation, _numLocClasses, _backgroundLabelId,474_codeType, _varianceEncodedInTarget, _clip, clipBounds,475_bboxesNormalized, allDecodedBBoxes);476}477478size_t numKept = 0;479std::vector<std::map<int, std::vector<int> > > allIndices;480for (int i = 0; i < num; ++i)481{482numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices);483}484485if (numKept == 0)486{487// Set confidences to zeros.488Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};489outputs[0](ranges).setTo(0);490return;491}492int outputShape[] = {1, 1, (int)numKept, 7};493outputs[0].create(4, outputShape, CV_32F);494float* outputsData = outputs[0].ptr<float>();495496size_t count = 0;497for (int i = 0; i < num; ++i)498{499count += outputDetections_(i, &outputsData[count * 7],500allDecodedBBoxes[i], allConfidenceScores[i],501allIndices[i], _groupByClasses);502}503CV_Assert(count == numKept);504// Sync results back due changed output shape.505outputs_arr.assign(outputs);506}507508size_t outputDetections_(509const int i, float* outputsData,510const LabelBBox& decodeBBoxes, Mat& confidenceScores,511const std::map<int, std::vector<int> >& indicesMap,512bool groupByClasses513)514{515std::vector<int> dstIndices;516std::vector<std::pair<float, int> > allScores;517for (std::map<int, std::vector<int> >::const_iterator it = indicesMap.begin(); it != indicesMap.end(); ++it)518{519int label = it->first;520if (confidenceScores.rows <= label)521CV_Error_(cv::Error::StsError, ("Could not find confidence predictions for label %d", label));522const std::vector<float>& scores = confidenceScores.row(label);523const std::vector<int>& indices = it->second;524525const int numAllScores = allScores.size();526allScores.reserve(numAllScores + indices.size());527for (size_t j = 0; j < indices.size(); ++j)528{529allScores.push_back(std::make_pair(scores[indices[j]], numAllScores + j));530}531}532if (!groupByClasses)533std::sort(allScores.begin(), allScores.end(), util::SortScorePairDescend<int>);534535dstIndices.resize(allScores.size());536for (size_t j = 0; j < dstIndices.size(); ++j)537{538dstIndices[allScores[j].second] = j;539}540541size_t count = 0;542for (std::map<int, std::vector<int> >::const_iterator it = indicesMap.begin(); it != indicesMap.end(); ++it)543{544int label = it->first;545if (confidenceScores.rows <= label)546CV_Error_(cv::Error::StsError, ("Could not find confidence predictions for label %d", label));547const std::vector<float>& scores = confidenceScores.row(label);548int locLabel = _shareLocation ? -1 : label;549LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(locLabel);550if (label_bboxes == decodeBBoxes.end())551CV_Error_(cv::Error::StsError, ("Could not find location predictions for label %d", locLabel));552const std::vector<int>& indices = it->second;553554for (size_t j = 0; j < indices.size(); ++j, ++count)555{556int idx = indices[j];557int dstIdx = dstIndices[count];558const util::NormalizedBBox& decode_bbox = label_bboxes->second[idx];559outputsData[dstIdx * 7] = i;560outputsData[dstIdx * 7 + 1] = label;561outputsData[dstIdx * 7 + 2] = scores[idx];562outputsData[dstIdx * 7 + 3] = decode_bbox.xmin;563outputsData[dstIdx * 7 + 4] = decode_bbox.ymin;564outputsData[dstIdx * 7 + 5] = decode_bbox.xmax;565outputsData[dstIdx * 7 + 6] = decode_bbox.ymax;566}567}568return count;569}570571size_t processDetections_(572const LabelBBox& decodeBBoxes, Mat& confidenceScores,573std::vector<std::map<int, std::vector<int> > >& allIndices574)575{576std::map<int, std::vector<int> > indices;577size_t numDetections = 0;578for (int c = 0; c < (int)_numClasses; ++c)579{580if (c == _backgroundLabelId)581continue; // Ignore background class.582if (c >= confidenceScores.rows)583CV_Error_(cv::Error::StsError, ("Could not find confidence predictions for label %d", c));584585const std::vector<float> scores = confidenceScores.row(c);586int label = _shareLocation ? -1 : c;587588LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(label);589if (label_bboxes == decodeBBoxes.end())590CV_Error_(cv::Error::StsError, ("Could not find location predictions for label %d", label));591if (_bboxesNormalized)592NMSFast_(label_bboxes->second, scores, _confidenceThreshold, _nmsThreshold, 1.0, _topK,593indices[c], util::caffe_norm_box_overlap);594else595NMSFast_(label_bboxes->second, scores, _confidenceThreshold, _nmsThreshold, 1.0, _topK,596indices[c], util::caffe_box_overlap);597numDetections += indices[c].size();598}599if (_keepTopK > -1 && numDetections > (size_t)_keepTopK)600{601std::vector<std::pair<float, std::pair<int, int> > > scoreIndexPairs;602for (std::map<int, std::vector<int> >::iterator it = indices.begin();603it != indices.end(); ++it)604{605int label = it->first;606const std::vector<int>& labelIndices = it->second;607if (label >= confidenceScores.rows)608CV_Error_(cv::Error::StsError, ("Could not find location predictions for label %d", label));609const std::vector<float>& scores = confidenceScores.row(label);610for (size_t j = 0; j < labelIndices.size(); ++j)611{612size_t idx = labelIndices[j];613CV_Assert(idx < scores.size());614scoreIndexPairs.push_back(std::make_pair(scores[idx], std::make_pair(label, idx)));615}616}617// Keep outputs k results per image.618std::sort(scoreIndexPairs.begin(), scoreIndexPairs.end(),619util::SortScorePairDescend<std::pair<int, int> >);620scoreIndexPairs.resize(_keepTopK);621622std::map<int, std::vector<int> > newIndices;623for (size_t j = 0; j < scoreIndexPairs.size(); ++j)624{625int label = scoreIndexPairs[j].second.first;626int idx = scoreIndexPairs[j].second.second;627newIndices[label].push_back(idx);628}629allIndices.push_back(newIndices);630return (size_t)_keepTopK;631}632else633{634allIndices.push_back(indices);635return numDetections;636}637}638639640// **************************************************************641// Utility functions642// **************************************************************643644// Compute bbox size645static float BBoxSize(const util::NormalizedBBox& bbox, bool normalized)646{647if (bbox.xmax < bbox.xmin || bbox.ymax < bbox.ymin)648{649return 0; // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.650}651else652{653if (bbox.has_size())654{655return bbox.size();656}657else658{659float width = bbox.xmax - bbox.xmin;660float height = bbox.ymax - bbox.ymin;661if (normalized)662{663return width * height;664}665else666{667// If bbox is not within range [0, 1].668return (width + 1) * (height + 1);669}670}671}672}673674675// Decode a bbox according to a prior bbox676template<bool variance_encoded_in_target>677static void DecodeBBox(678const util::NormalizedBBox& prior_bbox, const std::vector<float>& prior_variance,679const cv::String& code_type,680const bool clip_bbox, const util::NormalizedBBox& clip_bounds,681const bool normalized_bbox, const util::NormalizedBBox& bbox,682util::NormalizedBBox& decode_bbox)683{684float bbox_xmin = variance_encoded_in_target ? bbox.xmin : prior_variance[0] * bbox.xmin;685float bbox_ymin = variance_encoded_in_target ? bbox.ymin : prior_variance[1] * bbox.ymin;686float bbox_xmax = variance_encoded_in_target ? bbox.xmax : prior_variance[2] * bbox.xmax;687float bbox_ymax = variance_encoded_in_target ? bbox.ymax : prior_variance[3] * bbox.ymax;688if (code_type == "CORNER")689{690decode_bbox.xmin = prior_bbox.xmin + bbox_xmin;691decode_bbox.ymin = prior_bbox.ymin + bbox_ymin;692decode_bbox.xmax = prior_bbox.xmax + bbox_xmax;693decode_bbox.ymax = prior_bbox.ymax + bbox_ymax;694}695else if (code_type == "CENTER_SIZE")696{697float prior_width = prior_bbox.xmax - prior_bbox.xmin;698float prior_height = prior_bbox.ymax - prior_bbox.ymin;699if (!normalized_bbox)700{701prior_width += 1.0f;702prior_height += 1.0f;703}704CV_Assert(prior_width > 0);705CV_Assert(prior_height > 0);706float prior_center_x = prior_bbox.xmin + prior_width * .5;707float prior_center_y = prior_bbox.ymin + prior_height * .5;708709float decode_bbox_center_x, decode_bbox_center_y;710float decode_bbox_width, decode_bbox_height;711decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;712decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;713decode_bbox_width = exp(bbox_xmax) * prior_width;714decode_bbox_height = exp(bbox_ymax) * prior_height;715decode_bbox.xmin = decode_bbox_center_x - decode_bbox_width * .5;716decode_bbox.ymin = decode_bbox_center_y - decode_bbox_height * .5;717decode_bbox.xmax = decode_bbox_center_x + decode_bbox_width * .5;718decode_bbox.ymax = decode_bbox_center_y + decode_bbox_height * .5;719}720else721CV_Error(Error::StsBadArg, "Unknown type.");722723if (clip_bbox)724{725// Clip the util::NormalizedBBox.726decode_bbox.xmin = std::max(std::min(decode_bbox.xmin, clip_bounds.xmax), clip_bounds.xmin);727decode_bbox.ymin = std::max(std::min(decode_bbox.ymin, clip_bounds.ymax), clip_bounds.ymin);728decode_bbox.xmax = std::max(std::min(decode_bbox.xmax, clip_bounds.xmax), clip_bounds.xmin);729decode_bbox.ymax = std::max(std::min(decode_bbox.ymax, clip_bounds.ymax), clip_bounds.ymin);730}731decode_bbox.clear_size();732decode_bbox.set_size(BBoxSize(decode_bbox, normalized_bbox));733}734735// Decode a set of bboxes according to a set of prior bboxes736static void DecodeBBoxes(737const std::vector<util::NormalizedBBox>& prior_bboxes,738const std::vector<std::vector<float> >& prior_variances,739const cv::String& code_type, const bool variance_encoded_in_target,740const bool clip_bbox, const util::NormalizedBBox& clip_bounds,741const bool normalized_bbox, const std::vector<util::NormalizedBBox>& bboxes,742std::vector<util::NormalizedBBox>& decode_bboxes)743{744CV_Assert(prior_bboxes.size() == prior_variances.size());745CV_Assert(prior_bboxes.size() == bboxes.size());746size_t num_bboxes = prior_bboxes.size();747CV_Assert(num_bboxes == 0 || prior_variances[0].size() == 4);748decode_bboxes.clear(); decode_bboxes.resize(num_bboxes);749if(variance_encoded_in_target)750{751for (int i = 0; i < num_bboxes; ++i)752DecodeBBox<true>(prior_bboxes[i], prior_variances[i], code_type,753clip_bbox, clip_bounds, normalized_bbox,754bboxes[i], decode_bboxes[i]);755}756else757{758for (int i = 0; i < num_bboxes; ++i)759DecodeBBox<false>(prior_bboxes[i], prior_variances[i], code_type,760clip_bbox, clip_bounds, normalized_bbox,761bboxes[i], decode_bboxes[i]);762}763}764765// Decode all bboxes in a batch766static void DecodeBBoxesAll(const std::vector<LabelBBox>& all_loc_preds,767const std::vector<util::NormalizedBBox>& prior_bboxes,768const std::vector<std::vector<float> >& prior_variances,769const int num, const bool share_location,770const int num_loc_classes, const int background_label_id,771const cv::String& code_type, const bool variance_encoded_in_target,772const bool clip, const util::NormalizedBBox& clip_bounds,773const bool normalized_bbox, std::vector<LabelBBox>& all_decode_bboxes)774{775CV_Assert(all_loc_preds.size() == num);776all_decode_bboxes.clear();777all_decode_bboxes.resize(num);778for (int i = 0; i < num; ++i)779{780// Decode predictions into bboxes.781const LabelBBox& loc_preds = all_loc_preds[i];782LabelBBox& decode_bboxes = all_decode_bboxes[i];783for (int c = 0; c < num_loc_classes; ++c)784{785int label = share_location ? -1 : c;786if (label == background_label_id)787continue; // Ignore background class.788LabelBBox::const_iterator label_loc_preds = loc_preds.find(label);789if (label_loc_preds == loc_preds.end())790CV_Error_(cv::Error::StsError, ("Could not find location predictions for label %d", label));791DecodeBBoxes(prior_bboxes, prior_variances,792code_type, variance_encoded_in_target, clip, clip_bounds,793normalized_bbox, label_loc_preds->second, decode_bboxes[label]);794}795}796}797798// Get prior bounding boxes from prior_data799// prior_data: 1 x 2 x num_priors * 4 x 1 blob.800// num_priors: number of priors.801// prior_bboxes: stores all the prior bboxes in the format of util::NormalizedBBox.802// prior_variances: stores all the variances needed by prior bboxes.803static void GetPriorBBoxes(const float* priorData, const int& numPriors,804bool normalized_bbox, std::vector<util::NormalizedBBox>& priorBBoxes,805std::vector<std::vector<float> >& priorVariances)806{807priorBBoxes.clear(); priorBBoxes.resize(numPriors);808priorVariances.clear(); priorVariances.resize(numPriors);809for (int i = 0; i < numPriors; ++i)810{811int startIdx = i * 4;812util::NormalizedBBox& bbox = priorBBoxes[i];813bbox.xmin = priorData[startIdx];814bbox.ymin = priorData[startIdx + 1];815bbox.xmax = priorData[startIdx + 2];816bbox.ymax = priorData[startIdx + 3];817bbox.set_size(BBoxSize(bbox, normalized_bbox));818}819820for (int i = 0; i < numPriors; ++i)821{822int startIdx = (numPriors + i) * 4;823// not needed here: priorVariances[i].clear();824for (int j = 0; j < 4; ++j)825{826priorVariances[i].push_back(priorData[startIdx + j]);827}828}829}830831// Get location predictions from loc_data.832// loc_data: num x num_preds_per_class * num_loc_classes * 4 blob.833// num: the number of images.834// num_preds_per_class: number of predictions per class.835// num_loc_classes: number of location classes. It is 1 if share_location is836// true; and is equal to number of classes needed to predict otherwise.837// share_location: if true, all classes share the same location prediction.838// loc_pred_transposed: if true, represent four bounding box values as839// [y,x,height,width] or [x,y,width,height] otherwise.840// loc_preds: stores the location prediction, where each item contains841// location prediction for an image.842static void GetLocPredictions(const float* locData, const int num,843const int numPredsPerClass, const int numLocClasses,844const bool shareLocation, const bool locPredTransposed,845std::vector<LabelBBox>& locPreds)846{847locPreds.clear();848if (shareLocation)849{850CV_Assert(numLocClasses == 1);851}852locPreds.resize(num);853for (int i = 0; i < num; ++i, locData += numPredsPerClass * numLocClasses * 4)854{855LabelBBox& labelBBox = locPreds[i];856for (int p = 0; p < numPredsPerClass; ++p)857{858int startIdx = p * numLocClasses * 4;859for (int c = 0; c < numLocClasses; ++c)860{861int label = shareLocation ? -1 : c;862if (labelBBox.find(label) == labelBBox.end())863{864labelBBox[label].resize(numPredsPerClass);865}866util::NormalizedBBox& bbox = labelBBox[label][p];867if (locPredTransposed)868{869bbox.ymin = locData[startIdx + c * 4];870bbox.xmin = locData[startIdx + c * 4 + 1];871bbox.ymax = locData[startIdx + c * 4 + 2];872bbox.xmax = locData[startIdx + c * 4 + 3];873}874else875{876bbox.xmin = locData[startIdx + c * 4];877bbox.ymin = locData[startIdx + c * 4 + 1];878bbox.xmax = locData[startIdx + c * 4 + 2];879bbox.ymax = locData[startIdx + c * 4 + 3];880}881}882}883}884}885886// Get confidence predictions from conf_data.887// conf_data: num x num_preds_per_class * num_classes blob.888// num: the number of images.889// num_preds_per_class: number of predictions per class.890// num_classes: number of classes.891// conf_preds: stores the confidence prediction, where each item contains892// confidence prediction for an image.893static void GetConfidenceScores(const float* confData, const int num,894const int numPredsPerClass, const int numClasses,895std::vector<Mat>& confPreds)896{897int shape[] = { numClasses, numPredsPerClass };898for (int i = 0; i < num; i++)899confPreds.push_back(Mat(2, shape, CV_32F));900901for (int i = 0; i < num; ++i, confData += numPredsPerClass * numClasses)902{903Mat labelScores = confPreds[i];904for (int c = 0; c < numClasses; ++c)905{906for (int p = 0; p < numPredsPerClass; ++p)907{908labelScores.at<float>(c, p) = confData[p * numClasses + c];909}910}911}912}913914// Compute the jaccard (intersection over union IoU) overlap between two bboxes.915template<bool normalized>916static float JaccardOverlap(const util::NormalizedBBox& bbox1,917const util::NormalizedBBox& bbox2)918{919util::NormalizedBBox intersect_bbox;920intersect_bbox.xmin = std::max(bbox1.xmin, bbox2.xmin);921intersect_bbox.ymin = std::max(bbox1.ymin, bbox2.ymin);922intersect_bbox.xmax = std::min(bbox1.xmax, bbox2.xmax);923intersect_bbox.ymax = std::min(bbox1.ymax, bbox2.ymax);924925float intersect_size = BBoxSize(intersect_bbox, normalized);926if (intersect_size > 0)927{928float bbox1_size = BBoxSize(bbox1, normalized);929float bbox2_size = BBoxSize(bbox2, normalized);930return intersect_size / (bbox1_size + bbox2_size - intersect_size);931}932else933{934return 0.;935}936}937938virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE939{940#ifdef HAVE_INF_ENGINE941InferenceEngine::LayerParams lp;942lp.name = name;943lp.type = "DetectionOutput";944lp.precision = InferenceEngine::Precision::FP32;945std::shared_ptr<InferenceEngine::CNNLayer> ieLayer(new InferenceEngine::CNNLayer(lp));946947ieLayer->params["num_classes"] = format("%d", _numClasses);948ieLayer->params["share_location"] = _shareLocation ? "1" : "0";949ieLayer->params["background_label_id"] = format("%d", _backgroundLabelId);950ieLayer->params["nms_threshold"] = format("%f", _nmsThreshold);951ieLayer->params["top_k"] = format("%d", _topK);952ieLayer->params["keep_top_k"] = format("%d", _keepTopK);953ieLayer->params["eta"] = "1.0";954ieLayer->params["confidence_threshold"] = format("%f", _confidenceThreshold);955ieLayer->params["variance_encoded_in_target"] = _varianceEncodedInTarget ? "1" : "0";956ieLayer->params["code_type"] = "caffe.PriorBoxParameter." + _codeType;957return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));958#endif // HAVE_INF_ENGINE959return Ptr<BackendNode>();960}961};962963float util::caffe_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b)964{965return DetectionOutputLayerImpl::JaccardOverlap<false>(a, b);966}967968float util::caffe_norm_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b)969{970return DetectionOutputLayerImpl::JaccardOverlap<true>(a, b);971}972973const std::string DetectionOutputLayerImpl::_layerName = std::string("DetectionOutput");974975Ptr<DetectionOutputLayer> DetectionOutputLayer::create(const LayerParams ¶ms)976{977return Ptr<DetectionOutputLayer>(new DetectionOutputLayerImpl(params));978}979980}981}982983984