CoCalc -- detection_output

GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/dnn/src/layers/detection_output_layer.cpp
¹⁶³³⁷ views
1
/*M ///////////////////////////////////////////////////////////////////////////////////////
2
//
3
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
//  By downloading, copying, installing or using the software you agree to this license.
6
//  If you do not agree to this license, do not download, install,
7
//  copy or use the software.
8
//
9
//
10
//                           License Agreement
11
//                For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14
// Copyright (C) 2017, Intel Corporation, all rights reserved.
15
// Third party copyrights are property of their respective owners.
16
//
17
// Redistribution and use in source and binary forms, with or without modification,
18
// are permitted provided that the following conditions are met:
19
//
20
//   * Redistribution's of source code must retain the above copyright notice,
21
//     this list of conditions and the following disclaimer.
22
//
23
//   * Redistribution's in binary form must reproduce the above copyright notice,
24
//     this list of conditions and the following disclaimer in the documentation
25
//     and/or other materials provided with the distribution.
26
//
27
//   * The name of the copyright holders may not be used to endorse or promote products
28
//     derived from this software without specific prior written permission.
29
//
30
// This software is provided by the copyright holders and contributors "as is" and
31
// any express or implied warranties, including, but not limited to, the implied
32
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33
// In no event shall the Intel Corporation or contributors be liable for any direct,
34
// indirect, incidental, special, exemplary, or consequential damages
35
// (including, but not limited to, procurement of substitute goods or services;
36
// loss of use, data, or profits; or business interruption) however caused
37
// and on any theory of liability, whether in contract, strict liability,
38
// or tort (including negligence or otherwise) arising in any way out of
39
// the use of this software, even if advised of the possibility of such damage.
40
//
41
//M*/
42

43
#include "../precomp.hpp"
44
#include "layers_common.hpp"
45
#include "../op_inf_engine.hpp"
46
#include <float.h>
47
#include <string>
48
#include "../nms.inl.hpp"
49

50
#ifdef HAVE_OPENCL
51
#include "opencl_kernels_dnn.hpp"
52
#endif
53

54
namespace cv
55
{
56
namespace dnn
57
{
58

59
namespace util
60
{
61

62
class NormalizedBBox
63
{
64
public:
65
    float xmin, ymin, xmax, ymax;
66

67
    NormalizedBBox()
68
        : xmin(0), ymin(0), xmax(0), ymax(0), has_size_(false), size_(0) {}
69

70
    float size() const { return size_; }
71

72
    bool has_size() const { return has_size_; }
73

74
    void set_size(float value) { size_ = value; has_size_ = true; }
75

76
    void clear_size() { size_ = 0; has_size_ = false; }
77

78
private:
79
    bool has_size_;
80
    float size_;
81
};
82

83
template <typename T>
84
static inline bool SortScorePairDescend(const std::pair<float, T>& pair1,
85
                          const std::pair<float, T>& pair2)
86
{
87
    return pair1.first > pair2.first;
88
}
89

90
static inline float caffe_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b);
91

92
static inline float caffe_norm_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b);
93

94
} // namespace
95

96
class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
97
{
98
public:
99
    unsigned _numClasses;
100
    bool _shareLocation;
101
    int _numLocClasses;
102

103
    int _backgroundLabelId;
104

105
    cv::String _codeType;
106

107
    bool _varianceEncodedInTarget;
108
    int _keepTopK;
109
    float _confidenceThreshold;
110

111
    float _nmsThreshold;
112
    int _topK;
113
    // Whenever predicted bounding boxes are represented in YXHW instead of XYWH layout.
114
    bool _locPredTransposed;
115
    // It's true whenever predicted bounding boxes and proposals are normalized to [0, 1].
116
    bool _bboxesNormalized;
117
    bool _clip;
118
    bool _groupByClasses;
119

120
    enum { _numAxes = 4 };
121
    static const std::string _layerName;
122

123
    typedef std::map<int, std::vector<util::NormalizedBBox> > LabelBBox;
124

125
    bool getParameterDict(const LayerParams &params,
126
                          const std::string &parameterName,
127
                          DictValue& result)
128
    {
129
        if (!params.has(parameterName))
130
        {
131
            return false;
132
        }
133

134
        result = params.get(parameterName);
135
        return true;
136
    }
137

138
    template<typename T>
139
    T getParameter(const LayerParams &params,
140
                   const std::string &parameterName,
141
                   const size_t &idx=0,
142
                   const bool required=true,
143
                   const T& defaultValue=T())
144
    {
145
        DictValue dictValue;
146
        bool success = getParameterDict(params, parameterName, dictValue);
147
        if(!success)
148
        {
149
            if(required)
150
            {
151
                std::string message = _layerName;
152
                message += " layer parameter does not contain ";
153
                message += parameterName;
154
                message += " parameter.";
155
                CV_Error(Error::StsBadArg, message);
156
            }
157
            else
158
            {
159
                return defaultValue;
160
            }
161
        }
162
        return dictValue.get<T>(idx);
163
    }
164

165
    void getCodeType(const LayerParams &params)
166
    {
167
        String codeTypeString = toLowerCase(params.get<String>("code_type"));
168
        if (codeTypeString == "center_size")
169
            _codeType = "CENTER_SIZE";
170
        else
171
            _codeType = "CORNER";
172
    }
173

174
    DetectionOutputLayerImpl(const LayerParams &params)
175
    {
176
        _numClasses = getParameter<unsigned>(params, "num_classes");
177
        _shareLocation = getParameter<bool>(params, "share_location");
178
        _numLocClasses = _shareLocation ? 1 : _numClasses;
179
        _backgroundLabelId = getParameter<int>(params, "background_label_id");
180
        _varianceEncodedInTarget = getParameter<bool>(params, "variance_encoded_in_target", 0, false, false);
181
        _keepTopK = getParameter<int>(params, "keep_top_k");
182
        _confidenceThreshold = getParameter<float>(params, "confidence_threshold", 0, false, -FLT_MAX);
183
        _topK = getParameter<int>(params, "top_k", 0, false, -1);
184
        _locPredTransposed = getParameter<bool>(params, "loc_pred_transposed", 0, false, false);
185
        _bboxesNormalized = getParameter<bool>(params, "normalized_bbox", 0, false, true);
186
        _clip = getParameter<bool>(params, "clip", 0, false, false);
187
        _groupByClasses = getParameter<bool>(params, "group_by_classes", 0, false, true);
188

189
        getCodeType(params);
190

191
        // Parameters used in nms.
192
        _nmsThreshold = getParameter<float>(params, "nms_threshold");
193
        CV_Assert(_nmsThreshold > 0.);
194

195
        setParamsFrom(params);
196
    }
197

198
    virtual bool supportBackend(int backendId) CV_OVERRIDE
199
    {
200
        return backendId == DNN_BACKEND_OPENCV ||
201
               backendId == DNN_BACKEND_INFERENCE_ENGINE && !_locPredTransposed && _bboxesNormalized && !_clip;
202
    }
203

204
    bool getMemoryShapes(const std::vector<MatShape> &inputs,
205
                         const int requiredOutputs,
206
                         std::vector<MatShape> &outputs,
207
                         std::vector<MatShape> &internals) const CV_OVERRIDE
208
    {
209
        CV_Assert(inputs.size() >= 3);
210
        CV_Assert(inputs[0][0] == inputs[1][0]);
211

212
        int numPriors = inputs[2][2] / 4;
213
        CV_Assert((numPriors * _numLocClasses * 4) == total(inputs[0], 1));
214
        CV_Assert(int(numPriors * _numClasses) == total(inputs[1], 1));
215
        CV_Assert(inputs[2][1] == 1 + (int)(!_varianceEncodedInTarget));
216

217
        // num() and channels() are 1.
218
        // Since the number of bboxes to be kept is unknown before nms, we manually
219
        // set it to maximal number of detections, [keep_top_k] parameter.
220
        // Each row is a 7 dimension std::vector, which stores
221
        // [image_id, label, confidence, xmin, ymin, xmax, ymax]
222
        outputs.resize(1, shape(1, 1, _keepTopK, 7));
223

224
        return false;
225
    }
226

227
#ifdef HAVE_OPENCL
228
    // Decode all bboxes in a batch
229
    bool ocl_DecodeBBoxesAll(UMat& loc_mat, UMat& prior_mat,
230
                             const int num, const int numPriors, const bool share_location,
231
                             const int num_loc_classes, const int background_label_id,
232
                             const cv::String& code_type, const bool variance_encoded_in_target,
233
                             const bool clip, std::vector<LabelBBox>& all_decode_bboxes)
234
    {
235
        UMat outmat = UMat(loc_mat.dims, loc_mat.size, CV_32F);
236
        size_t nthreads = loc_mat.total();
237
        String kernel_name;
238

239
        if (code_type == "CORNER")
240
            kernel_name = "DecodeBBoxesCORNER";
241
        else if (code_type == "CENTER_SIZE")
242
            kernel_name = "DecodeBBoxesCENTER_SIZE";
243
        else
244
            return false;
245

246
        for (int i = 0; i < num; ++i)
247
        {
248
            ocl::Kernel kernel(kernel_name.c_str(), ocl::dnn::detection_output_oclsrc);
249
            kernel.set(0, (int)nthreads);
250
            kernel.set(1, ocl::KernelArg::PtrReadOnly(loc_mat));
251
            kernel.set(2, ocl::KernelArg::PtrReadOnly(prior_mat));
252
            kernel.set(3, (int)variance_encoded_in_target);
253
            kernel.set(4, (int)numPriors);
254
            kernel.set(5, (int)share_location);
255
            kernel.set(6, (int)num_loc_classes);
256
            kernel.set(7, (int)background_label_id);
257
            kernel.set(8, (int)clip);
258
            kernel.set(9, (int)_locPredTransposed);
259
            kernel.set(10, ocl::KernelArg::PtrWriteOnly(outmat));
260

261
            if (!kernel.run(1, &nthreads, NULL, false))
262
                return false;
263
        }
264

265
        all_decode_bboxes.clear();
266
        all_decode_bboxes.resize(num);
267
        {
268
            Mat mat = outmat.getMat(ACCESS_READ);
269
            const float* decode_data = mat.ptr<float>();
270
            for (int i = 0; i < num; ++i)
271
            {
272
                LabelBBox& decode_bboxes = all_decode_bboxes[i];
273
                for (int c = 0; c < num_loc_classes; ++c)
274
                {
275
                    int label = share_location ? -1 : c;
276
                    decode_bboxes[label].resize(numPriors);
277
                    for (int p = 0; p < numPriors; ++p)
278
                    {
279
                        int startIdx = p * num_loc_classes * 4;
280
                        util::NormalizedBBox& bbox = decode_bboxes[label][p];
281
                        bbox.xmin = decode_data[startIdx + c * 4];
282
                        bbox.ymin = decode_data[startIdx + c * 4 + 1];
283
                        bbox.xmax = decode_data[startIdx + c * 4 + 2];
284
                        bbox.ymax = decode_data[startIdx + c * 4 + 3];
285
                    }
286
                }
287
            }
288
        }
289
        return true;
290
    }
291

292
    void ocl_GetConfidenceScores(const UMat& inp1, const int num,
293
                                 const int numPredsPerClass, const int numClasses,
294
                                 std::vector<Mat>& confPreds)
295
    {
296
        int shape[] = { numClasses, numPredsPerClass };
297
        for (int i = 0; i < num; i++)
298
            confPreds.push_back(Mat(2, shape, CV_32F));
299

300
        shape[0] = num * numPredsPerClass;
301
        shape[1] = inp1.total() / shape[0];
302
        UMat umat = inp1.reshape(1, 2, &shape[0]);
303
        for (int i = 0; i < num; ++i)
304
        {
305
            Range ranges[] = { Range(i * numPredsPerClass, (i + 1) * numPredsPerClass), Range::all() };
306
            transpose(umat(ranges), confPreds[i]);
307
        }
308
    }
309

310
    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
311
    {
312
        std::vector<UMat> inputs;
313
        std::vector<UMat> outputs;
314

315
        bool use_half = (inps.depth() == CV_16S);
316
        if (use_half)
317
        {
318
            std::vector<UMat> orig_inputs;
319
            std::vector<UMat> orig_outputs;
320

321
            inps.getUMatVector(orig_inputs);
322
            outs.getUMatVector(orig_outputs);
323

324
            inputs.resize(orig_inputs.size());
325
            for (size_t i = 0; i < orig_inputs.size(); i++)
326
                convertFp16(orig_inputs[i], inputs[i]);
327
        }
328
        else
329
        {
330
            inps.getUMatVector(inputs);
331
            outs.getUMatVector(outputs);
332
        }
333

334
        std::vector<LabelBBox> allDecodedBBoxes;
335
        std::vector<Mat> allConfidenceScores;
336

337
        int num = inputs[0].size[0];
338

339
        // extract predictions from input layers
340
        {
341
            int numPriors = inputs[2].size[2] / 4;
342

343
            // Retrieve all confidences
344
            ocl_GetConfidenceScores(inputs[1], num, numPriors, _numClasses, allConfidenceScores);
345

346
            // Decode all loc predictions to bboxes
347
            bool ret = ocl_DecodeBBoxesAll(inputs[0], inputs[2], num, numPriors,
348
                                           _shareLocation, _numLocClasses, _backgroundLabelId,
349
                                           _codeType, _varianceEncodedInTarget, _clip,
350
                                           allDecodedBBoxes);
351
            if (!ret)
352
                return false;
353
        }
354

355
        size_t numKept = 0;
356
        std::vector<std::map<int, std::vector<int> > > allIndices;
357
        for (int i = 0; i < num; ++i)
358
        {
359
            numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices);
360
        }
361

362
        if (numKept == 0)
363
        {
364
            // Set confidences to zeros.
365
            Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
366
            if (use_half)
367
            {
368
                std::vector<UMat> orig_outputs;
369
                outs.getUMatVector(orig_outputs);
370
                orig_outputs[0](ranges).setTo(0);
371
            } else
372
                outputs[0](ranges).setTo(0);
373
            return true;
374
        }
375
        int outputShape[] = {1, 1, (int)numKept, 7};
376
        UMat umat = UMat(4, outputShape, CV_32F);
377
        {
378
            Mat mat = umat.getMat(ACCESS_WRITE);
379
            float* outputsData = mat.ptr<float>();
380

381
            size_t count = 0;
382
            for (int i = 0; i < num; ++i)
383
            {
384
                count += outputDetections_(i, &outputsData[count * 7],
385
                                           allDecodedBBoxes[i], allConfidenceScores[i],
386
                                           allIndices[i], _groupByClasses);
387
            }
388
            CV_Assert(count == numKept);
389
        }
390

391
        if (use_half)
392
        {
393
            UMat half_umat;
394
            convertFp16(umat, half_umat);
395

396
            std::vector<UMat> orig_outputs;
397
            outs.getUMatVector(orig_outputs);
398
            orig_outputs.clear();
399
            orig_outputs.push_back(half_umat);
400
            outs.assign(orig_outputs);
401
        } else {
402
            outputs.clear();
403
            outputs.push_back(umat);
404
            outs.assign(outputs);
405
        }
406

407
        return true;
408
    }
409
#endif
410

411
    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
412
    {
413
        CV_TRACE_FUNCTION();
414
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
415

416
        if (_bboxesNormalized)
417
        {
418
            CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
419
                       forward_ocl(inputs_arr, outputs_arr, internals_arr))
420
        }
421
        if (inputs_arr.depth() == CV_16S)
422
        {
423
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
424
            return;
425
        }
426

427
        std::vector<Mat> inputs, outputs;
428
        inputs_arr.getMatVector(inputs);
429
        outputs_arr.getMatVector(outputs);
430

431
        std::vector<LabelBBox> allDecodedBBoxes;
432
        std::vector<Mat> allConfidenceScores;
433

434
        int num = inputs[0].size[0];
435

436
        // extract predictions from input layers
437
        {
438
            int numPriors = inputs[2].size[2] / 4;
439

440
            const float* locationData = inputs[0].ptr<float>();
441
            const float* confidenceData = inputs[1].ptr<float>();
442
            const float* priorData = inputs[2].ptr<float>();
443

444
            // Retrieve all location predictions
445
            std::vector<LabelBBox> allLocationPredictions;
446
            GetLocPredictions(locationData, num, numPriors, _numLocClasses,
447
                              _shareLocation, _locPredTransposed, allLocationPredictions);
448

449
            // Retrieve all confidences
450
            GetConfidenceScores(confidenceData, num, numPriors, _numClasses, allConfidenceScores);
451

452
            // Retrieve all prior bboxes
453
            std::vector<util::NormalizedBBox> priorBBoxes;
454
            std::vector<std::vector<float> > priorVariances;
455
            GetPriorBBoxes(priorData, numPriors, _bboxesNormalized, priorBBoxes, priorVariances);
456

457
            // Decode all loc predictions to bboxes
458
            util::NormalizedBBox clipBounds;
459
            if (_clip)
460
            {
461
                CV_Assert(_bboxesNormalized || inputs.size() >= 4);
462
                clipBounds.xmin = clipBounds.ymin = 0.0f;
463
                if (_bboxesNormalized)
464
                    clipBounds.xmax = clipBounds.ymax = 1.0f;
465
                else
466
                {
467
                    // Input image sizes;
468
                    CV_Assert(inputs[3].dims == 4);
469
                    clipBounds.xmax = inputs[3].size[3] - 1;
470
                    clipBounds.ymax = inputs[3].size[2] - 1;
471
                }
472
            }
473
            DecodeBBoxesAll(allLocationPredictions, priorBBoxes, priorVariances, num,
474
                            _shareLocation, _numLocClasses, _backgroundLabelId,
475
                            _codeType, _varianceEncodedInTarget, _clip, clipBounds,
476
                            _bboxesNormalized, allDecodedBBoxes);
477
        }
478

479
        size_t numKept = 0;
480
        std::vector<std::map<int, std::vector<int> > > allIndices;
481
        for (int i = 0; i < num; ++i)
482
        {
483
            numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices);
484
        }
485

486
        if (numKept == 0)
487
        {
488
            // Set confidences to zeros.
489
            Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)};
490
            outputs[0](ranges).setTo(0);
491
            return;
492
        }
493
        int outputShape[] = {1, 1, (int)numKept, 7};
494
        outputs[0].create(4, outputShape, CV_32F);
495
        float* outputsData = outputs[0].ptr<float>();
496

497
        size_t count = 0;
498
        for (int i = 0; i < num; ++i)
499
        {
500
            count += outputDetections_(i, &outputsData[count * 7],
501
                                       allDecodedBBoxes[i], allConfidenceScores[i],
502
                                       allIndices[i], _groupByClasses);
503
        }
504
        CV_Assert(count == numKept);
505
        // Sync results back due changed output shape.
506
        outputs_arr.assign(outputs);
507
    }
508

509
    size_t outputDetections_(
510
            const int i, float* outputsData,
511
            const LabelBBox& decodeBBoxes, Mat& confidenceScores,
512
            const std::map<int, std::vector<int> >& indicesMap,
513
            bool groupByClasses
514
    )
515
    {
516
        std::vector<int> dstIndices;
517
        std::vector<std::pair<float, int> > allScores;
518
        for (std::map<int, std::vector<int> >::const_iterator it = indicesMap.begin(); it != indicesMap.end(); ++it)
519
        {
520
            int label = it->first;
521
            if (confidenceScores.rows <= label)
522
                CV_Error_(cv::Error::StsError, ("Could not find confidence predictions for label %d", label));
523
            const std::vector<float>& scores = confidenceScores.row(label);
524
            const std::vector<int>& indices = it->second;
525

526
            const int numAllScores = allScores.size();
527
            allScores.reserve(numAllScores + indices.size());
528
            for (size_t j = 0; j < indices.size(); ++j)
529
            {
530
                allScores.push_back(std::make_pair(scores[indices[j]], numAllScores + j));
531
            }
532
        }
533
        if (!groupByClasses)
534
            std::sort(allScores.begin(), allScores.end(), util::SortScorePairDescend<int>);
535

536
        dstIndices.resize(allScores.size());
537
        for (size_t j = 0; j < dstIndices.size(); ++j)
538
        {
539
            dstIndices[allScores[j].second] = j;
540
        }
541

542
        size_t count = 0;
543
        for (std::map<int, std::vector<int> >::const_iterator it = indicesMap.begin(); it != indicesMap.end(); ++it)
544
        {
545
            int label = it->first;
546
            if (confidenceScores.rows <= label)
547
                CV_Error_(cv::Error::StsError, ("Could not find confidence predictions for label %d", label));
548
            const std::vector<float>& scores = confidenceScores.row(label);
549
            int locLabel = _shareLocation ? -1 : label;
550
            LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(locLabel);
551
            if (label_bboxes == decodeBBoxes.end())
552
                CV_Error_(cv::Error::StsError, ("Could not find location predictions for label %d", locLabel));
553
            const std::vector<int>& indices = it->second;
554

555
            for (size_t j = 0; j < indices.size(); ++j, ++count)
556
            {
557
                int idx = indices[j];
558
                int dstIdx = dstIndices[count];
559
                const util::NormalizedBBox& decode_bbox = label_bboxes->second[idx];
560
                outputsData[dstIdx * 7] = i;
561
                outputsData[dstIdx * 7 + 1] = label;
562
                outputsData[dstIdx * 7 + 2] = scores[idx];
563
                outputsData[dstIdx * 7 + 3] = decode_bbox.xmin;
564
                outputsData[dstIdx * 7 + 4] = decode_bbox.ymin;
565
                outputsData[dstIdx * 7 + 5] = decode_bbox.xmax;
566
                outputsData[dstIdx * 7 + 6] = decode_bbox.ymax;
567
            }
568
        }
569
        return count;
570
    }
571

572
    size_t processDetections_(
573
            const LabelBBox& decodeBBoxes, Mat& confidenceScores,
574
            std::vector<std::map<int, std::vector<int> > >& allIndices
575
    )
576
    {
577
        std::map<int, std::vector<int> > indices;
578
        size_t numDetections = 0;
579
        for (int c = 0; c < (int)_numClasses; ++c)
580
        {
581
            if (c == _backgroundLabelId)
582
                continue; // Ignore background class.
583
            if (c >= confidenceScores.rows)
584
                CV_Error_(cv::Error::StsError, ("Could not find confidence predictions for label %d", c));
585

586
            const std::vector<float> scores = confidenceScores.row(c);
587
            int label = _shareLocation ? -1 : c;
588

589
            LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(label);
590
            if (label_bboxes == decodeBBoxes.end())
591
                CV_Error_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
592
            if (_bboxesNormalized)
593
                NMSFast_(label_bboxes->second, scores, _confidenceThreshold, _nmsThreshold, 1.0, _topK,
594
                         indices[c], util::caffe_norm_box_overlap);
595
            else
596
                NMSFast_(label_bboxes->second, scores, _confidenceThreshold, _nmsThreshold, 1.0, _topK,
597
                         indices[c], util::caffe_box_overlap);
598
            numDetections += indices[c].size();
599
        }
600
        if (_keepTopK > -1 && numDetections > (size_t)_keepTopK)
601
        {
602
            std::vector<std::pair<float, std::pair<int, int> > > scoreIndexPairs;
603
            for (std::map<int, std::vector<int> >::iterator it = indices.begin();
604
                 it != indices.end(); ++it)
605
            {
606
                int label = it->first;
607
                const std::vector<int>& labelIndices = it->second;
608
                if (label >= confidenceScores.rows)
609
                    CV_Error_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
610
                const std::vector<float>& scores = confidenceScores.row(label);
611
                for (size_t j = 0; j < labelIndices.size(); ++j)
612
                {
613
                    size_t idx = labelIndices[j];
614
                    CV_Assert(idx < scores.size());
615
                    scoreIndexPairs.push_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
616
                }
617
            }
618
            // Keep outputs k results per image.
619
            std::sort(scoreIndexPairs.begin(), scoreIndexPairs.end(),
620
                      util::SortScorePairDescend<std::pair<int, int> >);
621
            scoreIndexPairs.resize(_keepTopK);
622

623
            std::map<int, std::vector<int> > newIndices;
624
            for (size_t j = 0; j < scoreIndexPairs.size(); ++j)
625
            {
626
                int label = scoreIndexPairs[j].second.first;
627
                int idx = scoreIndexPairs[j].second.second;
628
                newIndices[label].push_back(idx);
629
            }
630
            allIndices.push_back(newIndices);
631
            return (size_t)_keepTopK;
632
        }
633
        else
634
        {
635
            allIndices.push_back(indices);
636
            return numDetections;
637
        }
638
    }
639

640

641
    // **************************************************************
642
    // Utility functions
643
    // **************************************************************
644

645
    // Compute bbox size
646
    static float BBoxSize(const util::NormalizedBBox& bbox, bool normalized)
647
    {
648
        if (bbox.xmax < bbox.xmin || bbox.ymax < bbox.ymin)
649
        {
650
            return 0; // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
651
        }
652
        else
653
        {
654
            if (bbox.has_size())
655
            {
656
                return bbox.size();
657
            }
658
            else
659
            {
660
                float width = bbox.xmax - bbox.xmin;
661
                float height = bbox.ymax - bbox.ymin;
662
                if (normalized)
663
                {
664
                    return width * height;
665
                }
666
                else
667
                {
668
                    // If bbox is not within range [0, 1].
669
                    return (width + 1) * (height + 1);
670
                }
671
            }
672
        }
673
    }
674

675

676
    // Decode a bbox according to a prior bbox
677
    template<bool variance_encoded_in_target>
678
    static void DecodeBBox(
679
        const util::NormalizedBBox& prior_bbox, const std::vector<float>& prior_variance,
680
        const cv::String& code_type,
681
        const bool clip_bbox, const util::NormalizedBBox& clip_bounds,
682
        const bool normalized_bbox, const util::NormalizedBBox& bbox,
683
        util::NormalizedBBox& decode_bbox)
684
    {
685
        float bbox_xmin = variance_encoded_in_target ? bbox.xmin : prior_variance[0] * bbox.xmin;
686
        float bbox_ymin = variance_encoded_in_target ? bbox.ymin : prior_variance[1] * bbox.ymin;
687
        float bbox_xmax = variance_encoded_in_target ? bbox.xmax : prior_variance[2] * bbox.xmax;
688
        float bbox_ymax = variance_encoded_in_target ? bbox.ymax : prior_variance[3] * bbox.ymax;
689
        if (code_type == "CORNER")
690
        {
691
            decode_bbox.xmin = prior_bbox.xmin + bbox_xmin;
692
            decode_bbox.ymin = prior_bbox.ymin + bbox_ymin;
693
            decode_bbox.xmax = prior_bbox.xmax + bbox_xmax;
694
            decode_bbox.ymax = prior_bbox.ymax + bbox_ymax;
695
        }
696
        else if (code_type == "CENTER_SIZE")
697
        {
698
            float prior_width = prior_bbox.xmax - prior_bbox.xmin;
699
            float prior_height = prior_bbox.ymax - prior_bbox.ymin;
700
            if (!normalized_bbox)
701
            {
702
                prior_width += 1.0f;
703
                prior_height += 1.0f;
704
            }
705
            CV_Assert(prior_width > 0);
706
            CV_Assert(prior_height > 0);
707
            float prior_center_x = prior_bbox.xmin + prior_width * .5;
708
            float prior_center_y = prior_bbox.ymin + prior_height * .5;
709

710
            float decode_bbox_center_x, decode_bbox_center_y;
711
            float decode_bbox_width, decode_bbox_height;
712
            decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
713
            decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
714
            decode_bbox_width = exp(bbox_xmax) * prior_width;
715
            decode_bbox_height = exp(bbox_ymax) * prior_height;
716
            decode_bbox.xmin = decode_bbox_center_x - decode_bbox_width * .5;
717
            decode_bbox.ymin = decode_bbox_center_y - decode_bbox_height * .5;
718
            decode_bbox.xmax = decode_bbox_center_x + decode_bbox_width * .5;
719
            decode_bbox.ymax = decode_bbox_center_y + decode_bbox_height * .5;
720
        }
721
        else
722
            CV_Error(Error::StsBadArg, "Unknown type.");
723

724
        if (clip_bbox)
725
        {
726
            // Clip the util::NormalizedBBox.
727
            decode_bbox.xmin = std::max(std::min(decode_bbox.xmin, clip_bounds.xmax), clip_bounds.xmin);
728
            decode_bbox.ymin = std::max(std::min(decode_bbox.ymin, clip_bounds.ymax), clip_bounds.ymin);
729
            decode_bbox.xmax = std::max(std::min(decode_bbox.xmax, clip_bounds.xmax), clip_bounds.xmin);
730
            decode_bbox.ymax = std::max(std::min(decode_bbox.ymax, clip_bounds.ymax), clip_bounds.ymin);
731
        }
732
        decode_bbox.clear_size();
733
        decode_bbox.set_size(BBoxSize(decode_bbox, normalized_bbox));
734
    }
735

736
    // Decode a set of bboxes according to a set of prior bboxes
737
    static void DecodeBBoxes(
738
        const std::vector<util::NormalizedBBox>& prior_bboxes,
739
        const std::vector<std::vector<float> >& prior_variances,
740
        const cv::String& code_type, const bool variance_encoded_in_target,
741
        const bool clip_bbox, const util::NormalizedBBox& clip_bounds,
742
        const bool normalized_bbox, const std::vector<util::NormalizedBBox>& bboxes,
743
        std::vector<util::NormalizedBBox>& decode_bboxes)
744
    {
745
        CV_Assert(prior_bboxes.size() == prior_variances.size());
746
        CV_Assert(prior_bboxes.size() == bboxes.size());
747
        size_t num_bboxes = prior_bboxes.size();
748
        CV_Assert(num_bboxes == 0 || prior_variances[0].size() == 4);
749
        decode_bboxes.clear(); decode_bboxes.resize(num_bboxes);
750
        if(variance_encoded_in_target)
751
        {
752
            for (int i = 0; i < num_bboxes; ++i)
753
                DecodeBBox<true>(prior_bboxes[i], prior_variances[i], code_type,
754
                                 clip_bbox, clip_bounds, normalized_bbox,
755
                                 bboxes[i], decode_bboxes[i]);
756
        }
757
        else
758
        {
759
            for (int i = 0; i < num_bboxes; ++i)
760
                DecodeBBox<false>(prior_bboxes[i], prior_variances[i], code_type,
761
                                  clip_bbox, clip_bounds, normalized_bbox,
762
                                  bboxes[i], decode_bboxes[i]);
763
        }
764
    }
765

766
    // Decode all bboxes in a batch
767
    static void DecodeBBoxesAll(const std::vector<LabelBBox>& all_loc_preds,
768
        const std::vector<util::NormalizedBBox>& prior_bboxes,
769
        const std::vector<std::vector<float> >& prior_variances,
770
        const int num, const bool share_location,
771
        const int num_loc_classes, const int background_label_id,
772
        const cv::String& code_type, const bool variance_encoded_in_target,
773
        const bool clip, const util::NormalizedBBox& clip_bounds,
774
        const bool normalized_bbox, std::vector<LabelBBox>& all_decode_bboxes)
775
    {
776
        CV_Assert(all_loc_preds.size() == num);
777
        all_decode_bboxes.clear();
778
        all_decode_bboxes.resize(num);
779
        for (int i = 0; i < num; ++i)
780
        {
781
            // Decode predictions into bboxes.
782
            const LabelBBox& loc_preds = all_loc_preds[i];
783
            LabelBBox& decode_bboxes = all_decode_bboxes[i];
784
            for (int c = 0; c < num_loc_classes; ++c)
785
            {
786
                int label = share_location ? -1 : c;
787
                if (label == background_label_id)
788
                    continue; // Ignore background class.
789
                LabelBBox::const_iterator label_loc_preds = loc_preds.find(label);
790
                if (label_loc_preds == loc_preds.end())
791
                    CV_Error_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
792
                DecodeBBoxes(prior_bboxes, prior_variances,
793
                             code_type, variance_encoded_in_target, clip, clip_bounds,
794
                             normalized_bbox, label_loc_preds->second, decode_bboxes[label]);
795
            }
796
        }
797
    }
798

799
    // Get prior bounding boxes from prior_data
800
    //    prior_data: 1 x 2 x num_priors * 4 x 1 blob.
801
    //    num_priors: number of priors.
802
    //    prior_bboxes: stores all the prior bboxes in the format of util::NormalizedBBox.
803
    //    prior_variances: stores all the variances needed by prior bboxes.
804
    static void GetPriorBBoxes(const float* priorData, const int& numPriors,
805
                        bool normalized_bbox, std::vector<util::NormalizedBBox>& priorBBoxes,
806
                        std::vector<std::vector<float> >& priorVariances)
807
    {
808
        priorBBoxes.clear(); priorBBoxes.resize(numPriors);
809
        priorVariances.clear(); priorVariances.resize(numPriors);
810
        for (int i = 0; i < numPriors; ++i)
811
        {
812
            int startIdx = i * 4;
813
            util::NormalizedBBox& bbox = priorBBoxes[i];
814
            bbox.xmin = priorData[startIdx];
815
            bbox.ymin = priorData[startIdx + 1];
816
            bbox.xmax = priorData[startIdx + 2];
817
            bbox.ymax = priorData[startIdx + 3];
818
            bbox.set_size(BBoxSize(bbox, normalized_bbox));
819
        }
820

821
        for (int i = 0; i < numPriors; ++i)
822
        {
823
            int startIdx = (numPriors + i) * 4;
824
            // not needed here: priorVariances[i].clear();
825
            for (int j = 0; j < 4; ++j)
826
            {
827
                priorVariances[i].push_back(priorData[startIdx + j]);
828
            }
829
        }
830
    }
831

832
    // Get location predictions from loc_data.
833
    //    loc_data: num x num_preds_per_class * num_loc_classes * 4 blob.
834
    //    num: the number of images.
835
    //    num_preds_per_class: number of predictions per class.
836
    //    num_loc_classes: number of location classes. It is 1 if share_location is
837
    //      true; and is equal to number of classes needed to predict otherwise.
838
    //    share_location: if true, all classes share the same location prediction.
839
    //    loc_pred_transposed: if true, represent four bounding box values as
840
    //                         [y,x,height,width] or [x,y,width,height] otherwise.
841
    //    loc_preds: stores the location prediction, where each item contains
842
    //      location prediction for an image.
843
    static void GetLocPredictions(const float* locData, const int num,
844
                           const int numPredsPerClass, const int numLocClasses,
845
                           const bool shareLocation, const bool locPredTransposed,
846
                           std::vector<LabelBBox>& locPreds)
847
    {
848
        locPreds.clear();
849
        if (shareLocation)
850
        {
851
            CV_Assert(numLocClasses == 1);
852
        }
853
        locPreds.resize(num);
854
        for (int i = 0; i < num; ++i, locData += numPredsPerClass * numLocClasses * 4)
855
        {
856
            LabelBBox& labelBBox = locPreds[i];
857
            for (int p = 0; p < numPredsPerClass; ++p)
858
            {
859
                int startIdx = p * numLocClasses * 4;
860
                for (int c = 0; c < numLocClasses; ++c)
861
                {
862
                    int label = shareLocation ? -1 : c;
863
                    if (labelBBox.find(label) == labelBBox.end())
864
                    {
865
                        labelBBox[label].resize(numPredsPerClass);
866
                    }
867
                    util::NormalizedBBox& bbox = labelBBox[label][p];
868
                    if (locPredTransposed)
869
                    {
870
                        bbox.ymin = locData[startIdx + c * 4];
871
                        bbox.xmin = locData[startIdx + c * 4 + 1];
872
                        bbox.ymax = locData[startIdx + c * 4 + 2];
873
                        bbox.xmax = locData[startIdx + c * 4 + 3];
874
                    }
875
                    else
876
                    {
877
                        bbox.xmin = locData[startIdx + c * 4];
878
                        bbox.ymin = locData[startIdx + c * 4 + 1];
879
                        bbox.xmax = locData[startIdx + c * 4 + 2];
880
                        bbox.ymax = locData[startIdx + c * 4 + 3];
881
                    }
882
                }
883
            }
884
        }
885
    }
886

887
    // Get confidence predictions from conf_data.
888
    //    conf_data: num x num_preds_per_class * num_classes blob.
889
    //    num: the number of images.
890
    //    num_preds_per_class: number of predictions per class.
891
    //    num_classes: number of classes.
892
    //    conf_preds: stores the confidence prediction, where each item contains
893
    //      confidence prediction for an image.
894
    static void GetConfidenceScores(const float* confData, const int num,
895
                             const int numPredsPerClass, const int numClasses,
896
                             std::vector<Mat>& confPreds)
897
    {
898
        int shape[] = { numClasses, numPredsPerClass };
899
        for (int i = 0; i < num; i++)
900
            confPreds.push_back(Mat(2, shape, CV_32F));
901

902
        for (int i = 0; i < num; ++i, confData += numPredsPerClass * numClasses)
903
        {
904
            Mat labelScores = confPreds[i];
905
            for (int c = 0; c < numClasses; ++c)
906
            {
907
                for (int p = 0; p < numPredsPerClass; ++p)
908
                {
909
                    labelScores.at<float>(c, p) = confData[p * numClasses + c];
910
                }
911
            }
912
        }
913
    }
914

915
    // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
916
    template<bool normalized>
917
    static float JaccardOverlap(const util::NormalizedBBox& bbox1,
918
                         const util::NormalizedBBox& bbox2)
919
    {
920
        util::NormalizedBBox intersect_bbox;
921
        intersect_bbox.xmin = std::max(bbox1.xmin, bbox2.xmin);
922
        intersect_bbox.ymin = std::max(bbox1.ymin, bbox2.ymin);
923
        intersect_bbox.xmax = std::min(bbox1.xmax, bbox2.xmax);
924
        intersect_bbox.ymax = std::min(bbox1.ymax, bbox2.ymax);
925

926
        float intersect_size = BBoxSize(intersect_bbox, normalized);
927
        if (intersect_size > 0)
928
        {
929
            float bbox1_size = BBoxSize(bbox1, normalized);
930
            float bbox2_size = BBoxSize(bbox2, normalized);
931
            return intersect_size / (bbox1_size + bbox2_size - intersect_size);
932
        }
933
        else
934
        {
935
            return 0.;
936
        }
937
    }
938

939
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
940
    {
941
#ifdef HAVE_INF_ENGINE
942
        InferenceEngine::LayerParams lp;
943
        lp.name = name;
944
        lp.type = "DetectionOutput";
945
        lp.precision = InferenceEngine::Precision::FP32;
946
        std::shared_ptr<InferenceEngine::CNNLayer> ieLayer(new InferenceEngine::CNNLayer(lp));
947

948
        ieLayer->params["num_classes"] = format("%d", _numClasses);
949
        ieLayer->params["share_location"] = _shareLocation ? "1" : "0";
950
        ieLayer->params["background_label_id"] = format("%d", _backgroundLabelId);
951
        ieLayer->params["nms_threshold"] = format("%f", _nmsThreshold);
952
        ieLayer->params["top_k"] = format("%d", _topK);
953
        ieLayer->params["keep_top_k"] = format("%d", _keepTopK);
954
        ieLayer->params["eta"] = "1.0";
955
        ieLayer->params["confidence_threshold"] = format("%f", _confidenceThreshold);
956
        ieLayer->params["variance_encoded_in_target"] = _varianceEncodedInTarget ? "1" : "0";
957
        ieLayer->params["code_type"] = "caffe.PriorBoxParameter." + _codeType;
958
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
959
#endif  // HAVE_INF_ENGINE
960
        return Ptr<BackendNode>();
961
    }
962
};
963

964
float util::caffe_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b)
965
{
966
    return DetectionOutputLayerImpl::JaccardOverlap<false>(a, b);
967
}
968

969
float util::caffe_norm_box_overlap(const util::NormalizedBBox& a, const util::NormalizedBBox& b)
970
{
971
    return DetectionOutputLayerImpl::JaccardOverlap<true>(a, b);
972
}
973

974
const std::string DetectionOutputLayerImpl::_layerName = std::string("DetectionOutput");
975

976
Ptr<DetectionOutputLayer> DetectionOutputLayer::create(const LayerParams &params)
977
{
978
    return Ptr<DetectionOutputLayer>(new DetectionOutputLayerImpl(params));
979
}
980

981
}
982
}
983

984
Product

Resources

Company