CoCalc -- pooling

GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/dnn/src/layers/pooling_layer.cpp
¹⁶³³⁷ views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
//  By downloading, copying, installing or using the software you agree to this license.
6
//  If you do not agree to this license, do not download, install,
7
//  copy or use the software.
8
//
9
//
10
//                           License Agreement
11
//                For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14
// Copyright (C) 2017, Intel Corporation, all rights reserved.
15
// Third party copyrights are property of their respective owners.
16
//
17
// Redistribution and use in source and binary forms, with or without modification,
18
// are permitted provided that the following conditions are met:
19
//
20
//   * Redistribution's of source code must retain the above copyright notice,
21
//     this list of conditions and the following disclaimer.
22
//
23
//   * Redistribution's in binary form must reproduce the above copyright notice,
24
//     this list of conditions and the following disclaimer in the documentation
25
//     and/or other materials provided with the distribution.
26
//
27
//   * The name of the copyright holders may not be used to endorse or promote products
28
//     derived from this software without specific prior written permission.
29
//
30
// This software is provided by the copyright holders and contributors "as is" and
31
// any express or implied warranties, including, but not limited to, the implied
32
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33
// In no event shall the Intel Corporation or contributors be liable for any direct,
34
// indirect, incidental, special, exemplary, or consequential damages
35
// (including, but not limited to, procurement of substitute goods or services;
36
// loss of use, data, or profits; or business interruption) however caused
37
// and on any theory of liability, whether in contract, strict liability,
38
// or tort (including negligence or otherwise) arising in any way out of
39
// the use of this software, even if advised of the possibility of such damage.
40
//
41
//M*/
42

43
#include "../precomp.hpp"
44
#include "layers_common.hpp"
45
#include "opencv2/core/hal/intrin.hpp"
46
#include "../op_halide.hpp"
47
#include "../op_inf_engine.hpp"
48
#include "../op_vkcom.hpp"
49
#include <float.h>
50
#include <algorithm>
51
using std::max;
52
using std::min;
53

54
#ifdef HAVE_OPENCL
55
#include "opencl_kernels_dnn.hpp"
56
using namespace cv::dnn::ocl4dnn;
57
#endif
58

59
namespace cv
60
{
61
namespace dnn
62
{
63
static inline int roundRoiSize(float v)
64
{
65
    return (int)(v + (v >= 0.f ? 0.5f : -0.5f));
66
}
67

68
class PoolingLayerImpl CV_FINAL : public PoolingLayer
69
{
70
public:
71
    PoolingLayerImpl(const LayerParams& params)
72
    {
73
        computeMaxIdx = true;
74
        globalPooling = false;
75
        stride = Size(1, 1);
76

77
        if (params.has("pool") || params.has("kernel_size") ||
78
            params.has("kernel_w") || params.has("kernel_h"))
79
        {
80
            String pool = toLowerCase(params.get<String>("pool", "max"));
81
            if (pool == "max")
82
                type = MAX;
83
            else if (pool == "ave")
84
                type = AVE;
85
            else if (pool == "stochastic")
86
                type = STOCHASTIC;
87
            else
88
                CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
89

90
            getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
91
                                   pad_t, pad_l, pad_b, pad_r, stride.height, stride.width, padMode);
92

93
            pad.width = pad_l;
94
            pad.height = pad_t;
95
        }
96
        else if (params.has("pooled_w") || params.has("pooled_h"))
97
        {
98
            type = ROI;
99
            computeMaxIdx = false;
100
            pooledSize.width = params.get<uint32_t>("pooled_w", 1);
101
            pooledSize.height = params.get<uint32_t>("pooled_h", 1);
102
        }
103
        else if (params.has("output_dim") && params.has("group_size"))
104
        {
105
            type = PSROI;
106
            pooledSize.width = params.get<int>("group_size");
107
            pooledSize.height = pooledSize.width;
108
            psRoiOutChannels = params.get<int>("output_dim");
109
        }
110
        else
111
            CV_Error(Error::StsBadArg, "Cannot determine pooling type");
112
        setParamsFrom(params);
113
        ceilMode = params.get<bool>("ceil_mode", true);
114
        spatialScale = params.get<float>("spatial_scale", 1);
115
        avePoolPaddedArea = params.get<bool>("ave_pool_padded_area", true);
116
    }
117

118
#ifdef HAVE_OPENCL
119
    Ptr<OCL4DNNPool<float> > poolOp;
120
#endif
121

122
    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
123
    {
124
        std::vector<Mat> inputs, outputs;
125
        inputs_arr.getMatVector(inputs);
126
        outputs_arr.getMatVector(outputs);
127

128
        CV_Assert(!inputs.empty());
129

130
        cv::Size inp(inputs[0].size[3], inputs[0].size[2]),
131
                out(outputs[0].size[3], outputs[0].size[2]);
132

133
        if(globalPooling)
134
        {
135
            kernel = inp;
136
        }
137

138
        getConvPoolPaddings(inp, out, kernel, stride, padMode, Size(1, 1), pad_t, pad_l, pad_b, pad_r);
139
        pad.width = pad_l;
140
        pad.height = pad_t;
141

142
#ifdef HAVE_OPENCL
143
        poolOp.release();
144
#endif
145
    }
146

147
    virtual bool supportBackend(int backendId) CV_OVERRIDE
148
    {
149
        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
150
        {
151
            if (preferableTarget == DNN_TARGET_MYRIAD)
152
                return type == MAX || type == AVE;
153
            else
154
                return type != STOCHASTIC;
155
        }
156
        else
157
            return backendId == DNN_BACKEND_OPENCV ||
158
                   backendId == DNN_BACKEND_HALIDE && haveHalide() &&
159
                   (type == MAX || type == AVE && !pad_t && !pad_l && !pad_b && !pad_r) ||
160
                   backendId == DNN_BACKEND_VKCOM && haveVulkan() &&
161
                   (type == MAX || type == AVE);
162
    }
163

164
#ifdef HAVE_OPENCL
165
    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)
166
    {
167
        std::vector<UMat> inputs;
168
        std::vector<UMat> outputs;
169

170
        bool use_half = (inps.depth() == CV_16S);
171
        inps.getUMatVector(inputs);
172
        outs.getUMatVector(outputs);
173

174
        if (poolOp.empty())
175
        {
176
            OCL4DNNPoolConfig config;
177

178
            config.in_shape = shape(inputs[0]);
179
            config.out_shape = shape(outputs[0]);
180
            config.kernel = kernel;
181
            config.pad_l = pad_l;
182
            config.pad_t = pad_t;
183
            config.pad_r = pad_r;
184
            config.pad_b = pad_b;
185
            config.stride = stride;
186
            config.channels = inputs[0].size[1];
187
            config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX :
188
                                (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
189
                                               LIBDNN_POOLING_METHOD_STO);
190
            config.avePoolPaddedArea = avePoolPaddedArea;
191
            config.computeMaxIdx = computeMaxIdx;
192
            config.use_half = use_half;
193
            poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
194
        }
195

196
        for (size_t ii = 0; ii < inputs.size(); ii++)
197
        {
198
            UMat& inpMat = inputs[ii];
199
            int out_index = (type == MAX) ? 2 : 1;
200
            UMat& outMat = outputs[out_index * ii];
201
            UMat maskMat = (type == MAX) ? outputs[2 * ii + 1] : UMat();
202

203
            CV_Assert(inpMat.offset == 0 && outMat.offset == 0);
204

205
            if (!poolOp->Forward(inpMat, outMat, maskMat))
206
                return false;
207
        }
208
        return true;
209
    }
210
#endif
211

212
    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
213
    {
214
        CV_TRACE_FUNCTION();
215
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
216

217
        if (type == MAX || type == AVE || type == STOCHASTIC)
218
        {
219
            CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
220
                       forward_ocl(inputs_arr, outputs_arr, internals_arr))
221
        }
222
        if (inputs_arr.depth() == CV_16S)
223
        {
224
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
225
            return;
226
        }
227

228
        std::vector<Mat> inputs, outputs;
229
        inputs_arr.getMatVector(inputs);
230
        outputs_arr.getMatVector(outputs);
231

232
        switch (type)
233
        {
234
            case MAX:
235
                CV_Assert_N(inputs.size() == 1, outputs.size() == 2);
236
                maxPooling(inputs[0], outputs[0], outputs[1]);
237
                break;
238
            case AVE:
239
                CV_Assert_N(inputs.size() == 1, outputs.size() == 1);
240
                avePooling(inputs[0], outputs[0]);
241
                break;
242
            case ROI: case PSROI:
243
                CV_Assert_N(inputs.size() == 2, outputs.size() == 1);
244
                roiPooling(inputs[0], inputs[1], outputs[0]);
245
                break;
246
            default:
247
                CV_Error(Error::StsNotImplemented, "Not implemented");
248
                break;
249
        }
250
    }
251

252
    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
253
    {
254
#ifdef HAVE_VULKAN
255
        int padding_mode;
256
        vkcom::PoolType pool_type;
257
        int filter_size[2] = {kernel.height, kernel.width};
258
        int pad_size[2] = {pad.height, pad.width};
259
        int stride_size[2] = {stride.height, stride.width};
260
        pool_type = type == MAX ? vkcom::kPoolTypeMax:
261
                   (type == AVE ? vkcom::kPoolTypeAvg:
262
                            vkcom::kPoolTypeNum);
263

264
        if (padMode.empty())
265
        {
266
            padding_mode = vkcom::kPaddingModeCaffe;
267
        }
268
        else if (padMode == "VALID")
269
        {
270
            padding_mode = vkcom::kPaddingModeValid;
271
        }
272
        else if (padMode == "SAME")
273
        {
274
            padding_mode = vkcom::kPaddingModeSame;
275
        }
276
        else
277
            CV_Error(Error::StsError, "Unsupported padding mode " + padMode);
278

279
        std::shared_ptr<vkcom::OpBase> op(new vkcom::OpPool(filter_size, pad_size,
280
                                                            stride_size, padding_mode,
281
                                                            pool_type, avePoolPaddedArea));
282
        return Ptr<BackendNode>(new VkComBackendNode(inputs, op));
283
#endif
284
        return Ptr<BackendNode>();
285
    }
286

287
    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
288
    {
289
        if (type == MAX)
290
            return initMaxPoolingHalide(inputs);
291
        else if (type == AVE)
292
            return initAvePoolingHalide(inputs);
293
        else
294
            return Ptr<BackendNode>();
295
    }
296

297
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
298
    {
299
#ifdef HAVE_INF_ENGINE
300
        InferenceEngine::LayerParams lp;
301
        lp.name = name;
302
        lp.precision = InferenceEngine::Precision::FP32;
303

304
        std::shared_ptr<InferenceEngine::CNNLayer> ieLayer;
305
        if (type == MAX || type == AVE)
306
        {
307
            lp.type = "Pooling";
308
            InferenceEngine::PoolingLayer* poolLayer = new InferenceEngine::PoolingLayer(lp);
309
#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2018R3)
310
            poolLayer->_kernel.insert(InferenceEngine::X_AXIS, kernel.width);
311
            poolLayer->_kernel.insert(InferenceEngine::Y_AXIS, kernel.height);
312
            poolLayer->_stride.insert(InferenceEngine::X_AXIS, stride.width);
313
            poolLayer->_stride.insert(InferenceEngine::Y_AXIS, stride.height);
314
            poolLayer->_padding.insert(InferenceEngine::X_AXIS, pad_l);
315
            poolLayer->_padding.insert(InferenceEngine::Y_AXIS, pad_t);
316
            poolLayer->_pads_end.insert(InferenceEngine::X_AXIS, pad_r);
317
            poolLayer->_pads_end.insert(InferenceEngine::Y_AXIS, pad_b);
318
#else
319
            poolLayer->_kernel_x = kernel.width;
320
            poolLayer->_kernel_y = kernel.height;
321
            poolLayer->_stride_x = stride.width;
322
            poolLayer->_stride_y = stride.height;
323
            poolLayer->_padding_x = pad_l;
324
            poolLayer->_padding_y = pad_t;
325
            poolLayer->params["pad-r"] = format("%d", pad_r);
326
            poolLayer->params["pad-b"] = format("%d", pad_b);
327
#endif
328
            poolLayer->_exclude_pad = type == AVE && padMode == "SAME";
329
            poolLayer->params["rounding-type"] = ceilMode ? "ceil" : "floor";
330
            poolLayer->_type = type == MAX ? InferenceEngine::PoolingLayer::PoolType::MAX :
331
                                             InferenceEngine::PoolingLayer::PoolType::AVG;
332
            ieLayer = std::shared_ptr<InferenceEngine::CNNLayer>(poolLayer);
333
        }
334
        else if (type == ROI)
335
        {
336
            lp.type = "ROIPooling";
337
            ieLayer = std::shared_ptr<InferenceEngine::CNNLayer>(new InferenceEngine::CNNLayer(lp));
338
            ieLayer->params["pooled_w"] = format("%d", pooledSize.width);
339
            ieLayer->params["pooled_h"] = format("%d", pooledSize.height);
340
            ieLayer->params["spatial_scale"] = format("%f", spatialScale);
341
        }
342
        else if (type == PSROI)
343
        {
344
            lp.type = "PSROIPooling";
345
            ieLayer = std::shared_ptr<InferenceEngine::CNNLayer>(new InferenceEngine::CNNLayer(lp));
346
            ieLayer->params["output_dim"] = format("%d", psRoiOutChannels);
347
            ieLayer->params["group_size"] = format("%d", pooledSize.width);
348
            ieLayer->params["spatial_scale"] = format("%f", spatialScale);
349
        }
350
        else
351
            CV_Error(Error::StsNotImplemented, "Unsupported pooling type");
352

353
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
354
#endif  // HAVE_INF_ENGINE
355
        return Ptr<BackendNode>();
356
    }
357

358

359
    class PoolingInvoker : public ParallelLoopBody
360
    {
361
    public:
362
        const Mat* src, *rois;
363
        Mat *dst, *mask;
364
        Size kernel, stride;
365
        int pad_l, pad_t, pad_r, pad_b;
366
        bool avePoolPaddedArea;
367
        int nstripes;
368
        bool computeMaxIdx;
369
        std::vector<int> ofsbuf;
370
        int poolingType;
371
        float spatialScale;
372

373
        PoolingInvoker() : src(0), rois(0), dst(0), mask(0), pad_l(0), pad_t(0), pad_r(0), pad_b(0),
374
                           avePoolPaddedArea(false), nstripes(0),
375
                           computeMaxIdx(0), poolingType(MAX), spatialScale(0) {}
376

377
        static void run(const Mat& src, const Mat& rois, Mat& dst, Mat& mask, Size kernel,
378
                        Size stride, int pad_l, int pad_t, int pad_r, int pad_b, bool avePoolPaddedArea, int poolingType, float spatialScale,
379
                        bool computeMaxIdx, int nstripes)
380
        {
381
            CV_Assert_N(
382
                      src.isContinuous(), dst.isContinuous(),
383
                      src.type() == CV_32F, src.type() == dst.type(),
384
                      src.dims == 4, dst.dims == 4,
385
                      ((poolingType == ROI || poolingType == PSROI) && dst.size[0] ==rois.size[0] || src.size[0] == dst.size[0]),
386
                       poolingType == PSROI || src.size[1] == dst.size[1],
387
                      (mask.empty() || (mask.type() == src.type() && mask.size == dst.size)));
388

389
            PoolingInvoker p;
390

391
            p.src = &src;
392
            p.rois = &rois;
393
            p.dst = &dst;
394
            p.mask = &mask;
395
            p.kernel = kernel;
396
            p.stride = stride;
397
            p.pad_l = pad_l;
398
            p.pad_t = pad_t;
399
            p.pad_r = pad_r;
400
            p.pad_b = pad_b;
401
            p.avePoolPaddedArea = avePoolPaddedArea;
402
            p.nstripes = nstripes;
403
            p.computeMaxIdx = computeMaxIdx;
404
            p.poolingType = poolingType;
405
            p.spatialScale = spatialScale;
406

407
            if( !computeMaxIdx )
408
            {
409
                p.ofsbuf.resize(kernel.width*kernel.height);
410
                for( int i = 0; i < kernel.height; i++ )
411
                    for( int j = 0; j < kernel.width; j++ )
412
                        p.ofsbuf[i*kernel.width + j] = src.size[3]*i + j;
413
            }
414

415
            parallel_for_(Range(0, nstripes), p, nstripes);
416
        }
417

418
        void operator()(const Range& r) const CV_OVERRIDE
419
        {
420
            int channels = dst->size[1], width = dst->size[3], height = dst->size[2];
421
            int inp_width = src->size[3], inp_height = src->size[2];
422
            size_t total = dst->total();
423
            size_t stripeSize = (total + nstripes - 1)/nstripes;
424
            size_t stripeStart = r.start*stripeSize;
425
            size_t stripeEnd = std::min(r.end*stripeSize, total);
426
            int kernel_w = kernel.width, kernel_h = kernel.height;
427
            int stride_w = stride.width, stride_h = stride.height;
428
            bool compMaxIdx = computeMaxIdx;
429

430
#if CV_SIMD128
431
            const int* ofsptr = ofsbuf.empty() ? 0 : (const int*)&ofsbuf[0];
432
            if (poolingType == MAX && !compMaxIdx && !ofsptr)
433
                CV_Error(Error::StsBadArg, "ofsbuf should be initialized in this mode");
434
            v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
435
            v_float32x4 ones = v_setall_f32(1.f);
436
            v_float32x4 idx_delta = v_setall_f32((float)(inp_width - kernel_w));
437
#endif
438

439
            for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
440
            {
441
                size_t ofs = ofs0;
442
                int x0 = (int)(ofs % width);
443
                ofs /= width;
444
                int y0 = (int)(ofs % height);
445
                ofs /= height;
446
                int c = (int)(ofs % channels);
447
                int n = (int)(ofs / channels);
448
                int ystart, yend;
449

450
                const float *srcData = 0;
451
                if (poolingType == ROI)
452
                {
453
                    const float *roisData = rois->ptr<float>(n);
454
                    int ystartROI = roundRoiSize(roisData[2] * spatialScale);
455
                    int yendROI = roundRoiSize(roisData[4] * spatialScale);
456
                    int roiHeight = std::max(yendROI - ystartROI + 1, 1);
457
                    float roiRatio = (float)roiHeight / height;
458

459
                    ystart = ystartROI + y0 * roiRatio;
460
                    yend = ystartROI + std::ceil((y0 + 1) * roiRatio);
461

462
                    CV_Assert(roisData[0] < src->size[0]);
463
                    srcData = src->ptr<float>(roisData[0], c);
464
                }
465
                else if (poolingType == PSROI)
466
                {
467
                    const float *roisData = rois->ptr<float>(n);
468
                    float ystartROI = roundRoiSize(roisData[2]) * spatialScale;
469
                    float yendROI = roundRoiSize(roisData[4] + 1) * spatialScale;
470
                    float roiHeight = std::max(yendROI - ystartROI, 0.1f);
471
                    float roiRatio = roiHeight / height;
472

473
                    ystart = (int)std::floor(ystartROI + y0 * roiRatio);
474
                    yend = (int)std::ceil(ystartROI + (y0 + 1) * roiRatio);
475
                }
476
                else
477
                {
478
                    ystart = y0 * stride_h - pad_t;
479
                    yend = min(ystart + kernel_h, inp_height + pad_b);
480
                    srcData = src->ptr<float>(n, c);
481
                }
482
                int ydelta = yend - ystart;
483
                ystart = max(ystart, 0);
484
                yend = min(yend, inp_height);
485
                float *dstData = dst->ptr<float>(n, c, y0);
486
                float *dstMaskData = mask->data ? mask->ptr<float>(n, c, y0) : 0;
487

488
                int delta = std::min((int)(stripeEnd - ofs0), width - x0);
489
                ofs0 += delta;
490
                int x1 = x0 + delta;
491

492
                if( poolingType == MAX)
493
                    for( ; x0 < x1; x0++ )
494
                    {
495
                        int xstart = x0 * stride_w - pad_l;
496
                        int xend = min(xstart + kernel_w, inp_width);
497
                        xstart = max(xstart, 0);
498
                        if (xstart >= xend || ystart >= yend)
499
                        {
500
                            dstData[x0] = 0;
501
                            if (compMaxIdx && dstMaskData)
502
                                dstMaskData[x0] = -1;
503
                            continue;
504
                        }
505
#if CV_SIMD128
506
                        if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_l + kernel_w < inp_width )
507
                        {
508
                            if( compMaxIdx )
509
                            {
510
                                v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
511
                                v_float32x4 max_val1 = max_val0;
512
                                v_float32x4 max_idx0 = v_setall_f32(-1.f);
513
                                v_float32x4 max_idx1 = max_idx0;
514
                                int index0 = ystart * inp_width + xstart;
515
                                v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
516
                                v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
517

518
                                for (int y = ystart; y < yend; ++y)
519
                                {
520
                                    for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
521
                                    {
522
                                        const int index = y * inp_width + x;
523
                                        v_float32x4 v0(srcData[index], srcData[index + stride_w],
524
                                                       srcData[index + stride_w*2], srcData[index + stride_w*3]);
525
                                        v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
526
                                                       srcData[index + stride_w*6], srcData[index + stride_w*7]);
527
                                        max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
528
                                        max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
529
                                        max_val0 = v_max(max_val0, v0);
530
                                        max_val1 = v_max(max_val1, v1);
531
                                    }
532
                                    idx0 += idx_delta;
533
                                    idx1 += idx_delta;
534
                                }
535
                                v_store(dstData + x0, max_val0);
536
                                v_store(dstData + x0 + 4, max_val1);
537
                                if (dstMaskData)
538
                                {
539
                                    v_store(dstMaskData + x0, max_idx0);
540
                                    v_store(dstMaskData + x0 + 4, max_idx1);
541
                                }
542
                                x0 += 7;
543
                            }
544
                            else
545
                            {
546
                                v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
547
                                v_float32x4 max_val1 = max_val0;
548

549
                                if( yend - ystart == kernel_h )
550
                                {
551
                                    const float* srcData1 = srcData + ystart*inp_width + xstart;
552
                                    if( stride_w == 1 )
553
                                        for (int k = 0; k < kernel_w*kernel_h; k++)
554
                                        {
555
                                            int index = ofsptr[k];
556
                                            v_float32x4 v0 = v_load(srcData1 + index);
557
                                            v_float32x4 v1 = v_load(srcData1 + index + 4);
558
                                            max_val0 = v_max(max_val0, v0);
559
                                            max_val1 = v_max(max_val1, v1);
560
                                        }
561
                                    else if( stride_w == 2 )
562
                                        for (int k = 0; k < kernel_w*kernel_h; k++)
563
                                        {
564
                                            int index = ofsptr[k];
565
                                            v_float32x4 v0, v1, dummy;
566
                                            v_load_deinterleave(srcData1 + index, v0, dummy);     // f0  f2  f4  f6  ,f1  f3  f5  f7
567
                                            v_load_deinterleave(srcData1 + index + 8, v1, dummy); // f8  f10 f12 f14 ,f9  f11 f13 f15
568
                                            max_val0 = v_max(max_val0, v0);
569
                                            max_val1 = v_max(max_val1, v1);
570
                                        }
571
                                    else
572
                                        for (int k = 0; k < kernel_w*kernel_h; k++)
573
                                        {
574
                                            int index = ofsptr[k];
575
                                            v_float32x4 v0(srcData1[index], srcData1[index + stride_w],
576
                                                           srcData1[index + stride_w*2], srcData1[index + stride_w*3]);
577
                                            v_float32x4 v1(srcData1[index + stride_w*4], srcData1[index + stride_w*5],
578
                                                           srcData1[index + stride_w*6], srcData1[index + stride_w*7]);
579
                                            max_val0 = v_max(max_val0, v0);
580
                                            max_val1 = v_max(max_val1, v1);
581
                                        }
582
                                }
583
                                else
584
                                {
585
                                    for (int y = ystart; y < yend; ++y)
586
                                    {
587
                                        for (int x = xstart; x < xend; ++x)
588
                                        {
589
                                            const int index = y * inp_width + x;
590
                                            v_float32x4 v0(srcData[index], srcData[index + stride_w],
591
                                                           srcData[index + stride_w*2], srcData[index + stride_w*3]);
592
                                            v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
593
                                                           srcData[index + stride_w*6], srcData[index + stride_w*7]);
594
                                            max_val0 = v_max(max_val0, v0);
595
                                            max_val1 = v_max(max_val1, v1);
596
                                        }
597
                                    }
598
                                }
599
                                v_store(dstData + x0, max_val0);
600
                                v_store(dstData + x0 + 4, max_val1);
601
                                x0 += 7;
602
                            }
603
                        }
604
                        else
605
#endif
606
                        {
607
                            float max_val = -FLT_MAX;
608
                            if( compMaxIdx )
609
                            {
610
                                int max_index = -1;
611
                                for (int y = ystart; y < yend; ++y)
612
                                    for (int x = xstart; x < xend; ++x)
613
                                    {
614
                                        const int index = y * inp_width + x;
615
                                        float val = srcData[index];
616
                                        if (val > max_val)
617
                                        {
618
                                            max_val = val;
619
                                            max_index = index;
620
                                        }
621
                                    }
622

623
                                dstData[x0] = max_val;
624
                                if (dstMaskData)
625
                                    dstMaskData[x0] = max_index;
626
                            }
627
                            else
628
                            {
629
                                for (int y = ystart; y < yend; ++y)
630
                                    for (int x = xstart; x < xend; ++x)
631
                                    {
632
                                        const int index = y * inp_width + x;
633
                                        float val = srcData[index];
634
                                        max_val = std::max(max_val, val);
635
                                    }
636

637
                                dstData[x0] = max_val;
638
                            }
639
                        }
640
                    }
641
                else if (poolingType == AVE)
642
                {
643
                    for( ; x0 < x1; x0++ )
644
                    {
645
                        int xstart = x0 * stride_w - pad_l;
646
                        int xend = min(xstart + kernel_w, inp_width + pad_r);
647
                        int xdelta = xend - xstart;
648
                        xstart = max(xstart, 0);
649
                        xend = min(xend, inp_width);
650
                        float inv_kernel_area = avePoolPaddedArea ? xdelta * ydelta : ((yend - ystart) * (xend - xstart));
651
                        inv_kernel_area = 1.0 / inv_kernel_area;
652
#if CV_SIMD128
653
                        if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_l + kernel_w < inp_width )
654
                        {
655
                            v_float32x4 sum_val0 = v_setzero_f32(), sum_val1 = v_setzero_f32();
656
                            v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
657

658
                            for (int y = ystart; y < yend; ++y)
659
                            {
660
                                for (int x = xstart; x < xend; ++x)
661
                                {
662
                                    const int index = y * inp_width + x;
663
                                    v_float32x4 v0(srcData[index], srcData[index + stride_w],
664
                                                   srcData[index + stride_w*2], srcData[index + stride_w*3]);
665
                                    v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
666
                                                   srcData[index + stride_w*6], srcData[index + stride_w*7]);
667
                                    sum_val0 += v0;
668
                                    sum_val1 += v1;
669
                                }
670
                            }
671
                            v_store(dstData + x0, sum_val0*ikarea);
672
                            v_store(dstData + x0 + 4, sum_val1*ikarea);
673
                            x0 += 7;
674
                        }
675
                        else
676
#endif
677
                        {
678
                            float sum_val = 0.f;
679
                            for (int y = ystart; y < yend; ++y)
680
                                for (int x = xstart; x < xend; ++x)
681
                                {
682
                                    const int index = y * inp_width + x;
683
                                    float val = srcData[index];
684
                                    sum_val += val;
685
                                }
686

687
                            dstData[x0] = sum_val*inv_kernel_area;
688
                        }
689
                    }
690
                }
691
                else if (poolingType == ROI)
692
                {
693
                    const float *roisData = rois->ptr<float>(n);
694
                    int xstartROI = roundRoiSize(roisData[1] * spatialScale);
695
                    int xendROI = roundRoiSize(roisData[3] * spatialScale);
696
                    int roiWidth = std::max(xendROI - xstartROI + 1, 1);
697
                    float roiRatio = (float)roiWidth / width;
698
                    for( ; x0 < x1; x0++ )
699
                    {
700
                        int xstart = xstartROI + x0 * roiRatio;
701
                        int xend = xstartROI + std::ceil((x0 + 1) * roiRatio);
702
                        xstart = max(xstart, 0);
703
                        xend = min(xend, inp_width);
704
                        if (xstart >= xend || ystart >= yend)
705
                        {
706
                            dstData[x0] = 0;
707
                            if (compMaxIdx && dstMaskData)
708
                                dstMaskData[x0] = -1;
709
                            continue;
710
                        }
711
                        float max_val = -FLT_MAX;
712
                        for (int y = ystart; y < yend; ++y)
713
                            for (int x = xstart; x < xend; ++x)
714
                            {
715
                                const int index = y * inp_width + x;
716
                                float val = srcData[index];
717
                                max_val = std::max(max_val, val);
718
                            }
719
                        dstData[x0] = max_val;
720
                    }
721
                }
722
                else  // PSROI
723
                {
724
                    const float *roisData = rois->ptr<float>(n);
725
                    CV_Assert(roisData[0] < src->size[0]);
726
                    float xstartROI = roundRoiSize(roisData[1]) * spatialScale;
727
                    float xendROI = roundRoiSize(roisData[3] + 1) * spatialScale;
728
                    float roiWidth = std::max(xendROI - xstartROI, 0.1f);
729
                    float roiRatio = roiWidth / width;
730
                    for( ; x0 < x1; x0++ )
731
                    {
732
                        int xstart = (int)std::floor(xstartROI + x0 * roiRatio);
733
                        int xend = (int)std::ceil(xstartROI + (x0 + 1) * roiRatio);
734
                        xstart = max(xstart, 0);
735
                        xend = min(xend, inp_width);
736
                        if (xstart >= xend || ystart >= yend)
737
                        {
738
                            dstData[x0] = 0;
739
                            continue;
740
                        }
741

742
                        srcData = src->ptr<float>(roisData[0], (c * height + y0) * width + x0);
743
                        float sum_val = 0.f;
744
                        for (int y = ystart; y < yend; ++y)
745
                            for (int x = xstart; x < xend; ++x)
746
                            {
747
                                const int index = y * inp_width + x;
748
                                float val = srcData[index];
749
                                sum_val += val;
750
                            }
751
                        dstData[x0] = sum_val / ((yend - ystart) * (xend - xstart));
752
                    }
753
                }
754
            }
755
        }
756
    };
757

758
    void maxPooling(Mat &src, Mat &dst, Mat &mask)
759
    {
760
        const int nstripes = getNumThreads();
761
        Mat rois;
762
        PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad_l, pad_t, pad_r, pad_b,  avePoolPaddedArea, type, spatialScale, computeMaxIdx, nstripes);
763
    }
764

765
    void avePooling(Mat &src, Mat &dst)
766
    {
767
        const int nstripes = getNumThreads();
768
        Mat rois, mask;
769
        PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad_l, pad_t, pad_r, pad_b, avePoolPaddedArea, type, spatialScale, computeMaxIdx, nstripes);
770
    }
771

772
    void roiPooling(const Mat &src, const Mat &rois, Mat &dst)
773
    {
774
        const int nstripes = getNumThreads();
775
        Mat mask;
776
        PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad_l, pad_t, pad_r, pad_b, avePoolPaddedArea, type, spatialScale, computeMaxIdx, nstripes);
777
    }
778

779
    virtual Ptr<BackendNode> initMaxPoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
780
    {
781
#ifdef HAVE_HALIDE
782
        Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
783
        const int inWidth = inputBuffer.width();
784
        const int inHeight = inputBuffer.height();
785

786
        Halide::Var x("x"), y("y"), c("c"), n("n");
787
        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
788
        Halide::RDom r(0, kernel.width, 0, kernel.height);
789
        Halide::Expr kx, ky;
790
        if(pad_l || pad_t)
791
        {
792
            kx = clamp(x * stride.width + r.x - pad_l, 0, inWidth - 1);
793
            ky = clamp(y * stride.height + r.y - pad_t, 0, inHeight - 1);
794
        }
795
        else
796
        {
797
            kx = min(x * stride.width + r.x, inWidth - 1);
798
            ky = min(y * stride.height + r.y, inHeight - 1);
799
        }
800

801
        // Halide::argmax returns tuple (r.x, r.y, max).
802
        Halide::Tuple res = argmax(inputBuffer(kx, ky, c, n));
803

804
        // Compute offset from argmax in range [0, kernel_size).
805
        Halide::Expr max_index;
806
        if(pad_l || pad_t)
807
        {
808
            max_index = clamp(y * stride.height + res[1] - pad_t,
809
                              0, inHeight - 1) * inWidth +
810
                        clamp(x * stride.width + res[0] - pad_l,
811
                              0, inWidth - 1);
812
        }
813
        else
814
        {
815
            max_index = min(y * stride.height + res[1], inHeight - 1) * inWidth +
816
                        min(x * stride.width + res[0], inWidth - 1);
817
        }
818
        top(x, y, c, n) = { res[2], Halide::cast<float>(max_index) };
819
        return Ptr<BackendNode>(new HalideBackendNode(top));
820
#endif  // HAVE_HALIDE
821
        return Ptr<BackendNode>();
822
    }
823

824
    virtual Ptr<BackendNode> initAvePoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
825
    {
826
#ifdef HAVE_HALIDE
827
        Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
828

829
        const int inW = inputBuffer.width(), inH = inputBuffer.height();
830
        if ((inW - kernel.width) % stride.width || (inH - kernel.height) % stride.height)
831
        {
832
            CV_Error(cv::Error::StsNotImplemented,
833
                     "Halide backend for average pooling with partial "
834
                     "kernels is not implemented");
835
        }
836

837
        const float norm = 1.0f / (kernel.width * kernel.height);
838

839
        Halide::Var x("x"), y("y"), c("c"), n("n");
840
        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
841
        Halide::RDom r(0, kernel.width, 0, kernel.height);
842
        top(x, y, c, n) = sum(
843
            inputBuffer(x * stride.width + r.x,
844
                        y * stride.height + r.y, c, n)) * norm;
845
        return Ptr<BackendNode>(new HalideBackendNode(top));
846
#endif  // HAVE_HALIDE
847
        return Ptr<BackendNode>();
848
    }
849

850
    virtual void applyHalideScheduler(Ptr<BackendNode>& node,
851
                                      const std::vector<Mat*> &inputs,
852
                                      const std::vector<Mat> &outputs,
853
                                      int targetId) const CV_OVERRIDE
854
    {
855
#ifdef  HAVE_HALIDE
856
        if (targetId != DNN_TARGET_CPU)
857
        {
858
            Layer::applyHalideScheduler(node, inputs, outputs, targetId);
859
            return;
860
        }
861
        Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
862
                    xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
863
        Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
864

865
        int outW, outH, outC, outN;
866
        getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
867

868
        if (outW < 8 || outH < 8)
869
        {
870
            if (outC > 8)
871
                top.split(c, co, ci, 8)
872
                   .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
873
                   .parallel(tile)
874
                   .vectorize(ci);
875
            else
876
            {
877
                top.fuse(y, c, tile).fuse(n, tile, tile)
878
                   .parallel(tile);
879
                if (outW > 1)
880
                    top.vectorize(x);
881
            }
882
        }
883
        else
884
        {
885
            if (outC > 8)
886
                top.split(x, xo, xi, 8).split(y, yo, yi, 8).split(c, co, ci, 8)
887
                   .fuse(xo, yo, tile).fuse(co, tile, tile).fuse(n, tile, tile)
888
                   .parallel(tile)
889
                   .vectorize(xi);
890
            else
891
                top.split(x, xo, xi, 8).split(y, yo, yi, 8)
892
                   .fuse(xo, yo, tile).fuse(c, tile, tile).fuse(n, tile, tile)
893
                   .parallel(tile)
894
                   .vectorize(xi);
895
        }
896
#endif  // HAVE_HALIDE
897
    }
898

899
    bool getMemoryShapes(const std::vector<MatShape> &inputs,
900
                         const int requiredOutputs,
901
                         std::vector<MatShape> &outputs,
902
                         std::vector<MatShape> &internals) const CV_OVERRIDE
903
    {
904
        CV_Assert(inputs.size() != 0);
905
        Size in(inputs[0][3], inputs[0][2]), out;
906

907
        if (globalPooling)
908
        {
909
            out.height = 1;
910
            out.width = 1;
911
        }
912
        else if (type == ROI || type == PSROI)
913
        {
914
            out.height = pooledSize.height;
915
            out.width = pooledSize.width;
916
        }
917
        else if (padMode.empty())
918
        {
919
            float height = (float)(in.height + pad_t + pad_b - kernel.height) / stride.height;
920
            float width = (float)(in.width + pad_l + pad_r - kernel.width) / stride.width;
921
            out.height = 1 + (ceilMode ? ceil(height) : floor(height));
922
            out.width = 1 + (ceilMode ? ceil(width) : floor(width));
923

924
            if (pad_r || pad_b)
925
            {
926
                // If we have padding, ensure that the last pooling starts strictly
927
                // inside the image (instead of at the padding); otherwise clip the last.
928
                if ((out.height - 1) * stride.height >= in.height + pad_b)
929
                    --out.height;
930
                if ((out.width - 1) * stride.width >= in.width + pad_r)
931
                    --out.width;
932
                CV_Assert((out.height - 1) * stride.height < in.height + pad_b);
933
                CV_Assert((out.width - 1) * stride.width < in.width + pad_r);
934
            }
935
        }
936
        else
937
        {
938
            getConvPoolOutParams(in, kernel, stride, padMode, Size(1, 1), out);
939
        }
940

941
        int dims[] = {inputs[0][0], inputs[0][1], out.height, out.width};
942
        if (type == ROI)
943
        {
944
            CV_Assert(inputs.size() == 2);
945
            dims[0] = inputs[1][0];  // Number of proposals;
946
        }
947
        else if (type == PSROI)
948
        {
949
            CV_Assert(inputs.size() == 2);
950
            CV_Assert(psRoiOutChannels * pooledSize.width * pooledSize.height == inputs[0][1]);
951
            dims[0] = inputs[1][0];  // Number of proposals;
952
            dims[1] = psRoiOutChannels;
953
        }
954
        outputs.assign(type == MAX ? 2 : 1, shape(dims, 4));
955

956
        return false;
957
    }
958

959
    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
960
                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
961
    {
962
        CV_UNUSED(inputs); // suppress unused variable warning
963
        long flops = 0;
964

965
        for(int i = 0; i < outputs.size(); i++)
966
        {
967
            if (type == MAX)
968
            {
969
                if (i%2 == 0)
970
                    flops += total(outputs[i])*kernel.area();
971
            }
972
            else
973
            {
974
                flops += total(outputs[i])*(kernel.area() + 1);
975
            }
976
        }
977
        return flops;
978
    }
979
private:
980
    enum Type
981
    {
982
        MAX,
983
        AVE,
984
        STOCHASTIC,
985
        ROI,   // RoI pooling, https://arxiv.org/pdf/1504.08083.pdf
986
        PSROI  // Position-sensitive RoI pooling, https://arxiv.org/pdf/1605.06409.pdf
987
    };
988
};
989

990
Ptr<PoolingLayer> PoolingLayer::create(const LayerParams& params)
991
{
992
    return Ptr<PoolingLayer>(new PoolingLayerImpl(params));
993
}
994

995
}
996
}
997

998
Product

Resources

Company