CoCalc -- fully_connected

GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/dnn/src/layers/fully_connected_layer.cpp
¹⁶³³⁷ views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
//  By downloading, copying, installing or using the software you agree to this license.
6
//  If you do not agree to this license, do not download, install,
7
//  copy or use the software.
8
//
9
//
10
//                           License Agreement
11
//                For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14
// Copyright (C) 2017, Intel Corporation, all rights reserved.
15
// Third party copyrights are property of their respective owners.
16
//
17
// Redistribution and use in source and binary forms, with or without modification,
18
// are permitted provided that the following conditions are met:
19
//
20
//   * Redistribution's of source code must retain the above copyright notice,
21
//     this list of conditions and the following disclaimer.
22
//
23
//   * Redistribution's in binary form must reproduce the above copyright notice,
24
//     this list of conditions and the following disclaimer in the documentation
25
//     and/or other materials provided with the distribution.
26
//
27
//   * The name of the copyright holders may not be used to endorse or promote products
28
//     derived from this software without specific prior written permission.
29
//
30
// This software is provided by the copyright holders and contributors "as is" and
31
// any express or implied warranties, including, but not limited to, the implied
32
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33
// In no event shall the Intel Corporation or contributors be liable for any direct,
34
// indirect, incidental, special, exemplary, or consequential damages
35
// (including, but not limited to, procurement of substitute goods or services;
36
// loss of use, data, or profits; or business interruption) however caused
37
// and on any theory of liability, whether in contract, strict liability,
38
// or tort (including negligence or otherwise) arising in any way out of
39
// the use of this software, even if advised of the possibility of such damage.
40
//
41
//M*/
42

43
#include "../precomp.hpp"
44
#include "layers_common.hpp"
45
#include "../op_halide.hpp"
46
#include "../op_inf_engine.hpp"
47
#include <opencv2/dnn/shape_utils.hpp>
48

49
#ifdef HAVE_OPENCL
50
#include "opencl_kernels_dnn.hpp"
51
using namespace cv::dnn::ocl4dnn;
52
#endif
53

54
namespace cv
55
{
56
namespace dnn
57
{
58

59
class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
60
{
61
public:
62
    enum { VEC_ALIGN = 8 };
63

64
#ifdef HAVE_OPENCL
65
    Ptr<OCL4DNNInnerProduct<float> > innerProductOp;
66
    std::vector<UMat> umat_blobs;
67
    std::vector<UMat> half_blobs;
68
#endif
69

70
    FullyConnectedLayerImpl(const LayerParams& params)
71
    {
72
        setParamsFrom(params);
73
        CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
74

75
        int numOutput = params.get<int>("num_output");
76
        int innerSize = (int)blobs[0].total() / numOutput;
77
        bias = params.get<bool>("bias_term", true);
78
        axis = params.get<int>("axis", 1);
79

80
        CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total());
81
        CV_Assert(!bias || (blobs.size() == 2 && (size_t)numOutput == blobs[1].total()));
82

83
        weightsMat = blobs[0] = blobs[0].reshape(1, numOutput);
84
        int vecsize = weightsMat.cols;
85
        if( vecsize % VEC_ALIGN != 0 )
86
        {
87
            int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
88
            Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type());
89
            Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned);
90
            wpadding.setTo(Scalar::all(0.));
91
            weightsMat = weightsBuf.colRange(0, vecsize);
92
            blobs[0].copyTo(weightsMat);
93
        }
94

95
        if (bias)
96
            biasMat = blobs[1] = blobs[1].reshape(1, 1);
97
        else
98
            biasMat = Mat::zeros(1, numOutput, weightsMat.type());
99
    }
100

101
    bool getMemoryShapes(const std::vector<MatShape> &inputs,
102
                         const int requiredOutputs,
103
                         std::vector<MatShape> &outputs,
104
                         std::vector<MatShape> &) const CV_OVERRIDE
105
    {
106
        CV_Assert(inputs.size() == 1);
107
        CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
108
        CV_Assert(blobs[0].dims == 2);
109

110
        int cAxis = clamp(axis, inputs[0]);
111
        int numOutput = blobs[0].size[0];
112
        MatShape outShape(cAxis + 1);
113
        for (int i = 0; i < cAxis; ++i)
114
            outShape[i] = inputs[0][i];
115
        outShape.back() = numOutput;
116

117
        outputs.resize(inputs.size(), outShape);
118

119
        CV_Assert(!bias || (size_t)numOutput == blobs[1].total());
120
        return false;
121
    }
122

123
    virtual bool supportBackend(int backendId) CV_OVERRIDE
124
    {
125
        return backendId == DNN_BACKEND_OPENCV ||
126
               backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1 ||
127
               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && axis == 1;
128
    }
129

130
    virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
131
    {
132
        if (activ.empty() || layer.empty())
133
        {
134
            activ = layer;
135
            return !activ.empty();
136
        }
137
        else
138
            return false;
139
    }
140

141
    class FullyConnected : public ParallelLoopBody
142
    {
143
    public:
144
        FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false) {}
145

146
        static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat,
147
                        Mat& dstMat, const ActivationLayer* activ, int nstripes)
148
        {
149
            CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols &&
150
                       dstMat.rows == srcMat.rows && dstMat.cols == weights.rows &&
151
                       srcMat.type() == weights.type() && weights.type() == dstMat.type() &&
152
                       srcMat.type() == CV_32F &&
153
                       (biasMat.empty() || (biasMat.type() == srcMat.type() &&
154
                                           biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols)) );
155

156
            FullyConnected p;
157

158
            p.srcMat = &srcMat;
159
            p.weights = &weights;
160
            p.biasMat = &biasMat;
161
            p.dstMat = &dstMat;
162
            p.nstripes = nstripes;
163
            p.activ = activ;
164
            p.useAVX = checkHardwareSupport(CPU_AVX);
165
            p.useAVX2 = checkHardwareSupport(CPU_AVX2);
166
            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
167

168
            parallel_for_(Range(0, nstripes), p, nstripes);
169
        }
170

171
        void operator()(const Range& r) const CV_OVERRIDE
172
        {
173
            int valign = FullyConnectedLayerImpl::VEC_ALIGN;
174
            int nsamples = srcMat->rows;
175
            int nw0 = weights->rows;
176
            int k, vecsize = srcMat->cols;
177
            int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
178
            size_t total = (size_t)nsamples*nw0;
179
            size_t stripeSize = (total + nstripes - 1)/nstripes;
180
            size_t stripeStart = r.start*stripeSize;
181
            size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
182
            size_t wstep = weights->step1();
183
            AutoBuffer<float> srcbuf(vecsize_aligned + valign);
184
            float* sptr = alignPtr(srcbuf.data(), (int)(valign*sizeof(float)));
185

186
            for( k = vecsize; k < vecsize_aligned; k++ )
187
                sptr[k] = 0.f;
188

189
            for( size_t ofs = stripeStart; ofs < stripeEnd; )
190
            {
191
                int sampleIdx = (int)(ofs / nw0);
192
                int delta = (int)(ofs - (size_t)sampleIdx*nw0);
193
                const float* sptr_ = srcMat->ptr<float>(sampleIdx);
194
                const float* wptr = weights->ptr<float>(delta);
195
                float* dptr = dstMat->ptr<float>(sampleIdx) + delta;
196
                const float* biasptr = biasMat->ptr<float>() + delta;
197
                int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
198

199
                memcpy(sptr, sptr_, vecsize*sizeof(sptr[0]));
200

201
            #if CV_TRY_AVX512_SKX
202
                if( useAVX512 )
203
                    opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
204
                else
205
            #endif
206
            #if CV_TRY_AVX2
207
                if( useAVX2 )
208
                    opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
209
                else
210
            #endif
211
            #if CV_TRY_AVX
212
                if( useAVX )
213
                    opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
214
                else
215
            #endif
216
                {
217
                    int i = 0;
218

219
            #if CV_SIMD128
220
                    for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
221
                    {
222
                        v_float32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);
223
                        v_float32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);
224

225
                        for( k = 0; k < vecsize; k += 4 )
226
                        {
227
                            v_float32x4 v = v_load_aligned(sptr + k);
228
                            vs0 += v*v_load_aligned(wptr + k);
229
                            vs1 += v*v_load_aligned(wptr + wstep + k);
230
                            vs2 += v*v_load_aligned(wptr + wstep*2 + k);
231
                            vs3 += v*v_load_aligned(wptr + wstep*3 + k);
232
                        }
233

234
                        v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
235
                        s += v_load(biasptr + i);
236
                        v_store(dptr + i, s);
237
                    }
238
            #endif
239

240
                    for( ; i < nw; i++, wptr += wstep )
241
                    {
242
                        float s0=biasptr[i];
243

244
                        for( k = 0; k < vecsize; k++ )
245
                        {
246
                            float v = sptr[k];
247
                            s0 += v*wptr[k];
248
                        }
249
                        dptr[i] = s0;
250
                    }
251
                }
252

253
                if(activ)
254
                    activ->forwardSlice(dptr, dptr, 1, 1, delta, delta + nw);
255

256
                ofs += nw;
257
            }
258
        }
259

260
        const Mat *srcMat, *weights, *biasMat;
261
        const ActivationLayer* activ;
262
        Mat* dstMat;
263
        int nstripes;
264
        bool useAVX;
265
        bool useAVX2;
266
        bool useAVX512;
267
    };
268

269
#ifdef HAVE_OPENCL
270
    virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
271
    {
272
        innerProductOp.release();
273
        umat_blobs.clear();
274
        half_blobs.clear();
275
    }
276

277
    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)
278
    {
279
        std::vector<UMat> inputs;
280
        std::vector<UMat> outputs;
281

282
        bool use_half = (inps.depth() == CV_16S);
283
        inps.getUMatVector(inputs);
284
        outs.getUMatVector(outputs);
285

286
        int axisCan = clamp(axis, inputs[0].dims);
287
        int numOutput = blobs[0].size[0];
288
        int innerSize = blobs[0].size[1];
289
        int outerSize = total(shape(inputs[0]), 0, axisCan);
290
        bool ret = true;
291

292
        if (innerProductOp.empty())
293
        {
294
            size_t n = blobs.size();
295
            umat_blobs.resize(n);
296
            for (int i = 0; i < n; i++) blobs[i].copyTo(umat_blobs[i]);
297

298
            OCL4DNNInnerProductConfig config;
299
            config.num_output = numOutput;
300
            config.bias_term = bias;
301
            config.M = outerSize;
302
            config.K = innerSize;
303
            config.use_half = use_half;
304

305
            if (use_half)
306
            {
307
                half_blobs.resize(umat_blobs.size());
308
                for (int i = 0; i < umat_blobs.size(); i++)
309
                {
310
                    if (!umat_blobs[i].empty())
311
                        convertFp16(umat_blobs[i], half_blobs[i]);
312
                }
313
            }
314

315
            innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
316
        }
317

318
        for (size_t i = 0; i < inputs.size(); i++)
319
        {
320
            MatShape inshape, outshape;
321
            inshape = shape(outerSize, innerSize);
322
            outshape = shape(outerSize, numOutput);
323

324
            UMat srcMat, dstMat;
325
            srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
326
            dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
327

328
            if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
329
                                         (bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
330
                                         dstMat))
331
            {
332
                ret = false;
333
                break;
334
            }
335

336
            if (!use_half && bias && (outerSize > 1))
337
            {
338
                UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
339
                UMat& biases = umat_blobs[1];
340
                cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
341
            }
342
        }
343

344
        if (ret) return true;
345

346
        UMat& weights = umat_blobs[0];
347
        for (size_t i = 0; i < inputs.size(); i++)
348
        {
349
            MatShape inshape, outshape;
350
            inshape = shape(outerSize, innerSize);
351
            outshape = shape(outerSize, numOutput);
352

353
            UMat srcMat, dstMat, srcMat_fp32, dstMat_fp32;
354
            srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
355
            dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
356

357
            if (use_half)
358
            {
359
                convertFp16(srcMat, srcMat_fp32);
360
                convertFp16(dstMat, dstMat_fp32);
361
            }
362
            else
363
            {
364
                srcMat_fp32 = srcMat;
365
                dstMat_fp32 = dstMat;
366
            }
367

368
            cv::gemm(srcMat_fp32, weights, 1, noArray(), 0, dstMat_fp32, GEMM_2_T);
369

370
            if (bias)
371
            {
372
                UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
373
                UMat& biases = umat_blobs[1];
374
                cv::gemm(biasOnesMat, biases, 1, dstMat_fp32, 1, dstMat_fp32, 0);
375
            }
376
            if (use_half)
377
            {
378
                convertFp16(srcMat_fp32, srcMat);
379
                convertFp16(dstMat_fp32, dstMat);
380
            }
381
        }
382

383
        return true;
384
    }
385
#endif
386

387
    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
388
    {
389
        CV_TRACE_FUNCTION();
390
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
391

392
        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
393
                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
394

395
        if (inputs_arr.depth() == CV_16S)
396
        {
397
            forward_fallback(inputs_arr, outputs_arr, internals_arr);
398
            return;
399
        }
400

401
        std::vector<Mat> input, output;
402
        inputs_arr.getMatVector(input);
403
        outputs_arr.getMatVector(output);
404

405
        int axisCan = clamp(axis, input[0].dims);
406
        int outerSize = input[0].total(0, axisCan);
407

408
        for (size_t i = 0; i < input.size(); i++)
409
        {
410
            Mat srcMat = input[i].reshape(1, outerSize);
411
            Mat dstMat = output[i].reshape(1, outerSize);
412

413
            const int nstripes = getNumThreads();
414
            FullyConnected::run(srcMat, weightsMat, biasMat, dstMat, activ.get(), nstripes);
415
        }
416
    }
417

418
    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
419
    {
420
#ifdef HAVE_HALIDE
421
        int inW, inH, inC, inN, outC = blobs[0].size[0];
422
        Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
423
        getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN);
424
        auto weights = wrapToHalideBuffer(blobs[0], {inW, inH, inC, outC});
425

426
        Halide::Var x("x"), y("y"), c("c"), n("n");
427
        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
428
        Halide::RDom r(0, inW, 0, inH, 0, inC);
429
        Halide::Expr topExpr = sum(inputBuffer(r.x, r.y, r.z, n) *
430
                                   weights(r.x, r.y, r.z, c));
431
        if (bias)
432
        {
433
            Halide::Buffer<float> bias = wrapToHalideBuffer(blobs[1], {outC});
434
            topExpr += bias(c);
435
        }
436
        top(x, y, c, n) = topExpr;
437
        return Ptr<BackendNode>(new HalideBackendNode(top));
438
#endif  // HAVE_HALIDE
439
        return Ptr<BackendNode>();
440
    }
441

442
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
443
    {
444
#ifdef HAVE_INF_ENGINE
445
        InferenceEngine::LayerParams lp;
446
        lp.name = name;
447
        lp.type = "FullyConnected";
448
        lp.precision = InferenceEngine::Precision::FP32;
449
        std::shared_ptr<InferenceEngine::FullyConnectedLayer> ieLayer(new InferenceEngine::FullyConnectedLayer(lp));
450

451
        ieLayer->_out_num = blobs[0].size[0];
452
        ieLayer->_weights = wrapToInfEngineBlob(blobs[0], {(size_t)blobs[0].size[0], (size_t)blobs[0].size[1], 1, 1}, InferenceEngine::Layout::OIHW);
453
        if (blobs.size() > 1)
454
            ieLayer->_biases = wrapToInfEngineBlob(blobs[1], {(size_t)ieLayer->_out_num}, InferenceEngine::Layout::C);
455
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
456
#endif  // HAVE_INF_ENGINE
457
        return Ptr<BackendNode>();
458
    }
459

460
    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
461
                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
462
    {
463
        CV_UNUSED(inputs); // suppress unused variable warning
464
        long flops = 0;
465

466
        int innerSize = blobs[0].size[1];
467
        for(int i = 0; i < outputs.size(); i++)
468
        {
469
            flops += CV_BIG_INT(3)*innerSize*total(outputs[i]);
470
        }
471

472
        return flops;
473

474
    }
475

476
    bool bias;
477
    Mat weightsMat, biasMat;
478
    Ptr<ActivationLayer> activ;
479
};
480

481
Ptr<InnerProductLayer> InnerProductLayer::create(const LayerParams& params)
482
{
483
    return Ptr<InnerProductLayer>(new FullyConnectedLayerImpl(params));
484
}
485

486
}
487
}
488

489
Product

Resources

Company