Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/dnn/src/layers/fully_connected_layer.cpp
16337 views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
8
//
9
//
10
// License Agreement
11
// For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14
// Copyright (C) 2017, Intel Corporation, all rights reserved.
15
// Third party copyrights are property of their respective owners.
16
//
17
// Redistribution and use in source and binary forms, with or without modification,
18
// are permitted provided that the following conditions are met:
19
//
20
// * Redistribution's of source code must retain the above copyright notice,
21
// this list of conditions and the following disclaimer.
22
//
23
// * Redistribution's in binary form must reproduce the above copyright notice,
24
// this list of conditions and the following disclaimer in the documentation
25
// and/or other materials provided with the distribution.
26
//
27
// * The name of the copyright holders may not be used to endorse or promote products
28
// derived from this software without specific prior written permission.
29
//
30
// This software is provided by the copyright holders and contributors "as is" and
31
// any express or implied warranties, including, but not limited to, the implied
32
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33
// In no event shall the Intel Corporation or contributors be liable for any direct,
34
// indirect, incidental, special, exemplary, or consequential damages
35
// (including, but not limited to, procurement of substitute goods or services;
36
// loss of use, data, or profits; or business interruption) however caused
37
// and on any theory of liability, whether in contract, strict liability,
38
// or tort (including negligence or otherwise) arising in any way out of
39
// the use of this software, even if advised of the possibility of such damage.
40
//
41
//M*/
42
43
#include "../precomp.hpp"
44
#include "layers_common.hpp"
45
#include "../op_halide.hpp"
46
#include "../op_inf_engine.hpp"
47
#include <opencv2/dnn/shape_utils.hpp>
48
49
#ifdef HAVE_OPENCL
50
#include "opencl_kernels_dnn.hpp"
51
using namespace cv::dnn::ocl4dnn;
52
#endif
53
54
namespace cv
55
{
56
namespace dnn
57
{
58
59
class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
60
{
61
public:
62
enum { VEC_ALIGN = 8 };
63
64
#ifdef HAVE_OPENCL
65
Ptr<OCL4DNNInnerProduct<float> > innerProductOp;
66
std::vector<UMat> umat_blobs;
67
std::vector<UMat> half_blobs;
68
#endif
69
70
FullyConnectedLayerImpl(const LayerParams& params)
71
{
72
setParamsFrom(params);
73
CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
74
75
int numOutput = params.get<int>("num_output");
76
int innerSize = (int)blobs[0].total() / numOutput;
77
bias = params.get<bool>("bias_term", true);
78
axis = params.get<int>("axis", 1);
79
80
CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total());
81
CV_Assert(!bias || (blobs.size() == 2 && (size_t)numOutput == blobs[1].total()));
82
83
weightsMat = blobs[0] = blobs[0].reshape(1, numOutput);
84
int vecsize = weightsMat.cols;
85
if( vecsize % VEC_ALIGN != 0 )
86
{
87
int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
88
Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type());
89
Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned);
90
wpadding.setTo(Scalar::all(0.));
91
weightsMat = weightsBuf.colRange(0, vecsize);
92
blobs[0].copyTo(weightsMat);
93
}
94
95
if (bias)
96
biasMat = blobs[1] = blobs[1].reshape(1, 1);
97
else
98
biasMat = Mat::zeros(1, numOutput, weightsMat.type());
99
}
100
101
bool getMemoryShapes(const std::vector<MatShape> &inputs,
102
const int requiredOutputs,
103
std::vector<MatShape> &outputs,
104
std::vector<MatShape> &) const CV_OVERRIDE
105
{
106
CV_Assert(inputs.size() == 1);
107
CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
108
CV_Assert(blobs[0].dims == 2);
109
110
int cAxis = clamp(axis, inputs[0]);
111
int numOutput = blobs[0].size[0];
112
MatShape outShape(cAxis + 1);
113
for (int i = 0; i < cAxis; ++i)
114
outShape[i] = inputs[0][i];
115
outShape.back() = numOutput;
116
117
outputs.resize(inputs.size(), outShape);
118
119
CV_Assert(!bias || (size_t)numOutput == blobs[1].total());
120
return false;
121
}
122
123
virtual bool supportBackend(int backendId) CV_OVERRIDE
124
{
125
return backendId == DNN_BACKEND_OPENCV ||
126
backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1 ||
127
backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && axis == 1;
128
}
129
130
virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
131
{
132
if (activ.empty() || layer.empty())
133
{
134
activ = layer;
135
return !activ.empty();
136
}
137
else
138
return false;
139
}
140
141
class FullyConnected : public ParallelLoopBody
142
{
143
public:
144
FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false) {}
145
146
static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat,
147
Mat& dstMat, const ActivationLayer* activ, int nstripes)
148
{
149
CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols &&
150
dstMat.rows == srcMat.rows && dstMat.cols == weights.rows &&
151
srcMat.type() == weights.type() && weights.type() == dstMat.type() &&
152
srcMat.type() == CV_32F &&
153
(biasMat.empty() || (biasMat.type() == srcMat.type() &&
154
biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols)) );
155
156
FullyConnected p;
157
158
p.srcMat = &srcMat;
159
p.weights = &weights;
160
p.biasMat = &biasMat;
161
p.dstMat = &dstMat;
162
p.nstripes = nstripes;
163
p.activ = activ;
164
p.useAVX = checkHardwareSupport(CPU_AVX);
165
p.useAVX2 = checkHardwareSupport(CPU_AVX2);
166
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
167
168
parallel_for_(Range(0, nstripes), p, nstripes);
169
}
170
171
void operator()(const Range& r) const CV_OVERRIDE
172
{
173
int valign = FullyConnectedLayerImpl::VEC_ALIGN;
174
int nsamples = srcMat->rows;
175
int nw0 = weights->rows;
176
int k, vecsize = srcMat->cols;
177
int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
178
size_t total = (size_t)nsamples*nw0;
179
size_t stripeSize = (total + nstripes - 1)/nstripes;
180
size_t stripeStart = r.start*stripeSize;
181
size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
182
size_t wstep = weights->step1();
183
AutoBuffer<float> srcbuf(vecsize_aligned + valign);
184
float* sptr = alignPtr(srcbuf.data(), (int)(valign*sizeof(float)));
185
186
for( k = vecsize; k < vecsize_aligned; k++ )
187
sptr[k] = 0.f;
188
189
for( size_t ofs = stripeStart; ofs < stripeEnd; )
190
{
191
int sampleIdx = (int)(ofs / nw0);
192
int delta = (int)(ofs - (size_t)sampleIdx*nw0);
193
const float* sptr_ = srcMat->ptr<float>(sampleIdx);
194
const float* wptr = weights->ptr<float>(delta);
195
float* dptr = dstMat->ptr<float>(sampleIdx) + delta;
196
const float* biasptr = biasMat->ptr<float>() + delta;
197
int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
198
199
memcpy(sptr, sptr_, vecsize*sizeof(sptr[0]));
200
201
#if CV_TRY_AVX512_SKX
202
if( useAVX512 )
203
opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
204
else
205
#endif
206
#if CV_TRY_AVX2
207
if( useAVX2 )
208
opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
209
else
210
#endif
211
#if CV_TRY_AVX
212
if( useAVX )
213
opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
214
else
215
#endif
216
{
217
int i = 0;
218
219
#if CV_SIMD128
220
for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
221
{
222
v_float32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);
223
v_float32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);
224
225
for( k = 0; k < vecsize; k += 4 )
226
{
227
v_float32x4 v = v_load_aligned(sptr + k);
228
vs0 += v*v_load_aligned(wptr + k);
229
vs1 += v*v_load_aligned(wptr + wstep + k);
230
vs2 += v*v_load_aligned(wptr + wstep*2 + k);
231
vs3 += v*v_load_aligned(wptr + wstep*3 + k);
232
}
233
234
v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
235
s += v_load(biasptr + i);
236
v_store(dptr + i, s);
237
}
238
#endif
239
240
for( ; i < nw; i++, wptr += wstep )
241
{
242
float s0=biasptr[i];
243
244
for( k = 0; k < vecsize; k++ )
245
{
246
float v = sptr[k];
247
s0 += v*wptr[k];
248
}
249
dptr[i] = s0;
250
}
251
}
252
253
if(activ)
254
activ->forwardSlice(dptr, dptr, 1, 1, delta, delta + nw);
255
256
ofs += nw;
257
}
258
}
259
260
const Mat *srcMat, *weights, *biasMat;
261
const ActivationLayer* activ;
262
Mat* dstMat;
263
int nstripes;
264
bool useAVX;
265
bool useAVX2;
266
bool useAVX512;
267
};
268
269
#ifdef HAVE_OPENCL
270
virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
271
{
272
innerProductOp.release();
273
umat_blobs.clear();
274
half_blobs.clear();
275
}
276
277
bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)
278
{
279
std::vector<UMat> inputs;
280
std::vector<UMat> outputs;
281
282
bool use_half = (inps.depth() == CV_16S);
283
inps.getUMatVector(inputs);
284
outs.getUMatVector(outputs);
285
286
int axisCan = clamp(axis, inputs[0].dims);
287
int numOutput = blobs[0].size[0];
288
int innerSize = blobs[0].size[1];
289
int outerSize = total(shape(inputs[0]), 0, axisCan);
290
bool ret = true;
291
292
if (innerProductOp.empty())
293
{
294
size_t n = blobs.size();
295
umat_blobs.resize(n);
296
for (int i = 0; i < n; i++) blobs[i].copyTo(umat_blobs[i]);
297
298
OCL4DNNInnerProductConfig config;
299
config.num_output = numOutput;
300
config.bias_term = bias;
301
config.M = outerSize;
302
config.K = innerSize;
303
config.use_half = use_half;
304
305
if (use_half)
306
{
307
half_blobs.resize(umat_blobs.size());
308
for (int i = 0; i < umat_blobs.size(); i++)
309
{
310
if (!umat_blobs[i].empty())
311
convertFp16(umat_blobs[i], half_blobs[i]);
312
}
313
}
314
315
innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
316
}
317
318
for (size_t i = 0; i < inputs.size(); i++)
319
{
320
MatShape inshape, outshape;
321
inshape = shape(outerSize, innerSize);
322
outshape = shape(outerSize, numOutput);
323
324
UMat srcMat, dstMat;
325
srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
326
dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
327
328
if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
329
(bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
330
dstMat))
331
{
332
ret = false;
333
break;
334
}
335
336
if (!use_half && bias && (outerSize > 1))
337
{
338
UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
339
UMat& biases = umat_blobs[1];
340
cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
341
}
342
}
343
344
if (ret) return true;
345
346
UMat& weights = umat_blobs[0];
347
for (size_t i = 0; i < inputs.size(); i++)
348
{
349
MatShape inshape, outshape;
350
inshape = shape(outerSize, innerSize);
351
outshape = shape(outerSize, numOutput);
352
353
UMat srcMat, dstMat, srcMat_fp32, dstMat_fp32;
354
srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
355
dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
356
357
if (use_half)
358
{
359
convertFp16(srcMat, srcMat_fp32);
360
convertFp16(dstMat, dstMat_fp32);
361
}
362
else
363
{
364
srcMat_fp32 = srcMat;
365
dstMat_fp32 = dstMat;
366
}
367
368
cv::gemm(srcMat_fp32, weights, 1, noArray(), 0, dstMat_fp32, GEMM_2_T);
369
370
if (bias)
371
{
372
UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
373
UMat& biases = umat_blobs[1];
374
cv::gemm(biasOnesMat, biases, 1, dstMat_fp32, 1, dstMat_fp32, 0);
375
}
376
if (use_half)
377
{
378
convertFp16(srcMat_fp32, srcMat);
379
convertFp16(dstMat_fp32, dstMat);
380
}
381
}
382
383
return true;
384
}
385
#endif
386
387
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
388
{
389
CV_TRACE_FUNCTION();
390
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
391
392
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
393
forward_ocl(inputs_arr, outputs_arr, internals_arr))
394
395
if (inputs_arr.depth() == CV_16S)
396
{
397
forward_fallback(inputs_arr, outputs_arr, internals_arr);
398
return;
399
}
400
401
std::vector<Mat> input, output;
402
inputs_arr.getMatVector(input);
403
outputs_arr.getMatVector(output);
404
405
int axisCan = clamp(axis, input[0].dims);
406
int outerSize = input[0].total(0, axisCan);
407
408
for (size_t i = 0; i < input.size(); i++)
409
{
410
Mat srcMat = input[i].reshape(1, outerSize);
411
Mat dstMat = output[i].reshape(1, outerSize);
412
413
const int nstripes = getNumThreads();
414
FullyConnected::run(srcMat, weightsMat, biasMat, dstMat, activ.get(), nstripes);
415
}
416
}
417
418
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
419
{
420
#ifdef HAVE_HALIDE
421
int inW, inH, inC, inN, outC = blobs[0].size[0];
422
Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
423
getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN);
424
auto weights = wrapToHalideBuffer(blobs[0], {inW, inH, inC, outC});
425
426
Halide::Var x("x"), y("y"), c("c"), n("n");
427
Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
428
Halide::RDom r(0, inW, 0, inH, 0, inC);
429
Halide::Expr topExpr = sum(inputBuffer(r.x, r.y, r.z, n) *
430
weights(r.x, r.y, r.z, c));
431
if (bias)
432
{
433
Halide::Buffer<float> bias = wrapToHalideBuffer(blobs[1], {outC});
434
topExpr += bias(c);
435
}
436
top(x, y, c, n) = topExpr;
437
return Ptr<BackendNode>(new HalideBackendNode(top));
438
#endif // HAVE_HALIDE
439
return Ptr<BackendNode>();
440
}
441
442
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
443
{
444
#ifdef HAVE_INF_ENGINE
445
InferenceEngine::LayerParams lp;
446
lp.name = name;
447
lp.type = "FullyConnected";
448
lp.precision = InferenceEngine::Precision::FP32;
449
std::shared_ptr<InferenceEngine::FullyConnectedLayer> ieLayer(new InferenceEngine::FullyConnectedLayer(lp));
450
451
ieLayer->_out_num = blobs[0].size[0];
452
ieLayer->_weights = wrapToInfEngineBlob(blobs[0], {(size_t)blobs[0].size[0], (size_t)blobs[0].size[1], 1, 1}, InferenceEngine::Layout::OIHW);
453
if (blobs.size() > 1)
454
ieLayer->_biases = wrapToInfEngineBlob(blobs[1], {(size_t)ieLayer->_out_num}, InferenceEngine::Layout::C);
455
return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
456
#endif // HAVE_INF_ENGINE
457
return Ptr<BackendNode>();
458
}
459
460
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
461
const std::vector<MatShape> &outputs) const CV_OVERRIDE
462
{
463
CV_UNUSED(inputs); // suppress unused variable warning
464
long flops = 0;
465
466
int innerSize = blobs[0].size[1];
467
for(int i = 0; i < outputs.size(); i++)
468
{
469
flops += CV_BIG_INT(3)*innerSize*total(outputs[i]);
470
}
471
472
return flops;
473
474
}
475
476
bool bias;
477
Mat weightsMat, biasMat;
478
Ptr<ActivationLayer> activ;
479
};
480
481
Ptr<InnerProductLayer> InnerProductLayer::create(const LayerParams& params)
482
{
483
return Ptr<InnerProductLayer>(new FullyConnectedLayerImpl(params));
484
}
485
486
}
487
}
488
489