Path: blob/master/modules/dnn/src/layers/fully_connected_layer.cpp
16337 views
/*M///////////////////////////////////////////////////////////////////////////////////////1//2// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.3//4// By downloading, copying, installing or using the software you agree to this license.5// If you do not agree to this license, do not download, install,6// copy or use the software.7//8//9// License Agreement10// For Open Source Computer Vision Library11//12// Copyright (C) 2013, OpenCV Foundation, all rights reserved.13// Copyright (C) 2017, Intel Corporation, all rights reserved.14// Third party copyrights are property of their respective owners.15//16// Redistribution and use in source and binary forms, with or without modification,17// are permitted provided that the following conditions are met:18//19// * Redistribution's of source code must retain the above copyright notice,20// this list of conditions and the following disclaimer.21//22// * Redistribution's in binary form must reproduce the above copyright notice,23// this list of conditions and the following disclaimer in the documentation24// and/or other materials provided with the distribution.25//26// * The name of the copyright holders may not be used to endorse or promote products27// derived from this software without specific prior written permission.28//29// This software is provided by the copyright holders and contributors "as is" and30// any express or implied warranties, including, but not limited to, the implied31// warranties of merchantability and fitness for a particular purpose are disclaimed.32// In no event shall the Intel Corporation or contributors be liable for any direct,33// indirect, incidental, special, exemplary, or consequential damages34// (including, but not limited to, procurement of substitute goods or services;35// loss of use, data, or profits; or business interruption) however caused36// and on any theory of liability, whether in contract, strict liability,37// or tort (including negligence or otherwise) arising in any way out of38// the use of this software, even if advised of the possibility of such damage.39//40//M*/4142#include "../precomp.hpp"43#include "layers_common.hpp"44#include "../op_halide.hpp"45#include "../op_inf_engine.hpp"46#include <opencv2/dnn/shape_utils.hpp>4748#ifdef HAVE_OPENCL49#include "opencl_kernels_dnn.hpp"50using namespace cv::dnn::ocl4dnn;51#endif5253namespace cv54{55namespace dnn56{5758class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer59{60public:61enum { VEC_ALIGN = 8 };6263#ifdef HAVE_OPENCL64Ptr<OCL4DNNInnerProduct<float> > innerProductOp;65std::vector<UMat> umat_blobs;66std::vector<UMat> half_blobs;67#endif6869FullyConnectedLayerImpl(const LayerParams& params)70{71setParamsFrom(params);72CV_Assert(1 <= blobs.size() && blobs.size() <= 2);7374int numOutput = params.get<int>("num_output");75int innerSize = (int)blobs[0].total() / numOutput;76bias = params.get<bool>("bias_term", true);77axis = params.get<int>("axis", 1);7879CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total());80CV_Assert(!bias || (blobs.size() == 2 && (size_t)numOutput == blobs[1].total()));8182weightsMat = blobs[0] = blobs[0].reshape(1, numOutput);83int vecsize = weightsMat.cols;84if( vecsize % VEC_ALIGN != 0 )85{86int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);87Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type());88Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned);89wpadding.setTo(Scalar::all(0.));90weightsMat = weightsBuf.colRange(0, vecsize);91blobs[0].copyTo(weightsMat);92}9394if (bias)95biasMat = blobs[1] = blobs[1].reshape(1, 1);96else97biasMat = Mat::zeros(1, numOutput, weightsMat.type());98}99100bool getMemoryShapes(const std::vector<MatShape> &inputs,101const int requiredOutputs,102std::vector<MatShape> &outputs,103std::vector<MatShape> &) const CV_OVERRIDE104{105CV_Assert(inputs.size() == 1);106CV_Assert(1 <= blobs.size() && blobs.size() <= 2);107CV_Assert(blobs[0].dims == 2);108109int cAxis = clamp(axis, inputs[0]);110int numOutput = blobs[0].size[0];111MatShape outShape(cAxis + 1);112for (int i = 0; i < cAxis; ++i)113outShape[i] = inputs[0][i];114outShape.back() = numOutput;115116outputs.resize(inputs.size(), outShape);117118CV_Assert(!bias || (size_t)numOutput == blobs[1].total());119return false;120}121122virtual bool supportBackend(int backendId) CV_OVERRIDE123{124return backendId == DNN_BACKEND_OPENCV ||125backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1 ||126backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && axis == 1;127}128129virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE130{131if (activ.empty() || layer.empty())132{133activ = layer;134return !activ.empty();135}136else137return false;138}139140class FullyConnected : public ParallelLoopBody141{142public:143FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false) {}144145static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat,146Mat& dstMat, const ActivationLayer* activ, int nstripes)147{148CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols &&149dstMat.rows == srcMat.rows && dstMat.cols == weights.rows &&150srcMat.type() == weights.type() && weights.type() == dstMat.type() &&151srcMat.type() == CV_32F &&152(biasMat.empty() || (biasMat.type() == srcMat.type() &&153biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols)) );154155FullyConnected p;156157p.srcMat = &srcMat;158p.weights = &weights;159p.biasMat = &biasMat;160p.dstMat = &dstMat;161p.nstripes = nstripes;162p.activ = activ;163p.useAVX = checkHardwareSupport(CPU_AVX);164p.useAVX2 = checkHardwareSupport(CPU_AVX2);165p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;166167parallel_for_(Range(0, nstripes), p, nstripes);168}169170void operator()(const Range& r) const CV_OVERRIDE171{172int valign = FullyConnectedLayerImpl::VEC_ALIGN;173int nsamples = srcMat->rows;174int nw0 = weights->rows;175int k, vecsize = srcMat->cols;176int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);177size_t total = (size_t)nsamples*nw0;178size_t stripeSize = (total + nstripes - 1)/nstripes;179size_t stripeStart = r.start*stripeSize;180size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);181size_t wstep = weights->step1();182AutoBuffer<float> srcbuf(vecsize_aligned + valign);183float* sptr = alignPtr(srcbuf.data(), (int)(valign*sizeof(float)));184185for( k = vecsize; k < vecsize_aligned; k++ )186sptr[k] = 0.f;187188for( size_t ofs = stripeStart; ofs < stripeEnd; )189{190int sampleIdx = (int)(ofs / nw0);191int delta = (int)(ofs - (size_t)sampleIdx*nw0);192const float* sptr_ = srcMat->ptr<float>(sampleIdx);193const float* wptr = weights->ptr<float>(delta);194float* dptr = dstMat->ptr<float>(sampleIdx) + delta;195const float* biasptr = biasMat->ptr<float>() + delta;196int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));197198memcpy(sptr, sptr_, vecsize*sizeof(sptr[0]));199200#if CV_TRY_AVX512_SKX201if( useAVX512 )202opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);203else204#endif205#if CV_TRY_AVX2206if( useAVX2 )207opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);208else209#endif210#if CV_TRY_AVX211if( useAVX )212opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);213else214#endif215{216int i = 0;217218#if CV_SIMD128219for( ; i <= nw - 4; i += 4, wptr += 4*wstep )220{221v_float32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);222v_float32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);223224for( k = 0; k < vecsize; k += 4 )225{226v_float32x4 v = v_load_aligned(sptr + k);227vs0 += v*v_load_aligned(wptr + k);228vs1 += v*v_load_aligned(wptr + wstep + k);229vs2 += v*v_load_aligned(wptr + wstep*2 + k);230vs3 += v*v_load_aligned(wptr + wstep*3 + k);231}232233v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);234s += v_load(biasptr + i);235v_store(dptr + i, s);236}237#endif238239for( ; i < nw; i++, wptr += wstep )240{241float s0=biasptr[i];242243for( k = 0; k < vecsize; k++ )244{245float v = sptr[k];246s0 += v*wptr[k];247}248dptr[i] = s0;249}250}251252if(activ)253activ->forwardSlice(dptr, dptr, 1, 1, delta, delta + nw);254255ofs += nw;256}257}258259const Mat *srcMat, *weights, *biasMat;260const ActivationLayer* activ;261Mat* dstMat;262int nstripes;263bool useAVX;264bool useAVX2;265bool useAVX512;266};267268#ifdef HAVE_OPENCL269virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE270{271innerProductOp.release();272umat_blobs.clear();273half_blobs.clear();274}275276bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)277{278std::vector<UMat> inputs;279std::vector<UMat> outputs;280281bool use_half = (inps.depth() == CV_16S);282inps.getUMatVector(inputs);283outs.getUMatVector(outputs);284285int axisCan = clamp(axis, inputs[0].dims);286int numOutput = blobs[0].size[0];287int innerSize = blobs[0].size[1];288int outerSize = total(shape(inputs[0]), 0, axisCan);289bool ret = true;290291if (innerProductOp.empty())292{293size_t n = blobs.size();294umat_blobs.resize(n);295for (int i = 0; i < n; i++) blobs[i].copyTo(umat_blobs[i]);296297OCL4DNNInnerProductConfig config;298config.num_output = numOutput;299config.bias_term = bias;300config.M = outerSize;301config.K = innerSize;302config.use_half = use_half;303304if (use_half)305{306half_blobs.resize(umat_blobs.size());307for (int i = 0; i < umat_blobs.size(); i++)308{309if (!umat_blobs[i].empty())310convertFp16(umat_blobs[i], half_blobs[i]);311}312}313314innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));315}316317for (size_t i = 0; i < inputs.size(); i++)318{319MatShape inshape, outshape;320inshape = shape(outerSize, innerSize);321outshape = shape(outerSize, numOutput);322323UMat srcMat, dstMat;324srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);325dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);326327if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],328(bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),329dstMat))330{331ret = false;332break;333}334335if (!use_half && bias && (outerSize > 1))336{337UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());338UMat& biases = umat_blobs[1];339cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);340}341}342343if (ret) return true;344345UMat& weights = umat_blobs[0];346for (size_t i = 0; i < inputs.size(); i++)347{348MatShape inshape, outshape;349inshape = shape(outerSize, innerSize);350outshape = shape(outerSize, numOutput);351352UMat srcMat, dstMat, srcMat_fp32, dstMat_fp32;353srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);354dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);355356if (use_half)357{358convertFp16(srcMat, srcMat_fp32);359convertFp16(dstMat, dstMat_fp32);360}361else362{363srcMat_fp32 = srcMat;364dstMat_fp32 = dstMat;365}366367cv::gemm(srcMat_fp32, weights, 1, noArray(), 0, dstMat_fp32, GEMM_2_T);368369if (bias)370{371UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());372UMat& biases = umat_blobs[1];373cv::gemm(biasOnesMat, biases, 1, dstMat_fp32, 1, dstMat_fp32, 0);374}375if (use_half)376{377convertFp16(srcMat_fp32, srcMat);378convertFp16(dstMat_fp32, dstMat);379}380}381382return true;383}384#endif385386void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE387{388CV_TRACE_FUNCTION();389CV_TRACE_ARG_VALUE(name, "name", name.c_str());390391CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),392forward_ocl(inputs_arr, outputs_arr, internals_arr))393394if (inputs_arr.depth() == CV_16S)395{396forward_fallback(inputs_arr, outputs_arr, internals_arr);397return;398}399400std::vector<Mat> input, output;401inputs_arr.getMatVector(input);402outputs_arr.getMatVector(output);403404int axisCan = clamp(axis, input[0].dims);405int outerSize = input[0].total(0, axisCan);406407for (size_t i = 0; i < input.size(); i++)408{409Mat srcMat = input[i].reshape(1, outerSize);410Mat dstMat = output[i].reshape(1, outerSize);411412const int nstripes = getNumThreads();413FullyConnected::run(srcMat, weightsMat, biasMat, dstMat, activ.get(), nstripes);414}415}416417virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE418{419#ifdef HAVE_HALIDE420int inW, inH, inC, inN, outC = blobs[0].size[0];421Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);422getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN);423auto weights = wrapToHalideBuffer(blobs[0], {inW, inH, inC, outC});424425Halide::Var x("x"), y("y"), c("c"), n("n");426Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));427Halide::RDom r(0, inW, 0, inH, 0, inC);428Halide::Expr topExpr = sum(inputBuffer(r.x, r.y, r.z, n) *429weights(r.x, r.y, r.z, c));430if (bias)431{432Halide::Buffer<float> bias = wrapToHalideBuffer(blobs[1], {outC});433topExpr += bias(c);434}435top(x, y, c, n) = topExpr;436return Ptr<BackendNode>(new HalideBackendNode(top));437#endif // HAVE_HALIDE438return Ptr<BackendNode>();439}440441virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE442{443#ifdef HAVE_INF_ENGINE444InferenceEngine::LayerParams lp;445lp.name = name;446lp.type = "FullyConnected";447lp.precision = InferenceEngine::Precision::FP32;448std::shared_ptr<InferenceEngine::FullyConnectedLayer> ieLayer(new InferenceEngine::FullyConnectedLayer(lp));449450ieLayer->_out_num = blobs[0].size[0];451ieLayer->_weights = wrapToInfEngineBlob(blobs[0], {(size_t)blobs[0].size[0], (size_t)blobs[0].size[1], 1, 1}, InferenceEngine::Layout::OIHW);452if (blobs.size() > 1)453ieLayer->_biases = wrapToInfEngineBlob(blobs[1], {(size_t)ieLayer->_out_num}, InferenceEngine::Layout::C);454return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));455#endif // HAVE_INF_ENGINE456return Ptr<BackendNode>();457}458459virtual int64 getFLOPS(const std::vector<MatShape> &inputs,460const std::vector<MatShape> &outputs) const CV_OVERRIDE461{462CV_UNUSED(inputs); // suppress unused variable warning463long flops = 0;464465int innerSize = blobs[0].size[1];466for(int i = 0; i < outputs.size(); i++)467{468flops += CV_BIG_INT(3)*innerSize*total(outputs[i]);469}470471return flops;472473}474475bool bias;476Mat weightsMat, biasMat;477Ptr<ActivationLayer> activ;478};479480Ptr<InnerProductLayer> InnerProductLayer::create(const LayerParams& params)481{482return Ptr<InnerProductLayer>(new FullyConnectedLayerImpl(params));483}484485}486}487488489