Path: blob/master/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
16348 views
/*M///////////////////////////////////////////////////////////////////////////////////////1//2// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.3//4// By downloading, copying, installing or using the software you agree to this license.5// If you do not agree to this license, do not download, install,6// copy or use the software.7//8//9// License Agreement10// For Open Source Computer Vision Library11//12// Copyright (C) 2017, Intel Corporation, all rights reserved.13// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.14// Third party copyrights are property of their respective owners.15//16// Redistribution and use in source and binary forms, with or without modification,17// are permitted provided that the following conditions are met:18//19// * Redistribution's of source code must retain the above copyright notice,20// this list of conditions and the following disclaimer.21//22// * Redistribution's in binary form must reproduce the above copyright notice,23// this list of conditions and the following disclaimer in the documentation24// and/or other materials provided with the distribution.25//26// * The name of the copyright holders may not be used to endorse or promote products27// derived from this software without specific prior written permission.28//29// This software is provided by the copyright holders and contributors "as is" and30// any express or implied warranties, including, but not limited to, the implied31// warranties of merchantability and fitness for a particular purpose are disclaimed.32// In no event shall the Intel Corporation or contributors be liable for any direct,33// indirect, incidental, special, exemplary, or consequential damages34// (including, but not limited to, procurement of substitute goods or services;35// loss of use, data, or profits; or business interruption) however caused36// and on any theory of liability, whether in contract, strict liability,37// or tort (including negligence or otherwise) arising in any way out of38// the use of this software, even if advised of the possibility of such damage.39//40//M*/4142#ifndef _OPENCV_LIBDNN_HPP_43#define _OPENCV_LIBDNN_HPP_44#include <iomanip>45#include <map>46#include <memory>47#include <string>48#include <vector>49#include "common.hpp"5051namespace cv { namespace dnn { namespace ocl4dnn {5253struct OCL4DNNConvConfig54{55OCL4DNNConvConfig() :56kernel(1, 1),57pad(0, 0),58stride(1, 1),59dilation(1, 1),60group(1),61bias_term(false),62use_half(false)63{}64MatShape in_shape;65MatShape out_shape;66Size kernel;67Size pad;68Size stride;69Size dilation;70int group; // = 1;71bool bias_term; // = false;72bool use_half; // = false;73};7475typedef enum {76OCL4DNN_CONV_FUSED_ACTIV_NONE = 0,77OCL4DNN_CONV_FUSED_ACTIV_RELU = 1,78OCL4DNN_CONV_FUSED_ACTIV_PRELU = 2,79OCL4DNN_CONV_FUSED_ACTIV_POWER = 3,80OCL4DNN_CONV_FUSED_ACTIV_TANH = 4,81OCL4DNN_CONV_FUSED_ACTIV_RELU6 = 582} ocl4dnnFusedActiv_t;8384template<typename Dtype>85class OCL4DNNConvSpatial86{87public:88explicit OCL4DNNConvSpatial(OCL4DNNConvConfig config);89~OCL4DNNConvSpatial();90bool Forward(const UMat& bottom_data,91const UMat& bottom_data2,92const UMat& weight,93const UMat& bias,94UMat& top_data, int32_t batch_size);95void setActivReLU(bool fuse_activ, float slope);96void setActivPReLU(bool fuse_activ, std::vector<float> &slope);97void setActivPower(bool fuse_activ, float power);98void setActivTanh(bool fuse_activ);99void setActivReLU6(bool fuse_activ, float min, float max);100void setBias(bool bias_term);101102private:103struct kernelConfig104{105std::string kernelName;106float executionTime;107size_t local_work_size[3];108size_t global_work_size[3];109int32_t workItem_output[3];110bool verified;111bool tested;112bool swizzle_weights;113bool use_null_local;114int32_t kernelType;115116kernelConfig()117{}118119kernelConfig(const std::string& name, const size_t* global_size, const size_t* local_size,120const int32_t* workItem,121bool swizzle,122int32_t type = 0)123: executionTime(0)124{125kernelName = name;126for (int32_t x = 0; x < 3; x++)127{128local_work_size[x] = local_size ? local_size[x] : 1;129global_work_size[x] = global_size[x];130workItem_output[x] = workItem[x];131}132swizzle_weights = swizzle;133use_null_local = local_size == NULL;134verified = false;135tested = false;136kernelType = type;137}138};139140struct tunerParam141{142int kernelType;143int blockWidth;144int blockHeight;145int blockDepth;146147tunerParam(int type, int w, int h, int d)148{149kernelType = type;150blockWidth = w;151blockHeight= h;152blockDepth = d;153}154};155156inline void addDef(const char* name)157{158options_ << " -D " << name;159}160161inline void addDef(const char* name, const int value)162{163options_ << " -D " << name << "=" << value;164}165166inline void addDef(const char* name, const float value)167{168options_ << " -D " << name << "=(float)" << value;169}170171inline void addDef(const char* name, const double value)172{173options_ << " -D " << name << "=(double)" << value;174}175176inline void addDef(const char* name, const char* value)177{178options_ << " -D " << name << "=" << value;179}180181void useFirstAvailable(const UMat &bottom,182UMat &top,183const UMat &weight,184const UMat &bias,185int32_t numImages,186UMat &verifyTop);187void setupKernel();188void collectCommonInformation();189void setupKernelDetails(int32_t kernelType,190int32_t blockM,191int32_t blockK,192int32_t blockN);193194ocl::Program compileKernel();195typedef std::map<std::string, ocl::Program> phash_t;196phash_t phash;197void calculateBenchmark(const UMat &bottom, UMat &verifyTop,198const UMat &weight, const UMat &bias,199int32_t numImages);200201202void setupConvolution(const UMat &bottom,203UMat &top,204const UMat &weight,205const UMat &bias,206int32_t numImags,207UMat &verifyTop);208bool createConvolutionKernel(int32_t kernelType,209int32_t blockWidth,210int32_t blockHeight,211int32_t blockDepth);212bool createIDLFKernel(int32_t blockWidth,213int32_t blockHeight,214int32_t blockDepth);215bool createBasicKernel(int32_t blockWidth,216int32_t blockHeight,217int32_t blockDepth);218bool createGEMMLikeConvKernel(int32_t blockWidth,219int32_t blockHeight,220int32_t blockDepth);221bool createDWConvKernel(int32_t blockWidth,222int32_t blockHeight,223int32_t blockDepth);224void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,225int32_t offset, int32_t size, bool write_only);226bool convolve(const UMat &bottom, UMat &top,227const UMat &weight, const UMat &bias,228int32_t numImages,229kernelConfig* config);230float timedConvolve(const UMat &bottom, UMat &top,231const UMat &weight, const UMat &bias,232int32_t numImages, kernelConfig* config);233234bool verifyResult(const UMat &bottom,235UMat &top,236const UMat &weight,237const UMat &bias,238int32_t numImages,239kernelConfig* config,240UMat &verifyTop);241242bool swizzleWeight(const UMat &weight,243int32_t swizzled_factor,244bool interleave = false);245246void generateKey();247std::string generateSpecificKey(int32_t type, int32_t blockWidth,248int32_t blockHeight,249int32_t blockDepth);250void cacheTunedConfig();251bool loadTunedConfig();252253void saveTunedConfig();254bool loadCachedConfig();255256void unloadProgram(const std::string& kernelName);257void prepareKernel(const UMat &bottom, UMat &top,258const UMat &weight, const UMat &bias,259int32_t numImages);260bool setupKernelByConfig(int x, int y, int z, int type,261int lx, int ly, int lz,262bool swizzle, bool nullLocal);263void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems);264void generate_dwconv_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,265int blockM, int blockK, int blockN);266void generate_gemmlike_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,267int blockM, int blockK, int blockN);268void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,269int blockM, int blockK, int simd_size);270void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);271void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx);272273int32_t group_;274bool bias_term_;275UMat swizzled_weights_umat;276UMat weights_half;277UMat bias_half;278UMat bottom_data2_;279280int32_t bottom_index_;281int32_t output_h_;282int32_t output_w_;283int32_t kernel_h_;284int32_t kernel_w_;285int32_t height_;286int32_t width_;287int32_t pad_h_;288int32_t pad_w_;289int32_t pad_bottom_;290int32_t pad_right_;291int32_t stride_h_;292int32_t stride_w_;293int32_t dilation_h_;294int32_t dilation_w_;295296/// M_ is the channel dimension of the output for a single group, which is the297/// leading dimension of the filter matrix.298int32_t M_;299300bool tuned_;301bool dwconv_;302303std::string key_, key_sanitized_;304std::string short_key_;305std::string kernel_name_;306std::string cache_path_;307bool use_cache_path_; // true if cache_path_ directory exists308bool run_auto_tuning_;309bool force_auto_tuning_;310int32_t kernel_index_;311std::vector< cv::Ptr<kernelConfig> > kernelQueue;312cv::Ptr<kernelConfig> bestKernelConfig;313314int32_t bottom_dim_;315int32_t top_dim_;316int32_t num_;317int32_t channels_;318int32_t num_output_;319320int32_t kernelType_;321int32_t blockM_;322int32_t blockK_;323int32_t blockN_;324std::stringstream options_;325cv::ocl::ProgramSource src_;326int32_t prev_kernel_type_;327float negative_slope_;328float min_value_;329float max_value_;330UMat negative_slope_umat_;331ocl4dnnFusedActiv_t fused_activ_;332float power_;333bool fused_eltwise_;334bool use_half_;335};336337typedef enum {338LIBDNN_POOLING_METHOD_MAX = 0,339LIBDNN_POOLING_METHOD_AVE = 1,340LIBDNN_POOLING_METHOD_STO = 2341} ocl4dnnPoolingMethod_t;342343struct OCL4DNNPoolConfig344{345OCL4DNNPoolConfig() :346kernel(1, 1),347pad_l(0), pad_t(0), pad_r(0), pad_b(0),348stride(1, 1),349dilation(1, 1),350channels(0),351pool_method(LIBDNN_POOLING_METHOD_MAX),352global_pooling(false),353avePoolPaddedArea(true),354computeMaxIdx(true),355use_half(false)356{}357MatShape in_shape;358MatShape out_shape;359Size kernel;360int pad_l, pad_t, pad_r, pad_b;361Size stride;362Size dilation;363364int channels;365ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX;366bool global_pooling; // = false;367bool avePoolPaddedArea;368bool computeMaxIdx;369bool use_half;370};371372template<typename Dtype>373class OCL4DNNPool374{375public:376explicit OCL4DNNPool(OCL4DNNPoolConfig config);377~OCL4DNNPool();378bool Forward(const UMat& bottom_data,379UMat& top_data,380UMat& top_mask);381private:382// Pooling parameters383std::vector<int32_t> stride_;384std::vector<int32_t> kernel_shape_;385std::vector<int32_t> im_in_shape_;386std::vector<int32_t> im_out_shape_;387388ocl4dnnPoolingMethod_t pool_method_;389int32_t count_;390int32_t channels_;391int32_t kernel_h_;392int32_t kernel_w_;393int32_t stride_h_;394int32_t stride_w_;395int32_t pad_t_;396int32_t pad_l_;397int32_t pad_b_;398int32_t pad_r_;399int32_t height_;400int32_t width_;401int32_t pooled_height_;402int32_t pooled_width_;403bool avePoolPaddedArea;404bool computeMaxIdx;405bool use_half;406};407408struct OCL4DNNInnerProductConfig409{410OCL4DNNInnerProductConfig() :411num_output(0), M(0), K(0),412bias_term(false), transpose(false), phase_test(true), use_half(false)413{}414int num_output;415int M;416int K;417bool bias_term;418bool transpose; // = false;419bool phase_test; // = true;420bool use_half; // = false;421};422423template<typename Dtype>424class OCL4DNNInnerProduct425{426public:427explicit OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config);428~OCL4DNNInnerProduct();429bool Forward(const UMat& bottom_data,430const UMat& weight,431const UMat& bias,432UMat& top_data);433private:434OCL4DNNInnerProductConfig config_;435int32_t axis_;436int32_t num_output_;437int32_t M_;438int32_t N_;439int32_t K_;440bool bias_term_;441bool transpose_;442bool image_copied_;443bool phase_test_;444bool use_half_;445};446447typedef enum {448LRNParameter_NormRegion_ACROSS_CHANNELS = 0,449LRNParameter_NormRegion_WITHIN_CHANNEL = 1450} LRNParameter_NormRegion_WITHIN_CHANNEL_t;451452struct OCL4DNNLRNConfig453{454OCL4DNNLRNConfig() :455lrn_type(LRNParameter_NormRegion_ACROSS_CHANNELS),456phase_test(true),457local_size(0), alpha(0.f), beta(0.f), k(0.f), norm_by_size(false),458batch_size(0), channels(0), height(0), width(0), use_half(false)459{}460MatShape in_shape;461LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type;462bool phase_test; // = true;463int local_size;464float alpha;465float beta;466float k;467bool norm_by_size;468int32_t batch_size;469int32_t channels;470int32_t height;471int32_t width;472bool use_half;473};474475template<typename Dtype>476class OCL4DNNLRN477{478public:479explicit OCL4DNNLRN(OCL4DNNLRNConfig config);480bool Forward(const UMat& bottom_data, UMat& top_data);481482private:483bool crossChannelForward(const UMat& bottom_data, UMat& top_data);484LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type_;485bool phase_test_;486int32_t size_;487Dtype alpha_;488Dtype beta_;489Dtype k_;490int32_t num_;491int32_t channels_;492int32_t height_;493int32_t width_;494bool norm_by_size_;495bool use_half_;496};497498struct OCL4DNNSoftmaxConfig499{500OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false), use_half(false)501{}502MatShape in_shape;503int axis;504int channels;505bool logsoftmax;506bool use_half;507};508509template<typename Dtype>510class OCL4DNNSoftmax511{512public:513explicit OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config);514~OCL4DNNSoftmax();515bool Forward(const UMat& bottom_data, UMat& top_data);516517private:518int32_t softmax_axis_;519int32_t inner_num_;520int32_t outer_num_;521int32_t channels_;522int32_t count_;523bool use_slm_;524bool log_softmax_;525UMat scale_data_;526bool use_half_;527};528529}}} // namespace cv::dnn::ocl4dnn530531#endif532533534