CoCalc -- ocl4dnn.hpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
¹⁶³⁴⁸ views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
//  By downloading, copying, installing or using the software you agree to this license.
6
//  If you do not agree to this license, do not download, install,
7
//  copy or use the software.
8
//
9
//
10
//                           License Agreement
11
//                For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2017, Intel Corporation, all rights reserved.
14
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
15
// Third party copyrights are property of their respective owners.
16
//
17
// Redistribution and use in source and binary forms, with or without modification,
18
// are permitted provided that the following conditions are met:
19
//
20
//   * Redistribution's of source code must retain the above copyright notice,
21
//     this list of conditions and the following disclaimer.
22
//
23
//   * Redistribution's in binary form must reproduce the above copyright notice,
24
//     this list of conditions and the following disclaimer in the documentation
25
//     and/or other materials provided with the distribution.
26
//
27
//   * The name of the copyright holders may not be used to endorse or promote products
28
//     derived from this software without specific prior written permission.
29
//
30
// This software is provided by the copyright holders and contributors "as is" and
31
// any express or implied warranties, including, but not limited to, the implied
32
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33
// In no event shall the Intel Corporation or contributors be liable for any direct,
34
// indirect, incidental, special, exemplary, or consequential damages
35
// (including, but not limited to, procurement of substitute goods or services;
36
// loss of use, data, or profits; or business interruption) however caused
37
// and on any theory of liability, whether in contract, strict liability,
38
// or tort (including negligence or otherwise) arising in any way out of
39
// the use of this software, even if advised of the possibility of such damage.
40
//
41
//M*/
42

43
#ifndef _OPENCV_LIBDNN_HPP_
44
#define _OPENCV_LIBDNN_HPP_
45
#include <iomanip>
46
#include <map>
47
#include <memory>
48
#include <string>
49
#include <vector>
50
#include "common.hpp"
51

52
namespace cv { namespace dnn { namespace ocl4dnn {
53

54
struct OCL4DNNConvConfig
55
{
56
    OCL4DNNConvConfig() :
57
        kernel(1, 1),
58
        pad(0, 0),
59
        stride(1, 1),
60
        dilation(1, 1),
61
        group(1),
62
        bias_term(false),
63
        use_half(false)
64
    {}
65
    MatShape in_shape;
66
    MatShape out_shape;
67
    Size kernel;
68
    Size pad;
69
    Size stride;
70
    Size dilation;
71
    int group; // = 1;
72
    bool bias_term; // = false;
73
    bool use_half; // = false;
74
};
75

76
typedef enum {
77
    OCL4DNN_CONV_FUSED_ACTIV_NONE                 = 0,
78
    OCL4DNN_CONV_FUSED_ACTIV_RELU                 = 1,
79
    OCL4DNN_CONV_FUSED_ACTIV_PRELU                = 2,
80
    OCL4DNN_CONV_FUSED_ACTIV_POWER                = 3,
81
    OCL4DNN_CONV_FUSED_ACTIV_TANH                 = 4,
82
    OCL4DNN_CONV_FUSED_ACTIV_RELU6                = 5
83
} ocl4dnnFusedActiv_t;
84

85
template<typename Dtype>
86
class OCL4DNNConvSpatial
87
{
88
    public:
89
        explicit OCL4DNNConvSpatial(OCL4DNNConvConfig config);
90
        ~OCL4DNNConvSpatial();
91
        bool Forward(const UMat& bottom_data,
92
                     const UMat& bottom_data2,
93
                     const UMat& weight,
94
                     const UMat& bias,
95
                     UMat& top_data, int32_t batch_size);
96
        void setActivReLU(bool fuse_activ, float slope);
97
        void setActivPReLU(bool fuse_activ, std::vector<float> &slope);
98
        void setActivPower(bool fuse_activ, float power);
99
        void setActivTanh(bool fuse_activ);
100
        void setActivReLU6(bool fuse_activ, float min, float max);
101
        void setBias(bool bias_term);
102

103
    private:
104
        struct kernelConfig
105
        {
106
            std::string kernelName;
107
            float executionTime;
108
            size_t local_work_size[3];
109
            size_t global_work_size[3];
110
            int32_t workItem_output[3];
111
            bool verified;
112
            bool tested;
113
            bool swizzle_weights;
114
            bool use_null_local;
115
            int32_t kernelType;
116

117
            kernelConfig()
118
            {}
119

120
            kernelConfig(const std::string& name, const size_t* global_size, const size_t* local_size,
121
                         const int32_t* workItem,
122
                         bool swizzle,
123
                         int32_t type = 0)
124
                : executionTime(0)
125
            {
126
                kernelName = name;
127
                for (int32_t x = 0; x < 3; x++)
128
                {
129
                    local_work_size[x] = local_size ? local_size[x] : 1;
130
                    global_work_size[x] = global_size[x];
131
                    workItem_output[x] = workItem[x];
132
                }
133
                swizzle_weights = swizzle;
134
                use_null_local = local_size == NULL;
135
                verified = false;
136
                tested = false;
137
                kernelType = type;
138
            }
139
        };
140

141
        struct tunerParam
142
        {
143
           int kernelType;
144
           int blockWidth;
145
           int blockHeight;
146
           int blockDepth;
147

148
           tunerParam(int type, int w, int h, int d)
149
           {
150
               kernelType = type;
151
               blockWidth = w;
152
               blockHeight= h;
153
               blockDepth = d;
154
           }
155
        };
156

157
        inline void addDef(const char* name)
158
        {
159
            options_ << " -D " << name;
160
        }
161

162
        inline void addDef(const char* name, const int value)
163
        {
164
            options_ << " -D " << name << "=" << value;
165
        }
166

167
        inline void addDef(const char* name, const float value)
168
        {
169
            options_ << " -D " << name << "=(float)" << value;
170
        }
171

172
        inline void addDef(const char* name, const double value)
173
        {
174
            options_ << " -D " << name << "=(double)" << value;
175
        }
176

177
        inline void addDef(const char* name, const char* value)
178
        {
179
            options_ << " -D " << name << "=" << value;
180
        }
181

182
        void useFirstAvailable(const UMat &bottom,
183
                               UMat &top,
184
                               const UMat &weight,
185
                               const UMat &bias,
186
                               int32_t numImages,
187
                               UMat &verifyTop);
188
        void setupKernel();
189
        void collectCommonInformation();
190
        void setupKernelDetails(int32_t kernelType,
191
                                int32_t blockM,
192
                                int32_t blockK,
193
                                int32_t blockN);
194

195
        ocl::Program compileKernel();
196
        typedef std::map<std::string, ocl::Program> phash_t;
197
        phash_t phash;
198
        void calculateBenchmark(const UMat &bottom, UMat &verifyTop,
199
                                const UMat &weight, const UMat &bias,
200
                                int32_t numImages);
201

202

203
        void setupConvolution(const UMat &bottom,
204
                              UMat &top,
205
                              const UMat &weight,
206
                              const UMat &bias,
207
                              int32_t numImags,
208
                              UMat &verifyTop);
209
        bool createConvolutionKernel(int32_t kernelType,
210
                                     int32_t blockWidth,
211
                                     int32_t blockHeight,
212
                                     int32_t blockDepth);
213
        bool createIDLFKernel(int32_t blockWidth,
214
                              int32_t blockHeight,
215
                              int32_t blockDepth);
216
        bool createBasicKernel(int32_t blockWidth,
217
                               int32_t blockHeight,
218
                               int32_t blockDepth);
219
        bool createGEMMLikeConvKernel(int32_t blockWidth,
220
                                      int32_t blockHeight,
221
                                      int32_t blockDepth);
222
        bool createDWConvKernel(int32_t blockWidth,
223
                                int32_t blockHeight,
224
                                int32_t blockDepth);
225
        void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
226
                             int32_t offset, int32_t size, bool write_only);
227
        bool convolve(const UMat &bottom, UMat &top,
228
                      const UMat &weight, const UMat &bias,
229
                      int32_t numImages,
230
                      kernelConfig* config);
231
        float timedConvolve(const UMat &bottom, UMat &top,
232
                            const UMat &weight, const UMat &bias,
233
                            int32_t numImages, kernelConfig* config);
234

235
        bool verifyResult(const UMat &bottom,
236
                          UMat &top,
237
                          const UMat &weight,
238
                          const UMat &bias,
239
                          int32_t numImages,
240
                          kernelConfig* config,
241
                          UMat &verifyTop);
242

243
        bool swizzleWeight(const UMat &weight,
244
                           int32_t swizzled_factor,
245
                           bool interleave = false);
246

247
        void generateKey();
248
        std::string generateSpecificKey(int32_t type, int32_t blockWidth,
249
                                          int32_t blockHeight,
250
                                          int32_t blockDepth);
251
        void cacheTunedConfig();
252
        bool loadTunedConfig();
253

254
        void saveTunedConfig();
255
        bool loadCachedConfig();
256

257
        void unloadProgram(const std::string& kernelName);
258
        void prepareKernel(const UMat &bottom, UMat &top,
259
                           const UMat &weight, const UMat &bias,
260
                           int32_t numImages);
261
        bool setupKernelByConfig(int x, int y, int z, int type,
262
                                 int lx, int ly, int lz,
263
                                 bool swizzle, bool nullLocal);
264
        void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems);
265
        void generate_dwconv_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
266
                                        int blockM, int blockK, int blockN);
267
        void generate_gemmlike_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
268
                                          int blockM, int blockK, int blockN);
269
        void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
270
                                      int blockM, int blockK, int simd_size);
271
        void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
272
        void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx);
273

274
        int32_t group_;
275
        bool bias_term_;
276
        UMat swizzled_weights_umat;
277
        UMat weights_half;
278
        UMat bias_half;
279
        UMat bottom_data2_;
280

281
        int32_t bottom_index_;
282
        int32_t output_h_;
283
        int32_t output_w_;
284
        int32_t kernel_h_;
285
        int32_t kernel_w_;
286
        int32_t height_;
287
        int32_t width_;
288
        int32_t pad_h_;
289
        int32_t pad_w_;
290
        int32_t pad_bottom_;
291
        int32_t pad_right_;
292
        int32_t stride_h_;
293
        int32_t stride_w_;
294
        int32_t dilation_h_;
295
        int32_t dilation_w_;
296

297
        /// M_ is the channel dimension of the output for a single group, which is the
298
        /// leading dimension of the filter matrix.
299
        int32_t M_;
300

301
        bool tuned_;
302
        bool dwconv_;
303

304
        std::string key_, key_sanitized_;
305
        std::string short_key_;
306
        std::string kernel_name_;
307
        std::string cache_path_;
308
        bool use_cache_path_; // true if cache_path_ directory exists
309
        bool run_auto_tuning_;
310
        bool force_auto_tuning_;
311
        int32_t kernel_index_;
312
        std::vector< cv::Ptr<kernelConfig> > kernelQueue;
313
        cv::Ptr<kernelConfig> bestKernelConfig;
314

315
        int32_t bottom_dim_;
316
        int32_t top_dim_;
317
        int32_t num_;
318
        int32_t channels_;
319
        int32_t num_output_;
320

321
        int32_t kernelType_;
322
        int32_t blockM_;
323
        int32_t blockK_;
324
        int32_t blockN_;
325
        std::stringstream options_;
326
        cv::ocl::ProgramSource src_;
327
        int32_t prev_kernel_type_;
328
        float negative_slope_;
329
        float min_value_;
330
        float max_value_;
331
        UMat negative_slope_umat_;
332
        ocl4dnnFusedActiv_t fused_activ_;
333
        float power_;
334
        bool fused_eltwise_;
335
        bool use_half_;
336
};
337

338
typedef enum {
339
    LIBDNN_POOLING_METHOD_MAX                 = 0,
340
    LIBDNN_POOLING_METHOD_AVE                 = 1,
341
    LIBDNN_POOLING_METHOD_STO                 = 2
342
} ocl4dnnPoolingMethod_t;
343

344
struct OCL4DNNPoolConfig
345
{
346
    OCL4DNNPoolConfig() :
347
        kernel(1, 1),
348
        pad_l(0), pad_t(0), pad_r(0), pad_b(0),
349
        stride(1, 1),
350
        dilation(1, 1),
351
        channels(0),
352
        pool_method(LIBDNN_POOLING_METHOD_MAX),
353
        global_pooling(false),
354
        avePoolPaddedArea(true),
355
        computeMaxIdx(true),
356
        use_half(false)
357
    {}
358
    MatShape in_shape;
359
    MatShape out_shape;
360
    Size kernel;
361
    int pad_l, pad_t, pad_r, pad_b;
362
    Size stride;
363
    Size dilation;
364

365
    int channels;
366
    ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX;
367
    bool global_pooling; // = false;
368
    bool avePoolPaddedArea;
369
    bool computeMaxIdx;
370
    bool use_half;
371
};
372

373
template<typename Dtype>
374
class OCL4DNNPool
375
{
376
    public:
377
        explicit OCL4DNNPool(OCL4DNNPoolConfig config);
378
        ~OCL4DNNPool();
379
        bool Forward(const UMat& bottom_data,
380
                     UMat& top_data,
381
                     UMat& top_mask);
382
    private:
383
        // Pooling parameters
384
        std::vector<int32_t> stride_;
385
        std::vector<int32_t> kernel_shape_;
386
        std::vector<int32_t> im_in_shape_;
387
        std::vector<int32_t> im_out_shape_;
388

389
        ocl4dnnPoolingMethod_t pool_method_;
390
        int32_t count_;
391
        int32_t channels_;
392
        int32_t kernel_h_;
393
        int32_t kernel_w_;
394
        int32_t stride_h_;
395
        int32_t stride_w_;
396
        int32_t pad_t_;
397
        int32_t pad_l_;
398
        int32_t pad_b_;
399
        int32_t pad_r_;
400
        int32_t height_;
401
        int32_t width_;
402
        int32_t pooled_height_;
403
        int32_t pooled_width_;
404
        bool avePoolPaddedArea;
405
        bool computeMaxIdx;
406
        bool use_half;
407
};
408

409
struct OCL4DNNInnerProductConfig
410
{
411
    OCL4DNNInnerProductConfig() :
412
        num_output(0), M(0), K(0),
413
        bias_term(false), transpose(false), phase_test(true), use_half(false)
414
    {}
415
    int num_output;
416
    int M;
417
    int K;
418
    bool bias_term;
419
    bool transpose; // = false;
420
    bool phase_test; // = true;
421
    bool use_half; // = false;
422
};
423

424
template<typename Dtype>
425
class OCL4DNNInnerProduct
426
{
427
    public:
428
        explicit OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config);
429
        ~OCL4DNNInnerProduct();
430
        bool Forward(const UMat& bottom_data,
431
                     const UMat& weight,
432
                     const UMat& bias,
433
                     UMat& top_data);
434
    private:
435
        OCL4DNNInnerProductConfig config_;
436
        int32_t axis_;
437
        int32_t num_output_;
438
        int32_t M_;
439
        int32_t N_;
440
        int32_t K_;
441
        bool bias_term_;
442
        bool transpose_;
443
        bool image_copied_;
444
        bool phase_test_;
445
        bool use_half_;
446
};
447

448
typedef enum {
449
    LRNParameter_NormRegion_ACROSS_CHANNELS = 0,
450
    LRNParameter_NormRegion_WITHIN_CHANNEL = 1
451
} LRNParameter_NormRegion_WITHIN_CHANNEL_t;
452

453
struct OCL4DNNLRNConfig
454
{
455
    OCL4DNNLRNConfig() :
456
        lrn_type(LRNParameter_NormRegion_ACROSS_CHANNELS),
457
        phase_test(true),
458
        local_size(0), alpha(0.f), beta(0.f), k(0.f), norm_by_size(false),
459
        batch_size(0), channels(0), height(0), width(0), use_half(false)
460
    {}
461
    MatShape in_shape;
462
    LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type;
463
    bool phase_test; // = true;
464
    int local_size;
465
    float alpha;
466
    float beta;
467
    float k;
468
    bool norm_by_size;
469
    int32_t batch_size;
470
    int32_t channels;
471
    int32_t height;
472
    int32_t width;
473
    bool use_half;
474
};
475

476
template<typename Dtype>
477
class OCL4DNNLRN
478
{
479
    public:
480
        explicit OCL4DNNLRN(OCL4DNNLRNConfig config);
481
        bool Forward(const UMat& bottom_data, UMat& top_data);
482

483
    private:
484
        bool crossChannelForward(const UMat& bottom_data, UMat& top_data);
485
        LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type_;
486
        bool phase_test_;
487
        int32_t size_;
488
        Dtype alpha_;
489
        Dtype beta_;
490
        Dtype k_;
491
        int32_t num_;
492
        int32_t channels_;
493
        int32_t height_;
494
        int32_t width_;
495
        bool norm_by_size_;
496
        bool use_half_;
497
};
498

499
struct OCL4DNNSoftmaxConfig
500
{
501
    OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false), use_half(false)
502
    {}
503
    MatShape in_shape;
504
    int axis;
505
    int channels;
506
    bool logsoftmax;
507
    bool use_half;
508
};
509

510
template<typename Dtype>
511
class OCL4DNNSoftmax
512
{
513
    public:
514
        explicit OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config);
515
        ~OCL4DNNSoftmax();
516
        bool Forward(const UMat& bottom_data, UMat& top_data);
517

518
    private:
519
        int32_t softmax_axis_;
520
        int32_t inner_num_;
521
        int32_t outer_num_;
522
        int32_t channels_;
523
        int32_t count_;
524
        bool use_slm_;
525
        bool log_softmax_;
526
        UMat scale_data_;
527
        bool use_half_;
528
};
529

530
}}} // namespace cv::dnn::ocl4dnn
531

532
#endif
533

534
Product

Resources

Company