CoCalc -- conv_layer

GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/dnn/src/opencl/conv_layer_spatial.cl
¹⁶³³⁷ views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
//  By downloading, copying, installing or using the software you agree to this license.
6
//  If you do not agree to this license, do not download, install,
7
//  copy or use the software.
8
//
9
//
10
//                           License Agreement
11
//                For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2017, Intel Corporation, all rights reserved.
14
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
15
// Third party copyrights are property of their respective owners.
16
//
17
// Redistribution and use in source and binary forms, with or without modification,
18
// are permitted provided that the following conditions are met:
19
//
20
//   * Redistribution's of source code must retain the above copyright notice,
21
//     this list of conditions and the following disclaimer.
22
//
23
//   * Redistribution's in binary form must reproduce the above copyright notice,
24
//     this list of conditions and the following disclaimer in the documentation
25
//     and/or other materials provided with the distribution.
26
//
27
//   * The name of the copyright holders may not be used to endorse or promote products
28
//     derived from this software without specific prior written permission.
29
//
30
// This software is provided by the copyright holders and contributors "as is" and
31
// any express or implied warranties, including, but not limited to, the implied
32
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33
// In no event shall the Intel Corporation or contributors be liable for any direct,
34
// indirect, incidental, special, exemplary, or consequential damages
35
// (including, but not limited to, procurement of substitute goods or services;
36
// loss of use, data, or profits; or business interruption) however caused
37
// and on any theory of liability, whether in contract, strict liability,
38
// or tort (including negligence or otherwise) arising in any way out of
39
// the use of this software, even if advised of the possibility of such damage.
40
//
41
//M*/
42

43
#if defined(cl_khr_fp16)
44
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
45
#endif
46

47
#define KERNEL_ARG_DTYPE float
48
#define TYPE_FLOAT  1
49
#define TYPE_HALF   2
50

51
#if defined(FUSED_CONV_RELU)
52
#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope)))
53
#define FUSED_ARG KERNEL_ARG_DTYPE negative_slope,
54
#elif defined(FUSED_CONV_PRELU)
55
#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope[c])))
56
#define FUSED_ARG __global const KERNEL_ARG_DTYPE* negative_slope,
57
#elif defined(FUSED_CONV_POWER)
58
#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, (Dtype)power)
59
#define FUSED_ARG KERNEL_ARG_DTYPE power,
60
#elif defined(FUSED_CONV_TANH)
61
#define ACTIVATION_RELU_FUNCTION(x, c) tanh(x)
62
#define FUSED_ARG
63
#elif defined(FUSED_CONV_RELU6)
64
#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), (Dtype)min_value, (Dtype)max_value))
65
#define FUSED_ARG KERNEL_ARG_DTYPE min_value, KERNEL_ARG_DTYPE max_value,
66
#else
67
#define ACTIVATION_RELU_FUNCTION(x, c) (x)
68
#define FUSED_ARG
69
#endif
70

71
#ifdef FUSED_CONV_ELTWISE
72
#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
73
    const Dtype _x_ = eltwise_data[(_offset_)] + (_data_); \
74
    (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
75
} while(0)
76
#define ELTWISE_DATA_ARG __global Dtype* eltwise_data,
77
#else
78
#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
79
    const Dtype _x_ = (_data_); \
80
    (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
81
} while(0)
82
#define ELTWISE_DATA_ARG
83
#endif
84

85
#if APPLY_BIAS
86
#define BIAS_KERNEL_ARG __global Dtype * biases_base,
87
#else
88
#define BIAS_KERNEL_ARG
89
#endif
90

91
#define __CAT(x, y) x##y
92
#define CAT(x, y) __CAT(x, y)
93
#define LOOP0(VAR, STMT)
94
#define LOOP1(VAR, STMT) (STMT); (VAR)++;
95
#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;
96
#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;
97
#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;
98
#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;
99
#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;
100
#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;
101
#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;
102
#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;
103
#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;
104
#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;
105
#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;
106
#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;
107
#define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++;
108
#define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++;
109
#define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++;
110
#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
111

112
#if defined(convolve_simd) || defined(Conv_Interleaved)
113
#if TYPE == TYPE_HALF
114
#define INT_TYPE ushort
115
#define INT_TYPE2 ushort2
116
#define INT_TYPE4 ushort4
117
#define INT_TYPE8 ushort8
118
#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read_us2
119
#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read_us4
120
#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read_us8
121
#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read_us
122
#else
123
#define INT_TYPE uint
124
#define INT_TYPE2 uint2
125
#define INT_TYPE4 uint4
126
#define INT_TYPE8 uint8
127
#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read2
128
#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read4
129
#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8
130
#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read
131
#endif
132
#endif
133

134
#ifdef KERNEL_BASIC
135

136
__kernel void ConvolveBasic(
137
    ELTWISE_DATA_ARG
138
    FUSED_ARG
139
    __global Dtype* image_data,
140
    int image_offset,
141
    __global Dtype* kernel_data,
142
    int kernel_offset,
143
    __global Dtype* bias,
144
    const int bias_offset,
145
    __global Dtype* convolved_image_base,
146
    const int convolved_image_base_offset,
147
    const int convolved_image_offset,
148
    const ushort input_width,
149
    const ushort input_height,
150
    const ushort output_width,
151
    const ushort output_height,
152
    const ushort pad_w,
153
    const ushort pad_h
154
)
155
{
156
    __global Dtype* convolved_image = convolved_image_base + convolved_image_base_offset;
157
    const int outputX = get_global_id(0);
158
    const int outputY = get_global_id(1);
159
    const int kernelNum = get_global_id(2) * ZPAR;
160
    if (outputX < output_width && outputY < output_height)
161
    {
162
        Dtype sum[ZPAR];
163
        for (int kern = 0; kern < ZPAR; kern++)
164
        {
165
            sum[kern] = 0.0f;
166
        }
167
        const int org_y = outputY * STRIDE_Y - pad_h;
168
        const int org_x = outputX * STRIDE_X - pad_w;
169
        const int currentKernelOffset = kernel_offset + kernelNum*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;
170
#if APPLY_BIAS
171
        const int biasIndex = bias_offset + kernelNum;
172
#endif
173
        const int local_image_offset = org_y * input_width + org_x;
174
        const int imageSize = input_width * input_height;
175
        __global Dtype* image_dataPtr = (image_data + (image_offset + local_image_offset));
176
        __global Dtype* kernel_dataPtr = (kernel_data + (currentKernelOffset));
177
        for (int c = 0; c < CHANNELS; c++)
178
        {
179
            for (int y = 0; y < KERNEL_HEIGHT; y++)
180
            {
181
                for (int x = 0; x < KERNEL_WIDTH; x++)
182
                {
183
                    int y_ = org_y + y * DILATION_Y;
184
                    int x_ = org_x + x * DILATION_X;
185
                    if (!(y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width))
186
                    {
187
                        continue;
188
                    }
189
                    for (int kern = 0; kern < ZPAR; kern++)
190
                    {
191
                        sum[kern] += image_dataPtr[x * DILATION_X] * kernel_dataPtr[kern*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS + x];
192
                    }
193
                }
194
                image_dataPtr += input_width * DILATION_Y;
195
                kernel_dataPtr += KERNEL_WIDTH;
196
            }
197
            image_dataPtr += imageSize - input_width*KERNEL_HEIGHT*DILATION_Y;
198
        }
199

200
        for (int kern = 0; kern < ZPAR; kern++)
201
        {
202
            if (kernelNum + kern < OUTPUT_Z)
203
            {
204
                int offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX;
205
#if APPLY_BIAS
206
                ACTIVATION_FUNCTION(convolved_image, offset, sum[kern] + bias[biasIndex + kern], biasIndex + kern);
207
#else
208
                ACTIVATION_FUNCTION(convolved_image, offset, sum[kern], biasIndex + kern);
209
#endif
210
            }
211
        }
212
    }
213
}
214

215
#elif defined KERNEL_IDLF
216

217
// Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map.
218
// Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the input image.
219
// NDRange:  (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH
220

221
// NOTE: for beignet this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break.
222
__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
223
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
224
__kernel void
225
convolve_simd(
226
    ELTWISE_DATA_ARG
227
    FUSED_ARG
228
    __global Dtype* inputs,
229
    __global Dtype* weights,
230
    BIAS_KERNEL_ARG
231
    __global Dtype* outputs_base,
232
    const int outputs_offset,
233
    const ushort input_width,
234
    const ushort input_height,
235
    const ushort output_width,
236
    const ushort output_height)
237
{
238
  __global Dtype* outputs = outputs_base + outputs_offset;
239
  unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH;  // oc = Output Column
240
  unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
241
  unsigned int fm = get_global_id(2);                    // fm = Feature Map = od = Output Depth
242
  unsigned int fmg = get_group_id(2);
243
  unsigned int lid = get_local_id(2);
244

245
  Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0.0f };
246

247
  // find weights address of given neuron (lid is index)
248
  unsigned int weight_addr = (fmg % FILTERS_IN_GROUP) *
249
                             INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;
250

251
  unsigned int num_in_batch = fm / ALIGNED_NUM_FILTERS;
252

253
  unsigned int input_batch_offset = num_in_batch * INPUT_PITCH * TOTAL_INPUT_DEPTH_SIZE;
254

255
  int curr_y = or * STRIDE_Y;
256
  int curr_x = oc * STRIDE_X + lid;
257

258
  int in_addr = input_batch_offset
259
                +  (curr_y - INPUT_PAD_H) * INPUT_WIDTH          // y tile offset
260
                +   curr_x - INPUT_PAD_W;                        // x tile offset
261

262
  const int in_limit = (get_global_size(2) / ALIGNED_NUM_FILTERS) * TOTAL_INPUT_DEPTH_SIZE * INPUT_PITCH - 1;
263

264
  Dtype in_buf[INVEC_SIZE];
265

266
  for(int kd = 0; kd < INPUT_DEPTH; kd++)
267
  {
268
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
269
    const bool cx_out_of_range = !(curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W);
270
    int in_offset = in_addr;
271
    __attribute__((opencl_unroll_hint(INVEC_SIZE)))
272
    for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH)
273
    {
274
      Dtype input = inputs[clamp(in_offset, 0, in_limit)];
275
      int cy = curr_y + reg;
276
      in_buf[reg] = (cx_out_of_range || cy < INPUT_PAD_H || cy >= INPUT_HEIGHT + INPUT_PAD_H) ? 0 : input;
277
    }
278
#else
279
    int in_offset = in_addr;
280
    __attribute__((opencl_unroll_hint(INVEC_SIZE)))
281
    for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH)
282
    {
283
      in_buf[reg] = inputs[min(in_offset, in_limit)];
284
    }
285
#endif
286

287
    in_addr += INPUT_PITCH;
288

289
#define BLOCK_IN(n, c) intel_sub_group_shuffle(in_buf[n], (c))
290

291
    int kr = 0;  // kr = Kernel Row
292
    LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop.
293
    {
294
        int kc = 0;  // kc = Kernel Column
295
        LOOP(KERNEL_WIDTH, kc,
296
        {
297
            Dtype weight_value = weights[weight_addr];
298
            weight_addr += SIMD_SIZE;
299
            for (int br=0; br < OUT_BLOCK_HEIGHT; br++)
300
            {
301
                for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++)
302
                {
303
                    Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y), bc * STRIDE_X + kc * DILATION_X);
304
                    out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_value, input, out[br * OUT_BLOCK_WIDTH + bc]);
305
                }
306
            }
307
        });
308
    });
309
  }
310

311
  fm = fm % ALIGNED_NUM_FILTERS;
312

313
#if LEFT_FILTERS > 0
314
  if (fm < NUM_FILTERS)
315
#endif
316
  {
317
    unsigned int out_addr = (num_in_batch * TOTAL_OUTPUT_DEPTH + fm) * OUTPUT_PITCH;
318
    out_addr += or * output_width + oc;
319
    // we need this address calculation for biases because we support views and batching
320
#if APPLY_BIAS
321
    Dtype bias = biases_base[fm];
322
#else
323
    Dtype bias = 0;
324
#endif
325

326
    for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++)
327
    {
328
      if (r + or >= output_height) break;
329
      for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++)
330
      {
331
        if (c + oc >= output_width) break;
332
        // this does a scattered write to SIMD_SIZE different feature maps,
333
        // so that data within one map is contiguous, thus ready for input to next layer.
334
        ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c], fm);
335
      }
336
    }
337
  }
338
}
339

340
#elif defined KERNEL_GEMM_LIKE
341

342
#if APPLY_BIAS
343
#define SUBGROUP_GET_BIAS(k, i) intel_sub_group_shuffle(bias[k], i)
344
#else
345
#define SUBGROUP_GET_BIAS(k, i) ((Dtype)0)
346
#endif
347

348
#ifdef Conv_Interleaved
349
typedef struct float1 { float s0; } float1;
350
typedef struct float5 { float s0; float s1; float s2; float s3; float s4; } float5;
351
typedef struct float6 { float s0; float s1; float s2; float s3; float s4; float s5; } float6;
352
typedef struct float7 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; } float7;
353
typedef struct float9 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; } float9;
354
typedef struct float10 { float s0; float s1; float s2; float s3; float s4; float s5;
355
                         float s6; float s7; float s8; float s9;} float10;
356
typedef struct float11 { float s0; float s1; float s2; float s3; float s4; float s5;
357
                         float s6; float s7; float s8; float s9; float sa;} float11;
358
typedef struct float12 { float s0; float s1; float s2; float s3; float s4; float s5;
359
                         float s6; float s7; float s8; float s9; float sa; float sb; } float12;
360
typedef struct float13 { float s0; float s1; float s2; float s3; float s4; float s5;
361
                         float s6; float s7; float s8; float s9; float sa; float sb; float sc;} float13;
362
typedef struct float14 { float s0; float s1; float s2; float s3; float s4; float s5;
363
                         float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; } float14;
364
typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float s5;
365
                         float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15;
366
typedef struct float0 { float s0; } float0; //never used but makes compiler happy.
367

368
typedef struct half1 { half s0; } half1;
369
typedef struct half5 { half s0; half s1; half s2; half s3; half s4; } half5;
370
typedef struct half6 { half s0; half s1; half s2; half s3; half s4; half s5; } half6;
371
typedef struct half7 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; } half7;
372
typedef struct half9 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; } half9;
373
typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5;
374
                        half s6; half s7; half s8; half s9; } half10;
375
typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5;
376
                        half s6; half s7; half s8; half s9; half sa; } half11;
377
typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5;
378
                        half s6; half s7; half s8; half s9; half sa; half sb; } half12;
379
typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5;
380
                        half s6; half s7; half s8; half s9; half sa; half sb; half sc; } half13;
381
typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5;
382
                        half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; } half14;
383
typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5;
384
                        half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; half se; } half15;
385
typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
386

387
#define OUT_PITCH_X output_width
388
#define ROW_PITCH input_width
389

390
#define GEMM_LIKE_KERNEL_ARGS     \
391
    ELTWISE_DATA_ARG              \
392
    FUSED_ARG                     \
393
    const __global Dtype *src0,   \
394
    const __global Dtype *src1,   \
395
    BIAS_KERNEL_ARG               \
396
    __global Dtype *dst_base,     \
397
    const int dst_offset,         \
398
    const ushort input_width,     \
399
    const ushort input_height,    \
400
    const ushort output_width,    \
401
    const ushort output_height,   \
402
    const int out_pitch_y,     \
403
    const int out_pitch_z,     \
404
    const int aligned_input_size, \
405
    const int slice_pitch
406
#endif
407

408
#ifdef GEMM_LIKE_CONV_32_1
409
//////////////////////////////////////////////////////////////////////////////
410
// Conv_Interleaved_32_1_flex
411
//
412
// Convolution: each workitem computes 1 patch x 32 filters worth of output
413
// data.  Kernel's inner loop works on a single tile consisting of one
414
// row from each patch and the filter data corresponding to that row.  Filter
415
// matrix is interleaved to reduce GRF bank conflicts.  Patches are walked
416
// by rows and then by slices.  Relies on sub_group extension for block
417
// reads and SIMD broadcast.  Allows flexible sizing of TILE width (TILE_N)
418
// by dynamically selecting one of two code paths: one uses TILE_N = 32 and
419
// the other uses TILE_N = 8, 16, or 24.
420
#define TILE_M          1
421
#define TILE_K          KERNEL_WIDTH
422
#define TILE_N          32
423

424
__attribute__((intel_reqd_sub_group_size(8)))
425
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
426
{
427
    __global Dtype *dst = dst_base + dst_offset;
428
    const int group_x = get_group_id(0);
429
    const int group_y = get_group_id(1);
430
    const int global_x = get_global_id(0);
431
    const int global_y = get_global_id(1);
432
    const int global_z = get_global_id(2);
433
    int interleaved_y;
434
    int kernel_y;
435
    int kernel_idx;
436

437
#define DOT_PRODUCT_8( _result, _rowA, colB )    \
438
    {   \
439
        _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 );  \
440
        _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 );  \
441
        _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 );  \
442
        _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 );  \
443
        _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 );  \
444
        _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 );  \
445
        _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 );  \
446
        _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 );  \
447
    }
448
    typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
449

450
    // True for all threads if filter_width is multiple of TILE_N
451
    // else, true for all but right-most column of threads.
452
    if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )
453
    {
454
        // Result ctile (*dst) is M rows x N columns
455
        // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
456
        Dtype8  blockC00 = 0.f;
457
        Dtype8  blockC10 = 0.f;
458
        Dtype8  blockC20 = 0.f;
459
        Dtype8  blockC30 = 0.f;
460

461
        // Src0 (patch input) is directly used as atile.
462
        // Each work item points to the start of a different patch.
463
        // atile is M rows x K columns.
464
        int curr_x = ( global_y % output_width ) * STRIDE_X;
465
        int curr_y = ( global_y / output_width ) * STRIDE_Y;
466
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
467
        int saved_y = curr_y;
468
#endif
469
        const __global Dtype *src0_read = src0
470
          + aligned_input_size * global_z           // batch offset
471
          + (curr_y - INPUT_PAD_H) * ROW_PITCH      // y offset
472
          + (curr_x - INPUT_PAD_W);                 // x offset
473

474
        // Src1 (filter) is directly used as btile.
475
        // It starts at the top of src1 and walks down.
476
        // btile is K rows x N columns.
477
        const __global Dtype *src1_read = src1 + ( global_x * TILE_N  * 2);
478

479
        // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
480
        // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
481
        // and KERNEL_WIDTH/2 rows of interleaved filter.
482
        int patch_depth = 0;
483
        do
484
        {
485
            int patch_row = 0;
486
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
487
            curr_y = saved_y;
488
#endif
489

490
            do
491
            {
492
                // Load atile and btile.
493
                // Kernel data is partially interleaved.  Every 2 rows are interleaved at Dtype8 granularity.
494
                // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved.  The non
495
                // interleaved row is padded with zero to ensure same size as interleaved rows. This
496
                // interleaving is done to ensure 0% GDR bank conflicts.  For example, this is how the
497
                // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
498
                // (0, 0) (8, 0) (16, 0) (24, 0) ...       (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
499
                // (0, 1) (8, 1) (16, 1) (24, 1) ... =>    (0, 2) (8, 2) (16, 2) (24, 2) ...
500
                // (0, 2) (8, 2) (16, 2) (24, 2) ...       ...
501
                // ...
502
                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
503

504
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
505
  #if KERNEL_WIDTH == 3
506
                Dtype_t blockA00 = vload3(0, src0_read);
507
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
508
  #else
509
                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
510
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
511
  #endif
512
#else
513
                Dtype_t blockA00;
514
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
515
                int pos = 0;
516
                LOOP(KERNEL_WIDTH, pos,
517
                {
518
                  if (curr_y >= INPUT_PAD_H &&
519
                      curr_y < input_height + INPUT_PAD_H &&
520
                      curr_x + pos * DILATION_X >= INPUT_PAD_W &&
521
                      curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
522
                    pblockA00[pos] = src0_read[pos * DILATION_X];
523
                  else
524
                    pblockA00[pos] = 0;
525
                })
526
                curr_y += DILATION_Y;
527
#endif
528
                src0_read += (ROW_PITCH * DILATION_Y);
529

530
                Dtype blockB00[KERNEL_WIDTH*4];
531
                Dtype8* p8BlockB00 = (Dtype8*)blockB00;
532
                Dtype4* p4BlockB00 = (Dtype4*)blockB00;
533
                Dtype*  pBlockB00 =  (Dtype* )blockB00;
534

535
                interleaved_y = 0;
536
                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
537
                {
538
                    p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE *)src1_read ) );
539
                    src1_read += WIDTH1 * 2;
540
                } )
541
                if ( kernel_width_is_odd )
542
                {
543
                    p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE *)src1_read ) );
544
                    src1_read += WIDTH1 * 2;
545
                }
546

547
                // Perform MADs
548
                kernel_idx = 0;
549
                interleaved_y = 0;
550
                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
551
                {
552
                    kernel_y = interleaved_y * 2;
553
                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
554
                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
555
                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
556
                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
557
                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
558
                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
559
                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
560
                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
561
                } )
562
                    kernel_y = interleaved_y * 2;
563
                if ( kernel_width_is_odd )
564
                {
565
                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
566
                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
567
                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
568
                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
569
                }
570
            }
571

572
            //while( ++patch_row < 1 ); //debug
573
            while( ++patch_row < KERNEL_HEIGHT );
574

575
            // reset to start of next slice of patch
576
            src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
577
        }
578
        //while ( ++patch_depth < 1 ); //debug
579
        while ( ++patch_depth < INPUT_DEPTH );
580

581
        // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
582
        // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
583
        int out_offset = global_z * out_pitch_z                                        // batch offset
584
         + ( group_x * TILE_N ) * out_pitch_y                                          // channel offset
585
         + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X  // y offset
586
         + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;                // x offset
587

588
        __global Dtype *out = dst + out_offset;
589
#if APPLY_BIAS
590
        Dtype bias[4];
591
        Dtype4 *bias_vec;
592
        bias_vec = (Dtype4*)bias;
593
        *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
594
        if (group_x > 0xFFFFFFFEul) {
595
          dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
596
        }
597
#else
598
        const Dtype bias[4] = {0, 0, 0, 0};
599
#endif
600
        if (global_y * TILE_M < output_width * output_height )
601
        {
602
            for (int i = 0; i < 8; i++)
603
            {
604
            ACTIVATION_FUNCTION(dst, out_offset + ( 0 + i ) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
605
            ACTIVATION_FUNCTION(dst, out_offset + ( 8 + i ) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + 8 + i);
606
            ACTIVATION_FUNCTION(dst, out_offset + ( 16 + i ) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + 16 + i);
607
            ACTIVATION_FUNCTION(dst, out_offset + ( 24 + i ) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + 24 + i);
608
            }
609
        }
610
    }
611
#if TILE_N_LAST > 0
612
    else
613
    {
614

615
        // Result ctile (*dst) is M rows x N columns
616
        // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
617
        int i = 0;
618
        Dtype8  blockC[TILE_N_LAST_DIV8];
619
        LOOP(TILE_N_LAST_DIV8, i,
620
        {
621
            blockC[i] = 0.f;
622
        } )
623

624
        // Src0 (patch input) is directly used as atile.
625
        // Each work item points to the start of a different patch.
626
        // atile is M rows x K columns.
627
        int curr_x = ( global_y % output_width ) * STRIDE_X;
628
        int curr_y = ( global_y / output_width ) * STRIDE_Y;
629
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
630
        int saved_y = curr_y;
631
#endif
632
        const __global Dtype *src0_read = src0
633
          + aligned_input_size * global_z           // batch offset
634
          + (curr_y - INPUT_PAD_H) * ROW_PITCH      // y offset
635
          + (curr_x - INPUT_PAD_W);                 // x offset
636

637
        // Src1 (filter) is directly used as btile.
638
        // It starts at the top of src1 and walks down.
639
        // btile is K rows x N columns.
640
        const __global Dtype *src1_read = src1 + ( global_x * TILE_N  * 2);
641

642
        // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
643
        // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
644
        // and KERNEL_WIDTH/2 rows of interleaved filter.
645
        int patch_depth = 0;
646
        do
647
        {
648
            int patch_row = 0;
649
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
650
            curr_y = saved_y;
651
#endif
652
            do
653
            {
654
                // Load atile and interleaved btile.
655
                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
656
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
657
                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
658
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
659
#else
660
                Dtype_t blockA00;
661
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
662
                int pos = 0;
663
                LOOP(KERNEL_WIDTH, pos,
664
                {
665
                  if (curr_y >= INPUT_PAD_H &&
666
                      curr_y < input_height + INPUT_PAD_H &&
667
                      curr_x + pos * DILATION_X >= INPUT_PAD_W &&
668
                      curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
669
                    pblockA00[pos] = src0_read[pos * DILATION_X];
670
                  else
671
                    pblockA00[pos] = 0;
672
                })
673
                curr_y += DILATION_Y;
674
#endif
675
                src0_read += (ROW_PITCH * DILATION_Y);
676
                Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8];
677

678
                interleaved_y = 0;
679
                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
680
                {
681
#if TILE_N_LAST_DIV8 == 1
682
                    Dtype2* p2BlockB = (Dtype2* )blockB;
683
                    p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
684
#elif TILE_N_LAST_DIV8 == 2
685
                    Dtype4* p4BlockB = (Dtype4* )blockB;
686
                    p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
687
#elif TILE_N_LAST_DIV8 == 3
688
                    //TODO: broken.  No block_read6
689
                    Dtype6* p6BlockB = (Dtype6* )blockB;
690
                    (*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
691
                    (*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) );
692
#endif
693
                    src1_read += WIDTH1 * 2;
694
                } )
695
                if ( kernel_width_is_odd )
696
                {
697
#if TILE_N_LAST_DIV8 == 1
698
                    Dtype* pBlockB = (Dtype* )blockB;
699
                    pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) );
700
#elif TILE_N_LAST_DIV8 == 2
701
                    Dtype2* p2BlockB = (Dtype2* )blockB;
702
                    p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
703
#elif TILE_N_LAST_DIV8 == 3
704
                    Dtype3* p3BlockB = (Dtype3* )blockB;
705
                    p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
706
                    p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 2 * 8) ) );
707
#endif
708
                    src1_read += WIDTH1 * 2;
709
                }
710

711
                // Perform MADs
712
                Dtype* pBlockB = (Dtype*)blockB;
713
                kernel_idx = 0;
714
                interleaved_y = 0;
715
                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
716
                {
717
                    kernel_y = interleaved_y * 2;
718
                    DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
719
                    DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
720
#if TILE_N_LAST_DIV8 >= 2
721
                    DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
722
                    DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
723
#if TILE_N_LAST_DIV8 >= 3
724
                    DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
725
                    DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
726
#endif
727
#endif
728
                } )
729
                    kernel_y = interleaved_y * 2;
730
                if ( kernel_width_is_odd )
731
                {
732
                    DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
733
#if TILE_N_LAST_DIV8 >= 2
734
                    DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
735
#if TILE_N_LAST_DIV8 >= 3
736
                    DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
737
#endif
738
#endif
739
                }
740
            }
741

742
            //while( ++patch_row < 1 ); //debug
743
            while( ++patch_row < KERNEL_HEIGHT );
744

745
            // reset to start of next slice of patch
746
            src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
747
        }
748
        //while ( ++patch_depth < 1 );  //debug
749
        while ( ++patch_depth < INPUT_DEPTH );
750

751
        // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
752
        // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
753
        int out_offset = global_z * out_pitch_z                                        // batch offset
754
         + ( group_x * TILE_N ) * out_pitch_y                                          // channel offset
755
         + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X  // y offset
756
         + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;                // x offset
757
        __global Dtype *out = dst + out_offset;
758
#if APPLY_BIAS
759
        Dtype bias[4];
760
        Dtype4 *bias_vec;
761
        bias_vec = (Dtype4*)bias;
762
        *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
763
        if (group_x > 0xFFFFFFFEul) {
764
          dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
765
        }
766
#else
767
        const Dtype bias[4] = {0, 0, 0, 0};
768
#endif
769

770
        if (global_y * TILE_M < output_width * output_height )
771
        {
772
            for (int i = 0; i < 8; i++)
773
            {
774
                if ( TILE_N_LAST_DIV8 > 0 )
775
                {
776
                  ACTIVATION_FUNCTION(dst, out_offset + ( 0+i) * out_pitch_y, blockC[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
777
                }
778
                if ( TILE_N_LAST_DIV8 > 1 )
779
                {
780
                  ACTIVATION_FUNCTION(dst, out_offset + ( 8+i) * out_pitch_y, blockC[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);
781
                }
782
                if ( TILE_N_LAST_DIV8 > 2 )
783
                {
784
                  ACTIVATION_FUNCTION(dst, out_offset + (16+i) * out_pitch_y, blockC[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);
785
                }
786
                if ( TILE_N_LAST_DIV8 > 3 )
787
                {
788
                  ACTIVATION_FUNCTION(dst, out_offset + (24+i) * out_pitch_y, blockC[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);
789
                }
790
            }
791
        }
792
    }
793
#endif
794
}
795
#endif
796
#ifdef GEMM_LIKE_CONV_32_2
797

798
//////////////////////////////////////////////////////////////////////////////
799
// Conv_Interleaved_32_2_flex
800
//
801
// Convolution: each workitem computes 1 patch x 32 filters worth of output
802
// data.  Kernel's inner loop works on a single tile consisting of one
803
// row from each patch and the filter data corresponding to that row.  Filter
804
// matrix is interleaved to reduce GRF bank conflicts.  Patches are walked
805
// by rows and then by slices.  Relies on sub_group extension for block
806
// reads and SIMD broadcast.  Allows flexible sizing of TILE width (TILE_N)
807
// by dynamically selecting one of two code paths: one uses TILE_N = 32 and
808
// the other uses TILE_N = 8, 16, or 24.
809
#define TILE_M          2
810
#define TILE_K          KERNEL_WIDTH
811
#define TILE_N          32
812

813
__attribute__((intel_reqd_sub_group_size(8)))
814
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
815
{
816
    __global Dtype *dst = dst_base + dst_offset;
817
    const int group_x = get_group_id(0);
818
    const int group_y = get_group_id(1);
819
    const int global_x = get_global_id(0);
820
    const int global_y = get_global_id(1);
821
    const int global_z = get_global_id(2);
822
    int interleaved_y;
823
    int kernel_y;
824
    int kernel_idx;
825

826
#define DOT_PRODUCT_8( _result, _rowA, colB )    \
827
    {   \
828
        _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 );  \
829
        _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 );  \
830
        _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 );  \
831
        _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 );  \
832
        _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 );  \
833
        _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 );  \
834
        _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 );  \
835
        _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 );  \
836
    }
837
        typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
838

839
    // True for all threads if filter_width is multiple of TILE_N
840
    // else, true for all but right-most column of threads.
841
    if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )
842
    {
843
        // Result ctile (*dst) is M rows x N columns
844
        // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
845
        Dtype8  blockC00 = 0.f;
846
        Dtype8  blockC10 = 0.f;
847
        Dtype8  blockC20 = 0.f;
848
        Dtype8  blockC30 = 0.f;
849
        Dtype8  blockC01 = 0.f;
850
        Dtype8  blockC11 = 0.f;
851
        Dtype8  blockC21 = 0.f;
852
        Dtype8  blockC31 = 0.f;
853

854
        // Src0 (patch input) is directly used as atile.
855
        // Each work item points to the start of a different patch.
856
        // atile is M rows x K columns.
857
        int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
858
        int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
859
        int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
860
        int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
861
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
862
        int saved_y0 = curr_y0;
863
        int saved_y1 = curr_y1;
864
#endif
865
        const __global Dtype *src0_read0 = src0
866
         + aligned_input_size * global_z         // batch offset
867
         + (curr_y0 - INPUT_PAD_H) * ROW_PITCH   // y offset
868
         + curr_x0 - INPUT_PAD_W;                // x offset
869
        const __global Dtype *src0_read1 = src0
870
         + aligned_input_size * global_z         // batch offset
871
         + (curr_y1 - INPUT_PAD_H) * ROW_PITCH   // y offset
872
         + curr_x1 - INPUT_PAD_W;                // x offset
873

874
        // Src1 (filter) is directly used as btile.
875
        // It starts at the top of src1 and walks down.
876
        // btile is K rows x N columns.
877
        const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
878

879
        // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
880
        // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
881
        // and KERNEL_WIDTH/2 rows of interleaved filter.
882
        int patch_depth = 0;
883
        do
884
        {
885
            int patch_row = 0;
886
            do
887
            {
888
                // Load atile and btile.
889
                // Kernel data is partially interleaved.  Every 2 rows are interleaved at Dtype8 granularity.
890
                // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved.  The non
891
                // interleaved row is padded with zero to ensure same size as interleaved rows. This
892
                // interleaving is done to ensure 0% GDR bank conflicts.  For example, this is how the
893
                // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
894
                // (0, 0) (8, 0) (16, 0) (24, 0) ...       (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
895
                // (0, 1) (8, 1) (16, 1) (24, 1) ... =>    (0, 2) (8, 2) (16, 2) (24, 2) ...
896
                // (0, 2) (8, 2) (16, 2) (24, 2) ...       ...
897
                // ...
898
                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
899
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
900
  #if KERNEL_WIDTH == 3
901
                Dtype_t blockA00 = vload3(0, src0_read0); src0_read0 += ROW_PITCH;
902
                Dtype_t blockA01 = vload3(0, src0_read1); src0_read1 += ROW_PITCH;
903
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
904
                Dtype*  pblockA01 = (Dtype*)(&blockA01);
905
  #else
906
                Dtype_t blockA00 = { (Dtype)0.f };
907
                Dtype_t blockA01 = { (Dtype)0.f };
908
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
909
                Dtype*  pblockA01 = (Dtype*)(&blockA01);
910
                int pos = 0;
911
                LOOP(KERNEL_WIDTH, pos,
912
                {
913
                  if (curr_x0 + pos < input_width)
914
                    pblockA00[pos] = src0_read0[pos];
915

916
                  if (curr_x1 + pos < input_width)
917
                    pblockA01[pos] = src0_read1[pos];
918
                })
919
                src0_read0 += ROW_PITCH;
920
                src0_read1 += ROW_PITCH;
921
  #endif
922
#else
923
                Dtype_t blockA00;
924
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
925
                int pos = 0;
926
                LOOP(KERNEL_WIDTH, pos,
927
                {
928
                  if (curr_y0 >= INPUT_PAD_H &&
929
                      curr_y0 < input_height + INPUT_PAD_H &&
930
                      curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
931
                      curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
932
                    pblockA00[pos] = src0_read0[pos * DILATION_X];
933
                  else
934
                    pblockA00[pos] = 0;
935
                })
936
                curr_y0 += DILATION_Y;
937
                Dtype_t blockA01;
938
                Dtype*  pblockA01 = (Dtype*)(&blockA01);
939
                pos = 0;
940
                LOOP(KERNEL_WIDTH, pos,
941
                {
942
                  if (curr_y1 >= INPUT_PAD_H &&
943
                      curr_y1 < input_height + INPUT_PAD_H &&
944
                      curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
945
                      curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
946
                    pblockA01[pos] = src0_read1[pos * DILATION_X];
947
                  else
948
                    pblockA01[pos] = 0;
949
                })
950
                curr_y1 += DILATION_Y;
951
                src0_read0 += (ROW_PITCH * DILATION_Y);
952
                src0_read1 += (ROW_PITCH * DILATION_Y);
953
#endif
954
                Dtype blockB00[KERNEL_WIDTH*4];
955
                Dtype8* p8BlockB00 = (Dtype8*)blockB00;
956
                Dtype4* p4BlockB00 = (Dtype4*)blockB00;
957
                Dtype*  pBlockB00 =  (Dtype* )blockB00;
958

959
                interleaved_y = 0;
960
                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
961
                {
962
                    p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE*)src1_read ) );
963
                    src1_read += WIDTH1 * 2;
964
                } )
965
                if ( kernel_width_is_odd )
966
                {
967
                    p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
968
                    src1_read += WIDTH1 * 2;
969
                }
970
                // Perform MADs
971
                kernel_idx = 0;
972
                interleaved_y = 0;
973
                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
974
                {
975
                    kernel_y = interleaved_y * 2;
976
                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
977
                    DOT_PRODUCT_8( blockC01, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
978
                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
979
                    DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
980
                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
981
                    DOT_PRODUCT_8( blockC11, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
982
                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
983
                    DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
984
                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
985
                    DOT_PRODUCT_8( blockC21, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
986
                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
987
                    DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
988
                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
989
                    DOT_PRODUCT_8( blockC31, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
990
                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
991
                    DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
992
                } )
993
                if ( kernel_width_is_odd )
994
                {
995
                    kernel_y = interleaved_y * 2;
996
                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );
997
                    DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
998
                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );
999
                    DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1000
                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] );
1001
                    DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1002
                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] );
1003
                    DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1004
                }
1005
            }
1006

1007
            //while( ++patch_row < 1 ); //debug
1008
            while( ++patch_row < KERNEL_HEIGHT );
1009
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1010
            curr_y0 = saved_y0;
1011
            curr_y1 = saved_y1;
1012
#endif
1013
            // reset to start of next slice of patch
1014
            src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
1015
            src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
1016
        }
1017
        //while ( ++patch_depth < 1 );  //debug
1018
        while ( ++patch_depth < INPUT_DEPTH );
1019

1020
        // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
1021
        // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
1022
        int out0_offset = global_z * out_pitch_z                                           // batch offset
1023
         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
1024
         + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1025
         + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
1026
        int out1_offset = global_z * out_pitch_z                                           // batch offset
1027
         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
1028
         + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1029
         + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
1030

1031
#if APPLY_BIAS
1032
        Dtype bias[4];
1033
        Dtype4 *bias_vec;
1034
        bias_vec = (Dtype4*)bias;
1035
        *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
1036
        if (group_x > 0xFFFFFFFEul) {
1037
          dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
1038
        }
1039
#else
1040
        const Dtype bias[4] = {0, 0, 0, 0};
1041
#endif
1042

1043
        if( global_y * TILE_M < output_width * output_height )
1044
        {
1045
            for( int i = 0; i < 8; i++ )
1046
            {
1047
                ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
1048
                ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);
1049
                ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);
1050
                ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);
1051
            }
1052
        }
1053
        if( global_y * TILE_M + 1 < output_width * output_height )
1054
        {
1055
            for( int i = 0; i < 8; i++ )
1056
            {
1057
                ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC01[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
1058
                ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC11[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);
1059
                ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC21[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);
1060
                ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC31[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);
1061
            }
1062
        }
1063
    }
1064
#if TILE_N_LAST > 0
1065
    else
1066
    {
1067

1068
        // Result ctile (*dst) is M rows x N columns
1069
        // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
1070
        int i = 0;
1071
        Dtype8  blockC0[TILE_N_LAST_DIV8];
1072
        Dtype8  blockC1[TILE_N_LAST_DIV8];
1073
        LOOP(TILE_N_LAST_DIV8, i,
1074
        {
1075
            blockC0[i] = 0.f;
1076
            blockC1[i] = 0.f;
1077
        } )
1078

1079
        // Src0 (patch input) is directly used as atile.
1080
        // Each work item points to the start of a different patch.
1081
        // atile is M rows x K columns.
1082
        int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
1083
        int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
1084
        int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
1085
        int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
1086
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1087
        int saved_y0 = curr_y0;
1088
        int saved_y1 = curr_y1;
1089
#endif
1090
        const __global Dtype *src0_read0 = src0
1091
         + aligned_input_size * global_z         // batch offset
1092
         + (curr_y0 - INPUT_PAD_H) * ROW_PITCH   // y offset
1093
         + curr_x0 - INPUT_PAD_W;                // x offset
1094
        const __global Dtype *src0_read1 = src0
1095
         + aligned_input_size * global_z         // batch offset
1096
         + (curr_y1 - INPUT_PAD_H) * ROW_PITCH   // y offset
1097
         + curr_x1 - INPUT_PAD_W;                // x offset
1098

1099
        // Src1 (filter) is directly used as btile.
1100
        // It starts at the top of src1 and walks down.
1101
        // btile is K rows x N columns.
1102
        const __global Dtype *src1_read = src1 + ( global_x * TILE_N  * 2);
1103

1104
        // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
1105
        // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
1106
        // and KERNEL_WIDTH/2 rows of interleaved filter.
1107
        int patch_depth = 0;
1108
        do
1109
        {
1110
            int patch_row = 0;
1111
            do
1112
            {
1113
                // Load atile and interleaved btile.
1114
                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
1115
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
1116
                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[  0  ]; src0_read0 += ROW_PITCH;
1117
                Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[  0  ]; src0_read1 += ROW_PITCH;
1118
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
1119
                Dtype*  pblockA01 = (Dtype*)(&blockA01);
1120
#else
1121
                Dtype_t blockA00;
1122
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
1123
                int pos = 0;
1124
                LOOP(KERNEL_WIDTH, pos,
1125
                {
1126
                  if (curr_y0 >= INPUT_PAD_H &&
1127
                      curr_y0 < input_height + INPUT_PAD_H &&
1128
                      curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
1129
                      curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
1130
                    pblockA00[pos] = src0_read0[pos * DILATION_X];
1131
                  else
1132
                    pblockA00[pos] = 0;
1133
                })
1134
                curr_y0 += DILATION_Y;
1135
                Dtype_t blockA01;
1136
                Dtype*  pblockA01 = (Dtype*)(&blockA01);
1137
                pos = 0;
1138
                LOOP(KERNEL_WIDTH, pos,
1139
                {
1140
                  if (curr_y1 >= INPUT_PAD_H &&
1141
                      curr_y1 < input_height + INPUT_PAD_H &&
1142
                      curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
1143
                      curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
1144
                    pblockA01[pos] = src0_read1[pos * DILATION_X];
1145
                  else
1146
                    pblockA01[pos] = 0;
1147
                })
1148
                curr_y1 += DILATION_Y;
1149
                src0_read0 += (ROW_PITCH * DILATION_Y);
1150
                src0_read1 += (ROW_PITCH * DILATION_Y);
1151
#endif
1152
                Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8];
1153

1154
                interleaved_y = 0;
1155
                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1156
                {
1157
#if TILE_N_LAST_DIV8 == 1
1158
                    Dtype2* p2BlockB = (Dtype2* )blockB;
1159
                    p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
1160
#elif TILE_N_LAST_DIV8 == 2
1161
                    Dtype4* p4BlockB = (Dtype4* )blockB;
1162
                    p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
1163
#elif TILE_N_LAST_DIV8 == 3
1164
                    //TODO: broken.  No block_read6
1165
                    Dtype6* p6BlockB = (Dtype6* )blockB;
1166
                    (*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
1167
                    (*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) );
1168
#endif
1169
                    src1_read += WIDTH1 * 2;
1170
                } )
1171
                if ( kernel_width_is_odd )
1172
                {
1173
#if TILE_N_LAST_DIV8 == 1
1174
                    Dtype* pBlockB = (Dtype* )blockB;
1175
                    pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) );
1176
#elif TILE_N_LAST_DIV8 == 2
1177
                    Dtype2* p2BlockB = (Dtype2* )blockB;
1178
                    p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
1179
#elif TILE_N_LAST_DIV8 == 3
1180
                    Dtype3* p3BlockB = (Dtype3* )blockB;
1181
                    p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
1182
                    p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 8) ) );
1183
#endif
1184
                    src1_read += WIDTH1 * 2;
1185
                }
1186

1187
                // Perform MADs
1188
                Dtype* pBlockB = (Dtype*)blockB;
1189
                kernel_idx = 0;
1190
                interleaved_y = 0;
1191
                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1192
                {
1193
                    kernel_y = interleaved_y * 2;
1194
                    DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y    ], pBlockB[kernel_idx] );
1195
                    DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
1196
                    DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );
1197
                    DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
1198
#if TILE_N_LAST_DIV8 >= 2
1199
                    DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y    ], pBlockB[kernel_idx] );
1200
                    DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
1201
                    DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );
1202
                    DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
1203
#if TILE_N_LAST_DIV8 >= 3
1204
                    DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y    ], pBlockB[kernel_idx] );
1205
                    DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
1206
                    DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );
1207
                    DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
1208
#endif
1209
#endif
1210
                } )
1211
                    kernel_y = interleaved_y * 2;
1212
                if ( kernel_width_is_odd )
1213
                {
1214
                    DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y], pBlockB[kernel_idx] );
1215
                    DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
1216
#if TILE_N_LAST_DIV8 >= 2
1217
                    DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y], pBlockB[kernel_idx] );
1218
                    DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
1219
#if TILE_N_LAST_DIV8 >= 3
1220
                    DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y], pBlockB[kernel_idx] );
1221
                    DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
1222
#endif
1223
#endif
1224
                }
1225
            }
1226

1227
            //while( ++patch_row < 1 ); //debug
1228
            while( ++patch_row < KERNEL_HEIGHT );
1229
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1230
            curr_y0 = saved_y0;
1231
            curr_y1 = saved_y1;
1232
#endif
1233
            // reset to start of next slice of patch
1234
            src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
1235
            src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
1236
        }
1237
        //while ( ++patch_depth < 1 );  //debug
1238
        while ( ++patch_depth < INPUT_DEPTH );
1239

1240
        // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
1241
        // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
1242
        int out0_offset = global_z * out_pitch_z                                           // batch offset
1243
         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
1244
         + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1245
         + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
1246
        int out1_offset = global_z * out_pitch_z                                           // batch offset
1247
         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
1248
         + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1249
         + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
1250
        __global Dtype *out1 = dst + out1_offset;
1251

1252
#if APPLY_BIAS
1253
        Dtype bias[4];
1254
        Dtype4 *bias_vec;
1255
        bias_vec = (Dtype4*)bias;
1256
        *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
1257
        if (group_x > 0xFFFFFFFEul) {
1258
          dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
1259
        }
1260
#else
1261
        const Dtype bias[4] = {0, 0, 0, 0};
1262
#endif
1263
        if( global_y * TILE_M < output_width * output_height )
1264
        {
1265
            for( int i = 0; i < 8; i++ )
1266
            {
1267
                if ( TILE_N_LAST_DIV8 > 0 )
1268
                {
1269
                  ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC0[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
1270
                }
1271
                if ( TILE_N_LAST_DIV8 > 1 )
1272
                {
1273
                  ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC0[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);
1274
                }
1275
                if ( TILE_N_LAST_DIV8 > 2 )
1276
                {
1277
                  ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC0[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);
1278
                }
1279
                if ( TILE_N_LAST_DIV8 > 3 )
1280
                {
1281
                  ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC0[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);
1282
                }
1283
            }
1284
        }
1285
        if( global_y * TILE_M + 1 < output_width * output_height )
1286
        {
1287
            for( int i = 0; i < 8; i++ )
1288
            {
1289
                if ( TILE_N_LAST_DIV8 > 0 )
1290
                {
1291
                  ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC1[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
1292
                }
1293
                if ( TILE_N_LAST_DIV8 > 1 )
1294
                {
1295
                  ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC1[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);
1296
                }
1297
                if ( TILE_N_LAST_DIV8 > 2 )
1298
                {
1299
                  ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC1[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);
1300
                }
1301
                if ( TILE_N_LAST_DIV8 > 3 )
1302
                {
1303
                  ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC1[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);
1304
                }
1305
            }
1306
        }
1307
    }
1308
#endif
1309
}
1310
#endif
1311

1312
#if defined(GEMM_LIKE_CONV_32_2_SIMD16) || defined(GEMM_LIKE_CONV_32_1_SIMD16)
1313
#define INTERLEAVED_SIMD16_OUTPUT(_out_, _offset_,  _m_) do {\
1314
    if (global_y * TILE_M < output_width * output_height ) \
1315
    { \
1316
      if ( ( OUT_DEPTH % TILE_N ) == 0 ) {\
1317
        for (int i = 0; i < 16; i++) \
1318
        { \
1319
          ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1320
          ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \
1321
        } \
1322
      } \
1323
      else if( ( OUT_DEPTH % 16 ) == 0 ) { \
1324
        if ( ( global_x + 1 ) < get_global_size(0) ) { \
1325
          for ( int i = 0; i < 16; i++ ) \
1326
          { \
1327
            ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1328
            ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \
1329
          } \
1330
        } \
1331
        else { \
1332
          for (int i = 0; i < 16; i++) \
1333
          { \
1334
            ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1335
          } \
1336
        } \
1337
      } \
1338
      else { \
1339
        if ( ( global_x + 1 ) < get_global_size(0) ) \
1340
        { \
1341
          for ( int i = 0; i < 16; i++ ) \
1342
          { \
1343
            ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1344
            ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \
1345
          } \
1346
        } \
1347
        else { \
1348
          if ( (OUT_DEPTH % TILE_N) > 16 ) { \
1349
            for (int i = 0; i < 16 ; i++) \
1350
            { \
1351
              ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1352
            } \
1353
            for (int i = 0; i < OUT_DEPTH % 16 ; i++) \
1354
            { \
1355
              ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \
1356
            } \
1357
          } \
1358
          else { \
1359
            for (int i = 0; i < OUT_DEPTH % 16 ; i++) \
1360
            { \
1361
              ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1362
            } \
1363
          } \
1364
        } \
1365
      } \
1366
    } \
1367
 }while(0)
1368
#endif
1369

1370
#ifdef GEMM_LIKE_CONV_32_1_SIMD16
1371
#define TILE_M          1
1372
#define TILE_K          KERNEL_WIDTH
1373
#define TILE_N          32
1374

1375
__attribute__((intel_reqd_sub_group_size(16)))
1376
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1377
{
1378
    __global Dtype *dst = dst_base + dst_offset;
1379
    const int group_x = get_group_id(0);
1380
    const int group_y = get_group_id(1);
1381
    const int global_x = get_global_id(0);
1382
    const int global_y = get_global_id(1);
1383
    const int global_z = get_global_id(2);
1384
    int interleaved_y;
1385
    int kernel_y;
1386
    int kernel_idx;
1387

1388
    // Result ctile (*dst) is M rows x N columns
1389
    // LWG size is 1x16.  Thus each thread calculates 16*M rows x N cols of ctile.
1390
    Dtype16  blockC00 = 0.f;
1391
    Dtype16  blockC10 = 0.f;
1392

1393
    // Src0 (patch input) is directly used as atile.
1394
    // Each work item points to the start of a different patch.
1395
    // atile is M rows x K columns.
1396
    int curr_x = ( global_y % output_width ) * STRIDE_X;
1397
    int curr_y = ( global_y / output_width ) * STRIDE_Y;
1398
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1399
    int saved_y = curr_y;
1400
#endif
1401
    const __global Dtype *src0_read = src0
1402
     + aligned_input_size * global_z           // batch offset
1403
     + (curr_y - INPUT_PAD_H) * ROW_PITCH      // y offset
1404
     + curr_x - INPUT_PAD_W;                   // x offset
1405
     const __global Dtype *src0_read_orig = src0_read;
1406

1407
    // Src1 (filter) is directly used as btile.
1408
    // It starts at the top of src1 and walks down.
1409
    // btile is K rows x N columns.
1410
    const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2 );
1411

1412
#define DOT_PRODUCT_16( _result, _rowA, colB )    \
1413
    {   \
1414
        _result.s0 = mad( _rowA, sub_group_broadcast( colB,  0 ), _result.s0 );  \
1415
        _result.s1 = mad( _rowA, sub_group_broadcast( colB,  1 ), _result.s1 );  \
1416
        _result.s2 = mad( _rowA, sub_group_broadcast( colB,  2 ), _result.s2 );  \
1417
        _result.s3 = mad( _rowA, sub_group_broadcast( colB,  3 ), _result.s3 );  \
1418
        _result.s4 = mad( _rowA, sub_group_broadcast( colB,  4 ), _result.s4 );  \
1419
        _result.s5 = mad( _rowA, sub_group_broadcast( colB,  5 ), _result.s5 );  \
1420
        _result.s6 = mad( _rowA, sub_group_broadcast( colB,  6 ), _result.s6 );  \
1421
        _result.s7 = mad( _rowA, sub_group_broadcast( colB,  7 ), _result.s7 );  \
1422
        _result.s8 = mad( _rowA, sub_group_broadcast( colB,  8 ), _result.s8 );  \
1423
        _result.s9 = mad( _rowA, sub_group_broadcast( colB,  9 ), _result.s9 );  \
1424
        _result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa );  \
1425
        _result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb );  \
1426
        _result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc );  \
1427
        _result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd );  \
1428
        _result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se );  \
1429
        _result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf );  \
1430
    }
1431
    typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
1432
    // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
1433
    // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
1434
    // and KERNEL_WIDTH/2 rows of interleaved filter.
1435
    int patch_depth = 0;
1436
    __attribute__((opencl_unroll_hint(1)))
1437
    do
1438
    {
1439
        int patch_row = 0;
1440
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1441
        curr_y = saved_y;
1442
#endif
1443
        __attribute__((opencl_unroll_hint(1)))
1444
        do
1445
        {
1446
            // Load atile and btile.
1447
            // Kernel data is partially interleaved.  Every 2 rows are interleaved at Dtype16 granularity.
1448
            // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved.  The non
1449
            // interleaved row is padded with zero to ensure same size as interleaved rows. This
1450
            // interleaving is done to ensure 0% GDR bank conflicts.  For example, this is how the
1451
            // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
1452
            // (0, 0) (16, 0) (32, 0) (48, 0) ...     (0, 0) ( 0, 1) (16, 0) ( 0, 1) (32, 0) (0, 1) (48, 0) ...
1453
            // (0, 1) (16, 1) (32, 1) (48, 1) ... =>  (0, 2) (16, 2) (32, 2) (48, 2) ...
1454
            // (0, 2) (16, 2) (32, 2) (48, 2) ...     ...
1455
            // ...
1456
            const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
1457

1458
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
1459
  #if KERNEL_WIDTH == 3
1460
            Dtype_t blockA00 = vload3(0, src0_read);
1461
            Dtype*  pblockA00 = (Dtype*)(&blockA00);
1462
  #else
1463
            Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
1464
            Dtype*  pblockA00 = (Dtype*)(&blockA00);
1465
  #endif
1466
#else
1467
            Dtype_t blockA00;
1468
            Dtype*  pblockA00 = (Dtype*)(&blockA00);
1469
            int pos = 0;
1470
            LOOP(KERNEL_WIDTH, pos,
1471
            {
1472
              if (curr_y >= INPUT_PAD_H &&
1473
                  curr_y < input_height + INPUT_PAD_H &&
1474
                  curr_x + pos * DILATION_X >= INPUT_PAD_W &&
1475
                  curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
1476
                pblockA00[pos] = src0_read[pos * DILATION_X];
1477
              else
1478
                pblockA00[pos] = 0;
1479
            })
1480
            curr_y += DILATION_Y;
1481
#endif
1482
            src0_read += ROW_PITCH * DILATION_Y;
1483
            INT_TYPE blockB00[KERNEL_WIDTH * 2];
1484
            INT_TYPE4* p4BlockB00 = (INT_TYPE4*)blockB00;
1485
            INT_TYPE2* p2BlockB00 = (INT_TYPE2*)blockB00;
1486
            Dtype* pBlockB00  = (Dtype*)blockB00;
1487
            interleaved_y = 0;
1488
            LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1489
            {
1490
                p4BlockB00[interleaved_y] = SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read );
1491
                src1_read += WIDTH1 * 2;
1492
            } )
1493
            if ( kernel_width_is_odd )
1494
            {
1495
                p2BlockB00[KERNEL_WIDTH - 1] = SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read );
1496
                src1_read += WIDTH1 * 2;
1497
            }
1498

1499
            // Perform MADs
1500
            kernel_idx = 0;
1501
            interleaved_y = 0;
1502
            LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1503
            {
1504
                kernel_y = interleaved_y * 2;
1505
                DOT_PRODUCT_16( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
1506
                DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
1507
                DOT_PRODUCT_16( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
1508
                DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
1509
            } )
1510
            if ( kernel_width_is_odd )
1511
            {
1512
                kernel_y = interleaved_y * 2;
1513
                DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1514
                DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1515
            }
1516
        }
1517

1518
        //while( ++patch_row < 1 ); //debug
1519
        while( ++patch_row < KERNEL_HEIGHT );
1520

1521
        // reset to start of next slice of patch
1522
        src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
1523
    }
1524
    //while ( ++patch_depth < 1 );  //debug
1525
    while ( ++patch_depth < INPUT_DEPTH );
1526

1527
    // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
1528
    // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
1529
    int out_offset = global_z * out_pitch_z                                        // batch offset
1530
     + ( group_x * TILE_N ) * out_pitch_y                                          // channel offset
1531
     + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X  // y offset
1532
     + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;                // x offset
1533
    __global Dtype *out = dst + out_offset;
1534

1535
#if APPLY_BIAS
1536
    Dtype bias[2];
1537
    Dtype2 *bias_vec;
1538
    bias_vec = (Dtype2*)bias;
1539
    *bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));
1540
    if (group_x > 0xFFFFFFFEul) {
1541
      dst[0] = bias[0] + bias[1];
1542
    }
1543
#else
1544
    const Dtype bias[2] = {0, 0};
1545
#endif
1546
    INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
1547
}
1548
#endif
1549

1550
#ifdef GEMM_LIKE_CONV_32_2_SIMD16
1551

1552
//////////////////////////////////////////////////////////////////////////////
1553
// Conv_Interleaved_32_2_SIMD16
1554
//
1555
// Convolution: each workitem computes 1 patch x 32 filters worth of output
1556
// data.
1557
#define TILE_M          2
1558
#define TILE_K          KERNEL_WIDTH
1559
#define TILE_N          32
1560

1561
__attribute__((intel_reqd_sub_group_size(16)))
1562
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1563
{
1564
    __global Dtype *dst = dst_base + dst_offset;
1565
    const int group_x = get_group_id(0);
1566
    const int group_y = get_group_id(1);
1567
    const int global_x = get_global_id(0);
1568
    const int global_y = get_global_id(1);
1569
    const int global_z = get_global_id(2);
1570
    int interleaved_y;
1571
    int kernel_y;
1572
    int kernel_idx;
1573
#define DOT_PRODUCT_16( _result, _rowA, colB )    \
1574
    {   \
1575
        _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 );  \
1576
        _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 );  \
1577
        _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 );  \
1578
        _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 );  \
1579
        _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 );  \
1580
        _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 );  \
1581
        _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 );  \
1582
        _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 );  \
1583
        _result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 );  \
1584
        _result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 );  \
1585
        _result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa );  \
1586
        _result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb );  \
1587
        _result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc );  \
1588
        _result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd );  \
1589
        _result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se );  \
1590
        _result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf );  \
1591
    }
1592
        typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
1593

1594
    // True for all threads if filter_width is multiple of TILE_N
1595
    // else, true for all but right-most column of threads.
1596
    {
1597
        // Result ctile (*dst) is M rows x N columns
1598
        // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
1599
        Dtype16  blockC00 = 0.f;
1600
        Dtype16  blockC10 = 0.f;
1601
        Dtype16  blockC01 = 0.f;
1602
        Dtype16  blockC11 = 0.f;
1603

1604
        // Src0 (patch input) is directly used as atile.
1605
        // Each work item points to the start of a different patch.
1606
        // atile is M rows x K columns.
1607
        int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
1608
        int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
1609
        int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
1610
        int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
1611
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1612
        int saved_y0 = curr_y0;
1613
        int saved_y1 = curr_y1;
1614
#endif
1615
        const __global Dtype *src0_read0 = src0
1616
         + aligned_input_size * global_z         // batch offset
1617
         + (curr_y0 - INPUT_PAD_H) * ROW_PITCH   // y offset
1618
         + curr_x0 - INPUT_PAD_W;                // x offset
1619
        const __global Dtype *src0_read1 = src0
1620
         + aligned_input_size * global_z         // batch offset
1621
         + (curr_y1 - INPUT_PAD_H) * ROW_PITCH   // y offset
1622
         + curr_x1 - INPUT_PAD_W;                // x offset
1623

1624
        // Src1 (filter) is directly used as btile.
1625
        // It starts at the top of src1 and walks down.
1626
        // btile is K rows x N columns.
1627
        const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
1628

1629
        // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
1630
        // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
1631
        // and KERNEL_WIDTH/2 rows of interleaved filter.
1632
        int patch_depth = 0;
1633
        do
1634
        {
1635
            int patch_row = 0;
1636
            do
1637
            {
1638
                // Load atile and btile.
1639
                // Kernel data is partially interleaved.  Every 2 rows are interleaved at Dtype8 granularity.
1640
                // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved.  The non
1641
                // interleaved row is padded with zero to ensure same size as interleaved rows. This
1642
                // interleaving is done to ensure 0% GDR bank conflicts.  For example, this is how the
1643
                // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
1644
                // (0, 0) (8, 0) (16, 0) (24, 0) ...       (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
1645
                // (0, 1) (8, 1) (16, 1) (24, 1) ... =>    (0, 2) (8, 2) (16, 2) (24, 2) ...
1646
                // (0, 2) (8, 2) (16, 2) (24, 2) ...       ...
1647
                // ...
1648
                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
1649
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
1650
                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[  0  ]; src0_read0 += ROW_PITCH;
1651
                Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[  0  ]; src0_read1 += ROW_PITCH;
1652
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
1653
                Dtype*  pblockA01 = (Dtype*)(&blockA01);
1654
#else
1655
                Dtype_t blockA00;
1656
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
1657
                int pos = 0;
1658
                LOOP(KERNEL_WIDTH, pos,
1659
                {
1660
                  if (curr_y0 >= INPUT_PAD_H &&
1661
                      curr_y0 < input_height + INPUT_PAD_H &&
1662
                      curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
1663
                      curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
1664
                    pblockA00[pos] = src0_read0[pos * DILATION_X];
1665
                  else
1666
                    pblockA00[pos] = 0;
1667
                })
1668
                curr_y0 += DILATION_Y;
1669
                Dtype_t blockA01;
1670
                Dtype*  pblockA01 = (Dtype*)(&blockA01);
1671
                pos = 0;
1672
                LOOP(KERNEL_WIDTH, pos,
1673
                {
1674
                  if (curr_y1 >= INPUT_PAD_H &&
1675
                      curr_y1 < input_height + INPUT_PAD_H &&
1676
                      curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
1677
                      curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
1678
                    pblockA01[pos] = src0_read1[pos * DILATION_X];
1679
                  else
1680
                    pblockA01[pos] = 0;
1681
                })
1682
                curr_y1 += DILATION_Y;
1683
                src0_read0 += (ROW_PITCH * DILATION_Y);
1684
                src0_read1 += (ROW_PITCH * DILATION_Y);
1685
#endif
1686
                Dtype blockB00[KERNEL_WIDTH*2];
1687
                Dtype4* p4BlockB00 = (Dtype4*)blockB00;
1688
                Dtype2* p2BlockB00 = (Dtype2*)blockB00;
1689
                Dtype*  pBlockB00 =  (Dtype* )blockB00;
1690

1691
                interleaved_y = 0;
1692
                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1693
                {
1694
                    p4BlockB00[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
1695
                    src1_read += WIDTH1 * 2;
1696
                } )
1697
                if ( kernel_width_is_odd )
1698
                {
1699
                    p2BlockB00[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
1700
                    src1_read += WIDTH1 * 2;
1701
                }
1702
                // Perform MADs
1703
                kernel_idx = 0;
1704
                interleaved_y = 0;
1705
                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1706
                {
1707
                    kernel_y = interleaved_y * 2;
1708
                    DOT_PRODUCT_16( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
1709
                    DOT_PRODUCT_16( blockC01, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
1710
                    DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
1711
                    DOT_PRODUCT_16( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
1712
                    DOT_PRODUCT_16( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
1713
                    DOT_PRODUCT_16( blockC11, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
1714
                    DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
1715
                    DOT_PRODUCT_16( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
1716
                } )
1717
                if ( kernel_width_is_odd )
1718
                {
1719
                    kernel_y = interleaved_y * 2;
1720
                    DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );
1721
                    DOT_PRODUCT_16( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1722
                    DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );
1723
                    DOT_PRODUCT_16( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1724
                }
1725
            }
1726

1727
            //while( ++patch_row < 1 ); //debug
1728
            while( ++patch_row < KERNEL_HEIGHT );
1729
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1730
            curr_y0 = saved_y0;
1731
            curr_y1 = saved_y1;
1732
#endif
1733
            // reset to start of next slice of patch
1734
            src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
1735
            src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
1736
        }
1737
        //while ( ++patch_depth < 1 );  //debug
1738
        while ( ++patch_depth < INPUT_DEPTH );
1739

1740
        // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
1741
        // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
1742
        int out0_offset = global_z * out_pitch_z                                           // batch offset
1743
         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
1744
         + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1745
         + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
1746
        int out1_offset = global_z * out_pitch_z                                           // batch offset
1747
         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
1748
         + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1749
         + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
1750

1751
#if APPLY_BIAS
1752
        Dtype bias[2];
1753
        Dtype2 *bias_vec;
1754
        bias_vec = (Dtype2*)bias;
1755
        *bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));
1756
        if (group_x > 0xFFFFFFFEul) {
1757
          dst[0] = bias[0] + bias[1];
1758
        }
1759
#else
1760
        const Dtype bias[2] = {0, 0};
1761
#endif
1762
        INTERLEAVED_SIMD16_OUTPUT(dst, out0_offset, 0);
1763
        INTERLEAVED_SIMD16_OUTPUT(dst, out1_offset, 1);
1764
    }
1765
}
1766
#endif
1767

1768
#elif defined KERNEL_DWCONV
1769

1770
__kernel void DWCONV(
1771
    ELTWISE_DATA_ARG
1772
    FUSED_ARG
1773
    __global Dtype* image_data,
1774
    __global Dtype* kernel_data,
1775
    BIAS_KERNEL_ARG
1776
    __global Dtype* convolved_image_base,
1777
    const int convolved_image_offset,
1778
    const ushort input_width,
1779
    const ushort input_height,
1780
    const ushort output_width,
1781
    const ushort output_height) {
1782
  __global Dtype* convolved_image = convolved_image_base + convolved_image_offset;
1783
  const int outputX = get_global_id(0);
1784
  const int outputY = get_global_id(1);
1785
  const int outputZ = get_global_id(2);
1786
  if(outputX < output_width && outputY < output_height)
1787
  {
1788
    Dtype sum = 0.;
1789

1790
    const int org_y = outputY * STRIDE_Y - INPUT_PAD_H;
1791
    const int org_x = outputX * STRIDE_X - INPUT_PAD_W;
1792
    const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS);
1793
    const int biasIndex=outputZ%CHANNELS;
1794
    const int local_image_offset = org_y*input_width + org_x;
1795
    const int imageSize = input_width*input_height;
1796

1797
    __global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset));
1798
    __global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
1799

1800
    for(int y = 0; y < KERNEL_H; y++)
1801
    {
1802
      for(int x = 0; x < KERNEL_W; x++)
1803
      {
1804
        if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width))
1805
        {
1806
          continue;
1807
        }
1808
        sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x];
1809
      }
1810
      image_dataPtrFloat += input_width * DILATION_Y;
1811
      kernel_dataPtrFloat += KERNEL_W;
1812
    }
1813

1814
    #if APPLY_BIAS
1815
    int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
1816
    ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex);
1817
    #else
1818
    int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
1819
    ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex);
1820
    #endif
1821
  }
1822
}
1823
#endif // KERNEL_BASIC/IDLF/GEMM_LIKE/DWCONV
1824

1825
Product

Resources

Company