Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/dnn/src/opencl/conv_layer_spatial.cl
16337 views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
8
//
9
//
10
// License Agreement
11
// For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2017, Intel Corporation, all rights reserved.
14
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
15
// Third party copyrights are property of their respective owners.
16
//
17
// Redistribution and use in source and binary forms, with or without modification,
18
// are permitted provided that the following conditions are met:
19
//
20
// * Redistribution's of source code must retain the above copyright notice,
21
// this list of conditions and the following disclaimer.
22
//
23
// * Redistribution's in binary form must reproduce the above copyright notice,
24
// this list of conditions and the following disclaimer in the documentation
25
// and/or other materials provided with the distribution.
26
//
27
// * The name of the copyright holders may not be used to endorse or promote products
28
// derived from this software without specific prior written permission.
29
//
30
// This software is provided by the copyright holders and contributors "as is" and
31
// any express or implied warranties, including, but not limited to, the implied
32
// warranties of merchantability and fitness for a particular purpose are disclaimed.
33
// In no event shall the Intel Corporation or contributors be liable for any direct,
34
// indirect, incidental, special, exemplary, or consequential damages
35
// (including, but not limited to, procurement of substitute goods or services;
36
// loss of use, data, or profits; or business interruption) however caused
37
// and on any theory of liability, whether in contract, strict liability,
38
// or tort (including negligence or otherwise) arising in any way out of
39
// the use of this software, even if advised of the possibility of such damage.
40
//
41
//M*/
42
43
#if defined(cl_khr_fp16)
44
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
45
#endif
46
47
#define KERNEL_ARG_DTYPE float
48
#define TYPE_FLOAT 1
49
#define TYPE_HALF 2
50
51
#if defined(FUSED_CONV_RELU)
52
#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope)))
53
#define FUSED_ARG KERNEL_ARG_DTYPE negative_slope,
54
#elif defined(FUSED_CONV_PRELU)
55
#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope[c])))
56
#define FUSED_ARG __global const KERNEL_ARG_DTYPE* negative_slope,
57
#elif defined(FUSED_CONV_POWER)
58
#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, (Dtype)power)
59
#define FUSED_ARG KERNEL_ARG_DTYPE power,
60
#elif defined(FUSED_CONV_TANH)
61
#define ACTIVATION_RELU_FUNCTION(x, c) tanh(x)
62
#define FUSED_ARG
63
#elif defined(FUSED_CONV_RELU6)
64
#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), (Dtype)min_value, (Dtype)max_value))
65
#define FUSED_ARG KERNEL_ARG_DTYPE min_value, KERNEL_ARG_DTYPE max_value,
66
#else
67
#define ACTIVATION_RELU_FUNCTION(x, c) (x)
68
#define FUSED_ARG
69
#endif
70
71
#ifdef FUSED_CONV_ELTWISE
72
#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
73
const Dtype _x_ = eltwise_data[(_offset_)] + (_data_); \
74
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
75
} while(0)
76
#define ELTWISE_DATA_ARG __global Dtype* eltwise_data,
77
#else
78
#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
79
const Dtype _x_ = (_data_); \
80
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
81
} while(0)
82
#define ELTWISE_DATA_ARG
83
#endif
84
85
#if APPLY_BIAS
86
#define BIAS_KERNEL_ARG __global Dtype * biases_base,
87
#else
88
#define BIAS_KERNEL_ARG
89
#endif
90
91
#define __CAT(x, y) x##y
92
#define CAT(x, y) __CAT(x, y)
93
#define LOOP0(VAR, STMT)
94
#define LOOP1(VAR, STMT) (STMT); (VAR)++;
95
#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;
96
#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;
97
#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;
98
#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;
99
#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;
100
#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;
101
#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;
102
#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;
103
#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;
104
#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;
105
#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;
106
#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;
107
#define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++;
108
#define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++;
109
#define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++;
110
#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
111
112
#if defined(convolve_simd) || defined(Conv_Interleaved)
113
#if TYPE == TYPE_HALF
114
#define INT_TYPE ushort
115
#define INT_TYPE2 ushort2
116
#define INT_TYPE4 ushort4
117
#define INT_TYPE8 ushort8
118
#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read_us2
119
#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read_us4
120
#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read_us8
121
#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read_us
122
#else
123
#define INT_TYPE uint
124
#define INT_TYPE2 uint2
125
#define INT_TYPE4 uint4
126
#define INT_TYPE8 uint8
127
#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read2
128
#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read4
129
#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8
130
#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read
131
#endif
132
#endif
133
134
#ifdef KERNEL_BASIC
135
136
__kernel void ConvolveBasic(
137
ELTWISE_DATA_ARG
138
FUSED_ARG
139
__global Dtype* image_data,
140
int image_offset,
141
__global Dtype* kernel_data,
142
int kernel_offset,
143
__global Dtype* bias,
144
const int bias_offset,
145
__global Dtype* convolved_image_base,
146
const int convolved_image_base_offset,
147
const int convolved_image_offset,
148
const ushort input_width,
149
const ushort input_height,
150
const ushort output_width,
151
const ushort output_height,
152
const ushort pad_w,
153
const ushort pad_h
154
)
155
{
156
__global Dtype* convolved_image = convolved_image_base + convolved_image_base_offset;
157
const int outputX = get_global_id(0);
158
const int outputY = get_global_id(1);
159
const int kernelNum = get_global_id(2) * ZPAR;
160
if (outputX < output_width && outputY < output_height)
161
{
162
Dtype sum[ZPAR];
163
for (int kern = 0; kern < ZPAR; kern++)
164
{
165
sum[kern] = 0.0f;
166
}
167
const int org_y = outputY * STRIDE_Y - pad_h;
168
const int org_x = outputX * STRIDE_X - pad_w;
169
const int currentKernelOffset = kernel_offset + kernelNum*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;
170
#if APPLY_BIAS
171
const int biasIndex = bias_offset + kernelNum;
172
#endif
173
const int local_image_offset = org_y * input_width + org_x;
174
const int imageSize = input_width * input_height;
175
__global Dtype* image_dataPtr = (image_data + (image_offset + local_image_offset));
176
__global Dtype* kernel_dataPtr = (kernel_data + (currentKernelOffset));
177
for (int c = 0; c < CHANNELS; c++)
178
{
179
for (int y = 0; y < KERNEL_HEIGHT; y++)
180
{
181
for (int x = 0; x < KERNEL_WIDTH; x++)
182
{
183
int y_ = org_y + y * DILATION_Y;
184
int x_ = org_x + x * DILATION_X;
185
if (!(y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width))
186
{
187
continue;
188
}
189
for (int kern = 0; kern < ZPAR; kern++)
190
{
191
sum[kern] += image_dataPtr[x * DILATION_X] * kernel_dataPtr[kern*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS + x];
192
}
193
}
194
image_dataPtr += input_width * DILATION_Y;
195
kernel_dataPtr += KERNEL_WIDTH;
196
}
197
image_dataPtr += imageSize - input_width*KERNEL_HEIGHT*DILATION_Y;
198
}
199
200
for (int kern = 0; kern < ZPAR; kern++)
201
{
202
if (kernelNum + kern < OUTPUT_Z)
203
{
204
int offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX;
205
#if APPLY_BIAS
206
ACTIVATION_FUNCTION(convolved_image, offset, sum[kern] + bias[biasIndex + kern], biasIndex + kern);
207
#else
208
ACTIVATION_FUNCTION(convolved_image, offset, sum[kern], biasIndex + kern);
209
#endif
210
}
211
}
212
}
213
}
214
215
#elif defined KERNEL_IDLF
216
217
// Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map.
218
// Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the input image.
219
// NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH
220
221
// NOTE: for beignet this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break.
222
__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
223
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
224
__kernel void
225
convolve_simd(
226
ELTWISE_DATA_ARG
227
FUSED_ARG
228
__global Dtype* inputs,
229
__global Dtype* weights,
230
BIAS_KERNEL_ARG
231
__global Dtype* outputs_base,
232
const int outputs_offset,
233
const ushort input_width,
234
const ushort input_height,
235
const ushort output_width,
236
const ushort output_height)
237
{
238
__global Dtype* outputs = outputs_base + outputs_offset;
239
unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
240
unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
241
unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth
242
unsigned int fmg = get_group_id(2);
243
unsigned int lid = get_local_id(2);
244
245
Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0.0f };
246
247
// find weights address of given neuron (lid is index)
248
unsigned int weight_addr = (fmg % FILTERS_IN_GROUP) *
249
INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;
250
251
unsigned int num_in_batch = fm / ALIGNED_NUM_FILTERS;
252
253
unsigned int input_batch_offset = num_in_batch * INPUT_PITCH * TOTAL_INPUT_DEPTH_SIZE;
254
255
int curr_y = or * STRIDE_Y;
256
int curr_x = oc * STRIDE_X + lid;
257
258
int in_addr = input_batch_offset
259
+ (curr_y - INPUT_PAD_H) * INPUT_WIDTH // y tile offset
260
+ curr_x - INPUT_PAD_W; // x tile offset
261
262
const int in_limit = (get_global_size(2) / ALIGNED_NUM_FILTERS) * TOTAL_INPUT_DEPTH_SIZE * INPUT_PITCH - 1;
263
264
Dtype in_buf[INVEC_SIZE];
265
266
for(int kd = 0; kd < INPUT_DEPTH; kd++)
267
{
268
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
269
const bool cx_out_of_range = !(curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W);
270
int in_offset = in_addr;
271
__attribute__((opencl_unroll_hint(INVEC_SIZE)))
272
for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH)
273
{
274
Dtype input = inputs[clamp(in_offset, 0, in_limit)];
275
int cy = curr_y + reg;
276
in_buf[reg] = (cx_out_of_range || cy < INPUT_PAD_H || cy >= INPUT_HEIGHT + INPUT_PAD_H) ? 0 : input;
277
}
278
#else
279
int in_offset = in_addr;
280
__attribute__((opencl_unroll_hint(INVEC_SIZE)))
281
for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH)
282
{
283
in_buf[reg] = inputs[min(in_offset, in_limit)];
284
}
285
#endif
286
287
in_addr += INPUT_PITCH;
288
289
#define BLOCK_IN(n, c) intel_sub_group_shuffle(in_buf[n], (c))
290
291
int kr = 0; // kr = Kernel Row
292
LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop.
293
{
294
int kc = 0; // kc = Kernel Column
295
LOOP(KERNEL_WIDTH, kc,
296
{
297
Dtype weight_value = weights[weight_addr];
298
weight_addr += SIMD_SIZE;
299
for (int br=0; br < OUT_BLOCK_HEIGHT; br++)
300
{
301
for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++)
302
{
303
Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y), bc * STRIDE_X + kc * DILATION_X);
304
out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_value, input, out[br * OUT_BLOCK_WIDTH + bc]);
305
}
306
}
307
});
308
});
309
}
310
311
fm = fm % ALIGNED_NUM_FILTERS;
312
313
#if LEFT_FILTERS > 0
314
if (fm < NUM_FILTERS)
315
#endif
316
{
317
unsigned int out_addr = (num_in_batch * TOTAL_OUTPUT_DEPTH + fm) * OUTPUT_PITCH;
318
out_addr += or * output_width + oc;
319
// we need this address calculation for biases because we support views and batching
320
#if APPLY_BIAS
321
Dtype bias = biases_base[fm];
322
#else
323
Dtype bias = 0;
324
#endif
325
326
for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++)
327
{
328
if (r + or >= output_height) break;
329
for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++)
330
{
331
if (c + oc >= output_width) break;
332
// this does a scattered write to SIMD_SIZE different feature maps,
333
// so that data within one map is contiguous, thus ready for input to next layer.
334
ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c], fm);
335
}
336
}
337
}
338
}
339
340
#elif defined KERNEL_GEMM_LIKE
341
342
#if APPLY_BIAS
343
#define SUBGROUP_GET_BIAS(k, i) intel_sub_group_shuffle(bias[k], i)
344
#else
345
#define SUBGROUP_GET_BIAS(k, i) ((Dtype)0)
346
#endif
347
348
#ifdef Conv_Interleaved
349
typedef struct float1 { float s0; } float1;
350
typedef struct float5 { float s0; float s1; float s2; float s3; float s4; } float5;
351
typedef struct float6 { float s0; float s1; float s2; float s3; float s4; float s5; } float6;
352
typedef struct float7 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; } float7;
353
typedef struct float9 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; } float9;
354
typedef struct float10 { float s0; float s1; float s2; float s3; float s4; float s5;
355
float s6; float s7; float s8; float s9;} float10;
356
typedef struct float11 { float s0; float s1; float s2; float s3; float s4; float s5;
357
float s6; float s7; float s8; float s9; float sa;} float11;
358
typedef struct float12 { float s0; float s1; float s2; float s3; float s4; float s5;
359
float s6; float s7; float s8; float s9; float sa; float sb; } float12;
360
typedef struct float13 { float s0; float s1; float s2; float s3; float s4; float s5;
361
float s6; float s7; float s8; float s9; float sa; float sb; float sc;} float13;
362
typedef struct float14 { float s0; float s1; float s2; float s3; float s4; float s5;
363
float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; } float14;
364
typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float s5;
365
float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15;
366
typedef struct float0 { float s0; } float0; //never used but makes compiler happy.
367
368
typedef struct half1 { half s0; } half1;
369
typedef struct half5 { half s0; half s1; half s2; half s3; half s4; } half5;
370
typedef struct half6 { half s0; half s1; half s2; half s3; half s4; half s5; } half6;
371
typedef struct half7 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; } half7;
372
typedef struct half9 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; } half9;
373
typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5;
374
half s6; half s7; half s8; half s9; } half10;
375
typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5;
376
half s6; half s7; half s8; half s9; half sa; } half11;
377
typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5;
378
half s6; half s7; half s8; half s9; half sa; half sb; } half12;
379
typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5;
380
half s6; half s7; half s8; half s9; half sa; half sb; half sc; } half13;
381
typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5;
382
half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; } half14;
383
typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5;
384
half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; half se; } half15;
385
typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
386
387
#define OUT_PITCH_X output_width
388
#define ROW_PITCH input_width
389
390
#define GEMM_LIKE_KERNEL_ARGS \
391
ELTWISE_DATA_ARG \
392
FUSED_ARG \
393
const __global Dtype *src0, \
394
const __global Dtype *src1, \
395
BIAS_KERNEL_ARG \
396
__global Dtype *dst_base, \
397
const int dst_offset, \
398
const ushort input_width, \
399
const ushort input_height, \
400
const ushort output_width, \
401
const ushort output_height, \
402
const int out_pitch_y, \
403
const int out_pitch_z, \
404
const int aligned_input_size, \
405
const int slice_pitch
406
#endif
407
408
#ifdef GEMM_LIKE_CONV_32_1
409
//////////////////////////////////////////////////////////////////////////////
410
// Conv_Interleaved_32_1_flex
411
//
412
// Convolution: each workitem computes 1 patch x 32 filters worth of output
413
// data. Kernel's inner loop works on a single tile consisting of one
414
// row from each patch and the filter data corresponding to that row. Filter
415
// matrix is interleaved to reduce GRF bank conflicts. Patches are walked
416
// by rows and then by slices. Relies on sub_group extension for block
417
// reads and SIMD broadcast. Allows flexible sizing of TILE width (TILE_N)
418
// by dynamically selecting one of two code paths: one uses TILE_N = 32 and
419
// the other uses TILE_N = 8, 16, or 24.
420
#define TILE_M 1
421
#define TILE_K KERNEL_WIDTH
422
#define TILE_N 32
423
424
__attribute__((intel_reqd_sub_group_size(8)))
425
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
426
{
427
__global Dtype *dst = dst_base + dst_offset;
428
const int group_x = get_group_id(0);
429
const int group_y = get_group_id(1);
430
const int global_x = get_global_id(0);
431
const int global_y = get_global_id(1);
432
const int global_z = get_global_id(2);
433
int interleaved_y;
434
int kernel_y;
435
int kernel_idx;
436
437
#define DOT_PRODUCT_8( _result, _rowA, colB ) \
438
{ \
439
_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \
440
_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \
441
_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \
442
_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \
443
_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \
444
_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \
445
_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \
446
_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \
447
}
448
typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
449
450
// True for all threads if filter_width is multiple of TILE_N
451
// else, true for all but right-most column of threads.
452
if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )
453
{
454
// Result ctile (*dst) is M rows x N columns
455
// LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.
456
Dtype8 blockC00 = 0.f;
457
Dtype8 blockC10 = 0.f;
458
Dtype8 blockC20 = 0.f;
459
Dtype8 blockC30 = 0.f;
460
461
// Src0 (patch input) is directly used as atile.
462
// Each work item points to the start of a different patch.
463
// atile is M rows x K columns.
464
int curr_x = ( global_y % output_width ) * STRIDE_X;
465
int curr_y = ( global_y / output_width ) * STRIDE_Y;
466
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
467
int saved_y = curr_y;
468
#endif
469
const __global Dtype *src0_read = src0
470
+ aligned_input_size * global_z // batch offset
471
+ (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset
472
+ (curr_x - INPUT_PAD_W); // x offset
473
474
// Src1 (filter) is directly used as btile.
475
// It starts at the top of src1 and walks down.
476
// btile is K rows x N columns.
477
const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
478
479
// Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
480
// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
481
// and KERNEL_WIDTH/2 rows of interleaved filter.
482
int patch_depth = 0;
483
do
484
{
485
int patch_row = 0;
486
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
487
curr_y = saved_y;
488
#endif
489
490
do
491
{
492
// Load atile and btile.
493
// Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity.
494
// The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non
495
// interleaved row is padded with zero to ensure same size as interleaved rows. This
496
// interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the
497
// kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
498
// (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
499
// (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ...
500
// (0, 2) (8, 2) (16, 2) (24, 2) ... ...
501
// ...
502
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
503
504
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
505
#if KERNEL_WIDTH == 3
506
Dtype_t blockA00 = vload3(0, src0_read);
507
Dtype* pblockA00 = (Dtype*)(&blockA00);
508
#else
509
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];
510
Dtype* pblockA00 = (Dtype*)(&blockA00);
511
#endif
512
#else
513
Dtype_t blockA00;
514
Dtype* pblockA00 = (Dtype*)(&blockA00);
515
int pos = 0;
516
LOOP(KERNEL_WIDTH, pos,
517
{
518
if (curr_y >= INPUT_PAD_H &&
519
curr_y < input_height + INPUT_PAD_H &&
520
curr_x + pos * DILATION_X >= INPUT_PAD_W &&
521
curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
522
pblockA00[pos] = src0_read[pos * DILATION_X];
523
else
524
pblockA00[pos] = 0;
525
})
526
curr_y += DILATION_Y;
527
#endif
528
src0_read += (ROW_PITCH * DILATION_Y);
529
530
Dtype blockB00[KERNEL_WIDTH*4];
531
Dtype8* p8BlockB00 = (Dtype8*)blockB00;
532
Dtype4* p4BlockB00 = (Dtype4*)blockB00;
533
Dtype* pBlockB00 = (Dtype* )blockB00;
534
535
interleaved_y = 0;
536
LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
537
{
538
p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE *)src1_read ) );
539
src1_read += WIDTH1 * 2;
540
} )
541
if ( kernel_width_is_odd )
542
{
543
p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE *)src1_read ) );
544
src1_read += WIDTH1 * 2;
545
}
546
547
// Perform MADs
548
kernel_idx = 0;
549
interleaved_y = 0;
550
LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
551
{
552
kernel_y = interleaved_y * 2;
553
DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
554
DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
555
DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
556
DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
557
DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
558
DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
559
DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
560
DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
561
} )
562
kernel_y = interleaved_y * 2;
563
if ( kernel_width_is_odd )
564
{
565
DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
566
DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
567
DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
568
DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
569
}
570
}
571
572
//while( ++patch_row < 1 ); //debug
573
while( ++patch_row < KERNEL_HEIGHT );
574
575
// reset to start of next slice of patch
576
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
577
}
578
//while ( ++patch_depth < 1 ); //debug
579
while ( ++patch_depth < INPUT_DEPTH );
580
581
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
582
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
583
int out_offset = global_z * out_pitch_z // batch offset
584
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
585
+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset
586
+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset
587
588
__global Dtype *out = dst + out_offset;
589
#if APPLY_BIAS
590
Dtype bias[4];
591
Dtype4 *bias_vec;
592
bias_vec = (Dtype4*)bias;
593
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
594
if (group_x > 0xFFFFFFFEul) {
595
dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
596
}
597
#else
598
const Dtype bias[4] = {0, 0, 0, 0};
599
#endif
600
if (global_y * TILE_M < output_width * output_height )
601
{
602
for (int i = 0; i < 8; i++)
603
{
604
ACTIVATION_FUNCTION(dst, out_offset + ( 0 + i ) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
605
ACTIVATION_FUNCTION(dst, out_offset + ( 8 + i ) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + 8 + i);
606
ACTIVATION_FUNCTION(dst, out_offset + ( 16 + i ) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + 16 + i);
607
ACTIVATION_FUNCTION(dst, out_offset + ( 24 + i ) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + 24 + i);
608
}
609
}
610
}
611
#if TILE_N_LAST > 0
612
else
613
{
614
615
// Result ctile (*dst) is M rows x N columns
616
// LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.
617
int i = 0;
618
Dtype8 blockC[TILE_N_LAST_DIV8];
619
LOOP(TILE_N_LAST_DIV8, i,
620
{
621
blockC[i] = 0.f;
622
} )
623
624
// Src0 (patch input) is directly used as atile.
625
// Each work item points to the start of a different patch.
626
// atile is M rows x K columns.
627
int curr_x = ( global_y % output_width ) * STRIDE_X;
628
int curr_y = ( global_y / output_width ) * STRIDE_Y;
629
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
630
int saved_y = curr_y;
631
#endif
632
const __global Dtype *src0_read = src0
633
+ aligned_input_size * global_z // batch offset
634
+ (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset
635
+ (curr_x - INPUT_PAD_W); // x offset
636
637
// Src1 (filter) is directly used as btile.
638
// It starts at the top of src1 and walks down.
639
// btile is K rows x N columns.
640
const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
641
642
// Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
643
// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
644
// and KERNEL_WIDTH/2 rows of interleaved filter.
645
int patch_depth = 0;
646
do
647
{
648
int patch_row = 0;
649
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
650
curr_y = saved_y;
651
#endif
652
do
653
{
654
// Load atile and interleaved btile.
655
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
656
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
657
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];
658
Dtype* pblockA00 = (Dtype*)(&blockA00);
659
#else
660
Dtype_t blockA00;
661
Dtype* pblockA00 = (Dtype*)(&blockA00);
662
int pos = 0;
663
LOOP(KERNEL_WIDTH, pos,
664
{
665
if (curr_y >= INPUT_PAD_H &&
666
curr_y < input_height + INPUT_PAD_H &&
667
curr_x + pos * DILATION_X >= INPUT_PAD_W &&
668
curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
669
pblockA00[pos] = src0_read[pos * DILATION_X];
670
else
671
pblockA00[pos] = 0;
672
})
673
curr_y += DILATION_Y;
674
#endif
675
src0_read += (ROW_PITCH * DILATION_Y);
676
Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8];
677
678
interleaved_y = 0;
679
LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
680
{
681
#if TILE_N_LAST_DIV8 == 1
682
Dtype2* p2BlockB = (Dtype2* )blockB;
683
p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
684
#elif TILE_N_LAST_DIV8 == 2
685
Dtype4* p4BlockB = (Dtype4* )blockB;
686
p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
687
#elif TILE_N_LAST_DIV8 == 3
688
//TODO: broken. No block_read6
689
Dtype6* p6BlockB = (Dtype6* )blockB;
690
(*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
691
(*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) );
692
#endif
693
src1_read += WIDTH1 * 2;
694
} )
695
if ( kernel_width_is_odd )
696
{
697
#if TILE_N_LAST_DIV8 == 1
698
Dtype* pBlockB = (Dtype* )blockB;
699
pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) );
700
#elif TILE_N_LAST_DIV8 == 2
701
Dtype2* p2BlockB = (Dtype2* )blockB;
702
p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
703
#elif TILE_N_LAST_DIV8 == 3
704
Dtype3* p3BlockB = (Dtype3* )blockB;
705
p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
706
p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 2 * 8) ) );
707
#endif
708
src1_read += WIDTH1 * 2;
709
}
710
711
// Perform MADs
712
Dtype* pBlockB = (Dtype*)blockB;
713
kernel_idx = 0;
714
interleaved_y = 0;
715
LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
716
{
717
kernel_y = interleaved_y * 2;
718
DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;
719
DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
720
#if TILE_N_LAST_DIV8 >= 2
721
DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;
722
DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
723
#if TILE_N_LAST_DIV8 >= 3
724
DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;
725
DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
726
#endif
727
#endif
728
} )
729
kernel_y = interleaved_y * 2;
730
if ( kernel_width_is_odd )
731
{
732
DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
733
#if TILE_N_LAST_DIV8 >= 2
734
DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
735
#if TILE_N_LAST_DIV8 >= 3
736
DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
737
#endif
738
#endif
739
}
740
}
741
742
//while( ++patch_row < 1 ); //debug
743
while( ++patch_row < KERNEL_HEIGHT );
744
745
// reset to start of next slice of patch
746
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
747
}
748
//while ( ++patch_depth < 1 ); //debug
749
while ( ++patch_depth < INPUT_DEPTH );
750
751
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
752
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
753
int out_offset = global_z * out_pitch_z // batch offset
754
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
755
+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset
756
+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset
757
__global Dtype *out = dst + out_offset;
758
#if APPLY_BIAS
759
Dtype bias[4];
760
Dtype4 *bias_vec;
761
bias_vec = (Dtype4*)bias;
762
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
763
if (group_x > 0xFFFFFFFEul) {
764
dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
765
}
766
#else
767
const Dtype bias[4] = {0, 0, 0, 0};
768
#endif
769
770
if (global_y * TILE_M < output_width * output_height )
771
{
772
for (int i = 0; i < 8; i++)
773
{
774
if ( TILE_N_LAST_DIV8 > 0 )
775
{
776
ACTIVATION_FUNCTION(dst, out_offset + ( 0+i) * out_pitch_y, blockC[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
777
}
778
if ( TILE_N_LAST_DIV8 > 1 )
779
{
780
ACTIVATION_FUNCTION(dst, out_offset + ( 8+i) * out_pitch_y, blockC[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);
781
}
782
if ( TILE_N_LAST_DIV8 > 2 )
783
{
784
ACTIVATION_FUNCTION(dst, out_offset + (16+i) * out_pitch_y, blockC[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);
785
}
786
if ( TILE_N_LAST_DIV8 > 3 )
787
{
788
ACTIVATION_FUNCTION(dst, out_offset + (24+i) * out_pitch_y, blockC[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);
789
}
790
}
791
}
792
}
793
#endif
794
}
795
#endif
796
#ifdef GEMM_LIKE_CONV_32_2
797
798
//////////////////////////////////////////////////////////////////////////////
799
// Conv_Interleaved_32_2_flex
800
//
801
// Convolution: each workitem computes 1 patch x 32 filters worth of output
802
// data. Kernel's inner loop works on a single tile consisting of one
803
// row from each patch and the filter data corresponding to that row. Filter
804
// matrix is interleaved to reduce GRF bank conflicts. Patches are walked
805
// by rows and then by slices. Relies on sub_group extension for block
806
// reads and SIMD broadcast. Allows flexible sizing of TILE width (TILE_N)
807
// by dynamically selecting one of two code paths: one uses TILE_N = 32 and
808
// the other uses TILE_N = 8, 16, or 24.
809
#define TILE_M 2
810
#define TILE_K KERNEL_WIDTH
811
#define TILE_N 32
812
813
__attribute__((intel_reqd_sub_group_size(8)))
814
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
815
{
816
__global Dtype *dst = dst_base + dst_offset;
817
const int group_x = get_group_id(0);
818
const int group_y = get_group_id(1);
819
const int global_x = get_global_id(0);
820
const int global_y = get_global_id(1);
821
const int global_z = get_global_id(2);
822
int interleaved_y;
823
int kernel_y;
824
int kernel_idx;
825
826
#define DOT_PRODUCT_8( _result, _rowA, colB ) \
827
{ \
828
_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \
829
_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \
830
_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \
831
_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \
832
_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \
833
_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \
834
_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \
835
_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \
836
}
837
typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
838
839
// True for all threads if filter_width is multiple of TILE_N
840
// else, true for all but right-most column of threads.
841
if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )
842
{
843
// Result ctile (*dst) is M rows x N columns
844
// LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.
845
Dtype8 blockC00 = 0.f;
846
Dtype8 blockC10 = 0.f;
847
Dtype8 blockC20 = 0.f;
848
Dtype8 blockC30 = 0.f;
849
Dtype8 blockC01 = 0.f;
850
Dtype8 blockC11 = 0.f;
851
Dtype8 blockC21 = 0.f;
852
Dtype8 blockC31 = 0.f;
853
854
// Src0 (patch input) is directly used as atile.
855
// Each work item points to the start of a different patch.
856
// atile is M rows x K columns.
857
int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
858
int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
859
int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
860
int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
861
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
862
int saved_y0 = curr_y0;
863
int saved_y1 = curr_y1;
864
#endif
865
const __global Dtype *src0_read0 = src0
866
+ aligned_input_size * global_z // batch offset
867
+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset
868
+ curr_x0 - INPUT_PAD_W; // x offset
869
const __global Dtype *src0_read1 = src0
870
+ aligned_input_size * global_z // batch offset
871
+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset
872
+ curr_x1 - INPUT_PAD_W; // x offset
873
874
// Src1 (filter) is directly used as btile.
875
// It starts at the top of src1 and walks down.
876
// btile is K rows x N columns.
877
const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
878
879
// Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
880
// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
881
// and KERNEL_WIDTH/2 rows of interleaved filter.
882
int patch_depth = 0;
883
do
884
{
885
int patch_row = 0;
886
do
887
{
888
// Load atile and btile.
889
// Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity.
890
// The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non
891
// interleaved row is padded with zero to ensure same size as interleaved rows. This
892
// interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the
893
// kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
894
// (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
895
// (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ...
896
// (0, 2) (8, 2) (16, 2) (24, 2) ... ...
897
// ...
898
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
899
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
900
#if KERNEL_WIDTH == 3
901
Dtype_t blockA00 = vload3(0, src0_read0); src0_read0 += ROW_PITCH;
902
Dtype_t blockA01 = vload3(0, src0_read1); src0_read1 += ROW_PITCH;
903
Dtype* pblockA00 = (Dtype*)(&blockA00);
904
Dtype* pblockA01 = (Dtype*)(&blockA01);
905
#else
906
Dtype_t blockA00 = { (Dtype)0.f };
907
Dtype_t blockA01 = { (Dtype)0.f };
908
Dtype* pblockA00 = (Dtype*)(&blockA00);
909
Dtype* pblockA01 = (Dtype*)(&blockA01);
910
int pos = 0;
911
LOOP(KERNEL_WIDTH, pos,
912
{
913
if (curr_x0 + pos < input_width)
914
pblockA00[pos] = src0_read0[pos];
915
916
if (curr_x1 + pos < input_width)
917
pblockA01[pos] = src0_read1[pos];
918
})
919
src0_read0 += ROW_PITCH;
920
src0_read1 += ROW_PITCH;
921
#endif
922
#else
923
Dtype_t blockA00;
924
Dtype* pblockA00 = (Dtype*)(&blockA00);
925
int pos = 0;
926
LOOP(KERNEL_WIDTH, pos,
927
{
928
if (curr_y0 >= INPUT_PAD_H &&
929
curr_y0 < input_height + INPUT_PAD_H &&
930
curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
931
curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
932
pblockA00[pos] = src0_read0[pos * DILATION_X];
933
else
934
pblockA00[pos] = 0;
935
})
936
curr_y0 += DILATION_Y;
937
Dtype_t blockA01;
938
Dtype* pblockA01 = (Dtype*)(&blockA01);
939
pos = 0;
940
LOOP(KERNEL_WIDTH, pos,
941
{
942
if (curr_y1 >= INPUT_PAD_H &&
943
curr_y1 < input_height + INPUT_PAD_H &&
944
curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
945
curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
946
pblockA01[pos] = src0_read1[pos * DILATION_X];
947
else
948
pblockA01[pos] = 0;
949
})
950
curr_y1 += DILATION_Y;
951
src0_read0 += (ROW_PITCH * DILATION_Y);
952
src0_read1 += (ROW_PITCH * DILATION_Y);
953
#endif
954
Dtype blockB00[KERNEL_WIDTH*4];
955
Dtype8* p8BlockB00 = (Dtype8*)blockB00;
956
Dtype4* p4BlockB00 = (Dtype4*)blockB00;
957
Dtype* pBlockB00 = (Dtype* )blockB00;
958
959
interleaved_y = 0;
960
LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
961
{
962
p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE*)src1_read ) );
963
src1_read += WIDTH1 * 2;
964
} )
965
if ( kernel_width_is_odd )
966
{
967
p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
968
src1_read += WIDTH1 * 2;
969
}
970
// Perform MADs
971
kernel_idx = 0;
972
interleaved_y = 0;
973
LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
974
{
975
kernel_y = interleaved_y * 2;
976
DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
977
DOT_PRODUCT_8( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
978
DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
979
DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
980
DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
981
DOT_PRODUCT_8( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
982
DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
983
DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
984
DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
985
DOT_PRODUCT_8( blockC21, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
986
DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
987
DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
988
DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
989
DOT_PRODUCT_8( blockC31, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
990
DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
991
DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
992
} )
993
if ( kernel_width_is_odd )
994
{
995
kernel_y = interleaved_y * 2;
996
DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );
997
DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
998
DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );
999
DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1000
DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] );
1001
DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1002
DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] );
1003
DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1004
}
1005
}
1006
1007
//while( ++patch_row < 1 ); //debug
1008
while( ++patch_row < KERNEL_HEIGHT );
1009
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1010
curr_y0 = saved_y0;
1011
curr_y1 = saved_y1;
1012
#endif
1013
// reset to start of next slice of patch
1014
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
1015
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
1016
}
1017
//while ( ++patch_depth < 1 ); //debug
1018
while ( ++patch_depth < INPUT_DEPTH );
1019
1020
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
1021
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
1022
int out0_offset = global_z * out_pitch_z // batch offset
1023
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
1024
+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1025
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset
1026
int out1_offset = global_z * out_pitch_z // batch offset
1027
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
1028
+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1029
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset
1030
1031
#if APPLY_BIAS
1032
Dtype bias[4];
1033
Dtype4 *bias_vec;
1034
bias_vec = (Dtype4*)bias;
1035
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
1036
if (group_x > 0xFFFFFFFEul) {
1037
dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
1038
}
1039
#else
1040
const Dtype bias[4] = {0, 0, 0, 0};
1041
#endif
1042
1043
if( global_y * TILE_M < output_width * output_height )
1044
{
1045
for( int i = 0; i < 8; i++ )
1046
{
1047
ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
1048
ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);
1049
ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);
1050
ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);
1051
}
1052
}
1053
if( global_y * TILE_M + 1 < output_width * output_height )
1054
{
1055
for( int i = 0; i < 8; i++ )
1056
{
1057
ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC01[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
1058
ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC11[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);
1059
ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC21[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);
1060
ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC31[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);
1061
}
1062
}
1063
}
1064
#if TILE_N_LAST > 0
1065
else
1066
{
1067
1068
// Result ctile (*dst) is M rows x N columns
1069
// LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.
1070
int i = 0;
1071
Dtype8 blockC0[TILE_N_LAST_DIV8];
1072
Dtype8 blockC1[TILE_N_LAST_DIV8];
1073
LOOP(TILE_N_LAST_DIV8, i,
1074
{
1075
blockC0[i] = 0.f;
1076
blockC1[i] = 0.f;
1077
} )
1078
1079
// Src0 (patch input) is directly used as atile.
1080
// Each work item points to the start of a different patch.
1081
// atile is M rows x K columns.
1082
int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
1083
int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
1084
int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
1085
int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
1086
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1087
int saved_y0 = curr_y0;
1088
int saved_y1 = curr_y1;
1089
#endif
1090
const __global Dtype *src0_read0 = src0
1091
+ aligned_input_size * global_z // batch offset
1092
+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset
1093
+ curr_x0 - INPUT_PAD_W; // x offset
1094
const __global Dtype *src0_read1 = src0
1095
+ aligned_input_size * global_z // batch offset
1096
+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset
1097
+ curr_x1 - INPUT_PAD_W; // x offset
1098
1099
// Src1 (filter) is directly used as btile.
1100
// It starts at the top of src1 and walks down.
1101
// btile is K rows x N columns.
1102
const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
1103
1104
// Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
1105
// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
1106
// and KERNEL_WIDTH/2 rows of interleaved filter.
1107
int patch_depth = 0;
1108
do
1109
{
1110
int patch_row = 0;
1111
do
1112
{
1113
// Load atile and interleaved btile.
1114
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
1115
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
1116
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH;
1117
Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH;
1118
Dtype* pblockA00 = (Dtype*)(&blockA00);
1119
Dtype* pblockA01 = (Dtype*)(&blockA01);
1120
#else
1121
Dtype_t blockA00;
1122
Dtype* pblockA00 = (Dtype*)(&blockA00);
1123
int pos = 0;
1124
LOOP(KERNEL_WIDTH, pos,
1125
{
1126
if (curr_y0 >= INPUT_PAD_H &&
1127
curr_y0 < input_height + INPUT_PAD_H &&
1128
curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
1129
curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
1130
pblockA00[pos] = src0_read0[pos * DILATION_X];
1131
else
1132
pblockA00[pos] = 0;
1133
})
1134
curr_y0 += DILATION_Y;
1135
Dtype_t blockA01;
1136
Dtype* pblockA01 = (Dtype*)(&blockA01);
1137
pos = 0;
1138
LOOP(KERNEL_WIDTH, pos,
1139
{
1140
if (curr_y1 >= INPUT_PAD_H &&
1141
curr_y1 < input_height + INPUT_PAD_H &&
1142
curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
1143
curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
1144
pblockA01[pos] = src0_read1[pos * DILATION_X];
1145
else
1146
pblockA01[pos] = 0;
1147
})
1148
curr_y1 += DILATION_Y;
1149
src0_read0 += (ROW_PITCH * DILATION_Y);
1150
src0_read1 += (ROW_PITCH * DILATION_Y);
1151
#endif
1152
Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8];
1153
1154
interleaved_y = 0;
1155
LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1156
{
1157
#if TILE_N_LAST_DIV8 == 1
1158
Dtype2* p2BlockB = (Dtype2* )blockB;
1159
p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
1160
#elif TILE_N_LAST_DIV8 == 2
1161
Dtype4* p4BlockB = (Dtype4* )blockB;
1162
p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
1163
#elif TILE_N_LAST_DIV8 == 3
1164
//TODO: broken. No block_read6
1165
Dtype6* p6BlockB = (Dtype6* )blockB;
1166
(*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
1167
(*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) );
1168
#endif
1169
src1_read += WIDTH1 * 2;
1170
} )
1171
if ( kernel_width_is_odd )
1172
{
1173
#if TILE_N_LAST_DIV8 == 1
1174
Dtype* pBlockB = (Dtype* )blockB;
1175
pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) );
1176
#elif TILE_N_LAST_DIV8 == 2
1177
Dtype2* p2BlockB = (Dtype2* )blockB;
1178
p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
1179
#elif TILE_N_LAST_DIV8 == 3
1180
Dtype3* p3BlockB = (Dtype3* )blockB;
1181
p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
1182
p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 8) ) );
1183
#endif
1184
src1_read += WIDTH1 * 2;
1185
}
1186
1187
// Perform MADs
1188
Dtype* pBlockB = (Dtype*)blockB;
1189
kernel_idx = 0;
1190
interleaved_y = 0;
1191
LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1192
{
1193
kernel_y = interleaved_y * 2;
1194
DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y ], pBlockB[kernel_idx] );
1195
DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;
1196
DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );
1197
DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
1198
#if TILE_N_LAST_DIV8 >= 2
1199
DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y ], pBlockB[kernel_idx] );
1200
DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;
1201
DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );
1202
DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
1203
#if TILE_N_LAST_DIV8 >= 3
1204
DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y ], pBlockB[kernel_idx] );
1205
DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;
1206
DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );
1207
DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
1208
#endif
1209
#endif
1210
} )
1211
kernel_y = interleaved_y * 2;
1212
if ( kernel_width_is_odd )
1213
{
1214
DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y], pBlockB[kernel_idx] );
1215
DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
1216
#if TILE_N_LAST_DIV8 >= 2
1217
DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y], pBlockB[kernel_idx] );
1218
DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
1219
#if TILE_N_LAST_DIV8 >= 3
1220
DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y], pBlockB[kernel_idx] );
1221
DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
1222
#endif
1223
#endif
1224
}
1225
}
1226
1227
//while( ++patch_row < 1 ); //debug
1228
while( ++patch_row < KERNEL_HEIGHT );
1229
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1230
curr_y0 = saved_y0;
1231
curr_y1 = saved_y1;
1232
#endif
1233
// reset to start of next slice of patch
1234
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
1235
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
1236
}
1237
//while ( ++patch_depth < 1 ); //debug
1238
while ( ++patch_depth < INPUT_DEPTH );
1239
1240
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
1241
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
1242
int out0_offset = global_z * out_pitch_z // batch offset
1243
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
1244
+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1245
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset
1246
int out1_offset = global_z * out_pitch_z // batch offset
1247
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
1248
+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1249
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset
1250
__global Dtype *out1 = dst + out1_offset;
1251
1252
#if APPLY_BIAS
1253
Dtype bias[4];
1254
Dtype4 *bias_vec;
1255
bias_vec = (Dtype4*)bias;
1256
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
1257
if (group_x > 0xFFFFFFFEul) {
1258
dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
1259
}
1260
#else
1261
const Dtype bias[4] = {0, 0, 0, 0};
1262
#endif
1263
if( global_y * TILE_M < output_width * output_height )
1264
{
1265
for( int i = 0; i < 8; i++ )
1266
{
1267
if ( TILE_N_LAST_DIV8 > 0 )
1268
{
1269
ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC0[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
1270
}
1271
if ( TILE_N_LAST_DIV8 > 1 )
1272
{
1273
ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC0[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);
1274
}
1275
if ( TILE_N_LAST_DIV8 > 2 )
1276
{
1277
ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC0[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);
1278
}
1279
if ( TILE_N_LAST_DIV8 > 3 )
1280
{
1281
ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC0[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);
1282
}
1283
}
1284
}
1285
if( global_y * TILE_M + 1 < output_width * output_height )
1286
{
1287
for( int i = 0; i < 8; i++ )
1288
{
1289
if ( TILE_N_LAST_DIV8 > 0 )
1290
{
1291
ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC1[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
1292
}
1293
if ( TILE_N_LAST_DIV8 > 1 )
1294
{
1295
ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC1[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);
1296
}
1297
if ( TILE_N_LAST_DIV8 > 2 )
1298
{
1299
ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC1[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);
1300
}
1301
if ( TILE_N_LAST_DIV8 > 3 )
1302
{
1303
ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC1[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);
1304
}
1305
}
1306
}
1307
}
1308
#endif
1309
}
1310
#endif
1311
1312
#if defined(GEMM_LIKE_CONV_32_2_SIMD16) || defined(GEMM_LIKE_CONV_32_1_SIMD16)
1313
#define INTERLEAVED_SIMD16_OUTPUT(_out_, _offset_, _m_) do {\
1314
if (global_y * TILE_M < output_width * output_height ) \
1315
{ \
1316
if ( ( OUT_DEPTH % TILE_N ) == 0 ) {\
1317
for (int i = 0; i < 16; i++) \
1318
{ \
1319
ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1320
ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \
1321
} \
1322
} \
1323
else if( ( OUT_DEPTH % 16 ) == 0 ) { \
1324
if ( ( global_x + 1 ) < get_global_size(0) ) { \
1325
for ( int i = 0; i < 16; i++ ) \
1326
{ \
1327
ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1328
ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \
1329
} \
1330
} \
1331
else { \
1332
for (int i = 0; i < 16; i++) \
1333
{ \
1334
ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1335
} \
1336
} \
1337
} \
1338
else { \
1339
if ( ( global_x + 1 ) < get_global_size(0) ) \
1340
{ \
1341
for ( int i = 0; i < 16; i++ ) \
1342
{ \
1343
ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1344
ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \
1345
} \
1346
} \
1347
else { \
1348
if ( (OUT_DEPTH % TILE_N) > 16 ) { \
1349
for (int i = 0; i < 16 ; i++) \
1350
{ \
1351
ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1352
} \
1353
for (int i = 0; i < OUT_DEPTH % 16 ; i++) \
1354
{ \
1355
ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \
1356
} \
1357
} \
1358
else { \
1359
for (int i = 0; i < OUT_DEPTH % 16 ; i++) \
1360
{ \
1361
ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1362
} \
1363
} \
1364
} \
1365
} \
1366
} \
1367
}while(0)
1368
#endif
1369
1370
#ifdef GEMM_LIKE_CONV_32_1_SIMD16
1371
#define TILE_M 1
1372
#define TILE_K KERNEL_WIDTH
1373
#define TILE_N 32
1374
1375
__attribute__((intel_reqd_sub_group_size(16)))
1376
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1377
{
1378
__global Dtype *dst = dst_base + dst_offset;
1379
const int group_x = get_group_id(0);
1380
const int group_y = get_group_id(1);
1381
const int global_x = get_global_id(0);
1382
const int global_y = get_global_id(1);
1383
const int global_z = get_global_id(2);
1384
int interleaved_y;
1385
int kernel_y;
1386
int kernel_idx;
1387
1388
// Result ctile (*dst) is M rows x N columns
1389
// LWG size is 1x16. Thus each thread calculates 16*M rows x N cols of ctile.
1390
Dtype16 blockC00 = 0.f;
1391
Dtype16 blockC10 = 0.f;
1392
1393
// Src0 (patch input) is directly used as atile.
1394
// Each work item points to the start of a different patch.
1395
// atile is M rows x K columns.
1396
int curr_x = ( global_y % output_width ) * STRIDE_X;
1397
int curr_y = ( global_y / output_width ) * STRIDE_Y;
1398
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1399
int saved_y = curr_y;
1400
#endif
1401
const __global Dtype *src0_read = src0
1402
+ aligned_input_size * global_z // batch offset
1403
+ (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset
1404
+ curr_x - INPUT_PAD_W; // x offset
1405
const __global Dtype *src0_read_orig = src0_read;
1406
1407
// Src1 (filter) is directly used as btile.
1408
// It starts at the top of src1 and walks down.
1409
// btile is K rows x N columns.
1410
const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2 );
1411
1412
#define DOT_PRODUCT_16( _result, _rowA, colB ) \
1413
{ \
1414
_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \
1415
_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \
1416
_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \
1417
_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \
1418
_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \
1419
_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \
1420
_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \
1421
_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \
1422
_result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); \
1423
_result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); \
1424
_result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); \
1425
_result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); \
1426
_result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); \
1427
_result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); \
1428
_result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); \
1429
_result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); \
1430
}
1431
typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
1432
// Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
1433
// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
1434
// and KERNEL_WIDTH/2 rows of interleaved filter.
1435
int patch_depth = 0;
1436
__attribute__((opencl_unroll_hint(1)))
1437
do
1438
{
1439
int patch_row = 0;
1440
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1441
curr_y = saved_y;
1442
#endif
1443
__attribute__((opencl_unroll_hint(1)))
1444
do
1445
{
1446
// Load atile and btile.
1447
// Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype16 granularity.
1448
// The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non
1449
// interleaved row is padded with zero to ensure same size as interleaved rows. This
1450
// interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the
1451
// kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
1452
// (0, 0) (16, 0) (32, 0) (48, 0) ... (0, 0) ( 0, 1) (16, 0) ( 0, 1) (32, 0) (0, 1) (48, 0) ...
1453
// (0, 1) (16, 1) (32, 1) (48, 1) ... => (0, 2) (16, 2) (32, 2) (48, 2) ...
1454
// (0, 2) (16, 2) (32, 2) (48, 2) ... ...
1455
// ...
1456
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
1457
1458
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
1459
#if KERNEL_WIDTH == 3
1460
Dtype_t blockA00 = vload3(0, src0_read);
1461
Dtype* pblockA00 = (Dtype*)(&blockA00);
1462
#else
1463
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];
1464
Dtype* pblockA00 = (Dtype*)(&blockA00);
1465
#endif
1466
#else
1467
Dtype_t blockA00;
1468
Dtype* pblockA00 = (Dtype*)(&blockA00);
1469
int pos = 0;
1470
LOOP(KERNEL_WIDTH, pos,
1471
{
1472
if (curr_y >= INPUT_PAD_H &&
1473
curr_y < input_height + INPUT_PAD_H &&
1474
curr_x + pos * DILATION_X >= INPUT_PAD_W &&
1475
curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
1476
pblockA00[pos] = src0_read[pos * DILATION_X];
1477
else
1478
pblockA00[pos] = 0;
1479
})
1480
curr_y += DILATION_Y;
1481
#endif
1482
src0_read += ROW_PITCH * DILATION_Y;
1483
INT_TYPE blockB00[KERNEL_WIDTH * 2];
1484
INT_TYPE4* p4BlockB00 = (INT_TYPE4*)blockB00;
1485
INT_TYPE2* p2BlockB00 = (INT_TYPE2*)blockB00;
1486
Dtype* pBlockB00 = (Dtype*)blockB00;
1487
interleaved_y = 0;
1488
LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1489
{
1490
p4BlockB00[interleaved_y] = SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read );
1491
src1_read += WIDTH1 * 2;
1492
} )
1493
if ( kernel_width_is_odd )
1494
{
1495
p2BlockB00[KERNEL_WIDTH - 1] = SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read );
1496
src1_read += WIDTH1 * 2;
1497
}
1498
1499
// Perform MADs
1500
kernel_idx = 0;
1501
interleaved_y = 0;
1502
LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1503
{
1504
kernel_y = interleaved_y * 2;
1505
DOT_PRODUCT_16( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
1506
DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
1507
DOT_PRODUCT_16( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
1508
DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
1509
} )
1510
if ( kernel_width_is_odd )
1511
{
1512
kernel_y = interleaved_y * 2;
1513
DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1514
DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1515
}
1516
}
1517
1518
//while( ++patch_row < 1 ); //debug
1519
while( ++patch_row < KERNEL_HEIGHT );
1520
1521
// reset to start of next slice of patch
1522
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
1523
}
1524
//while ( ++patch_depth < 1 ); //debug
1525
while ( ++patch_depth < INPUT_DEPTH );
1526
1527
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
1528
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
1529
int out_offset = global_z * out_pitch_z // batch offset
1530
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
1531
+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset
1532
+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset
1533
__global Dtype *out = dst + out_offset;
1534
1535
#if APPLY_BIAS
1536
Dtype bias[2];
1537
Dtype2 *bias_vec;
1538
bias_vec = (Dtype2*)bias;
1539
*bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));
1540
if (group_x > 0xFFFFFFFEul) {
1541
dst[0] = bias[0] + bias[1];
1542
}
1543
#else
1544
const Dtype bias[2] = {0, 0};
1545
#endif
1546
INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
1547
}
1548
#endif
1549
1550
#ifdef GEMM_LIKE_CONV_32_2_SIMD16
1551
1552
//////////////////////////////////////////////////////////////////////////////
1553
// Conv_Interleaved_32_2_SIMD16
1554
//
1555
// Convolution: each workitem computes 1 patch x 32 filters worth of output
1556
// data.
1557
#define TILE_M 2
1558
#define TILE_K KERNEL_WIDTH
1559
#define TILE_N 32
1560
1561
__attribute__((intel_reqd_sub_group_size(16)))
1562
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1563
{
1564
__global Dtype *dst = dst_base + dst_offset;
1565
const int group_x = get_group_id(0);
1566
const int group_y = get_group_id(1);
1567
const int global_x = get_global_id(0);
1568
const int global_y = get_global_id(1);
1569
const int global_z = get_global_id(2);
1570
int interleaved_y;
1571
int kernel_y;
1572
int kernel_idx;
1573
#define DOT_PRODUCT_16( _result, _rowA, colB ) \
1574
{ \
1575
_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \
1576
_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \
1577
_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \
1578
_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \
1579
_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \
1580
_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \
1581
_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \
1582
_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \
1583
_result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); \
1584
_result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); \
1585
_result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); \
1586
_result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); \
1587
_result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); \
1588
_result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); \
1589
_result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); \
1590
_result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); \
1591
}
1592
typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
1593
1594
// True for all threads if filter_width is multiple of TILE_N
1595
// else, true for all but right-most column of threads.
1596
{
1597
// Result ctile (*dst) is M rows x N columns
1598
// LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.
1599
Dtype16 blockC00 = 0.f;
1600
Dtype16 blockC10 = 0.f;
1601
Dtype16 blockC01 = 0.f;
1602
Dtype16 blockC11 = 0.f;
1603
1604
// Src0 (patch input) is directly used as atile.
1605
// Each work item points to the start of a different patch.
1606
// atile is M rows x K columns.
1607
int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
1608
int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
1609
int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
1610
int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
1611
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1612
int saved_y0 = curr_y0;
1613
int saved_y1 = curr_y1;
1614
#endif
1615
const __global Dtype *src0_read0 = src0
1616
+ aligned_input_size * global_z // batch offset
1617
+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset
1618
+ curr_x0 - INPUT_PAD_W; // x offset
1619
const __global Dtype *src0_read1 = src0
1620
+ aligned_input_size * global_z // batch offset
1621
+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset
1622
+ curr_x1 - INPUT_PAD_W; // x offset
1623
1624
// Src1 (filter) is directly used as btile.
1625
// It starts at the top of src1 and walks down.
1626
// btile is K rows x N columns.
1627
const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
1628
1629
// Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
1630
// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
1631
// and KERNEL_WIDTH/2 rows of interleaved filter.
1632
int patch_depth = 0;
1633
do
1634
{
1635
int patch_row = 0;
1636
do
1637
{
1638
// Load atile and btile.
1639
// Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity.
1640
// The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non
1641
// interleaved row is padded with zero to ensure same size as interleaved rows. This
1642
// interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the
1643
// kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
1644
// (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
1645
// (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ...
1646
// (0, 2) (8, 2) (16, 2) (24, 2) ... ...
1647
// ...
1648
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
1649
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
1650
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH;
1651
Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH;
1652
Dtype* pblockA00 = (Dtype*)(&blockA00);
1653
Dtype* pblockA01 = (Dtype*)(&blockA01);
1654
#else
1655
Dtype_t blockA00;
1656
Dtype* pblockA00 = (Dtype*)(&blockA00);
1657
int pos = 0;
1658
LOOP(KERNEL_WIDTH, pos,
1659
{
1660
if (curr_y0 >= INPUT_PAD_H &&
1661
curr_y0 < input_height + INPUT_PAD_H &&
1662
curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
1663
curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
1664
pblockA00[pos] = src0_read0[pos * DILATION_X];
1665
else
1666
pblockA00[pos] = 0;
1667
})
1668
curr_y0 += DILATION_Y;
1669
Dtype_t blockA01;
1670
Dtype* pblockA01 = (Dtype*)(&blockA01);
1671
pos = 0;
1672
LOOP(KERNEL_WIDTH, pos,
1673
{
1674
if (curr_y1 >= INPUT_PAD_H &&
1675
curr_y1 < input_height + INPUT_PAD_H &&
1676
curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
1677
curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
1678
pblockA01[pos] = src0_read1[pos * DILATION_X];
1679
else
1680
pblockA01[pos] = 0;
1681
})
1682
curr_y1 += DILATION_Y;
1683
src0_read0 += (ROW_PITCH * DILATION_Y);
1684
src0_read1 += (ROW_PITCH * DILATION_Y);
1685
#endif
1686
Dtype blockB00[KERNEL_WIDTH*2];
1687
Dtype4* p4BlockB00 = (Dtype4*)blockB00;
1688
Dtype2* p2BlockB00 = (Dtype2*)blockB00;
1689
Dtype* pBlockB00 = (Dtype* )blockB00;
1690
1691
interleaved_y = 0;
1692
LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1693
{
1694
p4BlockB00[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
1695
src1_read += WIDTH1 * 2;
1696
} )
1697
if ( kernel_width_is_odd )
1698
{
1699
p2BlockB00[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
1700
src1_read += WIDTH1 * 2;
1701
}
1702
// Perform MADs
1703
kernel_idx = 0;
1704
interleaved_y = 0;
1705
LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1706
{
1707
kernel_y = interleaved_y * 2;
1708
DOT_PRODUCT_16( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
1709
DOT_PRODUCT_16( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
1710
DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
1711
DOT_PRODUCT_16( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
1712
DOT_PRODUCT_16( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
1713
DOT_PRODUCT_16( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
1714
DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
1715
DOT_PRODUCT_16( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
1716
} )
1717
if ( kernel_width_is_odd )
1718
{
1719
kernel_y = interleaved_y * 2;
1720
DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );
1721
DOT_PRODUCT_16( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1722
DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );
1723
DOT_PRODUCT_16( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1724
}
1725
}
1726
1727
//while( ++patch_row < 1 ); //debug
1728
while( ++patch_row < KERNEL_HEIGHT );
1729
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1730
curr_y0 = saved_y0;
1731
curr_y1 = saved_y1;
1732
#endif
1733
// reset to start of next slice of patch
1734
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
1735
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
1736
}
1737
//while ( ++patch_depth < 1 ); //debug
1738
while ( ++patch_depth < INPUT_DEPTH );
1739
1740
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
1741
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
1742
int out0_offset = global_z * out_pitch_z // batch offset
1743
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
1744
+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1745
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset
1746
int out1_offset = global_z * out_pitch_z // batch offset
1747
+ ( group_x * TILE_N ) * out_pitch_y // channel offset
1748
+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1749
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset
1750
1751
#if APPLY_BIAS
1752
Dtype bias[2];
1753
Dtype2 *bias_vec;
1754
bias_vec = (Dtype2*)bias;
1755
*bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));
1756
if (group_x > 0xFFFFFFFEul) {
1757
dst[0] = bias[0] + bias[1];
1758
}
1759
#else
1760
const Dtype bias[2] = {0, 0};
1761
#endif
1762
INTERLEAVED_SIMD16_OUTPUT(dst, out0_offset, 0);
1763
INTERLEAVED_SIMD16_OUTPUT(dst, out1_offset, 1);
1764
}
1765
}
1766
#endif
1767
1768
#elif defined KERNEL_DWCONV
1769
1770
__kernel void DWCONV(
1771
ELTWISE_DATA_ARG
1772
FUSED_ARG
1773
__global Dtype* image_data,
1774
__global Dtype* kernel_data,
1775
BIAS_KERNEL_ARG
1776
__global Dtype* convolved_image_base,
1777
const int convolved_image_offset,
1778
const ushort input_width,
1779
const ushort input_height,
1780
const ushort output_width,
1781
const ushort output_height) {
1782
__global Dtype* convolved_image = convolved_image_base + convolved_image_offset;
1783
const int outputX = get_global_id(0);
1784
const int outputY = get_global_id(1);
1785
const int outputZ = get_global_id(2);
1786
if(outputX < output_width && outputY < output_height)
1787
{
1788
Dtype sum = 0.;
1789
1790
const int org_y = outputY * STRIDE_Y - INPUT_PAD_H;
1791
const int org_x = outputX * STRIDE_X - INPUT_PAD_W;
1792
const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS);
1793
const int biasIndex=outputZ%CHANNELS;
1794
const int local_image_offset = org_y*input_width + org_x;
1795
const int imageSize = input_width*input_height;
1796
1797
__global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset));
1798
__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
1799
1800
for(int y = 0; y < KERNEL_H; y++)
1801
{
1802
for(int x = 0; x < KERNEL_W; x++)
1803
{
1804
if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width))
1805
{
1806
continue;
1807
}
1808
sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x];
1809
}
1810
image_dataPtrFloat += input_width * DILATION_Y;
1811
kernel_dataPtrFloat += KERNEL_W;
1812
}
1813
1814
#if APPLY_BIAS
1815
int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
1816
ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex);
1817
#else
1818
int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
1819
ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex);
1820
#endif
1821
}
1822
}
1823
#endif // KERNEL_BASIC/IDLF/GEMM_LIKE/DWCONV
1824
1825