Path: blob/master/modules/dnn/src/opencl/conv_layer_spatial.cl
16337 views
/*M///////////////////////////////////////////////////////////////////////////////////////1//2// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.3//4// By downloading, copying, installing or using the software you agree to this license.5// If you do not agree to this license, do not download, install,6// copy or use the software.7//8//9// License Agreement10// For Open Source Computer Vision Library11//12// Copyright (C) 2017, Intel Corporation, all rights reserved.13// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.14// Third party copyrights are property of their respective owners.15//16// Redistribution and use in source and binary forms, with or without modification,17// are permitted provided that the following conditions are met:18//19// * Redistribution's of source code must retain the above copyright notice,20// this list of conditions and the following disclaimer.21//22// * Redistribution's in binary form must reproduce the above copyright notice,23// this list of conditions and the following disclaimer in the documentation24// and/or other materials provided with the distribution.25//26// * The name of the copyright holders may not be used to endorse or promote products27// derived from this software without specific prior written permission.28//29// This software is provided by the copyright holders and contributors "as is" and30// any express or implied warranties, including, but not limited to, the implied31// warranties of merchantability and fitness for a particular purpose are disclaimed.32// In no event shall the Intel Corporation or contributors be liable for any direct,33// indirect, incidental, special, exemplary, or consequential damages34// (including, but not limited to, procurement of substitute goods or services;35// loss of use, data, or profits; or business interruption) however caused36// and on any theory of liability, whether in contract, strict liability,37// or tort (including negligence or otherwise) arising in any way out of38// the use of this software, even if advised of the possibility of such damage.39//40//M*/4142#if defined(cl_khr_fp16)43#pragma OPENCL EXTENSION cl_khr_fp16 : enable44#endif4546#define KERNEL_ARG_DTYPE float47#define TYPE_FLOAT 148#define TYPE_HALF 24950#if defined(FUSED_CONV_RELU)51#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope)))52#define FUSED_ARG KERNEL_ARG_DTYPE negative_slope,53#elif defined(FUSED_CONV_PRELU)54#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope[c])))55#define FUSED_ARG __global const KERNEL_ARG_DTYPE* negative_slope,56#elif defined(FUSED_CONV_POWER)57#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, (Dtype)power)58#define FUSED_ARG KERNEL_ARG_DTYPE power,59#elif defined(FUSED_CONV_TANH)60#define ACTIVATION_RELU_FUNCTION(x, c) tanh(x)61#define FUSED_ARG62#elif defined(FUSED_CONV_RELU6)63#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), (Dtype)min_value, (Dtype)max_value))64#define FUSED_ARG KERNEL_ARG_DTYPE min_value, KERNEL_ARG_DTYPE max_value,65#else66#define ACTIVATION_RELU_FUNCTION(x, c) (x)67#define FUSED_ARG68#endif6970#ifdef FUSED_CONV_ELTWISE71#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \72const Dtype _x_ = eltwise_data[(_offset_)] + (_data_); \73(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \74} while(0)75#define ELTWISE_DATA_ARG __global Dtype* eltwise_data,76#else77#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \78const Dtype _x_ = (_data_); \79(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \80} while(0)81#define ELTWISE_DATA_ARG82#endif8384#if APPLY_BIAS85#define BIAS_KERNEL_ARG __global Dtype * biases_base,86#else87#define BIAS_KERNEL_ARG88#endif8990#define __CAT(x, y) x##y91#define CAT(x, y) __CAT(x, y)92#define LOOP0(VAR, STMT)93#define LOOP1(VAR, STMT) (STMT); (VAR)++;94#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;95#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;96#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;97#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;98#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;99#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;100#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;101#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;102#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;103#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;104#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;105#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;106#define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++;107#define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++;108#define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++;109#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))110111#if defined(convolve_simd) || defined(Conv_Interleaved)112#if TYPE == TYPE_HALF113#define INT_TYPE ushort114#define INT_TYPE2 ushort2115#define INT_TYPE4 ushort4116#define INT_TYPE8 ushort8117#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read_us2118#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read_us4119#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read_us8120#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read_us121#else122#define INT_TYPE uint123#define INT_TYPE2 uint2124#define INT_TYPE4 uint4125#define INT_TYPE8 uint8126#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read2127#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read4128#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8129#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read130#endif131#endif132133#ifdef KERNEL_BASIC134135__kernel void ConvolveBasic(136ELTWISE_DATA_ARG137FUSED_ARG138__global Dtype* image_data,139int image_offset,140__global Dtype* kernel_data,141int kernel_offset,142__global Dtype* bias,143const int bias_offset,144__global Dtype* convolved_image_base,145const int convolved_image_base_offset,146const int convolved_image_offset,147const ushort input_width,148const ushort input_height,149const ushort output_width,150const ushort output_height,151const ushort pad_w,152const ushort pad_h153)154{155__global Dtype* convolved_image = convolved_image_base + convolved_image_base_offset;156const int outputX = get_global_id(0);157const int outputY = get_global_id(1);158const int kernelNum = get_global_id(2) * ZPAR;159if (outputX < output_width && outputY < output_height)160{161Dtype sum[ZPAR];162for (int kern = 0; kern < ZPAR; kern++)163{164sum[kern] = 0.0f;165}166const int org_y = outputY * STRIDE_Y - pad_h;167const int org_x = outputX * STRIDE_X - pad_w;168const int currentKernelOffset = kernel_offset + kernelNum*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;169#if APPLY_BIAS170const int biasIndex = bias_offset + kernelNum;171#endif172const int local_image_offset = org_y * input_width + org_x;173const int imageSize = input_width * input_height;174__global Dtype* image_dataPtr = (image_data + (image_offset + local_image_offset));175__global Dtype* kernel_dataPtr = (kernel_data + (currentKernelOffset));176for (int c = 0; c < CHANNELS; c++)177{178for (int y = 0; y < KERNEL_HEIGHT; y++)179{180for (int x = 0; x < KERNEL_WIDTH; x++)181{182int y_ = org_y + y * DILATION_Y;183int x_ = org_x + x * DILATION_X;184if (!(y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width))185{186continue;187}188for (int kern = 0; kern < ZPAR; kern++)189{190sum[kern] += image_dataPtr[x * DILATION_X] * kernel_dataPtr[kern*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS + x];191}192}193image_dataPtr += input_width * DILATION_Y;194kernel_dataPtr += KERNEL_WIDTH;195}196image_dataPtr += imageSize - input_width*KERNEL_HEIGHT*DILATION_Y;197}198199for (int kern = 0; kern < ZPAR; kern++)200{201if (kernelNum + kern < OUTPUT_Z)202{203int offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX;204#if APPLY_BIAS205ACTIVATION_FUNCTION(convolved_image, offset, sum[kern] + bias[biasIndex + kern], biasIndex + kern);206#else207ACTIVATION_FUNCTION(convolved_image, offset, sum[kern], biasIndex + kern);208#endif209}210}211}212}213214#elif defined KERNEL_IDLF215216// Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map.217// Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the input image.218// NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH219220// NOTE: for beignet this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break.221__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))222__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))223__kernel void224convolve_simd(225ELTWISE_DATA_ARG226FUSED_ARG227__global Dtype* inputs,228__global Dtype* weights,229BIAS_KERNEL_ARG230__global Dtype* outputs_base,231const int outputs_offset,232const ushort input_width,233const ushort input_height,234const ushort output_width,235const ushort output_height)236{237__global Dtype* outputs = outputs_base + outputs_offset;238unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column239unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row240unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth241unsigned int fmg = get_group_id(2);242unsigned int lid = get_local_id(2);243244Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0.0f };245246// find weights address of given neuron (lid is index)247unsigned int weight_addr = (fmg % FILTERS_IN_GROUP) *248INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;249250unsigned int num_in_batch = fm / ALIGNED_NUM_FILTERS;251252unsigned int input_batch_offset = num_in_batch * INPUT_PITCH * TOTAL_INPUT_DEPTH_SIZE;253254int curr_y = or * STRIDE_Y;255int curr_x = oc * STRIDE_X + lid;256257int in_addr = input_batch_offset258+ (curr_y - INPUT_PAD_H) * INPUT_WIDTH // y tile offset259+ curr_x - INPUT_PAD_W; // x tile offset260261const int in_limit = (get_global_size(2) / ALIGNED_NUM_FILTERS) * TOTAL_INPUT_DEPTH_SIZE * INPUT_PITCH - 1;262263Dtype in_buf[INVEC_SIZE];264265for(int kd = 0; kd < INPUT_DEPTH; kd++)266{267#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0268const bool cx_out_of_range = !(curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W);269int in_offset = in_addr;270__attribute__((opencl_unroll_hint(INVEC_SIZE)))271for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH)272{273Dtype input = inputs[clamp(in_offset, 0, in_limit)];274int cy = curr_y + reg;275in_buf[reg] = (cx_out_of_range || cy < INPUT_PAD_H || cy >= INPUT_HEIGHT + INPUT_PAD_H) ? 0 : input;276}277#else278int in_offset = in_addr;279__attribute__((opencl_unroll_hint(INVEC_SIZE)))280for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH)281{282in_buf[reg] = inputs[min(in_offset, in_limit)];283}284#endif285286in_addr += INPUT_PITCH;287288#define BLOCK_IN(n, c) intel_sub_group_shuffle(in_buf[n], (c))289290int kr = 0; // kr = Kernel Row291LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop.292{293int kc = 0; // kc = Kernel Column294LOOP(KERNEL_WIDTH, kc,295{296Dtype weight_value = weights[weight_addr];297weight_addr += SIMD_SIZE;298for (int br=0; br < OUT_BLOCK_HEIGHT; br++)299{300for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++)301{302Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y), bc * STRIDE_X + kc * DILATION_X);303out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_value, input, out[br * OUT_BLOCK_WIDTH + bc]);304}305}306});307});308}309310fm = fm % ALIGNED_NUM_FILTERS;311312#if LEFT_FILTERS > 0313if (fm < NUM_FILTERS)314#endif315{316unsigned int out_addr = (num_in_batch * TOTAL_OUTPUT_DEPTH + fm) * OUTPUT_PITCH;317out_addr += or * output_width + oc;318// we need this address calculation for biases because we support views and batching319#if APPLY_BIAS320Dtype bias = biases_base[fm];321#else322Dtype bias = 0;323#endif324325for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++)326{327if (r + or >= output_height) break;328for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++)329{330if (c + oc >= output_width) break;331// this does a scattered write to SIMD_SIZE different feature maps,332// so that data within one map is contiguous, thus ready for input to next layer.333ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c], fm);334}335}336}337}338339#elif defined KERNEL_GEMM_LIKE340341#if APPLY_BIAS342#define SUBGROUP_GET_BIAS(k, i) intel_sub_group_shuffle(bias[k], i)343#else344#define SUBGROUP_GET_BIAS(k, i) ((Dtype)0)345#endif346347#ifdef Conv_Interleaved348typedef struct float1 { float s0; } float1;349typedef struct float5 { float s0; float s1; float s2; float s3; float s4; } float5;350typedef struct float6 { float s0; float s1; float s2; float s3; float s4; float s5; } float6;351typedef struct float7 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; } float7;352typedef struct float9 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; } float9;353typedef struct float10 { float s0; float s1; float s2; float s3; float s4; float s5;354float s6; float s7; float s8; float s9;} float10;355typedef struct float11 { float s0; float s1; float s2; float s3; float s4; float s5;356float s6; float s7; float s8; float s9; float sa;} float11;357typedef struct float12 { float s0; float s1; float s2; float s3; float s4; float s5;358float s6; float s7; float s8; float s9; float sa; float sb; } float12;359typedef struct float13 { float s0; float s1; float s2; float s3; float s4; float s5;360float s6; float s7; float s8; float s9; float sa; float sb; float sc;} float13;361typedef struct float14 { float s0; float s1; float s2; float s3; float s4; float s5;362float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; } float14;363typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float s5;364float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15;365typedef struct float0 { float s0; } float0; //never used but makes compiler happy.366367typedef struct half1 { half s0; } half1;368typedef struct half5 { half s0; half s1; half s2; half s3; half s4; } half5;369typedef struct half6 { half s0; half s1; half s2; half s3; half s4; half s5; } half6;370typedef struct half7 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; } half7;371typedef struct half9 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; } half9;372typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5;373half s6; half s7; half s8; half s9; } half10;374typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5;375half s6; half s7; half s8; half s9; half sa; } half11;376typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5;377half s6; half s7; half s8; half s9; half sa; half sb; } half12;378typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5;379half s6; half s7; half s8; half s9; half sa; half sb; half sc; } half13;380typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5;381half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; } half14;382typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5;383half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; half se; } half15;384typedef struct half0 { half s0; } half0; //never used but makes compiler happy.385386#define OUT_PITCH_X output_width387#define ROW_PITCH input_width388389#define GEMM_LIKE_KERNEL_ARGS \390ELTWISE_DATA_ARG \391FUSED_ARG \392const __global Dtype *src0, \393const __global Dtype *src1, \394BIAS_KERNEL_ARG \395__global Dtype *dst_base, \396const int dst_offset, \397const ushort input_width, \398const ushort input_height, \399const ushort output_width, \400const ushort output_height, \401const int out_pitch_y, \402const int out_pitch_z, \403const int aligned_input_size, \404const int slice_pitch405#endif406407#ifdef GEMM_LIKE_CONV_32_1408//////////////////////////////////////////////////////////////////////////////409// Conv_Interleaved_32_1_flex410//411// Convolution: each workitem computes 1 patch x 32 filters worth of output412// data. Kernel's inner loop works on a single tile consisting of one413// row from each patch and the filter data corresponding to that row. Filter414// matrix is interleaved to reduce GRF bank conflicts. Patches are walked415// by rows and then by slices. Relies on sub_group extension for block416// reads and SIMD broadcast. Allows flexible sizing of TILE width (TILE_N)417// by dynamically selecting one of two code paths: one uses TILE_N = 32 and418// the other uses TILE_N = 8, 16, or 24.419#define TILE_M 1420#define TILE_K KERNEL_WIDTH421#define TILE_N 32422423__attribute__((intel_reqd_sub_group_size(8)))424__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)425{426__global Dtype *dst = dst_base + dst_offset;427const int group_x = get_group_id(0);428const int group_y = get_group_id(1);429const int global_x = get_global_id(0);430const int global_y = get_global_id(1);431const int global_z = get_global_id(2);432int interleaved_y;433int kernel_y;434int kernel_idx;435436#define DOT_PRODUCT_8( _result, _rowA, colB ) \437{ \438_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \439_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \440_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \441_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \442_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \443_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \444_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \445_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \446}447typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;448449// True for all threads if filter_width is multiple of TILE_N450// else, true for all but right-most column of threads.451if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )452{453// Result ctile (*dst) is M rows x N columns454// LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.455Dtype8 blockC00 = 0.f;456Dtype8 blockC10 = 0.f;457Dtype8 blockC20 = 0.f;458Dtype8 blockC30 = 0.f;459460// Src0 (patch input) is directly used as atile.461// Each work item points to the start of a different patch.462// atile is M rows x K columns.463int curr_x = ( global_y % output_width ) * STRIDE_X;464int curr_y = ( global_y / output_width ) * STRIDE_Y;465#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0466int saved_y = curr_y;467#endif468const __global Dtype *src0_read = src0469+ aligned_input_size * global_z // batch offset470+ (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset471+ (curr_x - INPUT_PAD_W); // x offset472473// Src1 (filter) is directly used as btile.474// It starts at the top of src1 and walks down.475// btile is K rows x N columns.476const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);477478// Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.479// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch480// and KERNEL_WIDTH/2 rows of interleaved filter.481int patch_depth = 0;482do483{484int patch_row = 0;485#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0486curr_y = saved_y;487#endif488489do490{491// Load atile and btile.492// Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity.493// The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non494// interleaved row is padded with zero to ensure same size as interleaved rows. This495// interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the496// kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.497// (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..498// (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ...499// (0, 2) (8, 2) (16, 2) (24, 2) ... ...500// ...501const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;502503#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0504#if KERNEL_WIDTH == 3505Dtype_t blockA00 = vload3(0, src0_read);506Dtype* pblockA00 = (Dtype*)(&blockA00);507#else508Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];509Dtype* pblockA00 = (Dtype*)(&blockA00);510#endif511#else512Dtype_t blockA00;513Dtype* pblockA00 = (Dtype*)(&blockA00);514int pos = 0;515LOOP(KERNEL_WIDTH, pos,516{517if (curr_y >= INPUT_PAD_H &&518curr_y < input_height + INPUT_PAD_H &&519curr_x + pos * DILATION_X >= INPUT_PAD_W &&520curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)521pblockA00[pos] = src0_read[pos * DILATION_X];522else523pblockA00[pos] = 0;524})525curr_y += DILATION_Y;526#endif527src0_read += (ROW_PITCH * DILATION_Y);528529Dtype blockB00[KERNEL_WIDTH*4];530Dtype8* p8BlockB00 = (Dtype8*)blockB00;531Dtype4* p4BlockB00 = (Dtype4*)blockB00;532Dtype* pBlockB00 = (Dtype* )blockB00;533534interleaved_y = 0;535LOOP(KERNEL_WIDTH_DIV2, interleaved_y,536{537p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE *)src1_read ) );538src1_read += WIDTH1 * 2;539} )540if ( kernel_width_is_odd )541{542p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE *)src1_read ) );543src1_read += WIDTH1 * 2;544}545546// Perform MADs547kernel_idx = 0;548interleaved_y = 0;549LOOP(KERNEL_WIDTH_DIV2, interleaved_y,550{551kernel_y = interleaved_y * 2;552DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;553DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;554DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;555DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;556DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;557DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;558DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;559DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;560} )561kernel_y = interleaved_y * 2;562if ( kernel_width_is_odd )563{564DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;565DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;566DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;567DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;568}569}570571//while( ++patch_row < 1 ); //debug572while( ++patch_row < KERNEL_HEIGHT );573574// reset to start of next slice of patch575src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);576}577//while ( ++patch_depth < 1 ); //debug578while ( ++patch_depth < INPUT_DEPTH );579580// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:581// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.582int out_offset = global_z * out_pitch_z // batch offset583+ ( group_x * TILE_N ) * out_pitch_y // channel offset584+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset585+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset586587__global Dtype *out = dst + out_offset;588#if APPLY_BIAS589Dtype bias[4];590Dtype4 *bias_vec;591bias_vec = (Dtype4*)bias;592*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));593if (group_x > 0xFFFFFFFEul) {594dst[0] = bias[0] + bias[1] + bias[2] + bias[3];595}596#else597const Dtype bias[4] = {0, 0, 0, 0};598#endif599if (global_y * TILE_M < output_width * output_height )600{601for (int i = 0; i < 8; i++)602{603ACTIVATION_FUNCTION(dst, out_offset + ( 0 + i ) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);604ACTIVATION_FUNCTION(dst, out_offset + ( 8 + i ) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + 8 + i);605ACTIVATION_FUNCTION(dst, out_offset + ( 16 + i ) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + 16 + i);606ACTIVATION_FUNCTION(dst, out_offset + ( 24 + i ) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + 24 + i);607}608}609}610#if TILE_N_LAST > 0611else612{613614// Result ctile (*dst) is M rows x N columns615// LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.616int i = 0;617Dtype8 blockC[TILE_N_LAST_DIV8];618LOOP(TILE_N_LAST_DIV8, i,619{620blockC[i] = 0.f;621} )622623// Src0 (patch input) is directly used as atile.624// Each work item points to the start of a different patch.625// atile is M rows x K columns.626int curr_x = ( global_y % output_width ) * STRIDE_X;627int curr_y = ( global_y / output_width ) * STRIDE_Y;628#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0629int saved_y = curr_y;630#endif631const __global Dtype *src0_read = src0632+ aligned_input_size * global_z // batch offset633+ (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset634+ (curr_x - INPUT_PAD_W); // x offset635636// Src1 (filter) is directly used as btile.637// It starts at the top of src1 and walks down.638// btile is K rows x N columns.639const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);640641// Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.642// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch643// and KERNEL_WIDTH/2 rows of interleaved filter.644int patch_depth = 0;645do646{647int patch_row = 0;648#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0649curr_y = saved_y;650#endif651do652{653// Load atile and interleaved btile.654const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;655#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0656Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];657Dtype* pblockA00 = (Dtype*)(&blockA00);658#else659Dtype_t blockA00;660Dtype* pblockA00 = (Dtype*)(&blockA00);661int pos = 0;662LOOP(KERNEL_WIDTH, pos,663{664if (curr_y >= INPUT_PAD_H &&665curr_y < input_height + INPUT_PAD_H &&666curr_x + pos * DILATION_X >= INPUT_PAD_W &&667curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)668pblockA00[pos] = src0_read[pos * DILATION_X];669else670pblockA00[pos] = 0;671})672curr_y += DILATION_Y;673#endif674src0_read += (ROW_PITCH * DILATION_Y);675Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8];676677interleaved_y = 0;678LOOP(KERNEL_WIDTH_DIV2, interleaved_y,679{680#if TILE_N_LAST_DIV8 == 1681Dtype2* p2BlockB = (Dtype2* )blockB;682p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );683#elif TILE_N_LAST_DIV8 == 2684Dtype4* p4BlockB = (Dtype4* )blockB;685p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );686#elif TILE_N_LAST_DIV8 == 3687//TODO: broken. No block_read6688Dtype6* p6BlockB = (Dtype6* )blockB;689(*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );690(*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) );691#endif692src1_read += WIDTH1 * 2;693} )694if ( kernel_width_is_odd )695{696#if TILE_N_LAST_DIV8 == 1697Dtype* pBlockB = (Dtype* )blockB;698pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) );699#elif TILE_N_LAST_DIV8 == 2700Dtype2* p2BlockB = (Dtype2* )blockB;701p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );702#elif TILE_N_LAST_DIV8 == 3703Dtype3* p3BlockB = (Dtype3* )blockB;704p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );705p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 2 * 8) ) );706#endif707src1_read += WIDTH1 * 2;708}709710// Perform MADs711Dtype* pBlockB = (Dtype*)blockB;712kernel_idx = 0;713interleaved_y = 0;714LOOP(KERNEL_WIDTH_DIV2, interleaved_y,715{716kernel_y = interleaved_y * 2;717DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;718DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;719#if TILE_N_LAST_DIV8 >= 2720DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;721DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;722#if TILE_N_LAST_DIV8 >= 3723DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;724DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;725#endif726#endif727} )728kernel_y = interleaved_y * 2;729if ( kernel_width_is_odd )730{731DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;732#if TILE_N_LAST_DIV8 >= 2733DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;734#if TILE_N_LAST_DIV8 >= 3735DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;736#endif737#endif738}739}740741//while( ++patch_row < 1 ); //debug742while( ++patch_row < KERNEL_HEIGHT );743744// reset to start of next slice of patch745src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );746}747//while ( ++patch_depth < 1 ); //debug748while ( ++patch_depth < INPUT_DEPTH );749750// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:751// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.752int out_offset = global_z * out_pitch_z // batch offset753+ ( group_x * TILE_N ) * out_pitch_y // channel offset754+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset755+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset756__global Dtype *out = dst + out_offset;757#if APPLY_BIAS758Dtype bias[4];759Dtype4 *bias_vec;760bias_vec = (Dtype4*)bias;761*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));762if (group_x > 0xFFFFFFFEul) {763dst[0] = bias[0] + bias[1] + bias[2] + bias[3];764}765#else766const Dtype bias[4] = {0, 0, 0, 0};767#endif768769if (global_y * TILE_M < output_width * output_height )770{771for (int i = 0; i < 8; i++)772{773if ( TILE_N_LAST_DIV8 > 0 )774{775ACTIVATION_FUNCTION(dst, out_offset + ( 0+i) * out_pitch_y, blockC[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);776}777if ( TILE_N_LAST_DIV8 > 1 )778{779ACTIVATION_FUNCTION(dst, out_offset + ( 8+i) * out_pitch_y, blockC[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);780}781if ( TILE_N_LAST_DIV8 > 2 )782{783ACTIVATION_FUNCTION(dst, out_offset + (16+i) * out_pitch_y, blockC[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);784}785if ( TILE_N_LAST_DIV8 > 3 )786{787ACTIVATION_FUNCTION(dst, out_offset + (24+i) * out_pitch_y, blockC[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);788}789}790}791}792#endif793}794#endif795#ifdef GEMM_LIKE_CONV_32_2796797//////////////////////////////////////////////////////////////////////////////798// Conv_Interleaved_32_2_flex799//800// Convolution: each workitem computes 1 patch x 32 filters worth of output801// data. Kernel's inner loop works on a single tile consisting of one802// row from each patch and the filter data corresponding to that row. Filter803// matrix is interleaved to reduce GRF bank conflicts. Patches are walked804// by rows and then by slices. Relies on sub_group extension for block805// reads and SIMD broadcast. Allows flexible sizing of TILE width (TILE_N)806// by dynamically selecting one of two code paths: one uses TILE_N = 32 and807// the other uses TILE_N = 8, 16, or 24.808#define TILE_M 2809#define TILE_K KERNEL_WIDTH810#define TILE_N 32811812__attribute__((intel_reqd_sub_group_size(8)))813__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)814{815__global Dtype *dst = dst_base + dst_offset;816const int group_x = get_group_id(0);817const int group_y = get_group_id(1);818const int global_x = get_global_id(0);819const int global_y = get_global_id(1);820const int global_z = get_global_id(2);821int interleaved_y;822int kernel_y;823int kernel_idx;824825#define DOT_PRODUCT_8( _result, _rowA, colB ) \826{ \827_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \828_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \829_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \830_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \831_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \832_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \833_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \834_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \835}836typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;837838// True for all threads if filter_width is multiple of TILE_N839// else, true for all but right-most column of threads.840if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )841{842// Result ctile (*dst) is M rows x N columns843// LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.844Dtype8 blockC00 = 0.f;845Dtype8 blockC10 = 0.f;846Dtype8 blockC20 = 0.f;847Dtype8 blockC30 = 0.f;848Dtype8 blockC01 = 0.f;849Dtype8 blockC11 = 0.f;850Dtype8 blockC21 = 0.f;851Dtype8 blockC31 = 0.f;852853// Src0 (patch input) is directly used as atile.854// Each work item points to the start of a different patch.855// atile is M rows x K columns.856int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;857int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;858int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;859int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;860#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0861int saved_y0 = curr_y0;862int saved_y1 = curr_y1;863#endif864const __global Dtype *src0_read0 = src0865+ aligned_input_size * global_z // batch offset866+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset867+ curr_x0 - INPUT_PAD_W; // x offset868const __global Dtype *src0_read1 = src0869+ aligned_input_size * global_z // batch offset870+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset871+ curr_x1 - INPUT_PAD_W; // x offset872873// Src1 (filter) is directly used as btile.874// It starts at the top of src1 and walks down.875// btile is K rows x N columns.876const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);877878// Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.879// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch880// and KERNEL_WIDTH/2 rows of interleaved filter.881int patch_depth = 0;882do883{884int patch_row = 0;885do886{887// Load atile and btile.888// Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity.889// The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non890// interleaved row is padded with zero to ensure same size as interleaved rows. This891// interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the892// kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.893// (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..894// (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ...895// (0, 2) (8, 2) (16, 2) (24, 2) ... ...896// ...897const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;898#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0899#if KERNEL_WIDTH == 3900Dtype_t blockA00 = vload3(0, src0_read0); src0_read0 += ROW_PITCH;901Dtype_t blockA01 = vload3(0, src0_read1); src0_read1 += ROW_PITCH;902Dtype* pblockA00 = (Dtype*)(&blockA00);903Dtype* pblockA01 = (Dtype*)(&blockA01);904#else905Dtype_t blockA00 = { (Dtype)0.f };906Dtype_t blockA01 = { (Dtype)0.f };907Dtype* pblockA00 = (Dtype*)(&blockA00);908Dtype* pblockA01 = (Dtype*)(&blockA01);909int pos = 0;910LOOP(KERNEL_WIDTH, pos,911{912if (curr_x0 + pos < input_width)913pblockA00[pos] = src0_read0[pos];914915if (curr_x1 + pos < input_width)916pblockA01[pos] = src0_read1[pos];917})918src0_read0 += ROW_PITCH;919src0_read1 += ROW_PITCH;920#endif921#else922Dtype_t blockA00;923Dtype* pblockA00 = (Dtype*)(&blockA00);924int pos = 0;925LOOP(KERNEL_WIDTH, pos,926{927if (curr_y0 >= INPUT_PAD_H &&928curr_y0 < input_height + INPUT_PAD_H &&929curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&930curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)931pblockA00[pos] = src0_read0[pos * DILATION_X];932else933pblockA00[pos] = 0;934})935curr_y0 += DILATION_Y;936Dtype_t blockA01;937Dtype* pblockA01 = (Dtype*)(&blockA01);938pos = 0;939LOOP(KERNEL_WIDTH, pos,940{941if (curr_y1 >= INPUT_PAD_H &&942curr_y1 < input_height + INPUT_PAD_H &&943curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&944curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)945pblockA01[pos] = src0_read1[pos * DILATION_X];946else947pblockA01[pos] = 0;948})949curr_y1 += DILATION_Y;950src0_read0 += (ROW_PITCH * DILATION_Y);951src0_read1 += (ROW_PITCH * DILATION_Y);952#endif953Dtype blockB00[KERNEL_WIDTH*4];954Dtype8* p8BlockB00 = (Dtype8*)blockB00;955Dtype4* p4BlockB00 = (Dtype4*)blockB00;956Dtype* pBlockB00 = (Dtype* )blockB00;957958interleaved_y = 0;959LOOP(KERNEL_WIDTH_DIV2, interleaved_y,960{961p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE*)src1_read ) );962src1_read += WIDTH1 * 2;963} )964if ( kernel_width_is_odd )965{966p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );967src1_read += WIDTH1 * 2;968}969// Perform MADs970kernel_idx = 0;971interleaved_y = 0;972LOOP(KERNEL_WIDTH_DIV2, interleaved_y,973{974kernel_y = interleaved_y * 2;975DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] );976DOT_PRODUCT_8( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;977DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );978DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;979DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] );980DOT_PRODUCT_8( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;981DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );982DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;983DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] );984DOT_PRODUCT_8( blockC21, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;985DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );986DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;987DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] );988DOT_PRODUCT_8( blockC31, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;989DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );990DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;991} )992if ( kernel_width_is_odd )993{994kernel_y = interleaved_y * 2;995DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );996DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;997DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );998DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;999DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] );1000DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;1001DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] );1002DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;1003}1004}10051006//while( ++patch_row < 1 ); //debug1007while( ++patch_row < KERNEL_HEIGHT );1008#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 01009curr_y0 = saved_y0;1010curr_y1 = saved_y1;1011#endif1012// reset to start of next slice of patch1013src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );1014src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );1015}1016//while ( ++patch_depth < 1 ); //debug1017while ( ++patch_depth < INPUT_DEPTH );10181019// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:1020// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.1021int out0_offset = global_z * out_pitch_z // batch offset1022+ ( group_x * TILE_N ) * out_pitch_y // channel offset1023+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset1024+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset1025int out1_offset = global_z * out_pitch_z // batch offset1026+ ( group_x * TILE_N ) * out_pitch_y // channel offset1027+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset1028+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset10291030#if APPLY_BIAS1031Dtype bias[4];1032Dtype4 *bias_vec;1033bias_vec = (Dtype4*)bias;1034*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));1035if (group_x > 0xFFFFFFFEul) {1036dst[0] = bias[0] + bias[1] + bias[2] + bias[3];1037}1038#else1039const Dtype bias[4] = {0, 0, 0, 0};1040#endif10411042if( global_y * TILE_M < output_width * output_height )1043{1044for( int i = 0; i < 8; i++ )1045{1046ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);1047ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);1048ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);1049ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);1050}1051}1052if( global_y * TILE_M + 1 < output_width * output_height )1053{1054for( int i = 0; i < 8; i++ )1055{1056ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC01[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);1057ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC11[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);1058ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC21[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);1059ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC31[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);1060}1061}1062}1063#if TILE_N_LAST > 01064else1065{10661067// Result ctile (*dst) is M rows x N columns1068// LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.1069int i = 0;1070Dtype8 blockC0[TILE_N_LAST_DIV8];1071Dtype8 blockC1[TILE_N_LAST_DIV8];1072LOOP(TILE_N_LAST_DIV8, i,1073{1074blockC0[i] = 0.f;1075blockC1[i] = 0.f;1076} )10771078// Src0 (patch input) is directly used as atile.1079// Each work item points to the start of a different patch.1080// atile is M rows x K columns.1081int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;1082int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;1083int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;1084int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;1085#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 01086int saved_y0 = curr_y0;1087int saved_y1 = curr_y1;1088#endif1089const __global Dtype *src0_read0 = src01090+ aligned_input_size * global_z // batch offset1091+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset1092+ curr_x0 - INPUT_PAD_W; // x offset1093const __global Dtype *src0_read1 = src01094+ aligned_input_size * global_z // batch offset1095+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset1096+ curr_x1 - INPUT_PAD_W; // x offset10971098// Src1 (filter) is directly used as btile.1099// It starts at the top of src1 and walks down.1100// btile is K rows x N columns.1101const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);11021103// Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.1104// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch1105// and KERNEL_WIDTH/2 rows of interleaved filter.1106int patch_depth = 0;1107do1108{1109int patch_row = 0;1110do1111{1112// Load atile and interleaved btile.1113const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;1114#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 01115Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH;1116Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH;1117Dtype* pblockA00 = (Dtype*)(&blockA00);1118Dtype* pblockA01 = (Dtype*)(&blockA01);1119#else1120Dtype_t blockA00;1121Dtype* pblockA00 = (Dtype*)(&blockA00);1122int pos = 0;1123LOOP(KERNEL_WIDTH, pos,1124{1125if (curr_y0 >= INPUT_PAD_H &&1126curr_y0 < input_height + INPUT_PAD_H &&1127curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&1128curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)1129pblockA00[pos] = src0_read0[pos * DILATION_X];1130else1131pblockA00[pos] = 0;1132})1133curr_y0 += DILATION_Y;1134Dtype_t blockA01;1135Dtype* pblockA01 = (Dtype*)(&blockA01);1136pos = 0;1137LOOP(KERNEL_WIDTH, pos,1138{1139if (curr_y1 >= INPUT_PAD_H &&1140curr_y1 < input_height + INPUT_PAD_H &&1141curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&1142curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)1143pblockA01[pos] = src0_read1[pos * DILATION_X];1144else1145pblockA01[pos] = 0;1146})1147curr_y1 += DILATION_Y;1148src0_read0 += (ROW_PITCH * DILATION_Y);1149src0_read1 += (ROW_PITCH * DILATION_Y);1150#endif1151Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8];11521153interleaved_y = 0;1154LOOP(KERNEL_WIDTH_DIV2, interleaved_y,1155{1156#if TILE_N_LAST_DIV8 == 11157Dtype2* p2BlockB = (Dtype2* )blockB;1158p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );1159#elif TILE_N_LAST_DIV8 == 21160Dtype4* p4BlockB = (Dtype4* )blockB;1161p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );1162#elif TILE_N_LAST_DIV8 == 31163//TODO: broken. No block_read61164Dtype6* p6BlockB = (Dtype6* )blockB;1165(*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );1166(*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) );1167#endif1168src1_read += WIDTH1 * 2;1169} )1170if ( kernel_width_is_odd )1171{1172#if TILE_N_LAST_DIV8 == 11173Dtype* pBlockB = (Dtype* )blockB;1174pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) );1175#elif TILE_N_LAST_DIV8 == 21176Dtype2* p2BlockB = (Dtype2* )blockB;1177p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );1178#elif TILE_N_LAST_DIV8 == 31179Dtype3* p3BlockB = (Dtype3* )blockB;1180p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );1181p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 8) ) );1182#endif1183src1_read += WIDTH1 * 2;1184}11851186// Perform MADs1187Dtype* pBlockB = (Dtype*)blockB;1188kernel_idx = 0;1189interleaved_y = 0;1190LOOP(KERNEL_WIDTH_DIV2, interleaved_y,1191{1192kernel_y = interleaved_y * 2;1193DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y ], pBlockB[kernel_idx] );1194DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;1195DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );1196DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;1197#if TILE_N_LAST_DIV8 >= 21198DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y ], pBlockB[kernel_idx] );1199DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;1200DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );1201DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;1202#if TILE_N_LAST_DIV8 >= 31203DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y ], pBlockB[kernel_idx] );1204DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;1205DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );1206DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;1207#endif1208#endif1209} )1210kernel_y = interleaved_y * 2;1211if ( kernel_width_is_odd )1212{1213DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y], pBlockB[kernel_idx] );1214DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;1215#if TILE_N_LAST_DIV8 >= 21216DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y], pBlockB[kernel_idx] );1217DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;1218#if TILE_N_LAST_DIV8 >= 31219DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y], pBlockB[kernel_idx] );1220DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;1221#endif1222#endif1223}1224}12251226//while( ++patch_row < 1 ); //debug1227while( ++patch_row < KERNEL_HEIGHT );1228#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 01229curr_y0 = saved_y0;1230curr_y1 = saved_y1;1231#endif1232// reset to start of next slice of patch1233src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );1234src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );1235}1236//while ( ++patch_depth < 1 ); //debug1237while ( ++patch_depth < INPUT_DEPTH );12381239// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:1240// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.1241int out0_offset = global_z * out_pitch_z // batch offset1242+ ( group_x * TILE_N ) * out_pitch_y // channel offset1243+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset1244+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset1245int out1_offset = global_z * out_pitch_z // batch offset1246+ ( group_x * TILE_N ) * out_pitch_y // channel offset1247+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset1248+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset1249__global Dtype *out1 = dst + out1_offset;12501251#if APPLY_BIAS1252Dtype bias[4];1253Dtype4 *bias_vec;1254bias_vec = (Dtype4*)bias;1255*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));1256if (group_x > 0xFFFFFFFEul) {1257dst[0] = bias[0] + bias[1] + bias[2] + bias[3];1258}1259#else1260const Dtype bias[4] = {0, 0, 0, 0};1261#endif1262if( global_y * TILE_M < output_width * output_height )1263{1264for( int i = 0; i < 8; i++ )1265{1266if ( TILE_N_LAST_DIV8 > 0 )1267{1268ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC0[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);1269}1270if ( TILE_N_LAST_DIV8 > 1 )1271{1272ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC0[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);1273}1274if ( TILE_N_LAST_DIV8 > 2 )1275{1276ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC0[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);1277}1278if ( TILE_N_LAST_DIV8 > 3 )1279{1280ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC0[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);1281}1282}1283}1284if( global_y * TILE_M + 1 < output_width * output_height )1285{1286for( int i = 0; i < 8; i++ )1287{1288if ( TILE_N_LAST_DIV8 > 0 )1289{1290ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC1[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);1291}1292if ( TILE_N_LAST_DIV8 > 1 )1293{1294ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC1[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);1295}1296if ( TILE_N_LAST_DIV8 > 2 )1297{1298ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC1[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);1299}1300if ( TILE_N_LAST_DIV8 > 3 )1301{1302ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC1[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);1303}1304}1305}1306}1307#endif1308}1309#endif13101311#if defined(GEMM_LIKE_CONV_32_2_SIMD16) || defined(GEMM_LIKE_CONV_32_1_SIMD16)1312#define INTERLEAVED_SIMD16_OUTPUT(_out_, _offset_, _m_) do {\1313if (global_y * TILE_M < output_width * output_height ) \1314{ \1315if ( ( OUT_DEPTH % TILE_N ) == 0 ) {\1316for (int i = 0; i < 16; i++) \1317{ \1318ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \1319ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \1320} \1321} \1322else if( ( OUT_DEPTH % 16 ) == 0 ) { \1323if ( ( global_x + 1 ) < get_global_size(0) ) { \1324for ( int i = 0; i < 16; i++ ) \1325{ \1326ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \1327ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \1328} \1329} \1330else { \1331for (int i = 0; i < 16; i++) \1332{ \1333ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \1334} \1335} \1336} \1337else { \1338if ( ( global_x + 1 ) < get_global_size(0) ) \1339{ \1340for ( int i = 0; i < 16; i++ ) \1341{ \1342ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \1343ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \1344} \1345} \1346else { \1347if ( (OUT_DEPTH % TILE_N) > 16 ) { \1348for (int i = 0; i < 16 ; i++) \1349{ \1350ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \1351} \1352for (int i = 0; i < OUT_DEPTH % 16 ; i++) \1353{ \1354ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \1355} \1356} \1357else { \1358for (int i = 0; i < OUT_DEPTH % 16 ; i++) \1359{ \1360ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \1361} \1362} \1363} \1364} \1365} \1366}while(0)1367#endif13681369#ifdef GEMM_LIKE_CONV_32_1_SIMD161370#define TILE_M 11371#define TILE_K KERNEL_WIDTH1372#define TILE_N 3213731374__attribute__((intel_reqd_sub_group_size(16)))1375__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)1376{1377__global Dtype *dst = dst_base + dst_offset;1378const int group_x = get_group_id(0);1379const int group_y = get_group_id(1);1380const int global_x = get_global_id(0);1381const int global_y = get_global_id(1);1382const int global_z = get_global_id(2);1383int interleaved_y;1384int kernel_y;1385int kernel_idx;13861387// Result ctile (*dst) is M rows x N columns1388// LWG size is 1x16. Thus each thread calculates 16*M rows x N cols of ctile.1389Dtype16 blockC00 = 0.f;1390Dtype16 blockC10 = 0.f;13911392// Src0 (patch input) is directly used as atile.1393// Each work item points to the start of a different patch.1394// atile is M rows x K columns.1395int curr_x = ( global_y % output_width ) * STRIDE_X;1396int curr_y = ( global_y / output_width ) * STRIDE_Y;1397#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 01398int saved_y = curr_y;1399#endif1400const __global Dtype *src0_read = src01401+ aligned_input_size * global_z // batch offset1402+ (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset1403+ curr_x - INPUT_PAD_W; // x offset1404const __global Dtype *src0_read_orig = src0_read;14051406// Src1 (filter) is directly used as btile.1407// It starts at the top of src1 and walks down.1408// btile is K rows x N columns.1409const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2 );14101411#define DOT_PRODUCT_16( _result, _rowA, colB ) \1412{ \1413_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \1414_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \1415_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \1416_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \1417_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \1418_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \1419_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \1420_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \1421_result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); \1422_result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); \1423_result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); \1424_result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); \1425_result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); \1426_result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); \1427_result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); \1428_result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); \1429}1430typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;1431// Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.1432// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch1433// and KERNEL_WIDTH/2 rows of interleaved filter.1434int patch_depth = 0;1435__attribute__((opencl_unroll_hint(1)))1436do1437{1438int patch_row = 0;1439#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 01440curr_y = saved_y;1441#endif1442__attribute__((opencl_unroll_hint(1)))1443do1444{1445// Load atile and btile.1446// Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype16 granularity.1447// The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non1448// interleaved row is padded with zero to ensure same size as interleaved rows. This1449// interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the1450// kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.1451// (0, 0) (16, 0) (32, 0) (48, 0) ... (0, 0) ( 0, 1) (16, 0) ( 0, 1) (32, 0) (0, 1) (48, 0) ...1452// (0, 1) (16, 1) (32, 1) (48, 1) ... => (0, 2) (16, 2) (32, 2) (48, 2) ...1453// (0, 2) (16, 2) (32, 2) (48, 2) ... ...1454// ...1455const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;14561457#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 01458#if KERNEL_WIDTH == 31459Dtype_t blockA00 = vload3(0, src0_read);1460Dtype* pblockA00 = (Dtype*)(&blockA00);1461#else1462Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];1463Dtype* pblockA00 = (Dtype*)(&blockA00);1464#endif1465#else1466Dtype_t blockA00;1467Dtype* pblockA00 = (Dtype*)(&blockA00);1468int pos = 0;1469LOOP(KERNEL_WIDTH, pos,1470{1471if (curr_y >= INPUT_PAD_H &&1472curr_y < input_height + INPUT_PAD_H &&1473curr_x + pos * DILATION_X >= INPUT_PAD_W &&1474curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)1475pblockA00[pos] = src0_read[pos * DILATION_X];1476else1477pblockA00[pos] = 0;1478})1479curr_y += DILATION_Y;1480#endif1481src0_read += ROW_PITCH * DILATION_Y;1482INT_TYPE blockB00[KERNEL_WIDTH * 2];1483INT_TYPE4* p4BlockB00 = (INT_TYPE4*)blockB00;1484INT_TYPE2* p2BlockB00 = (INT_TYPE2*)blockB00;1485Dtype* pBlockB00 = (Dtype*)blockB00;1486interleaved_y = 0;1487LOOP(KERNEL_WIDTH_DIV2, interleaved_y,1488{1489p4BlockB00[interleaved_y] = SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read );1490src1_read += WIDTH1 * 2;1491} )1492if ( kernel_width_is_odd )1493{1494p2BlockB00[KERNEL_WIDTH - 1] = SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read );1495src1_read += WIDTH1 * 2;1496}14971498// Perform MADs1499kernel_idx = 0;1500interleaved_y = 0;1501LOOP(KERNEL_WIDTH_DIV2, interleaved_y,1502{1503kernel_y = interleaved_y * 2;1504DOT_PRODUCT_16( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;1505DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;1506DOT_PRODUCT_16( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;1507DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;1508} )1509if ( kernel_width_is_odd )1510{1511kernel_y = interleaved_y * 2;1512DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;1513DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;1514}1515}15161517//while( ++patch_row < 1 ); //debug1518while( ++patch_row < KERNEL_HEIGHT );15191520// reset to start of next slice of patch1521src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );1522}1523//while ( ++patch_depth < 1 ); //debug1524while ( ++patch_depth < INPUT_DEPTH );15251526// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:1527// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.1528int out_offset = global_z * out_pitch_z // batch offset1529+ ( group_x * TILE_N ) * out_pitch_y // channel offset1530+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset1531+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset1532__global Dtype *out = dst + out_offset;15331534#if APPLY_BIAS1535Dtype bias[2];1536Dtype2 *bias_vec;1537bias_vec = (Dtype2*)bias;1538*bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));1539if (group_x > 0xFFFFFFFEul) {1540dst[0] = bias[0] + bias[1];1541}1542#else1543const Dtype bias[2] = {0, 0};1544#endif1545INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);1546}1547#endif15481549#ifdef GEMM_LIKE_CONV_32_2_SIMD1615501551//////////////////////////////////////////////////////////////////////////////1552// Conv_Interleaved_32_2_SIMD161553//1554// Convolution: each workitem computes 1 patch x 32 filters worth of output1555// data.1556#define TILE_M 21557#define TILE_K KERNEL_WIDTH1558#define TILE_N 3215591560__attribute__((intel_reqd_sub_group_size(16)))1561__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)1562{1563__global Dtype *dst = dst_base + dst_offset;1564const int group_x = get_group_id(0);1565const int group_y = get_group_id(1);1566const int global_x = get_global_id(0);1567const int global_y = get_global_id(1);1568const int global_z = get_global_id(2);1569int interleaved_y;1570int kernel_y;1571int kernel_idx;1572#define DOT_PRODUCT_16( _result, _rowA, colB ) \1573{ \1574_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \1575_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \1576_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \1577_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \1578_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \1579_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \1580_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \1581_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \1582_result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); \1583_result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); \1584_result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); \1585_result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); \1586_result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); \1587_result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); \1588_result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); \1589_result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); \1590}1591typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;15921593// True for all threads if filter_width is multiple of TILE_N1594// else, true for all but right-most column of threads.1595{1596// Result ctile (*dst) is M rows x N columns1597// LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.1598Dtype16 blockC00 = 0.f;1599Dtype16 blockC10 = 0.f;1600Dtype16 blockC01 = 0.f;1601Dtype16 blockC11 = 0.f;16021603// Src0 (patch input) is directly used as atile.1604// Each work item points to the start of a different patch.1605// atile is M rows x K columns.1606int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;1607int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;1608int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;1609int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;1610#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 01611int saved_y0 = curr_y0;1612int saved_y1 = curr_y1;1613#endif1614const __global Dtype *src0_read0 = src01615+ aligned_input_size * global_z // batch offset1616+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset1617+ curr_x0 - INPUT_PAD_W; // x offset1618const __global Dtype *src0_read1 = src01619+ aligned_input_size * global_z // batch offset1620+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset1621+ curr_x1 - INPUT_PAD_W; // x offset16221623// Src1 (filter) is directly used as btile.1624// It starts at the top of src1 and walks down.1625// btile is K rows x N columns.1626const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);16271628// Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.1629// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch1630// and KERNEL_WIDTH/2 rows of interleaved filter.1631int patch_depth = 0;1632do1633{1634int patch_row = 0;1635do1636{1637// Load atile and btile.1638// Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity.1639// The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non1640// interleaved row is padded with zero to ensure same size as interleaved rows. This1641// interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the1642// kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.1643// (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..1644// (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ...1645// (0, 2) (8, 2) (16, 2) (24, 2) ... ...1646// ...1647const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;1648#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 01649Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH;1650Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH;1651Dtype* pblockA00 = (Dtype*)(&blockA00);1652Dtype* pblockA01 = (Dtype*)(&blockA01);1653#else1654Dtype_t blockA00;1655Dtype* pblockA00 = (Dtype*)(&blockA00);1656int pos = 0;1657LOOP(KERNEL_WIDTH, pos,1658{1659if (curr_y0 >= INPUT_PAD_H &&1660curr_y0 < input_height + INPUT_PAD_H &&1661curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&1662curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)1663pblockA00[pos] = src0_read0[pos * DILATION_X];1664else1665pblockA00[pos] = 0;1666})1667curr_y0 += DILATION_Y;1668Dtype_t blockA01;1669Dtype* pblockA01 = (Dtype*)(&blockA01);1670pos = 0;1671LOOP(KERNEL_WIDTH, pos,1672{1673if (curr_y1 >= INPUT_PAD_H &&1674curr_y1 < input_height + INPUT_PAD_H &&1675curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&1676curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)1677pblockA01[pos] = src0_read1[pos * DILATION_X];1678else1679pblockA01[pos] = 0;1680})1681curr_y1 += DILATION_Y;1682src0_read0 += (ROW_PITCH * DILATION_Y);1683src0_read1 += (ROW_PITCH * DILATION_Y);1684#endif1685Dtype blockB00[KERNEL_WIDTH*2];1686Dtype4* p4BlockB00 = (Dtype4*)blockB00;1687Dtype2* p2BlockB00 = (Dtype2*)blockB00;1688Dtype* pBlockB00 = (Dtype* )blockB00;16891690interleaved_y = 0;1691LOOP(KERNEL_WIDTH_DIV2, interleaved_y,1692{1693p4BlockB00[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );1694src1_read += WIDTH1 * 2;1695} )1696if ( kernel_width_is_odd )1697{1698p2BlockB00[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );1699src1_read += WIDTH1 * 2;1700}1701// Perform MADs1702kernel_idx = 0;1703interleaved_y = 0;1704LOOP(KERNEL_WIDTH_DIV2, interleaved_y,1705{1706kernel_y = interleaved_y * 2;1707DOT_PRODUCT_16( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] );1708DOT_PRODUCT_16( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;1709DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );1710DOT_PRODUCT_16( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;1711DOT_PRODUCT_16( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] );1712DOT_PRODUCT_16( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;1713DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );1714DOT_PRODUCT_16( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;1715} )1716if ( kernel_width_is_odd )1717{1718kernel_y = interleaved_y * 2;1719DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );1720DOT_PRODUCT_16( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;1721DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );1722DOT_PRODUCT_16( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;1723}1724}17251726//while( ++patch_row < 1 ); //debug1727while( ++patch_row < KERNEL_HEIGHT );1728#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 01729curr_y0 = saved_y0;1730curr_y1 = saved_y1;1731#endif1732// reset to start of next slice of patch1733src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);1734src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);1735}1736//while ( ++patch_depth < 1 ); //debug1737while ( ++patch_depth < INPUT_DEPTH );17381739// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:1740// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.1741int out0_offset = global_z * out_pitch_z // batch offset1742+ ( group_x * TILE_N ) * out_pitch_y // channel offset1743+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset1744+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset1745int out1_offset = global_z * out_pitch_z // batch offset1746+ ( group_x * TILE_N ) * out_pitch_y // channel offset1747+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset1748+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset17491750#if APPLY_BIAS1751Dtype bias[2];1752Dtype2 *bias_vec;1753bias_vec = (Dtype2*)bias;1754*bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));1755if (group_x > 0xFFFFFFFEul) {1756dst[0] = bias[0] + bias[1];1757}1758#else1759const Dtype bias[2] = {0, 0};1760#endif1761INTERLEAVED_SIMD16_OUTPUT(dst, out0_offset, 0);1762INTERLEAVED_SIMD16_OUTPUT(dst, out1_offset, 1);1763}1764}1765#endif17661767#elif defined KERNEL_DWCONV17681769__kernel void DWCONV(1770ELTWISE_DATA_ARG1771FUSED_ARG1772__global Dtype* image_data,1773__global Dtype* kernel_data,1774BIAS_KERNEL_ARG1775__global Dtype* convolved_image_base,1776const int convolved_image_offset,1777const ushort input_width,1778const ushort input_height,1779const ushort output_width,1780const ushort output_height) {1781__global Dtype* convolved_image = convolved_image_base + convolved_image_offset;1782const int outputX = get_global_id(0);1783const int outputY = get_global_id(1);1784const int outputZ = get_global_id(2);1785if(outputX < output_width && outputY < output_height)1786{1787Dtype sum = 0.;17881789const int org_y = outputY * STRIDE_Y - INPUT_PAD_H;1790const int org_x = outputX * STRIDE_X - INPUT_PAD_W;1791const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS);1792const int biasIndex=outputZ%CHANNELS;1793const int local_image_offset = org_y*input_width + org_x;1794const int imageSize = input_width*input_height;17951796__global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset));1797__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));17981799for(int y = 0; y < KERNEL_H; y++)1800{1801for(int x = 0; x < KERNEL_W; x++)1802{1803if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width))1804{1805continue;1806}1807sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x];1808}1809image_dataPtrFloat += input_width * DILATION_Y;1810kernel_dataPtrFloat += KERNEL_W;1811}18121813#if APPLY_BIAS1814int offset = outputZ*output_height*output_width + outputY*output_width + outputX;1815ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex);1816#else1817int offset = outputZ*output_height*output_width + outputY*output_width + outputX;1818ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex);1819#endif1820}1821}1822#endif // KERNEL_BASIC/IDLF/GEMM_LIKE/DWCONV182318241825