Path: blob/master/modules/objdetect/src/opencl/objdetect_hog.cl
16337 views
/*M///////////////////////////////////////////////////////////////////////////////////////1//2// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.3//4// By downloading, copying, installing or using the software you agree to this license.5// If you do not agree to this license, do not download, install,6// copy or use the software.7//8//9// License Agreement10// For Open Source Computer Vision Library11//12// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.13// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.14// Third party copyrights are property of their respective owners.15//16// @Authors17// Wenju He, wenju@multicorewareinc.com18//19// Redistribution and use in source and binary forms, with or without modification,20// are permitted provided that the following conditions are met:21//22// * Redistribution's of source code must retain the above copyright notice,23// this list of conditions and the following disclaimer.24//25// * Redistribution's in binary form must reproduce the above copyright notice,26// this list of conditions and the following disclaimer in the documentation27// and/or other materials provided with the distribution.28//29// * The name of the copyright holders may not be used to endorse or promote products30// derived from this software without specific prior written permission.31//32// This software is provided by the copyright holders and contributors as is and33// any express or implied warranties, including, but not limited to, the implied34// warranties of merchantability and fitness for a particular purpose are disclaimed.35// In no event shall the Intel Corporation or contributors be liable for any direct,36// indirect, incidental, special, exemplary, or consequential damages37// (including, but not limited to, procurement of substitute goods or services;38// loss of use, data, or profits; or business interruption) however caused39// and on any theory of liability, whether in contract, strict liability,40// or tort (including negligence or otherwise) arising in any way out of41// the use of this software, even if advised of the possibility of such damage.42//43//M*/4445#define CELL_WIDTH 846#define CELL_HEIGHT 847#define CELLS_PER_BLOCK_X 248#define CELLS_PER_BLOCK_Y 249#define NTHREADS 25650#define CV_PI_F M_PI_F5152#ifdef INTEL_DEVICE53#define QANGLE_TYPE int54#define QANGLE_TYPE2 int255#else56#define QANGLE_TYPE uchar57#define QANGLE_TYPE2 uchar258#endif5960//----------------------------------------------------------------------------61// Histogram computation62// 12 threads for a cell, 12x4 threads per block63// Use pre-computed gaussian and interp_weight lookup tables64__kernel void compute_hists_lut_kernel(65const int cblock_stride_x, const int cblock_stride_y,66const int cnbins, const int cblock_hist_size, const int img_block_width,67const int blocks_in_group, const int blocks_total,68const int grad_quadstep, const int qangle_step,69__global const float* grad, __global const QANGLE_TYPE* qangle,70__global const float* gauss_w_lut,71__global float* block_hists, __local float* smem)72{73const int lx = get_local_id(0);74const int lp = lx / 24; /* local group id */75const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */76const int gidY = gid / img_block_width;77const int gidX = gid - gidY * img_block_width;7879const int lidX = lx - lp * 24;80const int lidY = get_local_id(1);8182const int cell_x = lidX / 12;83const int cell_y = lidY;84const int cell_thread_x = lidX - cell_x * 12;8586__local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X *87CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);88__local float* final_hist = hists + cnbins *89(CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);9091const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;92const int offset_y = gidY * cblock_stride_y + (cell_y << 2);9394__global const float* grad_ptr = (gid < blocks_total) ?95grad + offset_y * grad_quadstep + (offset_x << 1) : grad;96__global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ?97qangle + offset_y * qangle_step + (offset_x << 1) : qangle;9899__local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +100cell_thread_x;101for (int bin_id = 0; bin_id < cnbins; ++bin_id)102hist[bin_id * 48] = 0.f;103104const int dist_x = -4 + cell_thread_x - 4 * cell_x;105const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);106107const int dist_y_begin = -4 - 4 * lidY;108for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)109{110float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);111QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]);112113grad_ptr += grad_quadstep;114qangle_ptr += qangle_step;115116int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);117118int idx = (dist_center_y + 8) * 16 + (dist_center_x + 8);119float gaussian = gauss_w_lut[idx];120idx = (dist_y + 8) * 16 + (dist_x + 8);121float interp_weight = gauss_w_lut[256+idx];122123hist[bin.x * 48] += gaussian * interp_weight * vote.x;124hist[bin.y * 48] += gaussian * interp_weight * vote.y;125}126barrier(CLK_LOCAL_MEM_FENCE);127128volatile __local float* hist_ = hist;129for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)130{131if (cell_thread_x < 6)132hist_[0] += hist_[6];133barrier(CLK_LOCAL_MEM_FENCE);134if (cell_thread_x < 3)135hist_[0] += hist_[3];136#ifdef CPU137barrier(CLK_LOCAL_MEM_FENCE);138#endif139if (cell_thread_x == 0)140final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =141hist_[0] + hist_[1] + hist_[2];142}143144barrier(CLK_LOCAL_MEM_FENCE);145146int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;147if ((tid < cblock_hist_size) && (gid < blocks_total))148{149__global float* block_hist = block_hists +150(gidY * img_block_width + gidX) * cblock_hist_size;151block_hist[tid] = final_hist[tid];152}153}154155//-------------------------------------------------------------156// Normalization of histograms via L2Hys_norm157// optimized for the case of 9 bins158__kernel void normalize_hists_36_kernel(__global float* block_hists,159const float threshold, __local float *squares)160{161const int tid = get_local_id(0);162const int gid = get_global_id(0);163const int bid = tid / 36; /* block-hist id, (0 - 6) */164const int boffset = bid * 36; /* block-hist offset in the work-group */165const int hid = tid - boffset; /* histogram bin id, (0 - 35) */166167float elem = block_hists[gid];168squares[tid] = elem * elem;169barrier(CLK_LOCAL_MEM_FENCE);170171__local float* smem = squares + boffset;172float sum = smem[hid];173if (hid < 18)174smem[hid] = sum = sum + smem[hid + 18];175barrier(CLK_LOCAL_MEM_FENCE);176if (hid < 9)177smem[hid] = sum = sum + smem[hid + 9];178barrier(CLK_LOCAL_MEM_FENCE);179if (hid < 4)180smem[hid] = sum + smem[hid + 4];181barrier(CLK_LOCAL_MEM_FENCE);182sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];183184elem = elem / (sqrt(sum) + 3.6f);185elem = min(elem, threshold);186187barrier(CLK_LOCAL_MEM_FENCE);188squares[tid] = elem * elem;189barrier(CLK_LOCAL_MEM_FENCE);190191sum = smem[hid];192if (hid < 18)193smem[hid] = sum = sum + smem[hid + 18];194barrier(CLK_LOCAL_MEM_FENCE);195if (hid < 9)196smem[hid] = sum = sum + smem[hid + 9];197barrier(CLK_LOCAL_MEM_FENCE);198if (hid < 4)199smem[hid] = sum + smem[hid + 4];200barrier(CLK_LOCAL_MEM_FENCE);201sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];202203block_hists[gid] = elem / (sqrt(sum) + 1e-3f);204}205206//-------------------------------------------------------------207// Normalization of histograms via L2Hys_norm208//209inline float reduce_smem(volatile __local float* smem, int size)210{211unsigned int tid = get_local_id(0);212float sum = smem[tid];213214if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];215barrier(CLK_LOCAL_MEM_FENCE); }216if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];217barrier(CLK_LOCAL_MEM_FENCE); }218if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];219barrier(CLK_LOCAL_MEM_FENCE); }220#ifdef CPU221if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32];222barrier(CLK_LOCAL_MEM_FENCE); }223if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16];224barrier(CLK_LOCAL_MEM_FENCE); }225if (size >= 16) { if (tid < 8) smem[tid] = sum = sum + smem[tid + 8];226barrier(CLK_LOCAL_MEM_FENCE); }227if (size >= 8) { if (tid < 4) smem[tid] = sum = sum + smem[tid + 4];228barrier(CLK_LOCAL_MEM_FENCE); }229if (size >= 4) { if (tid < 2) smem[tid] = sum = sum + smem[tid + 2];230barrier(CLK_LOCAL_MEM_FENCE); }231if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1];232barrier(CLK_LOCAL_MEM_FENCE); }233#else234if (tid < 32)235{236if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];237#if WAVE_SIZE < 32238} barrier(CLK_LOCAL_MEM_FENCE);239if (tid < 16) {240#endif241if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];242if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];243if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];244if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];245if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];246}247#endif248249return sum;250}251252__kernel void normalize_hists_kernel(253const int nthreads, const int block_hist_size, const int img_block_width,254__global float* block_hists, const float threshold, __local float *squares)255{256const int tid = get_local_id(0);257const int gidX = get_group_id(0);258const int gidY = get_group_id(1);259260__global float* hist = block_hists + (gidY * img_block_width + gidX) *261block_hist_size + tid;262263float elem = 0.f;264if (tid < block_hist_size)265elem = hist[0];266267squares[tid] = elem * elem;268269barrier(CLK_LOCAL_MEM_FENCE);270float sum = reduce_smem(squares, nthreads);271272float scale = 1.0f / (sqrt(sum) + 0.1f * block_hist_size);273elem = min(elem * scale, threshold);274275barrier(CLK_LOCAL_MEM_FENCE);276squares[tid] = elem * elem;277278barrier(CLK_LOCAL_MEM_FENCE);279sum = reduce_smem(squares, nthreads);280scale = 1.0f / (sqrt(sum) + 1e-3f);281282if (tid < block_hist_size)283hist[0] = elem * scale;284}285286//---------------------------------------------------------------------287// Linear SVM based classification288// 48x96 window, 9 bins and default parameters289// 180 threads, each thread corresponds to a bin in a row290__kernel void classify_hists_180_kernel(291const int cdescr_width, const int cdescr_height, const int cblock_hist_size,292const int img_win_width, const int img_block_width,293const int win_block_stride_x, const int win_block_stride_y,294__global const float * block_hists, __global const float* coefs,295float free_coef, float threshold, __global uchar* labels)296{297const int tid = get_local_id(0);298const int gidX = get_group_id(0);299const int gidY = get_group_id(1);300301__global const float* hist = block_hists + (gidY * win_block_stride_y *302img_block_width + gidX * win_block_stride_x) * cblock_hist_size;303304float product = 0.f;305306for (int i = 0; i < cdescr_height; i++)307{308product += coefs[i * cdescr_width + tid] *309hist[i * img_block_width * cblock_hist_size + tid];310}311312__local float products[180];313314products[tid] = product;315316barrier(CLK_LOCAL_MEM_FENCE);317318if (tid < 90) products[tid] = product = product + products[tid + 90];319barrier(CLK_LOCAL_MEM_FENCE);320321if (tid < 45) products[tid] = product = product + products[tid + 45];322barrier(CLK_LOCAL_MEM_FENCE);323324volatile __local float* smem = products;325#ifdef CPU326if (tid < 13) smem[tid] = product = product + smem[tid + 32];327barrier(CLK_LOCAL_MEM_FENCE);328if (tid < 16) smem[tid] = product = product + smem[tid + 16];329barrier(CLK_LOCAL_MEM_FENCE);330if(tid<8) smem[tid] = product = product + smem[tid + 8];331barrier(CLK_LOCAL_MEM_FENCE);332if(tid<4) smem[tid] = product = product + smem[tid + 4];333barrier(CLK_LOCAL_MEM_FENCE);334if(tid<2) smem[tid] = product = product + smem[tid + 2];335barrier(CLK_LOCAL_MEM_FENCE);336#else337if (tid < 13)338{339smem[tid] = product = product + smem[tid + 32];340}341#if WAVE_SIZE < 32342barrier(CLK_LOCAL_MEM_FENCE);343#endif344if (tid < 16)345{346smem[tid] = product = product + smem[tid + 16];347smem[tid] = product = product + smem[tid + 8];348smem[tid] = product = product + smem[tid + 4];349smem[tid] = product = product + smem[tid + 2];350}351#endif352353if (tid == 0){354product = product + smem[tid + 1];355labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);356}357}358359//---------------------------------------------------------------------360// Linear SVM based classification361// 64x128 window, 9 bins and default parameters362// 256 threads, 252 of them are used363__kernel void classify_hists_252_kernel(364const int cdescr_width, const int cdescr_height, const int cblock_hist_size,365const int img_win_width, const int img_block_width,366const int win_block_stride_x, const int win_block_stride_y,367__global const float * block_hists, __global const float* coefs,368float free_coef, float threshold, __global uchar* labels)369{370const int tid = get_local_id(0);371const int gidX = get_group_id(0);372const int gidY = get_group_id(1);373374__global const float* hist = block_hists + (gidY * win_block_stride_y *375img_block_width + gidX * win_block_stride_x) * cblock_hist_size;376377float product = 0.f;378if (tid < cdescr_width)379{380for (int i = 0; i < cdescr_height; i++)381product += coefs[i * cdescr_width + tid] *382hist[i * img_block_width * cblock_hist_size + tid];383}384385__local float products[NTHREADS];386387products[tid] = product;388389barrier(CLK_LOCAL_MEM_FENCE);390391if (tid < 128) products[tid] = product = product + products[tid + 128];392barrier(CLK_LOCAL_MEM_FENCE);393394if (tid < 64) products[tid] = product = product + products[tid + 64];395barrier(CLK_LOCAL_MEM_FENCE);396397volatile __local float* smem = products;398#ifdef CPU399if(tid<32) smem[tid] = product = product + smem[tid + 32];400barrier(CLK_LOCAL_MEM_FENCE);401if(tid<16) smem[tid] = product = product + smem[tid + 16];402barrier(CLK_LOCAL_MEM_FENCE);403if(tid<8) smem[tid] = product = product + smem[tid + 8];404barrier(CLK_LOCAL_MEM_FENCE);405if(tid<4) smem[tid] = product = product + smem[tid + 4];406barrier(CLK_LOCAL_MEM_FENCE);407if(tid<2) smem[tid] = product = product + smem[tid + 2];408barrier(CLK_LOCAL_MEM_FENCE);409#else410if (tid < 32)411{412smem[tid] = product = product + smem[tid + 32];413#if WAVE_SIZE < 32414} barrier(CLK_LOCAL_MEM_FENCE);415if (tid < 16) {416#endif417smem[tid] = product = product + smem[tid + 16];418smem[tid] = product = product + smem[tid + 8];419smem[tid] = product = product + smem[tid + 4];420smem[tid] = product = product + smem[tid + 2];421}422#endif423if (tid == 0){424product = product + smem[tid + 1];425labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);426}427}428429//---------------------------------------------------------------------430// Linear SVM based classification431// 256 threads432__kernel void classify_hists_kernel(433const int cdescr_size, const int cdescr_width, const int cblock_hist_size,434const int img_win_width, const int img_block_width,435const int win_block_stride_x, const int win_block_stride_y,436__global const float * block_hists, __global const float* coefs,437float free_coef, float threshold, __global uchar* labels)438{439const int tid = get_local_id(0);440const int gidX = get_group_id(0);441const int gidY = get_group_id(1);442443__global const float* hist = block_hists + (gidY * win_block_stride_y *444img_block_width + gidX * win_block_stride_x) * cblock_hist_size;445446float product = 0.f;447for (int i = tid; i < cdescr_size; i += NTHREADS)448{449int offset_y = i / cdescr_width;450int offset_x = i - offset_y * cdescr_width;451product += coefs[i] *452hist[offset_y * img_block_width * cblock_hist_size + offset_x];453}454455__local float products[NTHREADS];456457products[tid] = product;458459barrier(CLK_LOCAL_MEM_FENCE);460461if (tid < 128) products[tid] = product = product + products[tid + 128];462barrier(CLK_LOCAL_MEM_FENCE);463464if (tid < 64) products[tid] = product = product + products[tid + 64];465barrier(CLK_LOCAL_MEM_FENCE);466467volatile __local float* smem = products;468#ifdef CPU469if(tid<32) smem[tid] = product = product + smem[tid + 32];470barrier(CLK_LOCAL_MEM_FENCE);471if(tid<16) smem[tid] = product = product + smem[tid + 16];472barrier(CLK_LOCAL_MEM_FENCE);473if(tid<8) smem[tid] = product = product + smem[tid + 8];474barrier(CLK_LOCAL_MEM_FENCE);475if(tid<4) smem[tid] = product = product + smem[tid + 4];476barrier(CLK_LOCAL_MEM_FENCE);477if(tid<2) smem[tid] = product = product + smem[tid + 2];478barrier(CLK_LOCAL_MEM_FENCE);479#else480if (tid < 32)481{482smem[tid] = product = product + smem[tid + 32];483#if WAVE_SIZE < 32484} barrier(CLK_LOCAL_MEM_FENCE);485if (tid < 16) {486#endif487smem[tid] = product = product + smem[tid + 16];488smem[tid] = product = product + smem[tid + 8];489smem[tid] = product = product + smem[tid + 4];490smem[tid] = product = product + smem[tid + 2];491}492#endif493if (tid == 0){494smem[tid] = product = product + smem[tid + 1];495labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);496}497}498499//----------------------------------------------------------------------------500// Extract descriptors501502__kernel void extract_descrs_by_rows_kernel(503const int cblock_hist_size, const int descriptors_quadstep,504const int cdescr_size, const int cdescr_width, const int img_block_width,505const int win_block_stride_x, const int win_block_stride_y,506__global const float* block_hists, __global float* descriptors)507{508int tid = get_local_id(0);509int gidX = get_group_id(0);510int gidY = get_group_id(1);511512// Get left top corner of the window in src513__global const float* hist = block_hists + (gidY * win_block_stride_y *514img_block_width + gidX * win_block_stride_x) * cblock_hist_size;515516// Get left top corner of the window in dst517__global float* descriptor = descriptors +518(gidY * get_num_groups(0) + gidX) * descriptors_quadstep;519520// Copy elements from src to dst521for (int i = tid; i < cdescr_size; i += NTHREADS)522{523int offset_y = i / cdescr_width;524int offset_x = i - offset_y * cdescr_width;525descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];526}527}528529__kernel void extract_descrs_by_cols_kernel(530const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,531const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width,532const int win_block_stride_x, const int win_block_stride_y,533__global const float* block_hists, __global float* descriptors)534{535int tid = get_local_id(0);536int gidX = get_group_id(0);537int gidY = get_group_id(1);538539// Get left top corner of the window in src540__global const float* hist = block_hists + (gidY * win_block_stride_y *541img_block_width + gidX * win_block_stride_x) * cblock_hist_size;542543// Get left top corner of the window in dst544__global float* descriptor = descriptors +545(gidY * get_num_groups(0) + gidX) * descriptors_quadstep;546547// Copy elements from src to dst548for (int i = tid; i < cdescr_size; i += NTHREADS)549{550int block_idx = i / cblock_hist_size;551int idx_in_block = i - block_idx * cblock_hist_size;552553int y = block_idx / cnblocks_win_x;554int x = block_idx - y * cnblocks_win_x;555556descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] =557hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block];558}559}560561//----------------------------------------------------------------------------562// Gradients computation563564__kernel void compute_gradients_8UC4_kernel(565const int height, const int width,566const int img_step, const int grad_quadstep, const int qangle_step,567const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle,568const float angle_scale, const char correct_gamma, const int cnbins)569{570const int x = get_global_id(0);571const int tid = get_local_id(0);572const int gSizeX = get_local_size(0);573const int gidY = get_group_id(1);574575__global const uchar4* row = img + gidY * img_step;576577__local float sh_row[(NTHREADS + 2) * 3];578579uchar4 val;580if (x < width)581val = row[x];582else583val = row[width - 2];584585sh_row[tid + 1] = val.x;586sh_row[tid + 1 + (NTHREADS + 2)] = val.y;587sh_row[tid + 1 + 2 * (NTHREADS + 2)] = val.z;588589if (tid == 0)590{591val = row[max(x - 1, 1)];592sh_row[0] = val.x;593sh_row[(NTHREADS + 2)] = val.y;594sh_row[2 * (NTHREADS + 2)] = val.z;595}596597if (tid == gSizeX - 1)598{599val = row[min(x + 1, width - 2)];600sh_row[gSizeX + 1] = val.x;601sh_row[gSizeX + 1 + (NTHREADS + 2)] = val.y;602sh_row[gSizeX + 1 + 2 * (NTHREADS + 2)] = val.z;603}604605barrier(CLK_LOCAL_MEM_FENCE);606if (x < width)607{608float4 a = (float4) (sh_row[tid], sh_row[tid + (NTHREADS + 2)],609sh_row[tid + 2 * (NTHREADS + 2)], 0);610float4 b = (float4) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)],611sh_row[tid + 2 + 2 * (NTHREADS + 2)], 0);612613float4 dx;614if (correct_gamma == 1)615dx = sqrt(b) - sqrt(a);616else617dx = b - a;618619float4 dy = (float4) 0.f;620621if (gidY > 0 && gidY < height - 1)622{623a = convert_float4(img[(gidY - 1) * img_step + x].xyzw);624b = convert_float4(img[(gidY + 1) * img_step + x].xyzw);625626if (correct_gamma == 1)627dy = sqrt(b) - sqrt(a);628else629dy = b - a;630}631632float4 mag = hypot(dx, dy);633float best_dx = dx.x;634float best_dy = dy.x;635636float mag0 = mag.x;637if (mag0 < mag.y)638{639best_dx = dx.y;640best_dy = dy.y;641mag0 = mag.y;642}643644if (mag0 < mag.z)645{646best_dx = dx.z;647best_dy = dy.z;648mag0 = mag.z;649}650651float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;652int hidx = (int)floor(ang);653ang -= hidx;654hidx = (hidx + cnbins) % cnbins;655656qangle[(gidY * qangle_step + x) << 1] = hidx;657qangle[((gidY * qangle_step + x) << 1) + 1] = (hidx + 1) % cnbins;658grad[(gidY * grad_quadstep + x) << 1] = mag0 * (1.f - ang);659grad[((gidY * grad_quadstep + x) << 1) + 1] = mag0 * ang;660}661}662663__kernel void compute_gradients_8UC1_kernel(664const int height, const int width,665const int img_step, const int grad_quadstep, const int qangle_step,666__global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle,667const float angle_scale, const char correct_gamma, const int cnbins)668{669const int x = get_global_id(0);670const int tid = get_local_id(0);671const int gSizeX = get_local_size(0);672const int gidY = get_group_id(1);673674__global const uchar* row = img + gidY * img_step;675676__local float sh_row[NTHREADS + 2];677678if (x < width)679sh_row[tid + 1] = row[x];680else681sh_row[tid + 1] = row[width - 2];682683if (tid == 0)684sh_row[0] = row[max(x - 1, 1)];685686if (tid == gSizeX - 1)687sh_row[gSizeX + 1] = row[min(x + 1, width - 2)];688689barrier(CLK_LOCAL_MEM_FENCE);690if (x < width)691{692float dx;693694if (correct_gamma == 1)695dx = sqrt(sh_row[tid + 2]) - sqrt(sh_row[tid]);696else697dx = sh_row[tid + 2] - sh_row[tid];698699float dy = 0.f;700if (gidY > 0 && gidY < height - 1)701{702float a = (float) img[ (gidY + 1) * img_step + x ];703float b = (float) img[ (gidY - 1) * img_step + x ];704if (correct_gamma == 1)705dy = sqrt(a) - sqrt(b);706else707dy = a - b;708}709float mag = hypot(dx, dy);710711float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f;712int hidx = (int)floor(ang);713ang -= hidx;714hidx = (hidx + cnbins) % cnbins;715716qangle[ (gidY * qangle_step + x) << 1 ] = hidx;717qangle[ ((gidY * qangle_step + x) << 1) + 1 ] = (hidx + 1) % cnbins;718grad[ (gidY * grad_quadstep + x) << 1 ] = mag * (1.f - ang);719grad[ ((gidY * grad_quadstep + x) << 1) + 1 ] = mag * ang;720}721}722723724