Path: blob/master/modules/features2d/src/opencl/brute_force_match.cl
16339 views
/*M///////////////////////////////////////////////////////////////////////////////////////1//2// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.3//4// By downloading, copying, installing or using the software you agree to this license.5// If you do not agree to this license, do not download, install,6// copy or use the software.7//8//9// License Agreement10// For Open Source Computer Vision Library11//12// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.13// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.14// Third party copyrights are property of their respective owners.15//16// @Authors17// Nathan, liujun@multicorewareinc.com18// Peng Xiao, pengxiao@outlook.com19// Baichuan Su, baichuan@multicorewareinc.com20//21// Redistribution and use in source and binary forms, with or without modification,22// are permitted provided that the following conditions are met:23//24// * Redistribution's of source code must retain the above copyright notice,25// this list of conditions and the following disclaimer.26//27// * Redistribution's in binary form must reproduce the above copyright notice,28// this list of conditions and the following disclaimer in the documentation29// and/or other materials provided with the distribution.30//31// * The name of the copyright holders may not be used to endorse or promote products32// derived from this software without specific prior written permission.33//34// This software is provided by the copyright holders and contributors "as is" and35// any express or implied warranties, including, but not limited to, the implied36// warranties of merchantability and fitness for a particular purpose are disclaimed.37// In no event shall the Intel Corporation or contributors be liable for any direct,38// indirect, incidental, special, exemplary, or consequential damages39// (including, but not limited to, procurement of substitute goods or services;40// loss of use, data, or profits; or business interruption) however caused41// and on any theory of liability, whether in contract, strict liability,42// or tort (including negligence or otherwise) arising in any way out of43// the use of this software, even if advised of the possibility of such damage.44//45//M*/4647#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable48#define MAX_FLOAT 3.40282e+038f4950#ifndef T51#define T float52#endif5354#ifndef BLOCK_SIZE55#define BLOCK_SIZE 1656#endif57#ifndef MAX_DESC_LEN58#define MAX_DESC_LEN 6459#endif6061#define BLOCK_SIZE_ODD (BLOCK_SIZE + 1)62#ifndef SHARED_MEM_SZ63# if (BLOCK_SIZE < MAX_DESC_LEN)64# define SHARED_MEM_SZ (kercn * (BLOCK_SIZE * MAX_DESC_LEN + BLOCK_SIZE * BLOCK_SIZE))65# else66# define SHARED_MEM_SZ (kercn * 2 * BLOCK_SIZE_ODD * BLOCK_SIZE)67# endif68#endif6970#ifndef DIST_TYPE71#define DIST_TYPE 272#endif7374// dirty fix for non-template support75#if (DIST_TYPE == 2) // L1Dist76# ifdef T_FLOAT77typedef float result_type;78# if (8 == kercn)79typedef float8 value_type;80# define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}81# elif (4 == kercn)82typedef float4 value_type;83# define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}84# else85typedef float value_type;86# define DIST(x, y) result += fabs((x) - (y))87# endif88# else89typedef int result_type;90# if (8 == kercn)91typedef int8 value_type;92# define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}93# elif (4 == kercn)94typedef int4 value_type;95# define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}96# else97typedef int value_type;98# define DIST(x, y) result += abs((x) - (y))99# endif100# endif101# define DIST_RES(x) (x)102#elif (DIST_TYPE == 4) // L2Dist103typedef float result_type;104# if (8 == kercn)105typedef float8 value_type;106# define DIST(x, y) {value_type d = ((x) - (y)); result += dot(d.s0123, d.s0123) + dot(d.s4567, d.s4567);}107# elif (4 == kercn)108typedef float4 value_type;109# define DIST(x, y) {value_type d = ((x) - (y)); result += dot(d, d);}110# else111typedef float value_type;112# define DIST(x, y) {value_type d = ((x) - (y)); result = mad(d, d, result);}113# endif114# define DIST_RES(x) sqrt(x)115#elif (DIST_TYPE == 6) // Hamming116# if (8 == kercn)117typedef int8 value_type;118# elif (4 == kercn)119typedef int4 value_type;120# else121typedef int value_type;122# endif123typedef int result_type;124# define DIST(x, y) result += popcount( (x) ^ (y) )125# define DIST_RES(x) (x)126#endif127128inline result_type reduce_block(129__local value_type *s_query,130__local value_type *s_train,131int lidx,132int lidy133)134{135result_type result = 0;136#pragma unroll137for (int j = 0 ; j < BLOCK_SIZE ; j++)138{139DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);140}141return DIST_RES(result);142}143144inline result_type reduce_block_match(145__local value_type *s_query,146__local value_type *s_train,147int lidx,148int lidy149)150{151result_type result = 0;152#pragma unroll153for (int j = 0 ; j < BLOCK_SIZE ; j++)154{155DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);156}157return result;158}159160inline result_type reduce_multi_block(161__local value_type *s_query,162__local value_type *s_train,163int block_index,164int lidx,165int lidy166)167{168result_type result = 0;169#pragma unroll170for (int j = 0 ; j < BLOCK_SIZE ; j++)171{172DIST(s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]);173}174return result;175}176177__kernel void BruteForceMatch_Match(178__global T *query,179__global T *train,180__global int *bestTrainIdx,181__global float *bestDistance,182int query_rows,183int query_cols,184int train_rows,185int train_cols,186int step187)188{189const int lidx = get_local_id(0);190const int lidy = get_local_id(1);191const int groupidx = get_group_id(0);192193const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);194const int queryOffset = min(queryIdx, query_rows - 1) * step;195__global TN *query_vec = (__global TN *)(query + queryOffset);196query_cols /= kercn;197198__local float sharebuffer[SHARED_MEM_SZ];199__local value_type *s_query = (__local value_type *)sharebuffer;200201#if 0 < MAX_DESC_LEN202__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;203// load the query into local memory.204#pragma unroll205for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)206{207const int loadx = mad24(BLOCK_SIZE, i, lidx);208s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;209}210#else211__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;212const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);213const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);214#endif215216float myBestDistance = MAX_FLOAT;217int myBestTrainIdx = -1;218219// loopUnrolledCached to find the best trainIdx and best distance.220for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)221{222result_type result = 0;223224const int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;225__global TN *train_vec = (__global TN *)(train + trainOffset);226#if 0 < MAX_DESC_LEN227#pragma unroll228for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)229{230//load a BLOCK_SIZE * BLOCK_SIZE block into local train.231const int loadx = mad24(BLOCK_SIZE, i, lidx);232s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;233234//synchronize to make sure each elem for reduceIteration in share memory is written already.235barrier(CLK_LOCAL_MEM_FENCE);236237result += reduce_multi_block(s_query, s_train, i, lidx, lidy);238239barrier(CLK_LOCAL_MEM_FENCE);240}241#else242for (int i = 0, endq = (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endq; i++)243{244const int loadx = mad24(i, BLOCK_SIZE, lidx);245//load query and train into local memory246if (loadx < query_cols)247{248s_query[s_query_i] = query_vec[loadx];249s_train[s_train_i] = train_vec[loadx];250}251else252{253s_query[s_query_i] = 0;254s_train[s_train_i] = 0;255}256257barrier(CLK_LOCAL_MEM_FENCE);258259result += reduce_block_match(s_query, s_train, lidx, lidy);260261barrier(CLK_LOCAL_MEM_FENCE);262}263#endif264result = DIST_RES(result);265266const int trainIdx = mad24(BLOCK_SIZE, t, lidx);267268if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)269{270myBestDistance = result;271myBestTrainIdx = trainIdx;272}273}274275barrier(CLK_LOCAL_MEM_FENCE);276277__local float *s_distance = (__local float *)sharebuffer;278__local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);279280//findBestMatch281s_distance += lidy * BLOCK_SIZE_ODD;282s_trainIdx += lidy * BLOCK_SIZE_ODD;283s_distance[lidx] = myBestDistance;284s_trainIdx[lidx] = myBestTrainIdx;285286barrier(CLK_LOCAL_MEM_FENCE);287288//reduce -- now all reduce implement in each threads.289#pragma unroll290for (int k = 0 ; k < BLOCK_SIZE; k++)291{292if (myBestDistance > s_distance[k])293{294myBestDistance = s_distance[k];295myBestTrainIdx = s_trainIdx[k];296}297}298299if (queryIdx < query_rows && lidx == 0)300{301bestTrainIdx[queryIdx] = myBestTrainIdx;302bestDistance[queryIdx] = myBestDistance;303}304}305306//radius_match307__kernel void BruteForceMatch_RadiusMatch(308__global T *query,309__global T *train,310float maxDistance,311__global int *bestTrainIdx,312__global float *bestDistance,313__global int *nMatches,314int query_rows,315int query_cols,316int train_rows,317int train_cols,318int bestTrainIdx_cols,319int step,320int ostep321)322{323const int lidx = get_local_id(0);324const int lidy = get_local_id(1);325const int groupidx = get_group_id(0);326const int groupidy = get_group_id(1);327328const int queryIdx = mad24(BLOCK_SIZE, groupidy, lidy);329const int queryOffset = min(queryIdx, query_rows - 1) * step;330__global TN *query_vec = (__global TN *)(query + queryOffset);331332const int trainIdx = mad24(BLOCK_SIZE, groupidx, lidx);333const int trainOffset = min(mad24(BLOCK_SIZE, groupidx, lidy), train_rows - 1) * step;334__global TN *train_vec = (__global TN *)(train + trainOffset);335336query_cols /= kercn;337338__local float sharebuffer[SHARED_MEM_SZ];339__local value_type *s_query = (__local value_type *)sharebuffer;340__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;341342result_type result = 0;343const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);344const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);345for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i)346{347//load a BLOCK_SIZE * BLOCK_SIZE block into local train.348const int loadx = mad24(BLOCK_SIZE, i, lidx);349350if (loadx < query_cols)351{352s_query[s_query_i] = query_vec[loadx];353s_train[s_train_i] = train_vec[loadx];354}355else356{357s_query[s_query_i] = 0;358s_train[s_train_i] = 0;359}360361//synchronize to make sure each elem for reduceIteration in share memory is written already.362barrier(CLK_LOCAL_MEM_FENCE);363364result += reduce_block(s_query, s_train, lidx, lidy);365366barrier(CLK_LOCAL_MEM_FENCE);367}368if (queryIdx < query_rows && trainIdx < train_rows && convert_float(result) < maxDistance)369{370int ind = atom_inc(nMatches + queryIdx);371372if(ind < bestTrainIdx_cols)373{374bestTrainIdx[mad24(queryIdx, ostep, ind)] = trainIdx;375bestDistance[mad24(queryIdx, ostep, ind)] = result;376}377}378}379380__kernel void BruteForceMatch_knnMatch(381__global T *query,382__global T *train,383__global int2 *bestTrainIdx,384__global float2 *bestDistance,385int query_rows,386int query_cols,387int train_rows,388int train_cols,389int step390)391{392const int lidx = get_local_id(0);393const int lidy = get_local_id(1);394const int groupidx = get_group_id(0);395396const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);397const int queryOffset = min(queryIdx, query_rows - 1) * step;398__global TN *query_vec = (__global TN *)(query + queryOffset);399query_cols /= kercn;400401__local float sharebuffer[SHARED_MEM_SZ];402__local value_type *s_query = (__local value_type *)sharebuffer;403404#if 0 < MAX_DESC_LEN405__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;406// load the query into local memory.407#pragma unroll408for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++)409{410int loadx = mad24(BLOCK_SIZE, i, lidx);411s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;412}413#else414__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;415const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);416const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);417#endif418419float myBestDistance1 = MAX_FLOAT;420float myBestDistance2 = MAX_FLOAT;421int myBestTrainIdx1 = -1;422int myBestTrainIdx2 = -1;423424for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt ; t++)425{426result_type result = 0;427428int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;429__global TN *train_vec = (__global TN *)(train + trainOffset);430#if 0 < MAX_DESC_LEN431#pragma unroll432for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)433{434//load a BLOCK_SIZE * BLOCK_SIZE block into local train.435const int loadx = mad24(BLOCK_SIZE, i, lidx);436s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;437438//synchronize to make sure each elem for reduceIteration in share memory is written already.439barrier(CLK_LOCAL_MEM_FENCE);440441result += reduce_multi_block(s_query, s_train, i, lidx, lidy);442443barrier(CLK_LOCAL_MEM_FENCE);444}445#else446for (int i = 0, endq = (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE; i < endq ; i++)447{448const int loadx = mad24(BLOCK_SIZE, i, lidx);449//load query and train into local memory450if (loadx < query_cols)451{452s_query[s_query_i] = query_vec[loadx];453s_train[s_train_i] = train_vec[loadx];454}455else456{457s_query[s_query_i] = 0;458s_train[s_train_i] = 0;459}460461barrier(CLK_LOCAL_MEM_FENCE);462463result += reduce_block_match(s_query, s_train, lidx, lidy);464465barrier(CLK_LOCAL_MEM_FENCE);466}467#endif468result = DIST_RES(result);469470const int trainIdx = mad24(BLOCK_SIZE, t, lidx);471472if (queryIdx < query_rows && trainIdx < train_rows)473{474if (result < myBestDistance1)475{476myBestDistance2 = myBestDistance1;477myBestTrainIdx2 = myBestTrainIdx1;478myBestDistance1 = result;479myBestTrainIdx1 = trainIdx;480}481else if (result < myBestDistance2)482{483myBestDistance2 = result;484myBestTrainIdx2 = trainIdx;485}486}487}488489barrier(CLK_LOCAL_MEM_FENCE);490491__local float *s_distance = (__local float *)sharebuffer;492__local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);493494// find BestMatch495s_distance += lidy * BLOCK_SIZE_ODD;496s_trainIdx += lidy * BLOCK_SIZE_ODD;497s_distance[lidx] = myBestDistance1;498s_trainIdx[lidx] = myBestTrainIdx1;499500float bestDistance1 = MAX_FLOAT;501float bestDistance2 = MAX_FLOAT;502int bestTrainIdx1 = -1;503int bestTrainIdx2 = -1;504barrier(CLK_LOCAL_MEM_FENCE);505506if (lidx == 0)507{508for (int i = 0 ; i < BLOCK_SIZE ; i++)509{510float val = s_distance[i];511if (val < bestDistance1)512{513bestDistance2 = bestDistance1;514bestTrainIdx2 = bestTrainIdx1;515516bestDistance1 = val;517bestTrainIdx1 = s_trainIdx[i];518}519else if (val < bestDistance2)520{521bestDistance2 = val;522bestTrainIdx2 = s_trainIdx[i];523}524}525}526527barrier(CLK_LOCAL_MEM_FENCE);528529s_distance[lidx] = myBestDistance2;530s_trainIdx[lidx] = myBestTrainIdx2;531532barrier(CLK_LOCAL_MEM_FENCE);533534if (lidx == 0)535{536for (int i = 0 ; i < BLOCK_SIZE ; i++)537{538float val = s_distance[i];539540if (val < bestDistance2)541{542bestDistance2 = val;543bestTrainIdx2 = s_trainIdx[i];544}545}546}547548myBestDistance1 = bestDistance1;549myBestDistance2 = bestDistance2;550551myBestTrainIdx1 = bestTrainIdx1;552myBestTrainIdx2 = bestTrainIdx2;553554if (queryIdx < query_rows && lidx == 0)555{556bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);557bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);558}559}560561