Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/features2d/src/opencl/brute_force_match.cl
16339 views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
8
//
9
//
10
// License Agreement
11
// For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15
// Third party copyrights are property of their respective owners.
16
//
17
// @Authors
18
// Nathan, liujun@multicorewareinc.com
19
// Peng Xiao, pengxiao@outlook.com
20
// Baichuan Su, baichuan@multicorewareinc.com
21
//
22
// Redistribution and use in source and binary forms, with or without modification,
23
// are permitted provided that the following conditions are met:
24
//
25
// * Redistribution's of source code must retain the above copyright notice,
26
// this list of conditions and the following disclaimer.
27
//
28
// * Redistribution's in binary form must reproduce the above copyright notice,
29
// this list of conditions and the following disclaimer in the documentation
30
// and/or other materials provided with the distribution.
31
//
32
// * The name of the copyright holders may not be used to endorse or promote products
33
// derived from this software without specific prior written permission.
34
//
35
// This software is provided by the copyright holders and contributors "as is" and
36
// any express or implied warranties, including, but not limited to, the implied
37
// warranties of merchantability and fitness for a particular purpose are disclaimed.
38
// In no event shall the Intel Corporation or contributors be liable for any direct,
39
// indirect, incidental, special, exemplary, or consequential damages
40
// (including, but not limited to, procurement of substitute goods or services;
41
// loss of use, data, or profits; or business interruption) however caused
42
// and on any theory of liability, whether in contract, strict liability,
43
// or tort (including negligence or otherwise) arising in any way out of
44
// the use of this software, even if advised of the possibility of such damage.
45
//
46
//M*/
47
48
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
49
#define MAX_FLOAT 3.40282e+038f
50
51
#ifndef T
52
#define T float
53
#endif
54
55
#ifndef BLOCK_SIZE
56
#define BLOCK_SIZE 16
57
#endif
58
#ifndef MAX_DESC_LEN
59
#define MAX_DESC_LEN 64
60
#endif
61
62
#define BLOCK_SIZE_ODD (BLOCK_SIZE + 1)
63
#ifndef SHARED_MEM_SZ
64
# if (BLOCK_SIZE < MAX_DESC_LEN)
65
# define SHARED_MEM_SZ (kercn * (BLOCK_SIZE * MAX_DESC_LEN + BLOCK_SIZE * BLOCK_SIZE))
66
# else
67
# define SHARED_MEM_SZ (kercn * 2 * BLOCK_SIZE_ODD * BLOCK_SIZE)
68
# endif
69
#endif
70
71
#ifndef DIST_TYPE
72
#define DIST_TYPE 2
73
#endif
74
75
// dirty fix for non-template support
76
#if (DIST_TYPE == 2) // L1Dist
77
# ifdef T_FLOAT
78
typedef float result_type;
79
# if (8 == kercn)
80
typedef float8 value_type;
81
# define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}
82
# elif (4 == kercn)
83
typedef float4 value_type;
84
# define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}
85
# else
86
typedef float value_type;
87
# define DIST(x, y) result += fabs((x) - (y))
88
# endif
89
# else
90
typedef int result_type;
91
# if (8 == kercn)
92
typedef int8 value_type;
93
# define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}
94
# elif (4 == kercn)
95
typedef int4 value_type;
96
# define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}
97
# else
98
typedef int value_type;
99
# define DIST(x, y) result += abs((x) - (y))
100
# endif
101
# endif
102
# define DIST_RES(x) (x)
103
#elif (DIST_TYPE == 4) // L2Dist
104
typedef float result_type;
105
# if (8 == kercn)
106
typedef float8 value_type;
107
# define DIST(x, y) {value_type d = ((x) - (y)); result += dot(d.s0123, d.s0123) + dot(d.s4567, d.s4567);}
108
# elif (4 == kercn)
109
typedef float4 value_type;
110
# define DIST(x, y) {value_type d = ((x) - (y)); result += dot(d, d);}
111
# else
112
typedef float value_type;
113
# define DIST(x, y) {value_type d = ((x) - (y)); result = mad(d, d, result);}
114
# endif
115
# define DIST_RES(x) sqrt(x)
116
#elif (DIST_TYPE == 6) // Hamming
117
# if (8 == kercn)
118
typedef int8 value_type;
119
# elif (4 == kercn)
120
typedef int4 value_type;
121
# else
122
typedef int value_type;
123
# endif
124
typedef int result_type;
125
# define DIST(x, y) result += popcount( (x) ^ (y) )
126
# define DIST_RES(x) (x)
127
#endif
128
129
inline result_type reduce_block(
130
__local value_type *s_query,
131
__local value_type *s_train,
132
int lidx,
133
int lidy
134
)
135
{
136
result_type result = 0;
137
#pragma unroll
138
for (int j = 0 ; j < BLOCK_SIZE ; j++)
139
{
140
DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);
141
}
142
return DIST_RES(result);
143
}
144
145
inline result_type reduce_block_match(
146
__local value_type *s_query,
147
__local value_type *s_train,
148
int lidx,
149
int lidy
150
)
151
{
152
result_type result = 0;
153
#pragma unroll
154
for (int j = 0 ; j < BLOCK_SIZE ; j++)
155
{
156
DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);
157
}
158
return result;
159
}
160
161
inline result_type reduce_multi_block(
162
__local value_type *s_query,
163
__local value_type *s_train,
164
int block_index,
165
int lidx,
166
int lidy
167
)
168
{
169
result_type result = 0;
170
#pragma unroll
171
for (int j = 0 ; j < BLOCK_SIZE ; j++)
172
{
173
DIST(s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]);
174
}
175
return result;
176
}
177
178
__kernel void BruteForceMatch_Match(
179
__global T *query,
180
__global T *train,
181
__global int *bestTrainIdx,
182
__global float *bestDistance,
183
int query_rows,
184
int query_cols,
185
int train_rows,
186
int train_cols,
187
int step
188
)
189
{
190
const int lidx = get_local_id(0);
191
const int lidy = get_local_id(1);
192
const int groupidx = get_group_id(0);
193
194
const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);
195
const int queryOffset = min(queryIdx, query_rows - 1) * step;
196
__global TN *query_vec = (__global TN *)(query + queryOffset);
197
query_cols /= kercn;
198
199
__local float sharebuffer[SHARED_MEM_SZ];
200
__local value_type *s_query = (__local value_type *)sharebuffer;
201
202
#if 0 < MAX_DESC_LEN
203
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
204
// load the query into local memory.
205
#pragma unroll
206
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)
207
{
208
const int loadx = mad24(BLOCK_SIZE, i, lidx);
209
s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;
210
}
211
#else
212
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
213
const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);
214
const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);
215
#endif
216
217
float myBestDistance = MAX_FLOAT;
218
int myBestTrainIdx = -1;
219
220
// loopUnrolledCached to find the best trainIdx and best distance.
221
for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)
222
{
223
result_type result = 0;
224
225
const int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;
226
__global TN *train_vec = (__global TN *)(train + trainOffset);
227
#if 0 < MAX_DESC_LEN
228
#pragma unroll
229
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)
230
{
231
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
232
const int loadx = mad24(BLOCK_SIZE, i, lidx);
233
s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;
234
235
//synchronize to make sure each elem for reduceIteration in share memory is written already.
236
barrier(CLK_LOCAL_MEM_FENCE);
237
238
result += reduce_multi_block(s_query, s_train, i, lidx, lidy);
239
240
barrier(CLK_LOCAL_MEM_FENCE);
241
}
242
#else
243
for (int i = 0, endq = (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endq; i++)
244
{
245
const int loadx = mad24(i, BLOCK_SIZE, lidx);
246
//load query and train into local memory
247
if (loadx < query_cols)
248
{
249
s_query[s_query_i] = query_vec[loadx];
250
s_train[s_train_i] = train_vec[loadx];
251
}
252
else
253
{
254
s_query[s_query_i] = 0;
255
s_train[s_train_i] = 0;
256
}
257
258
barrier(CLK_LOCAL_MEM_FENCE);
259
260
result += reduce_block_match(s_query, s_train, lidx, lidy);
261
262
barrier(CLK_LOCAL_MEM_FENCE);
263
}
264
#endif
265
result = DIST_RES(result);
266
267
const int trainIdx = mad24(BLOCK_SIZE, t, lidx);
268
269
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
270
{
271
myBestDistance = result;
272
myBestTrainIdx = trainIdx;
273
}
274
}
275
276
barrier(CLK_LOCAL_MEM_FENCE);
277
278
__local float *s_distance = (__local float *)sharebuffer;
279
__local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);
280
281
//findBestMatch
282
s_distance += lidy * BLOCK_SIZE_ODD;
283
s_trainIdx += lidy * BLOCK_SIZE_ODD;
284
s_distance[lidx] = myBestDistance;
285
s_trainIdx[lidx] = myBestTrainIdx;
286
287
barrier(CLK_LOCAL_MEM_FENCE);
288
289
//reduce -- now all reduce implement in each threads.
290
#pragma unroll
291
for (int k = 0 ; k < BLOCK_SIZE; k++)
292
{
293
if (myBestDistance > s_distance[k])
294
{
295
myBestDistance = s_distance[k];
296
myBestTrainIdx = s_trainIdx[k];
297
}
298
}
299
300
if (queryIdx < query_rows && lidx == 0)
301
{
302
bestTrainIdx[queryIdx] = myBestTrainIdx;
303
bestDistance[queryIdx] = myBestDistance;
304
}
305
}
306
307
//radius_match
308
__kernel void BruteForceMatch_RadiusMatch(
309
__global T *query,
310
__global T *train,
311
float maxDistance,
312
__global int *bestTrainIdx,
313
__global float *bestDistance,
314
__global int *nMatches,
315
int query_rows,
316
int query_cols,
317
int train_rows,
318
int train_cols,
319
int bestTrainIdx_cols,
320
int step,
321
int ostep
322
)
323
{
324
const int lidx = get_local_id(0);
325
const int lidy = get_local_id(1);
326
const int groupidx = get_group_id(0);
327
const int groupidy = get_group_id(1);
328
329
const int queryIdx = mad24(BLOCK_SIZE, groupidy, lidy);
330
const int queryOffset = min(queryIdx, query_rows - 1) * step;
331
__global TN *query_vec = (__global TN *)(query + queryOffset);
332
333
const int trainIdx = mad24(BLOCK_SIZE, groupidx, lidx);
334
const int trainOffset = min(mad24(BLOCK_SIZE, groupidx, lidy), train_rows - 1) * step;
335
__global TN *train_vec = (__global TN *)(train + trainOffset);
336
337
query_cols /= kercn;
338
339
__local float sharebuffer[SHARED_MEM_SZ];
340
__local value_type *s_query = (__local value_type *)sharebuffer;
341
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
342
343
result_type result = 0;
344
const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);
345
const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);
346
for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i)
347
{
348
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
349
const int loadx = mad24(BLOCK_SIZE, i, lidx);
350
351
if (loadx < query_cols)
352
{
353
s_query[s_query_i] = query_vec[loadx];
354
s_train[s_train_i] = train_vec[loadx];
355
}
356
else
357
{
358
s_query[s_query_i] = 0;
359
s_train[s_train_i] = 0;
360
}
361
362
//synchronize to make sure each elem for reduceIteration in share memory is written already.
363
barrier(CLK_LOCAL_MEM_FENCE);
364
365
result += reduce_block(s_query, s_train, lidx, lidy);
366
367
barrier(CLK_LOCAL_MEM_FENCE);
368
}
369
if (queryIdx < query_rows && trainIdx < train_rows && convert_float(result) < maxDistance)
370
{
371
int ind = atom_inc(nMatches + queryIdx);
372
373
if(ind < bestTrainIdx_cols)
374
{
375
bestTrainIdx[mad24(queryIdx, ostep, ind)] = trainIdx;
376
bestDistance[mad24(queryIdx, ostep, ind)] = result;
377
}
378
}
379
}
380
381
__kernel void BruteForceMatch_knnMatch(
382
__global T *query,
383
__global T *train,
384
__global int2 *bestTrainIdx,
385
__global float2 *bestDistance,
386
int query_rows,
387
int query_cols,
388
int train_rows,
389
int train_cols,
390
int step
391
)
392
{
393
const int lidx = get_local_id(0);
394
const int lidy = get_local_id(1);
395
const int groupidx = get_group_id(0);
396
397
const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);
398
const int queryOffset = min(queryIdx, query_rows - 1) * step;
399
__global TN *query_vec = (__global TN *)(query + queryOffset);
400
query_cols /= kercn;
401
402
__local float sharebuffer[SHARED_MEM_SZ];
403
__local value_type *s_query = (__local value_type *)sharebuffer;
404
405
#if 0 < MAX_DESC_LEN
406
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
407
// load the query into local memory.
408
#pragma unroll
409
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++)
410
{
411
int loadx = mad24(BLOCK_SIZE, i, lidx);
412
s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;
413
}
414
#else
415
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
416
const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);
417
const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);
418
#endif
419
420
float myBestDistance1 = MAX_FLOAT;
421
float myBestDistance2 = MAX_FLOAT;
422
int myBestTrainIdx1 = -1;
423
int myBestTrainIdx2 = -1;
424
425
for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt ; t++)
426
{
427
result_type result = 0;
428
429
int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;
430
__global TN *train_vec = (__global TN *)(train + trainOffset);
431
#if 0 < MAX_DESC_LEN
432
#pragma unroll
433
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
434
{
435
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
436
const int loadx = mad24(BLOCK_SIZE, i, lidx);
437
s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;
438
439
//synchronize to make sure each elem for reduceIteration in share memory is written already.
440
barrier(CLK_LOCAL_MEM_FENCE);
441
442
result += reduce_multi_block(s_query, s_train, i, lidx, lidy);
443
444
barrier(CLK_LOCAL_MEM_FENCE);
445
}
446
#else
447
for (int i = 0, endq = (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE; i < endq ; i++)
448
{
449
const int loadx = mad24(BLOCK_SIZE, i, lidx);
450
//load query and train into local memory
451
if (loadx < query_cols)
452
{
453
s_query[s_query_i] = query_vec[loadx];
454
s_train[s_train_i] = train_vec[loadx];
455
}
456
else
457
{
458
s_query[s_query_i] = 0;
459
s_train[s_train_i] = 0;
460
}
461
462
barrier(CLK_LOCAL_MEM_FENCE);
463
464
result += reduce_block_match(s_query, s_train, lidx, lidy);
465
466
barrier(CLK_LOCAL_MEM_FENCE);
467
}
468
#endif
469
result = DIST_RES(result);
470
471
const int trainIdx = mad24(BLOCK_SIZE, t, lidx);
472
473
if (queryIdx < query_rows && trainIdx < train_rows)
474
{
475
if (result < myBestDistance1)
476
{
477
myBestDistance2 = myBestDistance1;
478
myBestTrainIdx2 = myBestTrainIdx1;
479
myBestDistance1 = result;
480
myBestTrainIdx1 = trainIdx;
481
}
482
else if (result < myBestDistance2)
483
{
484
myBestDistance2 = result;
485
myBestTrainIdx2 = trainIdx;
486
}
487
}
488
}
489
490
barrier(CLK_LOCAL_MEM_FENCE);
491
492
__local float *s_distance = (__local float *)sharebuffer;
493
__local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);
494
495
// find BestMatch
496
s_distance += lidy * BLOCK_SIZE_ODD;
497
s_trainIdx += lidy * BLOCK_SIZE_ODD;
498
s_distance[lidx] = myBestDistance1;
499
s_trainIdx[lidx] = myBestTrainIdx1;
500
501
float bestDistance1 = MAX_FLOAT;
502
float bestDistance2 = MAX_FLOAT;
503
int bestTrainIdx1 = -1;
504
int bestTrainIdx2 = -1;
505
barrier(CLK_LOCAL_MEM_FENCE);
506
507
if (lidx == 0)
508
{
509
for (int i = 0 ; i < BLOCK_SIZE ; i++)
510
{
511
float val = s_distance[i];
512
if (val < bestDistance1)
513
{
514
bestDistance2 = bestDistance1;
515
bestTrainIdx2 = bestTrainIdx1;
516
517
bestDistance1 = val;
518
bestTrainIdx1 = s_trainIdx[i];
519
}
520
else if (val < bestDistance2)
521
{
522
bestDistance2 = val;
523
bestTrainIdx2 = s_trainIdx[i];
524
}
525
}
526
}
527
528
barrier(CLK_LOCAL_MEM_FENCE);
529
530
s_distance[lidx] = myBestDistance2;
531
s_trainIdx[lidx] = myBestTrainIdx2;
532
533
barrier(CLK_LOCAL_MEM_FENCE);
534
535
if (lidx == 0)
536
{
537
for (int i = 0 ; i < BLOCK_SIZE ; i++)
538
{
539
float val = s_distance[i];
540
541
if (val < bestDistance2)
542
{
543
bestDistance2 = val;
544
bestTrainIdx2 = s_trainIdx[i];
545
}
546
}
547
}
548
549
myBestDistance1 = bestDistance1;
550
myBestDistance2 = bestDistance2;
551
552
myBestTrainIdx1 = bestTrainIdx1;
553
myBestTrainIdx2 = bestTrainIdx2;
554
555
if (queryIdx < query_rows && lidx == 0)
556
{
557
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
558
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
559
}
560
}
561