CoCalc -- brute_force

GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/features2d/src/opencl/brute_force_match.cl
¹⁶³³⁹ views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
//  By downloading, copying, installing or using the software you agree to this license.
6
//  If you do not agree to this license, do not download, install,
7
//  copy or use the software.
8
//
9
//
10
//                           License Agreement
11
//                For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15
// Third party copyrights are property of their respective owners.
16
//
17
// @Authors
18
//    Nathan, liujun@multicorewareinc.com
19
//    Peng Xiao, pengxiao@outlook.com
20
//    Baichuan Su, baichuan@multicorewareinc.com
21
//
22
// Redistribution and use in source and binary forms, with or without modification,
23
// are permitted provided that the following conditions are met:
24
//
25
//   * Redistribution's of source code must retain the above copyright notice,
26
//     this list of conditions and the following disclaimer.
27
//
28
//   * Redistribution's in binary form must reproduce the above copyright notice,
29
//     this list of conditions and the following disclaimer in the documentation
30
//     and/or other materials provided with the distribution.
31
//
32
//   * The name of the copyright holders may not be used to endorse or promote products
33
//     derived from this software without specific prior written permission.
34
//
35
// This software is provided by the copyright holders and contributors "as is" and
36
// any express or implied warranties, including, but not limited to, the implied
37
// warranties of merchantability and fitness for a particular purpose are disclaimed.
38
// In no event shall the Intel Corporation or contributors be liable for any direct,
39
// indirect, incidental, special, exemplary, or consequential damages
40
// (including, but not limited to, procurement of substitute goods or services;
41
// loss of use, data, or profits; or business interruption) however caused
42
// and on any theory of liability, whether in contract, strict liability,
43
// or tort (including negligence or otherwise) arising in any way out of
44
// the use of this software, even if advised of the possibility of such damage.
45
//
46
//M*/
47

48
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
49
#define MAX_FLOAT 3.40282e+038f
50

51
#ifndef T
52
#define T float
53
#endif
54

55
#ifndef BLOCK_SIZE
56
#define BLOCK_SIZE 16
57
#endif
58
#ifndef MAX_DESC_LEN
59
#define MAX_DESC_LEN 64
60
#endif
61

62
#define BLOCK_SIZE_ODD          (BLOCK_SIZE + 1)
63
#ifndef SHARED_MEM_SZ
64
#  if (BLOCK_SIZE < MAX_DESC_LEN)
65
#    define SHARED_MEM_SZ      (kercn * (BLOCK_SIZE * MAX_DESC_LEN + BLOCK_SIZE * BLOCK_SIZE))
66
#  else
67
#    define SHARED_MEM_SZ      (kercn * 2 * BLOCK_SIZE_ODD * BLOCK_SIZE)
68
#  endif
69
#endif
70

71
#ifndef DIST_TYPE
72
#define DIST_TYPE 2
73
#endif
74

75
// dirty fix for non-template support
76
#if (DIST_TYPE == 2) // L1Dist
77
#   ifdef T_FLOAT
78
        typedef float result_type;
79
#       if (8 == kercn)
80
            typedef float8 value_type;
81
#           define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}
82
#       elif (4 == kercn)
83
            typedef float4 value_type;
84
#           define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}
85
#       else
86
            typedef float value_type;
87
#           define DIST(x, y) result += fabs((x) - (y))
88
#       endif
89
#   else
90
        typedef int result_type;
91
#       if (8 == kercn)
92
            typedef int8 value_type;
93
#           define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}
94
#       elif (4 == kercn)
95
            typedef int4 value_type;
96
#           define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}
97
#       else
98
            typedef int  value_type;
99
#           define DIST(x, y) result += abs((x) - (y))
100
#       endif
101
#   endif
102
#   define DIST_RES(x) (x)
103
#elif (DIST_TYPE == 4) // L2Dist
104
    typedef float result_type;
105
#   if (8 == kercn)
106
        typedef float8 value_type;
107
#       define DIST(x, y)   {value_type d = ((x) - (y)); result += dot(d.s0123, d.s0123) + dot(d.s4567, d.s4567);}
108
#   elif (4 == kercn)
109
        typedef float4      value_type;
110
#       define DIST(x, y)   {value_type d = ((x) - (y)); result += dot(d, d);}
111
#   else
112
        typedef float       value_type;
113
#       define DIST(x, y)   {value_type d = ((x) - (y)); result = mad(d, d, result);}
114
#   endif
115
#   define DIST_RES(x) sqrt(x)
116
#elif (DIST_TYPE == 6) // Hamming
117
#   if (8 == kercn)
118
        typedef int8 value_type;
119
#   elif (4 == kercn)
120
        typedef int4 value_type;
121
#   else
122
        typedef int value_type;
123
#   endif
124
    typedef int result_type;
125
#   define DIST(x, y) result += popcount( (x) ^ (y) )
126
#   define DIST_RES(x) (x)
127
#endif
128

129
inline result_type reduce_block(
130
    __local value_type *s_query,
131
    __local value_type *s_train,
132
    int lidx,
133
    int lidy
134
    )
135
{
136
    result_type result = 0;
137
    #pragma unroll
138
    for (int j = 0 ; j < BLOCK_SIZE ; j++)
139
    {
140
        DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);
141
    }
142
    return DIST_RES(result);
143
}
144

145
inline result_type reduce_block_match(
146
    __local value_type *s_query,
147
    __local value_type *s_train,
148
    int lidx,
149
    int lidy
150
    )
151
{
152
    result_type result = 0;
153
    #pragma unroll
154
    for (int j = 0 ; j < BLOCK_SIZE ; j++)
155
    {
156
        DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);
157
    }
158
    return result;
159
}
160

161
inline result_type reduce_multi_block(
162
    __local value_type *s_query,
163
    __local value_type *s_train,
164
    int block_index,
165
    int lidx,
166
    int lidy
167
    )
168
{
169
    result_type result = 0;
170
    #pragma unroll
171
    for (int j = 0 ; j < BLOCK_SIZE ; j++)
172
    {
173
        DIST(s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]);
174
    }
175
    return result;
176
}
177

178
__kernel void BruteForceMatch_Match(
179
    __global T *query,
180
    __global T *train,
181
    __global int *bestTrainIdx,
182
    __global float *bestDistance,
183
    int query_rows,
184
    int query_cols,
185
    int train_rows,
186
    int train_cols,
187
    int step
188
)
189
{
190
    const int lidx = get_local_id(0);
191
    const int lidy = get_local_id(1);
192
    const int groupidx = get_group_id(0);
193

194
    const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);
195
    const int queryOffset = min(queryIdx, query_rows - 1) * step;
196
    __global TN *query_vec = (__global TN *)(query + queryOffset);
197
    query_cols /= kercn;
198

199
    __local float sharebuffer[SHARED_MEM_SZ];
200
    __local value_type *s_query = (__local value_type *)sharebuffer;
201

202
#if 0 < MAX_DESC_LEN
203
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
204
    // load the query into local memory.
205
    #pragma unroll
206
    for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)
207
    {
208
        const int loadx = mad24(BLOCK_SIZE, i, lidx);
209
        s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;
210
    }
211
#else
212
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
213
    const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);
214
    const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);
215
#endif
216

217
    float myBestDistance = MAX_FLOAT;
218
    int myBestTrainIdx = -1;
219

220
    // loopUnrolledCached to find the best trainIdx and best distance.
221
    for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)
222
    {
223
        result_type result = 0;
224

225
        const int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;
226
        __global TN *train_vec = (__global TN *)(train + trainOffset);
227
#if 0 < MAX_DESC_LEN
228
        #pragma unroll
229
        for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)
230
        {
231
            //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
232
            const int loadx = mad24(BLOCK_SIZE, i, lidx);
233
            s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;
234

235
            //synchronize to make sure each elem for reduceIteration in share memory is written already.
236
            barrier(CLK_LOCAL_MEM_FENCE);
237

238
            result += reduce_multi_block(s_query, s_train, i, lidx, lidy);
239

240
            barrier(CLK_LOCAL_MEM_FENCE);
241
        }
242
#else
243
        for (int i = 0, endq = (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endq; i++)
244
        {
245
            const int loadx = mad24(i, BLOCK_SIZE, lidx);
246
            //load query and train into local memory
247
            if (loadx < query_cols)
248
            {
249
                s_query[s_query_i] = query_vec[loadx];
250
                s_train[s_train_i] = train_vec[loadx];
251
            }
252
            else
253
            {
254
                s_query[s_query_i] = 0;
255
                s_train[s_train_i] = 0;
256
            }
257

258
            barrier(CLK_LOCAL_MEM_FENCE);
259

260
            result += reduce_block_match(s_query, s_train, lidx, lidy);
261

262
            barrier(CLK_LOCAL_MEM_FENCE);
263
        }
264
#endif
265
        result = DIST_RES(result);
266

267
        const int trainIdx = mad24(BLOCK_SIZE, t, lidx);
268

269
        if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
270
        {
271
            myBestDistance = result;
272
            myBestTrainIdx = trainIdx;
273
        }
274
    }
275

276
    barrier(CLK_LOCAL_MEM_FENCE);
277

278
    __local float *s_distance = (__local float *)sharebuffer;
279
    __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);
280

281
    //findBestMatch
282
    s_distance += lidy * BLOCK_SIZE_ODD;
283
    s_trainIdx += lidy * BLOCK_SIZE_ODD;
284
    s_distance[lidx] = myBestDistance;
285
    s_trainIdx[lidx] = myBestTrainIdx;
286

287
    barrier(CLK_LOCAL_MEM_FENCE);
288

289
    //reduce -- now all reduce implement in each threads.
290
    #pragma unroll
291
    for (int k = 0 ; k < BLOCK_SIZE; k++)
292
    {
293
        if (myBestDistance > s_distance[k])
294
        {
295
            myBestDistance = s_distance[k];
296
            myBestTrainIdx = s_trainIdx[k];
297
        }
298
    }
299

300
    if (queryIdx < query_rows && lidx == 0)
301
    {
302
        bestTrainIdx[queryIdx] = myBestTrainIdx;
303
        bestDistance[queryIdx] = myBestDistance;
304
    }
305
}
306

307
//radius_match
308
__kernel void BruteForceMatch_RadiusMatch(
309
    __global T *query,
310
    __global T *train,
311
    float maxDistance,
312
    __global int *bestTrainIdx,
313
    __global float *bestDistance,
314
    __global int *nMatches,
315
    int query_rows,
316
    int query_cols,
317
    int train_rows,
318
    int train_cols,
319
    int bestTrainIdx_cols,
320
    int step,
321
    int ostep
322
)
323
{
324
    const int lidx = get_local_id(0);
325
    const int lidy = get_local_id(1);
326
    const int groupidx = get_group_id(0);
327
    const int groupidy = get_group_id(1);
328

329
    const int queryIdx = mad24(BLOCK_SIZE, groupidy, lidy);
330
    const int queryOffset = min(queryIdx, query_rows - 1) * step;
331
    __global TN *query_vec = (__global TN *)(query + queryOffset);
332

333
    const int trainIdx = mad24(BLOCK_SIZE, groupidx, lidx);
334
    const int trainOffset = min(mad24(BLOCK_SIZE, groupidx, lidy), train_rows - 1) * step;
335
    __global TN *train_vec = (__global TN *)(train + trainOffset);
336

337
    query_cols /= kercn;
338

339
    __local float sharebuffer[SHARED_MEM_SZ];
340
    __local value_type *s_query = (__local value_type *)sharebuffer;
341
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
342

343
    result_type result = 0;
344
    const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);
345
    const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);
346
    for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i)
347
    {
348
        //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
349
        const int loadx = mad24(BLOCK_SIZE, i, lidx);
350

351
        if (loadx < query_cols)
352
        {
353
            s_query[s_query_i] = query_vec[loadx];
354
            s_train[s_train_i] = train_vec[loadx];
355
        }
356
        else
357
        {
358
            s_query[s_query_i] = 0;
359
            s_train[s_train_i] = 0;
360
        }
361

362
        //synchronize to make sure each elem for reduceIteration in share memory is written already.
363
        barrier(CLK_LOCAL_MEM_FENCE);
364

365
        result += reduce_block(s_query, s_train, lidx, lidy);
366

367
        barrier(CLK_LOCAL_MEM_FENCE);
368
    }
369
    if (queryIdx < query_rows && trainIdx < train_rows && convert_float(result) < maxDistance)
370
    {
371
        int ind = atom_inc(nMatches + queryIdx);
372

373
        if(ind < bestTrainIdx_cols)
374
        {
375
            bestTrainIdx[mad24(queryIdx, ostep, ind)] = trainIdx;
376
            bestDistance[mad24(queryIdx, ostep, ind)] = result;
377
        }
378
    }
379
}
380

381
__kernel void BruteForceMatch_knnMatch(
382
    __global T *query,
383
    __global T *train,
384
    __global int2 *bestTrainIdx,
385
    __global float2 *bestDistance,
386
    int query_rows,
387
    int query_cols,
388
    int train_rows,
389
    int train_cols,
390
    int step
391
)
392
{
393
    const int lidx = get_local_id(0);
394
    const int lidy = get_local_id(1);
395
    const int groupidx = get_group_id(0);
396

397
    const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);
398
    const int queryOffset = min(queryIdx, query_rows - 1) * step;
399
    __global TN *query_vec = (__global TN *)(query + queryOffset);
400
    query_cols /= kercn;
401

402
    __local float sharebuffer[SHARED_MEM_SZ];
403
    __local value_type *s_query = (__local value_type *)sharebuffer;
404

405
#if 0 < MAX_DESC_LEN
406
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
407
    // load the query into local memory.
408
    #pragma unroll
409
    for (int i = 0 ;  i <  MAX_DESC_LEN / BLOCK_SIZE; i ++)
410
    {
411
        int loadx = mad24(BLOCK_SIZE, i, lidx);
412
        s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;
413
    }
414
#else
415
    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
416
    const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);
417
    const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);
418
#endif
419

420
    float myBestDistance1 = MAX_FLOAT;
421
    float myBestDistance2 = MAX_FLOAT;
422
    int myBestTrainIdx1 = -1;
423
    int myBestTrainIdx2 = -1;
424

425
    for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt ; t++)
426
    {
427
        result_type result = 0;
428

429
        int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;
430
        __global TN *train_vec = (__global TN *)(train + trainOffset);
431
#if 0 < MAX_DESC_LEN
432
        #pragma unroll
433
        for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
434
        {
435
            //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
436
            const int loadx = mad24(BLOCK_SIZE, i, lidx);
437
            s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;
438

439
            //synchronize to make sure each elem for reduceIteration in share memory is written already.
440
            barrier(CLK_LOCAL_MEM_FENCE);
441

442
            result += reduce_multi_block(s_query, s_train, i, lidx, lidy);
443

444
            barrier(CLK_LOCAL_MEM_FENCE);
445
        }
446
#else
447
        for (int i = 0, endq = (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE; i < endq ; i++)
448
        {
449
            const int loadx = mad24(BLOCK_SIZE, i, lidx);
450
            //load query and train into local memory
451
            if (loadx < query_cols)
452
            {
453
                s_query[s_query_i] = query_vec[loadx];
454
                s_train[s_train_i] = train_vec[loadx];
455
            }
456
            else
457
            {
458
                s_query[s_query_i] = 0;
459
                s_train[s_train_i] = 0;
460
            }
461

462
            barrier(CLK_LOCAL_MEM_FENCE);
463

464
            result += reduce_block_match(s_query, s_train, lidx, lidy);
465

466
            barrier(CLK_LOCAL_MEM_FENCE);
467
        }
468
#endif
469
        result = DIST_RES(result);
470

471
        const int trainIdx = mad24(BLOCK_SIZE, t, lidx);
472

473
        if (queryIdx < query_rows && trainIdx < train_rows)
474
        {
475
            if (result < myBestDistance1)
476
            {
477
                myBestDistance2 = myBestDistance1;
478
                myBestTrainIdx2 = myBestTrainIdx1;
479
                myBestDistance1 = result;
480
                myBestTrainIdx1 = trainIdx;
481
            }
482
            else if (result < myBestDistance2)
483
            {
484
                myBestDistance2 = result;
485
                myBestTrainIdx2 = trainIdx;
486
            }
487
        }
488
    }
489

490
    barrier(CLK_LOCAL_MEM_FENCE);
491

492
    __local float *s_distance = (__local float *)sharebuffer;
493
    __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);
494

495
    // find BestMatch
496
    s_distance += lidy * BLOCK_SIZE_ODD;
497
    s_trainIdx += lidy * BLOCK_SIZE_ODD;
498
    s_distance[lidx] = myBestDistance1;
499
    s_trainIdx[lidx] = myBestTrainIdx1;
500

501
    float bestDistance1 = MAX_FLOAT;
502
    float bestDistance2 = MAX_FLOAT;
503
    int bestTrainIdx1 = -1;
504
    int bestTrainIdx2 = -1;
505
    barrier(CLK_LOCAL_MEM_FENCE);
506

507
    if (lidx == 0)
508
    {
509
        for (int i = 0 ; i < BLOCK_SIZE ; i++)
510
        {
511
            float val = s_distance[i];
512
            if (val < bestDistance1)
513
            {
514
                bestDistance2 = bestDistance1;
515
                bestTrainIdx2 = bestTrainIdx1;
516

517
                bestDistance1 = val;
518
                bestTrainIdx1 = s_trainIdx[i];
519
            }
520
            else if (val < bestDistance2)
521
            {
522
                bestDistance2 = val;
523
                bestTrainIdx2 = s_trainIdx[i];
524
            }
525
        }
526
    }
527

528
    barrier(CLK_LOCAL_MEM_FENCE);
529

530
    s_distance[lidx] = myBestDistance2;
531
    s_trainIdx[lidx] = myBestTrainIdx2;
532

533
    barrier(CLK_LOCAL_MEM_FENCE);
534

535
    if (lidx == 0)
536
    {
537
        for (int i = 0 ; i < BLOCK_SIZE ; i++)
538
        {
539
            float val = s_distance[i];
540

541
            if (val < bestDistance2)
542
            {
543
                bestDistance2 = val;
544
                bestTrainIdx2 = s_trainIdx[i];
545
            }
546
        }
547
    }
548

549
    myBestDistance1 = bestDistance1;
550
    myBestDistance2 = bestDistance2;
551

552
    myBestTrainIdx1 = bestTrainIdx1;
553
    myBestTrainIdx2 = bestTrainIdx2;
554

555
    if (queryIdx < query_rows && lidx == 0)
556
    {
557
        bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
558
        bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
559
    }
560
}
561
Product

Resources

Company