Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/objdetect/src/opencl/cascadedetect.cl
16337 views
1
///////////////////////////// OpenCL kernels for face detection //////////////////////////////
2
////////////////////////////// see the opencv/doc/license.txt ///////////////////////////////
3
4
//
5
// the code has been derived from the OpenCL Haar cascade kernel by
6
//
7
// Niko Li, newlife20080214@gmail.com
8
// Wang Weiyan, wangweiyanster@gmail.com
9
// Jia Haipeng, jiahaipeng95@gmail.com
10
// Nathan, liujun@multicorewareinc.com
11
// Peng Xiao, pengxiao@outlook.com
12
// Erping Pang, erping@multicorewareinc.com
13
//
14
15
#ifdef HAAR
16
typedef struct __attribute__((aligned(4))) OptHaarFeature
17
{
18
int4 ofs[3] __attribute__((aligned (4)));
19
float4 weight __attribute__((aligned (4)));
20
}
21
OptHaarFeature;
22
#endif
23
24
#ifdef LBP
25
typedef struct __attribute__((aligned(4))) OptLBPFeature
26
{
27
int16 ofs __attribute__((aligned (4)));
28
}
29
OptLBPFeature;
30
#endif
31
32
typedef struct __attribute__((aligned(4))) Stump
33
{
34
float4 st __attribute__((aligned (4)));
35
}
36
Stump;
37
38
typedef struct __attribute__((aligned(4))) Node
39
{
40
int4 n __attribute__((aligned (4)));
41
}
42
Node;
43
44
typedef struct __attribute__((aligned (4))) Stage
45
{
46
int first __attribute__((aligned (4)));
47
int ntrees __attribute__((aligned (4)));
48
float threshold __attribute__((aligned (4)));
49
}
50
Stage;
51
52
typedef struct __attribute__((aligned (4))) ScaleData
53
{
54
float scale __attribute__((aligned (4)));
55
int szi_width __attribute__((aligned (4)));
56
int szi_height __attribute__((aligned (4)));
57
int layer_ofs __attribute__((aligned (4)));
58
int ystep __attribute__((aligned (4)));
59
}
60
ScaleData;
61
62
#ifndef SUM_BUF_SIZE
63
#define SUM_BUF_SIZE 0
64
#endif
65
66
#ifndef NODE_COUNT
67
#define NODE_COUNT 1
68
#endif
69
70
#ifdef HAAR
71
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
72
void runHaarClassifier(
73
int nscales, __global const ScaleData* scaleData,
74
__global const int* sum,
75
int _sumstep, int sumoffset,
76
__global const OptHaarFeature* optfeatures,
77
__global const Stage* stages,
78
__global const Node* nodes,
79
__global const float* leaves0,
80
81
volatile __global int* facepos,
82
int4 normrect, int sqofs, int2 windowsize)
83
{
84
int lx = get_local_id(0);
85
int ly = get_local_id(1);
86
int groupIdx = get_group_id(0);
87
int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;
88
int scaleIdx, tileIdx, stageIdx;
89
int sumstep = (int)(_sumstep/sizeof(int));
90
int4 nofs0 = (int4)(mad24(normrect.y, sumstep, normrect.x),
91
mad24(normrect.y, sumstep, normrect.x + normrect.z),
92
mad24(normrect.y + normrect.w, sumstep, normrect.x),
93
mad24(normrect.y + normrect.w, sumstep, normrect.x + normrect.z));
94
int normarea = normrect.z * normrect.w;
95
float invarea = 1.f/normarea;
96
int lidx = ly*LOCAL_SIZE_X + lx;
97
98
#if SUM_BUF_SIZE > 0
99
int4 nofs = (int4)(mad24(normrect.y, SUM_BUF_STEP, normrect.x),
100
mad24(normrect.y, SUM_BUF_STEP, normrect.x + normrect.z),
101
mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x),
102
mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x + normrect.z));
103
#else
104
int4 nofs = nofs0;
105
#endif
106
#define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)
107
__local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*5/2+1];
108
#if SUM_BUF_SIZE > 0
109
__local int* ibuf = lstore;
110
__local int* lcount = ibuf + SUM_BUF_SIZE;
111
#else
112
__local int* lcount = lstore;
113
#endif
114
__local float* lnf = (__local float*)(lcount + 1);
115
__local float* lpartsum = lnf + LOCAL_SIZE;
116
__local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);
117
118
for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
119
{
120
__global const ScaleData* s = scaleData + scaleIdx;
121
int ystep = s->ystep;
122
int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
123
int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,
124
(worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);
125
int totalTiles = ntiles.x*ntiles.y;
126
127
for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
128
{
129
int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;
130
int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;
131
int ix = lx, iy = ly;
132
__global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;
133
__global const int* psum1 = psum0 + mad24(iy, sumstep, ix);
134
135
if( ix0 >= worksize.x || iy0 >= worksize.y )
136
continue;
137
#if SUM_BUF_SIZE > 0
138
for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
139
{
140
int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;
141
vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);
142
}
143
#endif
144
145
if( lidx == 0 )
146
lcount[0] = 0;
147
barrier(CLK_LOCAL_MEM_FENCE);
148
149
if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
150
{
151
#if NODE_COUNT==1
152
__global const Stump* stump = (__global const Stump*)nodes;
153
#else
154
__global const Node* node = nodes;
155
__global const float* leaves = leaves0;
156
#endif
157
#if SUM_BUF_SIZE > 0
158
__local const int* psum = ibuf + mad24(iy, SUM_BUF_STEP, ix);
159
#else
160
__global const int* psum = psum1;
161
#endif
162
163
__global const int* psqsum = (__global const int*)(psum1 + sqofs);
164
float sval = (psum[nofs.x] - psum[nofs.y] - psum[nofs.z] + psum[nofs.w])*invarea;
165
float sqval = (psqsum[nofs0.x] - psqsum[nofs0.y] - psqsum[nofs0.z] + psqsum[nofs0.w])*invarea;
166
float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
167
nf = nf > 0 ? nf : 1.f;
168
169
for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )
170
{
171
int ntrees = stages[stageIdx].ntrees;
172
float s = 0.f;
173
#if NODE_COUNT==1
174
for( i = 0; i < ntrees; i++ )
175
{
176
float4 st = stump[i].st;
177
__global const OptHaarFeature* f = optfeatures + as_int(st.x);
178
float4 weight = f->weight;
179
180
int4 ofs = f->ofs[0];
181
sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
182
ofs = f->ofs[1];
183
sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
184
if( weight.z > 0 )
185
{
186
ofs = f->ofs[2];
187
sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
188
}
189
190
s += (sval < st.y*nf) ? st.z : st.w;
191
}
192
stump += ntrees;
193
#else
194
for( i = 0; i < ntrees; i++, node += NODE_COUNT, leaves += NODE_COUNT+1 )
195
{
196
int idx = 0;
197
do
198
{
199
int4 n = node[idx].n;
200
__global const OptHaarFeature* f = optfeatures + n.x;
201
float4 weight = f->weight;
202
203
int4 ofs = f->ofs[0];
204
205
sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
206
ofs = f->ofs[1];
207
sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
208
if( weight.z > 0 )
209
{
210
ofs = f->ofs[2];
211
sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
212
}
213
214
idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
215
}
216
while(idx > 0);
217
s += leaves[-idx];
218
}
219
#endif
220
221
if( s < stages[stageIdx].threshold )
222
break;
223
}
224
225
if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )
226
{
227
int count = atomic_inc(lcount);
228
lbuf[count] = (int)(ix | (iy << 8));
229
lnf[count] = nf;
230
}
231
}
232
233
for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )
234
{
235
barrier(CLK_LOCAL_MEM_FENCE);
236
int nrects = lcount[0];
237
238
if( nrects == 0 )
239
break;
240
barrier(CLK_LOCAL_MEM_FENCE);
241
if( lidx == 0 )
242
lcount[0] = 0;
243
244
{
245
#if NODE_COUNT == 1
246
__global const Stump* stump = (__global const Stump*)nodes + stages[stageIdx].first;
247
#else
248
__global const Node* node = nodes + stages[stageIdx].first*NODE_COUNT;
249
__global const float* leaves = leaves0 + stages[stageIdx].first*(NODE_COUNT+1);
250
#endif
251
int nparts = LOCAL_SIZE / nrects;
252
int ntrees = stages[stageIdx].ntrees;
253
int ntrees_p = (ntrees + nparts - 1)/nparts;
254
int nr = lidx / nparts;
255
int partidx = -1, idxval = 0;
256
float partsum = 0.f, nf = 0.f;
257
258
if( nr < nrects )
259
{
260
partidx = lidx % nparts;
261
idxval = lbuf[nr];
262
nf = lnf[nr];
263
264
{
265
int ntrees0 = ntrees_p*partidx;
266
int ntrees1 = min(ntrees0 + ntrees_p, ntrees);
267
int ix1 = idxval & 255, iy1 = idxval >> 8;
268
#if SUM_BUF_SIZE > 0
269
__local const int* psum = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);
270
#else
271
__global const int* psum = psum0 + mad24(iy1, sumstep, ix1);
272
#endif
273
274
#if NODE_COUNT == 1
275
for( i = ntrees0; i < ntrees1; i++ )
276
{
277
float4 st = stump[i].st;
278
__global const OptHaarFeature* f = optfeatures + as_int(st.x);
279
float4 weight = f->weight;
280
281
int4 ofs = f->ofs[0];
282
float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
283
ofs = f->ofs[1];
284
sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
285
//if( weight.z > 0 )
286
if( fabs(weight.z) > 0 )
287
{
288
ofs = f->ofs[2];
289
sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
290
}
291
292
partsum += (sval < st.y*nf) ? st.z : st.w;
293
}
294
#else
295
for( i = ntrees0; i < ntrees1; i++ )
296
{
297
int idx = 0;
298
do
299
{
300
int4 n = node[i*2 + idx].n;
301
__global const OptHaarFeature* f = optfeatures + n.x;
302
float4 weight = f->weight;
303
int4 ofs = f->ofs[0];
304
305
float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
306
ofs = f->ofs[1];
307
sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
308
if( weight.z > 0 )
309
{
310
ofs = f->ofs[2];
311
sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
312
}
313
314
idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
315
}
316
while(idx > 0);
317
partsum += leaves[i*3-idx];
318
}
319
#endif
320
}
321
}
322
lpartsum[lidx] = partsum;
323
barrier(CLK_LOCAL_MEM_FENCE);
324
325
if( partidx == 0 )
326
{
327
float s = lpartsum[nr*nparts];
328
for( i = 1; i < nparts; i++ )
329
s += lpartsum[i + nr*nparts];
330
if( s >= stages[stageIdx].threshold )
331
{
332
int count = atomic_inc(lcount);
333
lbuf[count] = idxval;
334
lnf[count] = nf;
335
}
336
}
337
}
338
}
339
340
barrier(CLK_LOCAL_MEM_FENCE);
341
if( stageIdx == N_STAGES )
342
{
343
int nrects = lcount[0];
344
if( lidx < nrects )
345
{
346
int nfaces = atomic_inc(facepos);
347
if( nfaces < MAX_FACES )
348
{
349
volatile __global int* face = facepos + 1 + nfaces*3;
350
int val = lbuf[lidx];
351
face[0] = scaleIdx;
352
face[1] = ix0 + (val & 255);
353
face[2] = iy0 + (val >> 8);
354
}
355
}
356
}
357
}
358
}
359
}
360
#endif
361
362
#ifdef LBP
363
#undef CALC_SUM_OFS_
364
#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
365
((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
366
367
__kernel void runLBPClassifierStumpSimple(
368
int nscales, __global const ScaleData* scaleData,
369
__global const int* sum,
370
int _sumstep, int sumoffset,
371
__global const OptLBPFeature* optfeatures,
372
__global const Stage* stages,
373
__global const Stump* stumps,
374
__global const int* bitsets,
375
int bitsetSize,
376
377
volatile __global int* facepos,
378
int2 windowsize)
379
{
380
int lx = get_local_id(0);
381
int ly = get_local_id(1);
382
int local_size_x = get_local_size(0);
383
int local_size_y = get_local_size(1);
384
int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0);
385
int ngroups = get_num_groups(0)*get_num_groups(1);
386
int scaleIdx, tileIdx, stageIdx;
387
int sumstep = (int)(_sumstep/sizeof(int));
388
389
for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
390
{
391
__global const ScaleData* s = scaleData + scaleIdx;
392
int ystep = s->ystep;
393
int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
394
int2 ntiles = (int2)((worksize.x/ystep + local_size_x-1)/local_size_x,
395
(worksize.y/ystep + local_size_y-1)/local_size_y);
396
int totalTiles = ntiles.x*ntiles.y;
397
398
for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
399
{
400
int iy = mad24((tileIdx / ntiles.x), local_size_y, ly) * ystep;
401
int ix = mad24((tileIdx % ntiles.x), local_size_x, lx) * ystep;
402
403
if( ix < worksize.x && iy < worksize.y )
404
{
405
__global const int* p = sum + mad24(iy, sumstep, ix) + s->layer_ofs;
406
__global const Stump* stump = stumps;
407
__global const int* bitset = bitsets;
408
409
for( stageIdx = 0; stageIdx < N_STAGES; stageIdx++ )
410
{
411
int i, ntrees = stages[stageIdx].ntrees;
412
float s = 0.f;
413
for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
414
{
415
float4 st = stump->st;
416
__global const OptLBPFeature* f = optfeatures + as_int(st.x);
417
int16 ofs = f->ofs;
418
419
int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
420
421
int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
422
idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
423
idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
424
425
mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
426
mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8
427
mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7
428
mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6
429
mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7
430
431
s += (bitset[idx] & (1 << mask)) ? st.z : st.w;
432
}
433
434
if( s < stages[stageIdx].threshold )
435
break;
436
}
437
438
if( stageIdx == N_STAGES )
439
{
440
int nfaces = atomic_inc(facepos);
441
if( nfaces < MAX_FACES )
442
{
443
volatile __global int* face = facepos + 1 + nfaces*3;
444
face[0] = scaleIdx;
445
face[1] = ix;
446
face[2] = iy;
447
}
448
}
449
}
450
}
451
}
452
}
453
454
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
455
void runLBPClassifierStump(
456
int nscales, __global const ScaleData* scaleData,
457
__global const int* sum,
458
int _sumstep, int sumoffset,
459
__global const OptLBPFeature* optfeatures,
460
__global const Stage* stages,
461
__global const Stump* stumps,
462
__global const int* bitsets,
463
int bitsetSize,
464
465
volatile __global int* facepos,
466
int2 windowsize)
467
{
468
int lx = get_local_id(0);
469
int ly = get_local_id(1);
470
int groupIdx = get_group_id(0);
471
int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;
472
int scaleIdx, tileIdx, stageIdx;
473
int sumstep = (int)(_sumstep/sizeof(int));
474
int lidx = ly*LOCAL_SIZE_X + lx;
475
476
#define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)
477
__local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*3/2+1];
478
#if SUM_BUF_SIZE > 0
479
__local int* ibuf = lstore;
480
__local int* lcount = ibuf + SUM_BUF_SIZE;
481
#else
482
__local int* lcount = lstore;
483
#endif
484
__local float* lpartsum = (__local float*)(lcount + 1);
485
__local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);
486
487
for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
488
{
489
__global const ScaleData* s = scaleData + scaleIdx;
490
int ystep = s->ystep;
491
int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
492
int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,
493
(worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);
494
int totalTiles = ntiles.x*ntiles.y;
495
496
for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
497
{
498
int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;
499
int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;
500
int ix = lx, iy = ly;
501
__global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;
502
503
if( ix0 >= worksize.x || iy0 >= worksize.y )
504
continue;
505
#if SUM_BUF_SIZE > 0
506
for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
507
{
508
int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;
509
vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);
510
}
511
barrier(CLK_LOCAL_MEM_FENCE);
512
#endif
513
514
if( lidx == 0 )
515
lcount[0] = 0;
516
barrier(CLK_LOCAL_MEM_FENCE);
517
518
if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
519
{
520
__global const Stump* stump = stumps;
521
__global const int* bitset = bitsets;
522
#if SUM_BUF_SIZE > 0
523
__local const int* p = ibuf + mad24(iy, SUM_BUF_STEP, ix);
524
#else
525
__global const int* p = psum0 + mad24(iy, sumstep, ix);
526
#endif
527
528
for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )
529
{
530
int ntrees = stages[stageIdx].ntrees;
531
float s = 0.f;
532
for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
533
{
534
float4 st = stump->st;
535
__global const OptLBPFeature* f = optfeatures + as_int(st.x);
536
int16 ofs = f->ofs;
537
538
int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
539
540
int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
541
idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
542
idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
543
544
mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
545
mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8
546
mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7
547
mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6
548
mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7
549
550
s += (bitset[idx] & (1 << mask)) ? st.z : st.w;
551
}
552
553
if( s < stages[stageIdx].threshold )
554
break;
555
}
556
557
if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )
558
{
559
int count = atomic_inc(lcount);
560
lbuf[count] = (int)(ix | (iy << 8));
561
}
562
}
563
564
for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )
565
{
566
int nrects = lcount[0];
567
568
barrier(CLK_LOCAL_MEM_FENCE);
569
if( nrects == 0 )
570
break;
571
if( lidx == 0 )
572
lcount[0] = 0;
573
574
{
575
__global const Stump* stump = stumps + stages[stageIdx].first;
576
__global const int* bitset = bitsets + stages[stageIdx].first*bitsetSize;
577
int nparts = LOCAL_SIZE / nrects;
578
int ntrees = stages[stageIdx].ntrees;
579
int ntrees_p = (ntrees + nparts - 1)/nparts;
580
int nr = lidx / nparts;
581
int partidx = -1, idxval = 0;
582
float partsum = 0.f, nf = 0.f;
583
584
if( nr < nrects )
585
{
586
partidx = lidx % nparts;
587
idxval = lbuf[nr];
588
589
{
590
int ntrees0 = ntrees_p*partidx;
591
int ntrees1 = min(ntrees0 + ntrees_p, ntrees);
592
int ix1 = idxval & 255, iy1 = idxval >> 8;
593
#if SUM_BUF_SIZE > 0
594
__local const int* p = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);
595
#else
596
__global const int* p = psum0 + mad24(iy1, sumstep, ix1);
597
#endif
598
599
for( i = ntrees0; i < ntrees1; i++ )
600
{
601
float4 st = stump[i].st;
602
__global const OptLBPFeature* f = optfeatures + as_int(st.x);
603
int16 ofs = f->ofs;
604
605
#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
606
((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
607
608
int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
609
610
int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
611
idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
612
idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
613
614
mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
615
mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8
616
mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7
617
mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6
618
mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7
619
620
partsum += (bitset[i*bitsetSize + idx] & (1 << mask)) ? st.z : st.w;
621
}
622
}
623
}
624
lpartsum[lidx] = partsum;
625
barrier(CLK_LOCAL_MEM_FENCE);
626
627
if( partidx == 0 )
628
{
629
float s = lpartsum[nr*nparts];
630
for( i = 1; i < nparts; i++ )
631
s += lpartsum[i + nr*nparts];
632
if( s >= stages[stageIdx].threshold )
633
{
634
int count = atomic_inc(lcount);
635
lbuf[count] = idxval;
636
}
637
}
638
}
639
}
640
641
barrier(CLK_LOCAL_MEM_FENCE);
642
if( stageIdx == N_STAGES )
643
{
644
int nrects = lcount[0];
645
if( lidx < nrects )
646
{
647
int nfaces = atomic_inc(facepos);
648
if( nfaces < MAX_FACES )
649
{
650
volatile __global int* face = facepos + 1 + nfaces*3;
651
int val = lbuf[lidx];
652
face[0] = scaleIdx;
653
face[1] = ix0 + (val & 255);
654
face[2] = iy0 + (val >> 8);
655
}
656
}
657
}
658
}
659
}
660
}
661
#endif
662
663