Path: blob/master/modules/objdetect/src/opencl/cascadedetect.cl
16337 views
///////////////////////////// OpenCL kernels for face detection //////////////////////////////1////////////////////////////// see the opencv/doc/license.txt ///////////////////////////////23//4// the code has been derived from the OpenCL Haar cascade kernel by5//6// Niko Li, newlife20080214@gmail.com7// Wang Weiyan, wangweiyanster@gmail.com8// Jia Haipeng, jiahaipeng95@gmail.com9// Nathan, liujun@multicorewareinc.com10// Peng Xiao, pengxiao@outlook.com11// Erping Pang, erping@multicorewareinc.com12//1314#ifdef HAAR15typedef struct __attribute__((aligned(4))) OptHaarFeature16{17int4 ofs[3] __attribute__((aligned (4)));18float4 weight __attribute__((aligned (4)));19}20OptHaarFeature;21#endif2223#ifdef LBP24typedef struct __attribute__((aligned(4))) OptLBPFeature25{26int16 ofs __attribute__((aligned (4)));27}28OptLBPFeature;29#endif3031typedef struct __attribute__((aligned(4))) Stump32{33float4 st __attribute__((aligned (4)));34}35Stump;3637typedef struct __attribute__((aligned(4))) Node38{39int4 n __attribute__((aligned (4)));40}41Node;4243typedef struct __attribute__((aligned (4))) Stage44{45int first __attribute__((aligned (4)));46int ntrees __attribute__((aligned (4)));47float threshold __attribute__((aligned (4)));48}49Stage;5051typedef struct __attribute__((aligned (4))) ScaleData52{53float scale __attribute__((aligned (4)));54int szi_width __attribute__((aligned (4)));55int szi_height __attribute__((aligned (4)));56int layer_ofs __attribute__((aligned (4)));57int ystep __attribute__((aligned (4)));58}59ScaleData;6061#ifndef SUM_BUF_SIZE62#define SUM_BUF_SIZE 063#endif6465#ifndef NODE_COUNT66#define NODE_COUNT 167#endif6869#ifdef HAAR70__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))71void runHaarClassifier(72int nscales, __global const ScaleData* scaleData,73__global const int* sum,74int _sumstep, int sumoffset,75__global const OptHaarFeature* optfeatures,76__global const Stage* stages,77__global const Node* nodes,78__global const float* leaves0,7980volatile __global int* facepos,81int4 normrect, int sqofs, int2 windowsize)82{83int lx = get_local_id(0);84int ly = get_local_id(1);85int groupIdx = get_group_id(0);86int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;87int scaleIdx, tileIdx, stageIdx;88int sumstep = (int)(_sumstep/sizeof(int));89int4 nofs0 = (int4)(mad24(normrect.y, sumstep, normrect.x),90mad24(normrect.y, sumstep, normrect.x + normrect.z),91mad24(normrect.y + normrect.w, sumstep, normrect.x),92mad24(normrect.y + normrect.w, sumstep, normrect.x + normrect.z));93int normarea = normrect.z * normrect.w;94float invarea = 1.f/normarea;95int lidx = ly*LOCAL_SIZE_X + lx;9697#if SUM_BUF_SIZE > 098int4 nofs = (int4)(mad24(normrect.y, SUM_BUF_STEP, normrect.x),99mad24(normrect.y, SUM_BUF_STEP, normrect.x + normrect.z),100mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x),101mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x + normrect.z));102#else103int4 nofs = nofs0;104#endif105#define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)106__local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*5/2+1];107#if SUM_BUF_SIZE > 0108__local int* ibuf = lstore;109__local int* lcount = ibuf + SUM_BUF_SIZE;110#else111__local int* lcount = lstore;112#endif113__local float* lnf = (__local float*)(lcount + 1);114__local float* lpartsum = lnf + LOCAL_SIZE;115__local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);116117for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )118{119__global const ScaleData* s = scaleData + scaleIdx;120int ystep = s->ystep;121int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));122int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,123(worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);124int totalTiles = ntiles.x*ntiles.y;125126for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )127{128int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;129int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;130int ix = lx, iy = ly;131__global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;132__global const int* psum1 = psum0 + mad24(iy, sumstep, ix);133134if( ix0 >= worksize.x || iy0 >= worksize.y )135continue;136#if SUM_BUF_SIZE > 0137for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )138{139int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;140vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);141}142#endif143144if( lidx == 0 )145lcount[0] = 0;146barrier(CLK_LOCAL_MEM_FENCE);147148if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )149{150#if NODE_COUNT==1151__global const Stump* stump = (__global const Stump*)nodes;152#else153__global const Node* node = nodes;154__global const float* leaves = leaves0;155#endif156#if SUM_BUF_SIZE > 0157__local const int* psum = ibuf + mad24(iy, SUM_BUF_STEP, ix);158#else159__global const int* psum = psum1;160#endif161162__global const int* psqsum = (__global const int*)(psum1 + sqofs);163float sval = (psum[nofs.x] - psum[nofs.y] - psum[nofs.z] + psum[nofs.w])*invarea;164float sqval = (psqsum[nofs0.x] - psqsum[nofs0.y] - psqsum[nofs0.z] + psqsum[nofs0.w])*invarea;165float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));166nf = nf > 0 ? nf : 1.f;167168for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )169{170int ntrees = stages[stageIdx].ntrees;171float s = 0.f;172#if NODE_COUNT==1173for( i = 0; i < ntrees; i++ )174{175float4 st = stump[i].st;176__global const OptHaarFeature* f = optfeatures + as_int(st.x);177float4 weight = f->weight;178179int4 ofs = f->ofs[0];180sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;181ofs = f->ofs[1];182sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);183if( weight.z > 0 )184{185ofs = f->ofs[2];186sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);187}188189s += (sval < st.y*nf) ? st.z : st.w;190}191stump += ntrees;192#else193for( i = 0; i < ntrees; i++, node += NODE_COUNT, leaves += NODE_COUNT+1 )194{195int idx = 0;196do197{198int4 n = node[idx].n;199__global const OptHaarFeature* f = optfeatures + n.x;200float4 weight = f->weight;201202int4 ofs = f->ofs[0];203204sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;205ofs = f->ofs[1];206sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);207if( weight.z > 0 )208{209ofs = f->ofs[2];210sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);211}212213idx = (sval < as_float(n.y)*nf) ? n.z : n.w;214}215while(idx > 0);216s += leaves[-idx];217}218#endif219220if( s < stages[stageIdx].threshold )221break;222}223224if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )225{226int count = atomic_inc(lcount);227lbuf[count] = (int)(ix | (iy << 8));228lnf[count] = nf;229}230}231232for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )233{234barrier(CLK_LOCAL_MEM_FENCE);235int nrects = lcount[0];236237if( nrects == 0 )238break;239barrier(CLK_LOCAL_MEM_FENCE);240if( lidx == 0 )241lcount[0] = 0;242243{244#if NODE_COUNT == 1245__global const Stump* stump = (__global const Stump*)nodes + stages[stageIdx].first;246#else247__global const Node* node = nodes + stages[stageIdx].first*NODE_COUNT;248__global const float* leaves = leaves0 + stages[stageIdx].first*(NODE_COUNT+1);249#endif250int nparts = LOCAL_SIZE / nrects;251int ntrees = stages[stageIdx].ntrees;252int ntrees_p = (ntrees + nparts - 1)/nparts;253int nr = lidx / nparts;254int partidx = -1, idxval = 0;255float partsum = 0.f, nf = 0.f;256257if( nr < nrects )258{259partidx = lidx % nparts;260idxval = lbuf[nr];261nf = lnf[nr];262263{264int ntrees0 = ntrees_p*partidx;265int ntrees1 = min(ntrees0 + ntrees_p, ntrees);266int ix1 = idxval & 255, iy1 = idxval >> 8;267#if SUM_BUF_SIZE > 0268__local const int* psum = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);269#else270__global const int* psum = psum0 + mad24(iy1, sumstep, ix1);271#endif272273#if NODE_COUNT == 1274for( i = ntrees0; i < ntrees1; i++ )275{276float4 st = stump[i].st;277__global const OptHaarFeature* f = optfeatures + as_int(st.x);278float4 weight = f->weight;279280int4 ofs = f->ofs[0];281float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;282ofs = f->ofs[1];283sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);284//if( weight.z > 0 )285if( fabs(weight.z) > 0 )286{287ofs = f->ofs[2];288sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);289}290291partsum += (sval < st.y*nf) ? st.z : st.w;292}293#else294for( i = ntrees0; i < ntrees1; i++ )295{296int idx = 0;297do298{299int4 n = node[i*2 + idx].n;300__global const OptHaarFeature* f = optfeatures + n.x;301float4 weight = f->weight;302int4 ofs = f->ofs[0];303304float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;305ofs = f->ofs[1];306sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);307if( weight.z > 0 )308{309ofs = f->ofs[2];310sval = mad((float)(psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);311}312313idx = (sval < as_float(n.y)*nf) ? n.z : n.w;314}315while(idx > 0);316partsum += leaves[i*3-idx];317}318#endif319}320}321lpartsum[lidx] = partsum;322barrier(CLK_LOCAL_MEM_FENCE);323324if( partidx == 0 )325{326float s = lpartsum[nr*nparts];327for( i = 1; i < nparts; i++ )328s += lpartsum[i + nr*nparts];329if( s >= stages[stageIdx].threshold )330{331int count = atomic_inc(lcount);332lbuf[count] = idxval;333lnf[count] = nf;334}335}336}337}338339barrier(CLK_LOCAL_MEM_FENCE);340if( stageIdx == N_STAGES )341{342int nrects = lcount[0];343if( lidx < nrects )344{345int nfaces = atomic_inc(facepos);346if( nfaces < MAX_FACES )347{348volatile __global int* face = facepos + 1 + nfaces*3;349int val = lbuf[lidx];350face[0] = scaleIdx;351face[1] = ix0 + (val & 255);352face[2] = iy0 + (val >> 8);353}354}355}356}357}358}359#endif360361#ifdef LBP362#undef CALC_SUM_OFS_363#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \364((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])365366__kernel void runLBPClassifierStumpSimple(367int nscales, __global const ScaleData* scaleData,368__global const int* sum,369int _sumstep, int sumoffset,370__global const OptLBPFeature* optfeatures,371__global const Stage* stages,372__global const Stump* stumps,373__global const int* bitsets,374int bitsetSize,375376volatile __global int* facepos,377int2 windowsize)378{379int lx = get_local_id(0);380int ly = get_local_id(1);381int local_size_x = get_local_size(0);382int local_size_y = get_local_size(1);383int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0);384int ngroups = get_num_groups(0)*get_num_groups(1);385int scaleIdx, tileIdx, stageIdx;386int sumstep = (int)(_sumstep/sizeof(int));387388for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )389{390__global const ScaleData* s = scaleData + scaleIdx;391int ystep = s->ystep;392int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));393int2 ntiles = (int2)((worksize.x/ystep + local_size_x-1)/local_size_x,394(worksize.y/ystep + local_size_y-1)/local_size_y);395int totalTiles = ntiles.x*ntiles.y;396397for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )398{399int iy = mad24((tileIdx / ntiles.x), local_size_y, ly) * ystep;400int ix = mad24((tileIdx % ntiles.x), local_size_x, lx) * ystep;401402if( ix < worksize.x && iy < worksize.y )403{404__global const int* p = sum + mad24(iy, sumstep, ix) + s->layer_ofs;405__global const Stump* stump = stumps;406__global const int* bitset = bitsets;407408for( stageIdx = 0; stageIdx < N_STAGES; stageIdx++ )409{410int i, ntrees = stages[stageIdx].ntrees;411float s = 0.f;412for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )413{414float4 st = stump->st;415__global const OptLBPFeature* f = optfeatures + as_int(st.x);416int16 ofs = f->ofs;417418int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );419420int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0421idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1422idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2423424mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5425mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8426mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7427mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6428mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7429430s += (bitset[idx] & (1 << mask)) ? st.z : st.w;431}432433if( s < stages[stageIdx].threshold )434break;435}436437if( stageIdx == N_STAGES )438{439int nfaces = atomic_inc(facepos);440if( nfaces < MAX_FACES )441{442volatile __global int* face = facepos + 1 + nfaces*3;443face[0] = scaleIdx;444face[1] = ix;445face[2] = iy;446}447}448}449}450}451}452453__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))454void runLBPClassifierStump(455int nscales, __global const ScaleData* scaleData,456__global const int* sum,457int _sumstep, int sumoffset,458__global const OptLBPFeature* optfeatures,459__global const Stage* stages,460__global const Stump* stumps,461__global const int* bitsets,462int bitsetSize,463464volatile __global int* facepos,465int2 windowsize)466{467int lx = get_local_id(0);468int ly = get_local_id(1);469int groupIdx = get_group_id(0);470int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;471int scaleIdx, tileIdx, stageIdx;472int sumstep = (int)(_sumstep/sizeof(int));473int lidx = ly*LOCAL_SIZE_X + lx;474475#define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)476__local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*3/2+1];477#if SUM_BUF_SIZE > 0478__local int* ibuf = lstore;479__local int* lcount = ibuf + SUM_BUF_SIZE;480#else481__local int* lcount = lstore;482#endif483__local float* lpartsum = (__local float*)(lcount + 1);484__local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);485486for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )487{488__global const ScaleData* s = scaleData + scaleIdx;489int ystep = s->ystep;490int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));491int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,492(worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);493int totalTiles = ntiles.x*ntiles.y;494495for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )496{497int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;498int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;499int ix = lx, iy = ly;500__global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;501502if( ix0 >= worksize.x || iy0 >= worksize.y )503continue;504#if SUM_BUF_SIZE > 0505for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )506{507int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;508vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);509}510barrier(CLK_LOCAL_MEM_FENCE);511#endif512513if( lidx == 0 )514lcount[0] = 0;515barrier(CLK_LOCAL_MEM_FENCE);516517if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )518{519__global const Stump* stump = stumps;520__global const int* bitset = bitsets;521#if SUM_BUF_SIZE > 0522__local const int* p = ibuf + mad24(iy, SUM_BUF_STEP, ix);523#else524__global const int* p = psum0 + mad24(iy, sumstep, ix);525#endif526527for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )528{529int ntrees = stages[stageIdx].ntrees;530float s = 0.f;531for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )532{533float4 st = stump->st;534__global const OptLBPFeature* f = optfeatures + as_int(st.x);535int16 ofs = f->ofs;536537int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );538539int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0540idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1541idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2542543mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5544mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8545mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7546mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6547mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7548549s += (bitset[idx] & (1 << mask)) ? st.z : st.w;550}551552if( s < stages[stageIdx].threshold )553break;554}555556if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )557{558int count = atomic_inc(lcount);559lbuf[count] = (int)(ix | (iy << 8));560}561}562563for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )564{565int nrects = lcount[0];566567barrier(CLK_LOCAL_MEM_FENCE);568if( nrects == 0 )569break;570if( lidx == 0 )571lcount[0] = 0;572573{574__global const Stump* stump = stumps + stages[stageIdx].first;575__global const int* bitset = bitsets + stages[stageIdx].first*bitsetSize;576int nparts = LOCAL_SIZE / nrects;577int ntrees = stages[stageIdx].ntrees;578int ntrees_p = (ntrees + nparts - 1)/nparts;579int nr = lidx / nparts;580int partidx = -1, idxval = 0;581float partsum = 0.f, nf = 0.f;582583if( nr < nrects )584{585partidx = lidx % nparts;586idxval = lbuf[nr];587588{589int ntrees0 = ntrees_p*partidx;590int ntrees1 = min(ntrees0 + ntrees_p, ntrees);591int ix1 = idxval & 255, iy1 = idxval >> 8;592#if SUM_BUF_SIZE > 0593__local const int* p = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);594#else595__global const int* p = psum0 + mad24(iy1, sumstep, ix1);596#endif597598for( i = ntrees0; i < ntrees1; i++ )599{600float4 st = stump[i].st;601__global const OptLBPFeature* f = optfeatures + as_int(st.x);602int16 ofs = f->ofs;603604#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \605((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])606607int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );608609int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0610idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1611idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2612613mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5614mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8615mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7616mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6617mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7618619partsum += (bitset[i*bitsetSize + idx] & (1 << mask)) ? st.z : st.w;620}621}622}623lpartsum[lidx] = partsum;624barrier(CLK_LOCAL_MEM_FENCE);625626if( partidx == 0 )627{628float s = lpartsum[nr*nparts];629for( i = 1; i < nparts; i++ )630s += lpartsum[i + nr*nparts];631if( s >= stages[stageIdx].threshold )632{633int count = atomic_inc(lcount);634lbuf[count] = idxval;635}636}637}638}639640barrier(CLK_LOCAL_MEM_FENCE);641if( stageIdx == N_STAGES )642{643int nrects = lcount[0];644if( lidx < nrects )645{646int nfaces = atomic_inc(facepos);647if( nfaces < MAX_FACES )648{649volatile __global int* face = facepos + 1 + nfaces*3;650int val = lbuf[lidx];651face[0] = scaleIdx;652face[1] = ix0 + (val & 255);653face[2] = iy0 + (val >> 8);654}655}656}657}658}659}660#endif661662663