Path: blob/master/modules/video/src/opencl/optical_flow_farneback.cl
16337 views
/*M///////////////////////////////////////////////////////////////////////////////////////1//2// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.3//4// By downloading, copying, installing or using the software you agree to this license.5// If you do not agree to this license, do not download, install,6// copy or use the software.7//8//9// License Agreement10// For Open Source Computer Vision Library11//12// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.13// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.14// Third party copyrights are property of their respective owners.15//16// @Authors17// Sen Liu, swjtuls1987@126.com18//19// Redistribution and use in source and binary forms, with or without modification,20// are permitted provided that the following conditions are met:21//22// * Redistribution's of source code must retain the above copyright notice,23// this list of conditions and the following disclaimer.24//25// * Redistribution's in binary form must reproduce the above copyright notice,26// this list of conditions and the following disclaimer in the documentation27// and/or other materials provided with the distribution.28//29// * The name of the copyright holders may not be used to endorse or promote products30// derived from this software without specific prior written permission.31//32// This software is provided by the copyright holders and contributors as is and33// any express or implied warranties, including, but not limited to, the implied34// warranties of merchantability and fitness for a particular purpose are disclaimed.35// In no event shall the Intel Corporation or contributors be liable for any direct,36// indirect, incidental, special, exemplary, or consequential damages37// (including, but not limited to, procurement of substitute goods or services;38// loss of use, data, or profits; or business interruption) however caused39// and on any theory of liability, whether in contract, strict liability,40// or tort (including negligence or otherwise) arising in any way out of41// the use of this software, even if advised of the possibility of such damage.42//43//M*/444546#define tx (int)get_local_id(0)47#define ty get_local_id(1)48#define bx get_group_id(0)49#define bdx (int)get_local_size(0)5051#define BORDER_SIZE 552#define MAX_KSIZE_HALF 1005354#ifndef polyN55#define polyN 556#endif5758#if USE_DOUBLE59#ifdef cl_amd_fp6460#pragma OPENCL EXTENSION cl_amd_fp64:enable61#elif defined (cl_khr_fp64)62#pragma OPENCL EXTENSION cl_khr_fp64:enable63#endif64#define TYPE double65#define VECTYPE double466#else67#define TYPE float68#define VECTYPE float469#endif7071__kernel void polynomialExpansion(__global __const float * src, int srcStep,72__global float * dst, int dstStep,73const int rows, const int cols,74__global __const float * c_g,75__global __const float * c_xg,76__global __const float * c_xxg,77__local float * smem,78const VECTYPE ig)79{80const int y = get_global_id(1);81const int x = bx * (bdx - 2*polyN) + tx - polyN;8283int xWarped;84__local float *row = smem + tx;8586if (y < rows && y >= 0)87{88xWarped = min(max(x, 0), cols - 1);8990row[0] = src[mad24(y, srcStep, xWarped)] * c_g[0];91row[bdx] = 0.f;92row[2*bdx] = 0.f;9394#pragma unroll95for (int k = 1; k <= polyN; ++k)96{97float t0 = src[mad24(max(y - k, 0), srcStep, xWarped)];98float t1 = src[mad24(min(y + k, rows - 1), srcStep, xWarped)];99100row[0] += c_g[k] * (t0 + t1);101row[bdx] += c_xg[k] * (t1 - t0);102row[2*bdx] += c_xxg[k] * (t0 + t1);103}104}105106barrier(CLK_LOCAL_MEM_FENCE);107108if (y < rows && y >= 0 && tx >= polyN && tx + polyN < bdx && x < cols)109{110TYPE b1 = c_g[0] * row[0];111TYPE b3 = c_g[0] * row[bdx];112TYPE b5 = c_g[0] * row[2*bdx];113TYPE b2 = 0, b4 = 0, b6 = 0;114115#pragma unroll116for (int k = 1; k <= polyN; ++k)117{118b1 += (row[k] + row[-k]) * c_g[k];119b4 += (row[k] + row[-k]) * c_xxg[k];120b2 += (row[k] - row[-k]) * c_xg[k];121b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];122b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];123b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];124}125126dst[mad24(y, dstStep, xWarped)] = (float)(b3*ig.s0);127dst[mad24(rows + y, dstStep, xWarped)] = (float)(b2*ig.s0);128dst[mad24(2*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b5*ig.s2);129dst[mad24(3*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b4*ig.s2);130dst[mad24(4*rows + y, dstStep, xWarped)] = (float)(b6*ig.s3);131}132}133134inline int idx_row_low(const int y, const int last_row)135{136return abs(y) % (last_row + 1);137}138139inline int idx_row_high(const int y, const int last_row)140{141return abs(last_row - abs(last_row - y)) % (last_row + 1);142}143144inline int idx_col_low(const int x, const int last_col)145{146return abs(x) % (last_col + 1);147}148149inline int idx_col_high(const int x, const int last_col)150{151return abs(last_col - abs(last_col - x)) % (last_col + 1);152}153154inline int idx_col(const int x, const int last_col)155{156return idx_col_low(idx_col_high(x, last_col), last_col);157}158159__kernel void gaussianBlur(__global const float * src, int srcStep,160__global float * dst, int dstStep, const int rows, const int cols,161__global const float * c_gKer, const int ksizeHalf,162__local float * smem)163{164const int y = get_global_id(1);165const int x = get_global_id(0);166167__local float *row = smem + ty * (bdx + 2*ksizeHalf);168169if (y < rows)170{171// Vertical pass172for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)173{174int xExt = (int)(bx * bdx) + i - ksizeHalf;175xExt = idx_col(xExt, cols - 1);176row[i] = src[mad24(y, srcStep, xExt)] * c_gKer[0];177for (int j = 1; j <= ksizeHalf; ++j)178row[i] += (src[mad24(idx_row_low(y - j, rows - 1), srcStep, xExt)]179+ src[mad24(idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j];180}181}182183barrier(CLK_LOCAL_MEM_FENCE);184185if (y < rows && y >= 0 && x < cols && x >= 0)186{187// Horizontal pass188row += tx + ksizeHalf;189float res = row[0] * c_gKer[0];190for (int i = 1; i <= ksizeHalf; ++i)191res += (row[-i] + row[i]) * c_gKer[i];192193dst[mad24(y, dstStep, x)] = res;194}195}196197__kernel void gaussianBlur5(__global const float * src, int srcStep,198__global float * dst, int dstStep,199const int rows, const int cols,200__global const float * c_gKer, const int ksizeHalf,201__local float * smem)202{203const int y = get_global_id(1);204const int x = get_global_id(0);205206const int smw = bdx + 2*ksizeHalf; // shared memory "cols"207__local volatile float *row = smem + 5 * ty * smw;208209if (y < rows)210{211// Vertical pass212for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)213{214int xExt = (int)(bx * bdx) + i - ksizeHalf;215xExt = idx_col(xExt, cols - 1);216217#pragma unroll218for (int k = 0; k < 5; ++k)219row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)] * c_gKer[0];220221for (int j = 1; j <= ksizeHalf; ++j)222#pragma unroll223for (int k = 0; k < 5; ++k)224row[k*smw + i] +=225(src[mad24(k*rows + idx_row_low(y - j, rows - 1), srcStep, xExt)] +226src[mad24(k*rows + idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j];227}228}229230barrier(CLK_LOCAL_MEM_FENCE);231232if (y < rows && y >= 0 && x < cols && x >= 0)233{234// Horizontal pass235236row += tx + ksizeHalf;237float res[5];238239#pragma unroll240for (int k = 0; k < 5; ++k)241res[k] = row[k*smw] * c_gKer[0];242243for (int i = 1; i <= ksizeHalf; ++i)244#pragma unroll245for (int k = 0; k < 5; ++k)246res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];247248#pragma unroll249for (int k = 0; k < 5; ++k)250dst[mad24(k*rows + y, dstStep, x)] = res[k];251}252}253__constant float c_border[BORDER_SIZE + 1] = { 0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f };254255__kernel void updateMatrices(__global const float * flowx, int xStep,256__global const float * flowy, int yStep,257const int rows, const int cols,258__global const float * R0, int R0Step,259__global const float * R1, int R1Step,260__global float * M, int mStep)261{262const int y = get_global_id(1);263const int x = get_global_id(0);264265if (y < rows && y >= 0 && x < cols && x >= 0)266{267float dx = flowx[mad24(y, xStep, x)];268float dy = flowy[mad24(y, yStep, x)];269float fx = x + dx;270float fy = y + dy;271272int x1 = convert_int(floor(fx));273int y1 = convert_int(floor(fy));274fx -= x1;275fy -= y1;276277float r2, r3, r4, r5, r6;278279if (x1 >= 0 && y1 >= 0 && x1 < cols - 1 && y1 < rows - 1)280{281float a00 = (1.f - fx) * (1.f - fy);282float a01 = fx * (1.f - fy);283float a10 = (1.f - fx) * fy;284float a11 = fx * fy;285286r2 = a00 * R1[mad24(y1, R1Step, x1)] +287a01 * R1[mad24(y1, R1Step, x1 + 1)] +288a10 * R1[mad24(y1 + 1, R1Step, x1)] +289a11 * R1[mad24(y1 + 1, R1Step, x1 + 1)];290291r3 = a00 * R1[mad24(rows + y1, R1Step, x1)] +292a01 * R1[mad24(rows + y1, R1Step, x1 + 1)] +293a10 * R1[mad24(rows + y1 + 1, R1Step, x1)] +294a11 * R1[mad24(rows + y1 + 1, R1Step, x1 + 1)];295296r4 = a00 * R1[mad24(2*rows + y1, R1Step, x1)] +297a01 * R1[mad24(2*rows + y1, R1Step, x1 + 1)] +298a10 * R1[mad24(2*rows + y1 + 1, R1Step, x1)] +299a11 * R1[mad24(2*rows + y1 + 1, R1Step, x1 + 1)];300301r5 = a00 * R1[mad24(3*rows + y1, R1Step, x1)] +302a01 * R1[mad24(3*rows + y1, R1Step, x1 + 1)] +303a10 * R1[mad24(3*rows + y1 + 1, R1Step, x1)] +304a11 * R1[mad24(3*rows + y1 + 1, R1Step, x1 + 1)];305306r6 = a00 * R1[mad24(4*rows + y1, R1Step, x1)] +307a01 * R1[mad24(4*rows + y1, R1Step, x1 + 1)] +308a10 * R1[mad24(4*rows + y1 + 1, R1Step, x1)] +309a11 * R1[mad24(4*rows + y1 + 1, R1Step, x1 + 1)];310311r4 = (R0[mad24(2*rows + y, R0Step, x)] + r4) * 0.5f;312r5 = (R0[mad24(3*rows + y, R0Step, x)] + r5) * 0.5f;313r6 = (R0[mad24(4*rows + y, R0Step, x)] + r6) * 0.25f;314}315else316{317r2 = r3 = 0.f;318r4 = R0[mad24(2*rows + y, R0Step, x)];319r5 = R0[mad24(3*rows + y, R0Step, x)];320r6 = R0[mad24(4*rows + y, R0Step, x)] * 0.5f;321}322323r2 = (R0[mad24(y, R0Step, x)] - r2) * 0.5f;324r3 = (R0[mad24(rows + y, R0Step, x)] - r3) * 0.5f;325326r2 += r4*dy + r6*dx;327r3 += r6*dy + r5*dx;328329float scale =330c_border[min(x, BORDER_SIZE)] *331c_border[min(y, BORDER_SIZE)] *332c_border[min(cols - x - 1, BORDER_SIZE)] *333c_border[min(rows - y - 1, BORDER_SIZE)];334335r2 *= scale;336r3 *= scale;337r4 *= scale;338r5 *= scale;339r6 *= scale;340341M[mad24(y, mStep, x)] = r4*r4 + r6*r6;342M[mad24(rows + y, mStep, x)] = (r4 + r5)*r6;343M[mad24(2*rows + y, mStep, x)] = r5*r5 + r6*r6;344M[mad24(3*rows + y, mStep, x)] = r4*r2 + r6*r3;345M[mad24(4*rows + y, mStep, x)] = r6*r2 + r5*r3;346}347}348349__kernel void boxFilter5(__global const float * src, int srcStep,350__global float * dst, int dstStep,351const int rows, const int cols,352const int ksizeHalf,353__local float * smem)354{355const int y = get_global_id(1);356const int x = get_global_id(0);357358const float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));359const int smw = bdx + 2*ksizeHalf; // shared memory "width"360__local float *row = smem + 5 * ty * smw;361362if (y < rows)363{364// Vertical pass365for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)366{367int xExt = (int)(bx * bdx) + i - ksizeHalf;368xExt = min(max(xExt, 0), cols - 1);369370#pragma unroll371for (int k = 0; k < 5; ++k)372row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)];373374for (int j = 1; j <= ksizeHalf; ++j)375#pragma unroll376for (int k = 0; k < 5; ++k)377row[k*smw + i] +=378src[mad24(k*rows + max(y - j, 0), srcStep, xExt)] +379src[mad24(k*rows + min(y + j, rows - 1), srcStep, xExt)];380}381}382383barrier(CLK_LOCAL_MEM_FENCE);384385if (y < rows && y >= 0 && x < cols && x >= 0)386{387// Horizontal pass388389row += tx + ksizeHalf;390float res[5];391392#pragma unroll393for (int k = 0; k < 5; ++k)394res[k] = row[k*smw];395396for (int i = 1; i <= ksizeHalf; ++i)397#pragma unroll398for (int k = 0; k < 5; ++k)399res[k] += row[k*smw - i] + row[k*smw + i];400401#pragma unroll402for (int k = 0; k < 5; ++k)403dst[mad24(k*rows + y, dstStep, x)] = res[k] * boxAreaInv;404}405}406407__kernel void updateFlow(__global const float * M, int mStep,408__global float * flowx, int xStep,409__global float * flowy, int yStep,410const int rows, const int cols)411{412const int y = get_global_id(1);413const int x = get_global_id(0);414415if (y < rows && y >= 0 && x < cols && x >= 0)416{417float g11 = M[mad24(y, mStep, x)];418float g12 = M[mad24(rows + y, mStep, x)];419float g22 = M[mad24(2*rows + y, mStep, x)];420float h1 = M[mad24(3*rows + y, mStep, x)];421float h2 = M[mad24(4*rows + y, mStep, x)];422423float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);424425flowx[mad24(y, xStep, x)] = (g11*h2 - g12*h1) * detInv;426flowy[mad24(y, yStep, x)] = (g22*h1 - g12*h2) * detInv;427}428}429430431