Path: blob/master/modules/photo/src/opencl/nlmeans.cl
16348 views
// This file is part of OpenCV project.1// It is subject to the license terms in the LICENSE file found in the top-level directory2// of this distribution and at http://opencv.org/license.html.34// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.5// Third party copyrights are property of their respective owners.67#ifdef cl_amd_printf8#pragma OPENCL_EXTENSION cl_amd_printf:enable9#endif1011#ifdef DOUBLE_SUPPORT12#ifdef cl_amd_fp6413#pragma OPENCL EXTENSION cl_amd_fp64:enable14#elif defined cl_khr_fp6415#pragma OPENCL EXTENSION cl_khr_fp64:enable16#endif17#endif181920#ifdef OP_CALC_WEIGHTS2122__kernel void calcAlmostDist2Weight(__global wlut_t * almostDist2Weight, int almostMaxDist,23FT almostDist2ActualDistMultiplier, int fixedPointMult,24w_t den, FT WEIGHT_THRESHOLD)25{26int almostDist = get_global_id(0);2728if (almostDist < almostMaxDist)29{30FT dist = almostDist * almostDist2ActualDistMultiplier;31#ifdef ABS32w_t w = exp((w_t)(-dist*dist) * den);33#else34w_t w = exp((w_t)(-dist) * den);35#endif36wlut_t weight = convert_wlut_t(fixedPointMult * (isnan(w) ? (w_t)1.0 : w));37almostDist2Weight[almostDist] =38weight < (wlut_t)(WEIGHT_THRESHOLD * fixedPointMult) ? (wlut_t)0 : weight;39}40}4142#elif defined OP_CALC_FASTNLMEANS4344#define noconvert4546#define SEARCH_SIZE_SQ (SEARCH_SIZE * SEARCH_SIZE)4748inline int calcDist(pixel_t a, pixel_t b)49{50#ifdef ABS51int_t retval = convert_int_t(abs_diff(a, b));52#else53int_t diff = convert_int_t(a) - convert_int_t(b);54int_t retval = diff * diff;55#endif5657#if cn == 158return retval;59#elif cn == 260return retval.x + retval.y;61#elif cn == 362return retval.x + retval.y + retval.z;63#elif cn == 464return retval.x + retval.y + retval.z + retval.w;65#else66#error "cn should be either 1, 2, 3 or 4"67#endif68}6970#ifdef ABS71inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_value, pixel_t up_value_t)72{73return calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t);74}75#else76inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_value, pixel_t up_value_t)77{78int_t A = convert_int_t(down_value) - convert_int_t(down_value_t);79int_t B = convert_int_t(up_value) - convert_int_t(up_value_t);80int_t retval = (A - B) * (A + B);8182#if cn == 183return retval;84#elif cn == 285return retval.x + retval.y;86#elif cn == 387return retval.x + retval.y + retval.z;88#elif cn == 489return retval.x + retval.y + retval.z + retval.w;90#else91#error "cn should be either 1, 2, 3 or 4"92#endif93}94#endif9596#define COND if (x == 0 && y == 0)9798inline void calcFirstElementInRow(__global const uchar * src, int src_step, int src_offset,99__local int * dists, int y, int x, int id,100__global int * col_dists, __global int * up_col_dists)101{102y -= TEMPLATE_SIZE2;103int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;104int col_dists_current_private[TEMPLATE_SIZE];105106for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)107{108int dist = 0, value;109110__global const pixel_t * src_template = (__global const pixel_t *)(src +111mad24(sy + i / SEARCH_SIZE, src_step, mad24(psz, sx + i % SEARCH_SIZE, src_offset)));112__global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(psz, x, src_offset)));113__global int * col_dists_current = col_dists + i * TEMPLATE_SIZE;114115#pragma unroll116for (int j = 0; j < TEMPLATE_SIZE; ++j)117col_dists_current_private[j] = 0;118119for (int ty = 0; ty < TEMPLATE_SIZE; ++ty)120{121#pragma unroll122for (int tx = -TEMPLATE_SIZE2; tx <= TEMPLATE_SIZE2; ++tx)123{124value = calcDist(src_template[tx], src_current[tx]);125126col_dists_current_private[tx + TEMPLATE_SIZE2] += value;127dist += value;128}129130src_current = (__global const pixel_t *)((__global const uchar *)src_current + src_step);131src_template = (__global const pixel_t *)((__global const uchar *)src_template + src_step);132}133134#pragma unroll135for (int j = 0; j < TEMPLATE_SIZE; ++j)136col_dists_current[j] = col_dists_current_private[j];137138dists[i] = dist;139up_col_dists[0 + i] = col_dists[TEMPLATE_SIZE - 1];140}141}142143inline void calcElementInFirstRow(__global const uchar * src, int src_step, int src_offset,144__local int * dists, int y, int x0, int x, int id, int first,145__global int * col_dists, __global int * up_col_dists)146{147x += TEMPLATE_SIZE2;148y -= TEMPLATE_SIZE2;149int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;150151for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)152{153__global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(psz, x, src_offset)));154__global const pixel_t * src_template = (__global const pixel_t *)(src +155mad24(sy + i / SEARCH_SIZE, src_step, mad24(psz, sx + i % SEARCH_SIZE, src_offset)));156__global int * col_dists_current = col_dists + TEMPLATE_SIZE * i;157158int col_dist = 0;159160#pragma unroll161for (int ty = 0; ty < TEMPLATE_SIZE; ++ty)162{163col_dist += calcDist(src_current[0], src_template[0]);164165src_current = (__global const pixel_t *)((__global const uchar *)src_current + src_step);166src_template = (__global const pixel_t *)((__global const uchar *)src_template + src_step);167}168169dists[i] += col_dist - col_dists_current[first];170col_dists_current[first] = col_dist;171up_col_dists[mad24(x0, SEARCH_SIZE_SQ, i)] = col_dist;172}173}174175inline void calcElement(__global const uchar * src, int src_step, int src_offset,176__local int * dists, int y, int x0, int x, int id, int first,177__global int * col_dists, __global int * up_col_dists)178{179int sx = x + TEMPLATE_SIZE2;180int sy_up = y - TEMPLATE_SIZE2 - 1;181int sy_down = y + TEMPLATE_SIZE2;182183pixel_t up_value = *(__global const pixel_t *)(src + mad24(sy_up, src_step, mad24(psz, sx, src_offset)));184pixel_t down_value = *(__global const pixel_t *)(src + mad24(sy_down, src_step, mad24(psz, sx, src_offset)));185186sx -= SEARCH_SIZE2;187sy_up -= SEARCH_SIZE2;188sy_down -= SEARCH_SIZE2;189190for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)191{192int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE;193194pixel_t up_value_t = *(__global const pixel_t *)(src + mad24(sy_up + wy, src_step, mad24(psz, sx + wx, src_offset)));195pixel_t down_value_t = *(__global const pixel_t *)(src + mad24(sy_down + wy, src_step, mad24(psz, sx + wx, src_offset)));196197__global int * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first);198__global int * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i);199200int col_dist = up_col_dists_current[0] + calcDistUpDown(down_value, down_value_t, up_value, up_value_t);201202dists[i] += col_dist - col_dists_current[0];203col_dists_current[0] = col_dist;204up_col_dists_current[0] = col_dist;205}206}207208inline void convolveWindow(__global const uchar * src, int src_step, int src_offset,209__local int * dists, __global const wlut_t * almostDist2Weight,210__global uchar * dst, int dst_step, int dst_offset,211int y, int x, int id, __local weight_t * weights_local,212__local sum_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)213{214int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;215weight_t weights = (weight_t)0;216sum_t weighted_sum = (sum_t)0;217218for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)219{220int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, psz, src_offset));221sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index));222223int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;224weight_t weight = convert_weight_t(almostDist2Weight[almostAvgDist]);225226weights += weight;227weighted_sum += (sum_t)weight * src_value;228}229230weights_local[id] = weights;231weighted_sum_local[id] = weighted_sum;232barrier(CLK_LOCAL_MEM_FENCE);233234for (int lsize = CTA_SIZE >> 1; lsize > 2; lsize >>= 1)235{236if (id < lsize)237{238int id2 = lsize + id;239weights_local[id] += weights_local[id2];240weighted_sum_local[id] += weighted_sum_local[id2];241}242barrier(CLK_LOCAL_MEM_FENCE);243}244245if (id == 0)246{247int dst_index = mad24(y, dst_step, mad24(psz, x, dst_offset));248sum_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] +249weighted_sum_local[2] + weighted_sum_local[3];250weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];251252*(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)weights_local_0);253}254}255256__kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset,257__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,258__global const wlut_t * almostDist2Weight, __global uchar * buffer,259int almostTemplateWindowSizeSqBinShift)260{261int block_x = get_group_id(0), nblocks_x = get_num_groups(0);262int block_y = get_group_id(1);263int id = get_local_id(0), first;264265__local int dists[SEARCH_SIZE_SQ];266__local weight_t weights[CTA_SIZE];267__local sum_t weighted_sum[CTA_SIZE];268269int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols);270int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows);271272// for each group we need SEARCH_SIZE_SQ * TEMPLATE_SIZE integer buffer for storing part column sum for current element273// and SEARCH_SIZE_SQ * BLOCK_COLS integer buffer for storing last column sum for each element of search window of up row274int block_data_start = SEARCH_SIZE_SQ * (mad24(block_y, dst_cols, x0) + mad24(block_y, nblocks_x, block_x) * TEMPLATE_SIZE);275__global int * col_dists = (__global int *)(buffer + block_data_start * sizeof(int));276__global int * up_col_dists = col_dists + SEARCH_SIZE_SQ * TEMPLATE_SIZE;277278for (int y = y0; y < y1; ++y)279for (int x = x0; x < x1; ++x)280{281if (x == x0)282{283calcFirstElementInRow(src, src_step, src_offset, dists, y, x, id, col_dists, up_col_dists);284first = 0;285}286else287{288if (y == y0)289calcElementInFirstRow(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists);290else291calcElement(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists);292293first = (first + 1) % TEMPLATE_SIZE;294}295296convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset,297y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift);298}299}300301#endif302303304