Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/photo/src/opencl/nlmeans.cl
16348 views
1
// This file is part of OpenCV project.
2
// It is subject to the license terms in the LICENSE file found in the top-level directory
3
// of this distribution and at http://opencv.org/license.html.
4
5
// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
6
// Third party copyrights are property of their respective owners.
7
8
#ifdef cl_amd_printf
9
#pragma OPENCL_EXTENSION cl_amd_printf:enable
10
#endif
11
12
#ifdef DOUBLE_SUPPORT
13
#ifdef cl_amd_fp64
14
#pragma OPENCL EXTENSION cl_amd_fp64:enable
15
#elif defined cl_khr_fp64
16
#pragma OPENCL EXTENSION cl_khr_fp64:enable
17
#endif
18
#endif
19
20
21
#ifdef OP_CALC_WEIGHTS
22
23
__kernel void calcAlmostDist2Weight(__global wlut_t * almostDist2Weight, int almostMaxDist,
24
FT almostDist2ActualDistMultiplier, int fixedPointMult,
25
w_t den, FT WEIGHT_THRESHOLD)
26
{
27
int almostDist = get_global_id(0);
28
29
if (almostDist < almostMaxDist)
30
{
31
FT dist = almostDist * almostDist2ActualDistMultiplier;
32
#ifdef ABS
33
w_t w = exp((w_t)(-dist*dist) * den);
34
#else
35
w_t w = exp((w_t)(-dist) * den);
36
#endif
37
wlut_t weight = convert_wlut_t(fixedPointMult * (isnan(w) ? (w_t)1.0 : w));
38
almostDist2Weight[almostDist] =
39
weight < (wlut_t)(WEIGHT_THRESHOLD * fixedPointMult) ? (wlut_t)0 : weight;
40
}
41
}
42
43
#elif defined OP_CALC_FASTNLMEANS
44
45
#define noconvert
46
47
#define SEARCH_SIZE_SQ (SEARCH_SIZE * SEARCH_SIZE)
48
49
inline int calcDist(pixel_t a, pixel_t b)
50
{
51
#ifdef ABS
52
int_t retval = convert_int_t(abs_diff(a, b));
53
#else
54
int_t diff = convert_int_t(a) - convert_int_t(b);
55
int_t retval = diff * diff;
56
#endif
57
58
#if cn == 1
59
return retval;
60
#elif cn == 2
61
return retval.x + retval.y;
62
#elif cn == 3
63
return retval.x + retval.y + retval.z;
64
#elif cn == 4
65
return retval.x + retval.y + retval.z + retval.w;
66
#else
67
#error "cn should be either 1, 2, 3 or 4"
68
#endif
69
}
70
71
#ifdef ABS
72
inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_value, pixel_t up_value_t)
73
{
74
return calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t);
75
}
76
#else
77
inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_value, pixel_t up_value_t)
78
{
79
int_t A = convert_int_t(down_value) - convert_int_t(down_value_t);
80
int_t B = convert_int_t(up_value) - convert_int_t(up_value_t);
81
int_t retval = (A - B) * (A + B);
82
83
#if cn == 1
84
return retval;
85
#elif cn == 2
86
return retval.x + retval.y;
87
#elif cn == 3
88
return retval.x + retval.y + retval.z;
89
#elif cn == 4
90
return retval.x + retval.y + retval.z + retval.w;
91
#else
92
#error "cn should be either 1, 2, 3 or 4"
93
#endif
94
}
95
#endif
96
97
#define COND if (x == 0 && y == 0)
98
99
inline void calcFirstElementInRow(__global const uchar * src, int src_step, int src_offset,
100
__local int * dists, int y, int x, int id,
101
__global int * col_dists, __global int * up_col_dists)
102
{
103
y -= TEMPLATE_SIZE2;
104
int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;
105
int col_dists_current_private[TEMPLATE_SIZE];
106
107
for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
108
{
109
int dist = 0, value;
110
111
__global const pixel_t * src_template = (__global const pixel_t *)(src +
112
mad24(sy + i / SEARCH_SIZE, src_step, mad24(psz, sx + i % SEARCH_SIZE, src_offset)));
113
__global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(psz, x, src_offset)));
114
__global int * col_dists_current = col_dists + i * TEMPLATE_SIZE;
115
116
#pragma unroll
117
for (int j = 0; j < TEMPLATE_SIZE; ++j)
118
col_dists_current_private[j] = 0;
119
120
for (int ty = 0; ty < TEMPLATE_SIZE; ++ty)
121
{
122
#pragma unroll
123
for (int tx = -TEMPLATE_SIZE2; tx <= TEMPLATE_SIZE2; ++tx)
124
{
125
value = calcDist(src_template[tx], src_current[tx]);
126
127
col_dists_current_private[tx + TEMPLATE_SIZE2] += value;
128
dist += value;
129
}
130
131
src_current = (__global const pixel_t *)((__global const uchar *)src_current + src_step);
132
src_template = (__global const pixel_t *)((__global const uchar *)src_template + src_step);
133
}
134
135
#pragma unroll
136
for (int j = 0; j < TEMPLATE_SIZE; ++j)
137
col_dists_current[j] = col_dists_current_private[j];
138
139
dists[i] = dist;
140
up_col_dists[0 + i] = col_dists[TEMPLATE_SIZE - 1];
141
}
142
}
143
144
inline void calcElementInFirstRow(__global const uchar * src, int src_step, int src_offset,
145
__local int * dists, int y, int x0, int x, int id, int first,
146
__global int * col_dists, __global int * up_col_dists)
147
{
148
x += TEMPLATE_SIZE2;
149
y -= TEMPLATE_SIZE2;
150
int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;
151
152
for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
153
{
154
__global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(psz, x, src_offset)));
155
__global const pixel_t * src_template = (__global const pixel_t *)(src +
156
mad24(sy + i / SEARCH_SIZE, src_step, mad24(psz, sx + i % SEARCH_SIZE, src_offset)));
157
__global int * col_dists_current = col_dists + TEMPLATE_SIZE * i;
158
159
int col_dist = 0;
160
161
#pragma unroll
162
for (int ty = 0; ty < TEMPLATE_SIZE; ++ty)
163
{
164
col_dist += calcDist(src_current[0], src_template[0]);
165
166
src_current = (__global const pixel_t *)((__global const uchar *)src_current + src_step);
167
src_template = (__global const pixel_t *)((__global const uchar *)src_template + src_step);
168
}
169
170
dists[i] += col_dist - col_dists_current[first];
171
col_dists_current[first] = col_dist;
172
up_col_dists[mad24(x0, SEARCH_SIZE_SQ, i)] = col_dist;
173
}
174
}
175
176
inline void calcElement(__global const uchar * src, int src_step, int src_offset,
177
__local int * dists, int y, int x0, int x, int id, int first,
178
__global int * col_dists, __global int * up_col_dists)
179
{
180
int sx = x + TEMPLATE_SIZE2;
181
int sy_up = y - TEMPLATE_SIZE2 - 1;
182
int sy_down = y + TEMPLATE_SIZE2;
183
184
pixel_t up_value = *(__global const pixel_t *)(src + mad24(sy_up, src_step, mad24(psz, sx, src_offset)));
185
pixel_t down_value = *(__global const pixel_t *)(src + mad24(sy_down, src_step, mad24(psz, sx, src_offset)));
186
187
sx -= SEARCH_SIZE2;
188
sy_up -= SEARCH_SIZE2;
189
sy_down -= SEARCH_SIZE2;
190
191
for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
192
{
193
int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE;
194
195
pixel_t up_value_t = *(__global const pixel_t *)(src + mad24(sy_up + wy, src_step, mad24(psz, sx + wx, src_offset)));
196
pixel_t down_value_t = *(__global const pixel_t *)(src + mad24(sy_down + wy, src_step, mad24(psz, sx + wx, src_offset)));
197
198
__global int * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first);
199
__global int * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i);
200
201
int col_dist = up_col_dists_current[0] + calcDistUpDown(down_value, down_value_t, up_value, up_value_t);
202
203
dists[i] += col_dist - col_dists_current[0];
204
col_dists_current[0] = col_dist;
205
up_col_dists_current[0] = col_dist;
206
}
207
}
208
209
inline void convolveWindow(__global const uchar * src, int src_step, int src_offset,
210
__local int * dists, __global const wlut_t * almostDist2Weight,
211
__global uchar * dst, int dst_step, int dst_offset,
212
int y, int x, int id, __local weight_t * weights_local,
213
__local sum_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
214
{
215
int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;
216
weight_t weights = (weight_t)0;
217
sum_t weighted_sum = (sum_t)0;
218
219
for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
220
{
221
int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, psz, src_offset));
222
sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index));
223
224
int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;
225
weight_t weight = convert_weight_t(almostDist2Weight[almostAvgDist]);
226
227
weights += weight;
228
weighted_sum += (sum_t)weight * src_value;
229
}
230
231
weights_local[id] = weights;
232
weighted_sum_local[id] = weighted_sum;
233
barrier(CLK_LOCAL_MEM_FENCE);
234
235
for (int lsize = CTA_SIZE >> 1; lsize > 2; lsize >>= 1)
236
{
237
if (id < lsize)
238
{
239
int id2 = lsize + id;
240
weights_local[id] += weights_local[id2];
241
weighted_sum_local[id] += weighted_sum_local[id2];
242
}
243
barrier(CLK_LOCAL_MEM_FENCE);
244
}
245
246
if (id == 0)
247
{
248
int dst_index = mad24(y, dst_step, mad24(psz, x, dst_offset));
249
sum_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] +
250
weighted_sum_local[2] + weighted_sum_local[3];
251
weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];
252
253
*(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)weights_local_0);
254
}
255
}
256
257
__kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset,
258
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
259
__global const wlut_t * almostDist2Weight, __global uchar * buffer,
260
int almostTemplateWindowSizeSqBinShift)
261
{
262
int block_x = get_group_id(0), nblocks_x = get_num_groups(0);
263
int block_y = get_group_id(1);
264
int id = get_local_id(0), first;
265
266
__local int dists[SEARCH_SIZE_SQ];
267
__local weight_t weights[CTA_SIZE];
268
__local sum_t weighted_sum[CTA_SIZE];
269
270
int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols);
271
int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows);
272
273
// for each group we need SEARCH_SIZE_SQ * TEMPLATE_SIZE integer buffer for storing part column sum for current element
274
// and SEARCH_SIZE_SQ * BLOCK_COLS integer buffer for storing last column sum for each element of search window of up row
275
int block_data_start = SEARCH_SIZE_SQ * (mad24(block_y, dst_cols, x0) + mad24(block_y, nblocks_x, block_x) * TEMPLATE_SIZE);
276
__global int * col_dists = (__global int *)(buffer + block_data_start * sizeof(int));
277
__global int * up_col_dists = col_dists + SEARCH_SIZE_SQ * TEMPLATE_SIZE;
278
279
for (int y = y0; y < y1; ++y)
280
for (int x = x0; x < x1; ++x)
281
{
282
if (x == x0)
283
{
284
calcFirstElementInRow(src, src_step, src_offset, dists, y, x, id, col_dists, up_col_dists);
285
first = 0;
286
}
287
else
288
{
289
if (y == y0)
290
calcElementInFirstRow(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists);
291
else
292
calcElement(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists);
293
294
first = (first + 1) % TEMPLATE_SIZE;
295
}
296
297
convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset,
298
y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift);
299
}
300
}
301
302
#endif
303
304