Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/gaussian_blur.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
#include "saturate_cast.hpp"
42
#include "separable_filter.hpp"
43
44
namespace CAROTENE_NS {
45
46
bool isGaussianBlur3x3Supported(const Size2D &size, BORDER_MODE border)
47
{
48
return isSupportedConfiguration() && size.width >= 8 &&
49
(border == BORDER_MODE_CONSTANT ||
50
border == BORDER_MODE_REPLICATE);
51
}
52
53
void gaussianBlur3x3(const Size2D &size,
54
const u8 * srcBase, ptrdiff_t srcStride,
55
u8 * dstBase, ptrdiff_t dstStride,
56
BORDER_MODE border, u8 borderValue)
57
{
58
internal::assertSupportedConfiguration(isGaussianBlur3x3Supported(size, border));
59
#ifdef CAROTENE_NEON
60
const uint16x8_t v_border_x4 = vdupq_n_u16(borderValue << 2);
61
const uint16x8_t v_zero = vdupq_n_u16(0);
62
const uint8x8_t v_border = vdup_n_u8(borderValue);
63
64
uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
65
uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
66
67
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
68
69
for (ptrdiff_t y = 0; y < height; ++y)
70
{
71
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
72
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
73
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
74
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
75
76
s16 prevx = 0, currx = 0, nextx = 0;
77
ptrdiff_t x = 0;
78
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
79
80
// perform vertical convolution
81
for ( ; x <= bwidth; x += 8)
82
{
83
internal::prefetch(srow0 + x);
84
internal::prefetch(srow1 + x);
85
internal::prefetch(srow2 + x);
86
87
uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
88
uint8x8_t x1 = vld1_u8(srow1 + x);
89
uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
90
91
// calculate values for plain CPU part below if needed
92
if (x + 8 >= bwidth)
93
{
94
ptrdiff_t x3 = x == width ? width - 1 : x;
95
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
96
97
if (border == BORDER_MODE_CONSTANT && x4 < 0)
98
prevx = borderValue;
99
else
100
prevx = (srow2 ? srow2[x4] : borderValue) + (srow1[x4] << 1) + (srow0 ? srow0[x4] : borderValue);
101
102
currx = (srow2 ? srow2[x3] : borderValue) + (srow1[x3] << 1) + (srow0 ? srow0[x3] : borderValue);
103
}
104
105
// make shift
106
if (x)
107
{
108
tprev = tcurr;
109
tcurr = tnext;
110
}
111
112
// and calculate next value
113
tnext = vaddq_u16(vaddl_u8(x0, x2), vshll_n_u8(x1, 1));
114
115
// make extrapolation for the first elements
116
if (!x)
117
{
118
// make border
119
if (border == BORDER_MODE_CONSTANT)
120
tcurr = v_border_x4;
121
else if (border == BORDER_MODE_REPLICATE)
122
tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
123
124
continue;
125
}
126
127
// combine 3 "shifted" vectors
128
t0 = vextq_u16(tprev, tcurr, 7);
129
t1 = tcurr;
130
t2 = vextq_u16(tcurr, tnext, 1);
131
132
// and add them
133
t0 = vqaddq_u16(vshlq_n_u16(t1, 1), vqaddq_u16(t0, t2));
134
vst1_u8(drow + x - 8, vshrn_n_u16(t0, 4));
135
}
136
137
x -= 8;
138
if (x == width)
139
--x;
140
141
for ( ; x < width; ++x)
142
{
143
// make extrapolation for the last elements
144
if (x + 1 >= width)
145
{
146
if (border == BORDER_MODE_CONSTANT)
147
nextx = borderValue << 2;
148
else if (border == BORDER_MODE_REPLICATE)
149
nextx = srow2[x] + (srow1[x] << 1) + srow0[x];
150
}
151
else
152
nextx = (srow2 ? srow2[x + 1] : borderValue) +
153
(srow1[x + 1] << 1) +
154
(srow0 ? srow0[x + 1] : borderValue);
155
156
f32 val = (prevx + (currx << 1) + nextx) >> 4;
157
drow[x] = internal::saturate_cast<u8>((s32)val);
158
159
// make shift
160
prevx = currx;
161
currx = nextx;
162
}
163
}
164
#else
165
(void)srcBase;
166
(void)srcStride;
167
(void)dstBase;
168
(void)dstStride;
169
(void)borderValue;
170
#endif
171
}
172
173
bool isGaussianBlur3x3MarginSupported(const Size2D &size, BORDER_MODE border, Margin borderMargin)
174
{
175
return isSeparableFilter3x3Supported(size, border, 0, 0, borderMargin);
176
}
177
178
void gaussianBlur3x3Margin(const Size2D &size,
179
const u8 * srcBase, ptrdiff_t srcStride,
180
u8 * dstBase, ptrdiff_t dstStride,
181
BORDER_MODE border, u8 borderValue, Margin borderMargin)
182
{
183
internal::assertSupportedConfiguration(isGaussianBlur3x3MarginSupported(size, border, borderMargin));
184
#ifdef CAROTENE_NEON
185
internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3U8_121>::process(
186
size, srcBase, srcStride, dstBase, dstStride,
187
0, 0, border, borderValue, borderMargin);
188
#else
189
(void)srcBase;
190
(void)srcStride;
191
(void)dstBase;
192
(void)dstStride;
193
(void)borderValue;
194
#endif
195
}
196
197
bool isGaussianBlur5x5Supported(const Size2D &size, s32 cn, BORDER_MODE border)
198
{
199
return isSupportedConfiguration() &&
200
cn > 0 && cn <= 4 &&
201
size.width >= 8 && size.height >= 2 &&
202
(border == BORDER_MODE_CONSTANT ||
203
border == BORDER_MODE_REFLECT101 ||
204
border == BORDER_MODE_REFLECT ||
205
border == BORDER_MODE_REPLICATE ||
206
border == BORDER_MODE_WRAP);
207
}
208
209
void gaussianBlur5x5(const Size2D &size, s32 cn,
210
const u8 * srcBase, ptrdiff_t srcStride,
211
u8 * dstBase, ptrdiff_t dstStride,
212
BORDER_MODE borderType, u8 borderValue, Margin borderMargin)
213
{
214
internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
215
#ifdef CAROTENE_NEON
216
size_t colsn = size.width * cn;
217
218
std::vector<u8> _tmp;
219
u8 *tmp = 0;
220
if (borderType == BORDER_MODE_CONSTANT)
221
{
222
_tmp.assign(colsn + 4*cn, borderValue);
223
tmp = &_tmp[cn << 1];
224
}
225
226
ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
227
ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
228
ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
229
ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
230
231
//1-line buffer
232
std::vector<u16> _buf(cn * (size.width + 4) + 32 / sizeof(u16));
233
u16* lane = internal::alignPtr(&_buf[cn << 1], 32);
234
235
if (borderType == BORDER_MODE_CONSTANT)
236
for (s32 k = 0; k < cn; ++k)
237
{
238
lane[-cn+k] = borderValue;
239
lane[-cn-cn+k] = borderValue;
240
lane[colsn+k] = borderValue;
241
lane[colsn+cn+k] = borderValue;
242
}
243
244
uint8x8_t vc6u8 = vmov_n_u8(6);
245
uint16x8_t vc6u16 = vmovq_n_u16(6);
246
uint16x8_t vc4u16 = vmovq_n_u16(4);
247
248
for (size_t i = 0; i < size.height; ++i)
249
{
250
u8* dst = internal::getRowPtr(dstBase, dstStride, i);
251
//vertical convolution
252
ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
253
ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
254
ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
255
ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
256
257
const u8* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
258
const u8* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
259
const u8* ln2 = internal::getRowPtr(srcBase, srcStride, i);
260
const u8* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
261
const u8* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
262
263
size_t x = 0;
264
for (; x <= colsn - 8; x += 8)
265
{
266
internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
267
uint8x8_t v0 = vld1_u8(ln0+x);
268
uint8x8_t v1 = vld1_u8(ln1+x);
269
uint8x8_t v2 = vld1_u8(ln2+x);
270
uint8x8_t v3 = vld1_u8(ln3+x);
271
uint8x8_t v4 = vld1_u8(ln4+x);
272
273
uint16x8_t v = vaddl_u8(v0, v4);
274
uint16x8_t v13 = vaddl_u8(v1, v3);
275
276
v = vmlal_u8(v, v2, vc6u8);
277
v = vmlaq_u16(v, v13, vc4u16);
278
279
vst1q_u16(lane + x, v);
280
}
281
for (; x < colsn; ++x)
282
lane[x] = ln0[x] + ln4[x] + u16(4) * (ln1[x] + ln3[x]) + u16(6) * ln2[x];
283
284
//left&right borders
285
if (borderType != BORDER_MODE_CONSTANT)
286
for (s32 k = 0; k < cn; ++k)
287
{
288
lane[-cn+k] = lane[idx_l1 + k];
289
lane[-cn-cn+k] = lane[idx_l2 + k];
290
291
lane[colsn+k] = lane[idx_r1 + k];
292
lane[colsn+cn+k] = lane[idx_r2 + k];
293
}
294
295
//horizontal convolution
296
x = 0;
297
switch(cn)
298
{
299
case 1:
300
for (; x <= colsn - 8; x += 8)
301
{
302
internal::prefetch(lane + x);
303
304
uint16x8_t lane0 = vld1q_u16(lane + x - 2);
305
uint16x8_t lane4 = vld1q_u16(lane + x + 2);
306
uint16x8_t lane1 = vld1q_u16(lane + x - 1);
307
uint16x8_t lane3 = vld1q_u16(lane + x + 1);
308
uint16x8_t lane2 = vld1q_u16(lane + x + 0);
309
310
uint16x8_t ln04 = vaddq_u16(lane0, lane4);
311
uint16x8_t ln13 = vaddq_u16(lane1, lane3);
312
313
uint16x8_t ln042 = vmlaq_u16(ln04, lane2, vc6u16);
314
uint16x8_t lsw = vmlaq_u16(ln042, ln13, vc4u16);
315
316
uint8x8_t ls = vrshrn_n_u16(lsw, 8);
317
318
vst1_u8(dst + x, ls);
319
}
320
break;
321
case 2:
322
for (; x <= colsn - 8*2; x += 8*2)
323
{
324
internal::prefetch(lane + x);
325
326
u16* lidx0 = lane + x - 2*2;
327
u16* lidx1 = lane + x - 1*2;
328
u16* lidx3 = lane + x + 1*2;
329
u16* lidx4 = lane + x + 2*2;
330
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
331
__asm__ __volatile__ (
332
"vld2.16 {d0, d2}, [%[in0]]! \n\t"
333
"vld2.16 {d1, d3}, [%[in0]] \n\t"
334
"vld2.16 {d8, d10}, [%[in4]]! \n\t"
335
"vld2.16 {d9, d11}, [%[in4]] \n\t"
336
"vadd.i16 q0, q4 \n\t"
337
"vadd.i16 q1, q5 \n\t"
338
"vld2.16 {d16, d18}, [%[in1]]! \n\t"
339
"vld2.16 {d17, d19}, [%[in1]] \n\t"
340
"vld2.16 {d8, d10}, [%[in3]]! \n\t"
341
"vld2.16 {d9, d11}, [%[in3]] \n\t"
342
"vadd.i16 q4, q8 \n\t"
343
"vadd.i16 q5, q9 \n\t"
344
"vld2.16 {d16, d18}, [%[in2]] \n\t"
345
"vld2.16 {d17, d19}, [%[in22]] \n\t"
346
"vmla.i16 q0, q4, %q[c4] \n\t"
347
"vmla.i16 q1, q5, %q[c4] \n\t"
348
"vmla.i16 q0, q8, %q[c6] \n\t"
349
"vmla.i16 q1, q9, %q[c6] \n\t"
350
"vrshrn.u16 d8, q0, #8 \n\t"
351
"vrshrn.u16 d9, q1, #8 \n\t"
352
"vst2.8 {d8-d9}, [%[out]] \n\t"
353
: [in0] "=r" (lidx0),
354
[in1] "=r" (lidx1),
355
[in3] "=r" (lidx3),
356
[in4] "=r" (lidx4)
357
: [out] "r" (dst + x),
358
"0" (lidx0),
359
"1" (lidx1),
360
"2" (lidx3),
361
"3" (lidx4),
362
[in2] "r" (lane + x),
363
[in22] "r" (lane + x + 4*2),
364
[c4] "w" (vc4u16), [c6] "w" (vc6u16)
365
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
366
);
367
#else
368
uint16x8x2_t vLane0 = vld2q_u16(lidx0);
369
uint16x8x2_t vLane1 = vld2q_u16(lidx1);
370
uint16x8x2_t vLane2 = vld2q_u16(lane + x);
371
uint16x8x2_t vLane3 = vld2q_u16(lidx3);
372
uint16x8x2_t vLane4 = vld2q_u16(lidx4);
373
374
uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
375
uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]);
376
377
uint16x8_t vSum_4_8 = vaddq_u16(vLane1.val[0], vLane3.val[0]);
378
uint16x8_t vSum_5_9 = vaddq_u16(vLane1.val[1], vLane3.val[1]);
379
380
vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16);
381
vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16);
382
vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
383
vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16);
384
385
uint8x8x2_t vRes;
386
vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
387
vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
388
vst2_u8(dst + x, vRes);
389
#endif
390
}
391
break;
392
case 3:
393
for (; x <= colsn - 8*3; x += 8*3)
394
{
395
internal::prefetch(lane + x);
396
397
u16* lidx0 = lane + x - 2*3;
398
u16* lidx1 = lane + x - 1*3;
399
u16* lidx3 = lane + x + 1*3;
400
u16* lidx4 = lane + x + 2*3;
401
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
402
__asm__ __volatile__ (
403
"vld3.16 {d0, d2, d4}, [%[in0]]! \n\t"
404
"vld3.16 {d1, d3, d5}, [%[in0]] \n\t"
405
"vld3.16 {d8, d10, d12}, [%[in4]]! \n\t"
406
"vld3.16 {d9, d11, d13}, [%[in4]] \n\t"
407
"vadd.i16 q0, q4 \n\t"
408
"vadd.i16 q1, q5 \n\t"
409
"vadd.i16 q2, q6 \n\t"
410
"vld3.16 {d16, d18, d20}, [%[in1]]! \n\t"
411
"vld3.16 {d17, d19, d21}, [%[in1]] \n\t"
412
"vld3.16 {d8, d10, d12}, [%[in3]]! \n\t"
413
"vld3.16 {d9, d11, d13}, [%[in3]] \n\t"
414
"vadd.i16 q4, q8 \n\t"
415
"vadd.i16 q5, q9 \n\t"
416
"vadd.i16 q6, q10 \n\t"
417
"vld3.16 {d16, d18, d20}, [%[in2]] \n\t"
418
"vld3.16 {d17, d19, d21}, [%[in22]] \n\t"
419
"vmla.i16 q0, q4, %q[c4] \n\t"
420
"vmla.i16 q1, q5, %q[c4] \n\t"
421
"vmla.i16 q2, q6, %q[c4] \n\t"
422
"vmla.i16 q0, q8, %q[c6] \n\t"
423
"vmla.i16 q1, q9, %q[c6] \n\t"
424
"vmla.i16 q2, q10, %q[c6] \n\t"
425
"vrshrn.u16 d8, q0, #8 \n\t"
426
"vrshrn.u16 d9, q1, #8 \n\t"
427
"vrshrn.u16 d10, q2, #8 \n\t"
428
"vst3.8 {d8-d10}, [%[out]] \n\t"
429
: [in0] "=r" (lidx0),
430
[in1] "=r" (lidx1),
431
[in3] "=r" (lidx3),
432
[in4] "=r" (lidx4)
433
: [out] "r" (dst + x),
434
"0" (lidx0),
435
"1" (lidx1),
436
"2" (lidx3),
437
"3" (lidx4),
438
[in2] "r" (lane + x),
439
[in22] "r" (lane + x + 4*3),
440
[c4] "w" (vc4u16), [c6] "w" (vc6u16)
441
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
442
);
443
#else
444
uint16x8x3_t vLane0 = vld3q_u16(lidx0);
445
uint16x8x3_t vLane1 = vld3q_u16(lidx1);
446
uint16x8x3_t vLane2 = vld3q_u16(lane + x);
447
uint16x8x3_t vLane3 = vld3q_u16(lidx3);
448
uint16x8x3_t vLane4 = vld3q_u16(lidx4);
449
450
uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
451
uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]);
452
uint16x8_t vSum_2_6 = vaddq_u16(vLane0.val[2], vLane4.val[2]);
453
454
uint16x8_t vSum_3_1 = vaddq_u16(vLane3.val[0], vLane1.val[0]);
455
uint16x8_t vSum_4_2 = vaddq_u16(vLane3.val[1], vLane1.val[1]);
456
uint16x8_t vSum_5_6 = vaddq_u16(vLane3.val[2], vLane1.val[2]);
457
458
vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_3_1, vc4u16);
459
vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_4_2, vc4u16);
460
vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_5_6, vc4u16);
461
462
vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
463
vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16);
464
vSum_2_6 = vmlaq_u16(vSum_2_6, vLane2.val[2], vc6u16);
465
466
uint8x8x3_t vRes;
467
vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
468
vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
469
vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8);
470
471
vst3_u8(dst + x, vRes);
472
#endif
473
}
474
break;
475
case 4:
476
for (; x <= colsn - 8*4; x += 8*4)
477
{
478
internal::prefetch(lane + x);
479
internal::prefetch(lane + x + 16);
480
481
u16* lidx0 = lane + x - 2*4;
482
u16* lidx1 = lane + x - 1*4;
483
u16* lidx3 = lane + x + 1*4;
484
u16* lidx4 = lane + x + 2*4;
485
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
486
__asm__ __volatile__ (
487
"vld4.16 {d0, d2, d4, d6}, [%[in0]]! \n\t"
488
"vld4.16 {d1, d3, d5, d7}, [%[in0]] \n\t"
489
"vld4.16 {d8, d10, d12, d14}, [%[in4]]! \n\t"
490
"vld4.16 {d9, d11, d13, d15}, [%[in4]] \n\t"
491
"vadd.i16 q0, q4 \n\t"
492
"vadd.i16 q1, q5 \n\t"
493
"vadd.i16 q2, q6 \n\t"
494
"vadd.i16 q3, q7 \n\t"
495
"vld4.16 {d16, d18, d20, d22}, [%[in1]]! \n\t"
496
"vld4.16 {d17, d19, d21, d23}, [%[in1]] \n\t"
497
"vld4.16 {d8, d10, d12, d14}, [%[in3]]! \n\t"
498
"vld4.16 {d9, d11, d13, d15}, [%[in3]] \n\t"
499
"vadd.i16 q4, q8 \n\t"
500
"vadd.i16 q5, q9 \n\t"
501
"vadd.i16 q6, q10 \n\t"
502
"vadd.i16 q7, q11 \n\t"
503
"vld4.16 {d16, d18, d20, d22}, [%[in2],:256] \n\t"
504
"vld4.16 {d17, d19, d21, d23}, [%[in22],:256] \n\t"
505
"vmla.i16 q0, q4, %q[c4] \n\t"
506
"vmla.i16 q1, q5, %q[c4] \n\t"
507
"vmla.i16 q2, q6, %q[c4] \n\t"
508
"vmla.i16 q3, q7, %q[c4] \n\t"
509
"vmla.i16 q0, q8, %q[c6] \n\t"
510
"vmla.i16 q1, q9, %q[c6] \n\t"
511
"vmla.i16 q2, q10, %q[c6] \n\t"
512
"vmla.i16 q3, q11, %q[c6] \n\t"
513
"vrshrn.u16 d8, q0, #8 \n\t"
514
"vrshrn.u16 d9, q1, #8 \n\t"
515
"vrshrn.u16 d10, q2, #8 \n\t"
516
"vrshrn.u16 d11, q3, #8 \n\t"
517
"vst4.8 {d8-d11}, [%[out]] \n\t"
518
: [in0] "=r" (lidx0),
519
[in1] "=r" (lidx1),
520
[in3] "=r" (lidx3),
521
[in4] "=r" (lidx4)
522
: [out] "r" (dst + x),
523
"0" (lidx0),
524
"1" (lidx1),
525
"2" (lidx3),
526
"3" (lidx4),
527
[in2] "r" (lane + x),
528
[in22] "r" (lane + x + 4*4),
529
[c4] "w" (vc4u16), [c6] "w" (vc6u16)
530
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
531
);
532
#else
533
uint16x8x4_t vLane0 = vld4q_u16(lidx0);
534
uint16x8x4_t vLane2 = vld4q_u16(lidx4);
535
uint16x8x4_t vLane4 = vld4q_u16(lidx1);
536
uint16x8x4_t vLane6 = vld4q_u16(lidx3);
537
uint16x8x4_t vLane8 = vld4q_u16(lane + x);
538
539
uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane2.val[0]);
540
uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane2.val[1]);
541
uint16x8_t vSum_2_6 = vaddq_u16(vLane0.val[2], vLane2.val[2]);
542
uint16x8_t vSum_3_7 = vaddq_u16(vLane0.val[3], vLane2.val[3]);
543
544
uint16x8_t vSum_4_8 = vaddq_u16(vLane4.val[0], vLane6.val[0]);
545
uint16x8_t vSum_5_9 = vaddq_u16(vLane4.val[1], vLane6.val[1]);
546
uint16x8_t vSum_6_10 = vaddq_u16(vLane4.val[2], vLane6.val[2]);
547
uint16x8_t vSum_7_11 = vaddq_u16(vLane4.val[3], vLane6.val[3]);
548
549
vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16);
550
vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16);
551
vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_6_10, vc4u16);
552
vSum_3_7 = vmlaq_u16(vSum_3_7, vSum_7_11, vc4u16);
553
554
vSum_0_4 = vmlaq_u16(vSum_0_4, vLane8.val[0], vc6u16);
555
vSum_1_5 = vmlaq_u16(vSum_1_5, vLane8.val[1], vc6u16);
556
vSum_2_6 = vmlaq_u16(vSum_2_6, vLane8.val[2], vc6u16);
557
vSum_3_7 = vmlaq_u16(vSum_3_7, vLane8.val[3], vc6u16);
558
559
uint8x8x4_t vRes;
560
vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
561
vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
562
vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8);
563
vRes.val[3] = vrshrn_n_u16(vSum_3_7, 8);
564
565
vst4_u8(dst + x, vRes);
566
#endif
567
}
568
break;
569
}
570
for (s32 h = 0; h < cn; ++h)
571
{
572
u16* ln = lane + h;
573
u8* dt = dst + h;
574
for (size_t k = x; k < colsn; k += cn)
575
{
576
dt[k] = (u8)((ln[k-2*cn] + ln[k+2*cn]
577
+ u16(4) * (ln[k-cn] + ln[k+cn])
578
+ u16(6) * ln[k] + (1 << 7)) >> 8);
579
}
580
}
581
}
582
#else
583
(void)srcBase;
584
(void)srcStride;
585
(void)dstBase;
586
(void)dstStride;
587
(void)borderValue;
588
(void)borderMargin;
589
#endif
590
}
591
592
void gaussianBlur5x5(const Size2D &size, s32 cn,
593
const u16 * srcBase, ptrdiff_t srcStride,
594
u16 * dstBase, ptrdiff_t dstStride,
595
BORDER_MODE borderType, u16 borderValue, Margin borderMargin)
596
{
597
internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
598
#ifdef CAROTENE_NEON
599
size_t colsn = size.width * cn;
600
601
std::vector<u16> _tmp;
602
u16 *tmp = 0;
603
if (borderType == BORDER_MODE_CONSTANT)
604
{
605
_tmp.assign(colsn + 4*cn, borderValue);
606
tmp = &_tmp[cn << 1];
607
}
608
609
ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
610
ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
611
ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
612
ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
613
614
//1-line buffer
615
std::vector<u32> _buf(cn * (size.width + 4) + 32 / sizeof(u32));
616
u32* lane = internal::alignPtr(&_buf[cn << 1], 32);
617
618
if (borderType == BORDER_MODE_CONSTANT)
619
for (s32 k = 0; k < cn; ++k)
620
{
621
lane[-cn+k] = borderValue;
622
lane[-cn-cn+k] = borderValue;
623
lane[colsn+k] = borderValue;
624
lane[colsn+cn+k] = borderValue;
625
}
626
627
uint16x4_t vc6u16 = vmov_n_u16(6);
628
uint32x4_t vc6u32 = vmovq_n_u32(6);
629
uint32x4_t vc4u32 = vmovq_n_u32(4);
630
631
for (size_t i = 0; i < size.height; ++i)
632
{
633
u16* dst = internal::getRowPtr(dstBase, dstStride, i);
634
//vertical convolution
635
ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
636
ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
637
ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
638
ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
639
640
const u16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
641
const u16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
642
const u16* ln2 = internal::getRowPtr(srcBase, srcStride, i);
643
const u16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
644
const u16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
645
646
size_t x = 0;
647
for (; x <= colsn - 4; x += 4)
648
{
649
internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
650
uint16x4_t v0 = vld1_u16(ln0+x);
651
uint16x4_t v1 = vld1_u16(ln1+x);
652
uint16x4_t v2 = vld1_u16(ln2+x);
653
uint16x4_t v3 = vld1_u16(ln3+x);
654
uint16x4_t v4 = vld1_u16(ln4+x);
655
656
uint32x4_t v = vaddl_u16(v0, v4);
657
uint32x4_t v13 = vaddl_u16(v1, v3);
658
659
v = vmlal_u16(v, v2, vc6u16);
660
v = vmlaq_u32(v, v13, vc4u32);
661
662
vst1q_u32(lane + x, v);
663
}
664
for (; x < colsn; ++x)
665
lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
666
667
//left&right borders
668
if (borderType != BORDER_MODE_CONSTANT)
669
for (s32 k = 0; k < cn; ++k)
670
{
671
lane[-cn+k] = lane[idx_l1 + k];
672
lane[-cn-cn+k] = lane[idx_l2 + k];
673
674
lane[colsn+k] = lane[idx_r1 + k];
675
lane[colsn+cn+k] = lane[idx_r2 + k];
676
}
677
678
//horizontal convolution
679
x = 0;
680
for (; x <= colsn - 4; x += 4)
681
{
682
internal::prefetch(lane + x);
683
684
uint32x4_t lane0 = vld1q_u32(lane + x - 2);
685
uint32x4_t lane4 = vld1q_u32(lane + x + 2);
686
uint32x4_t lane1 = vld1q_u32(lane + x - 1);
687
uint32x4_t lane3 = vld1q_u32(lane + x + 1);
688
uint32x4_t lane2 = vld1q_u32(lane + x + 0);
689
690
uint32x4_t ln04 = vaddq_u32(lane0, lane4);
691
uint32x4_t ln13 = vaddq_u32(lane1, lane3);
692
693
uint32x4_t ln042 = vmlaq_u32(ln04, lane2, vc6u32);
694
uint32x4_t lsw = vmlaq_u32(ln042, ln13, vc4u32);
695
696
uint16x4_t ls = vrshrn_n_u32(lsw, 8);
697
698
vst1_u16(dst + x, ls);
699
}
700
for (s32 h = 0; h < cn; ++h)
701
{
702
u32* ln = lane + h;
703
u16* dt = dst + h;
704
for (size_t k = x; k < colsn; k += cn)
705
{
706
dt[k] = (u16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8);
707
}
708
}
709
}
710
#else
711
(void)srcBase;
712
(void)srcStride;
713
(void)dstBase;
714
(void)dstStride;
715
(void)borderValue;
716
(void)borderMargin;
717
#endif
718
}
719
720
void gaussianBlur5x5(const Size2D &size, s32 cn,
721
const s16 * srcBase, ptrdiff_t srcStride,
722
s16 * dstBase, ptrdiff_t dstStride,
723
BORDER_MODE borderType, s16 borderValue, Margin borderMargin)
724
{
725
internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
726
#ifdef CAROTENE_NEON
727
size_t colsn = size.width * cn;
728
729
std::vector<s16> _tmp;
730
s16 *tmp = 0;
731
if (borderType == BORDER_MODE_CONSTANT)
732
{
733
_tmp.assign(colsn + 4*cn, borderValue);
734
tmp = &_tmp[cn << 1];
735
}
736
737
ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
738
ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
739
ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
740
ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
741
742
//1-line buffer
743
std::vector<s32> _buf(cn * (size.width + 4) + 32 / sizeof(s32));
744
s32* lane = internal::alignPtr(&_buf[cn << 1], 32);
745
746
if (borderType == BORDER_MODE_CONSTANT)
747
for (s32 k = 0; k < cn; ++k)
748
{
749
lane[-cn+k] = borderValue;
750
lane[-cn-cn+k] = borderValue;
751
lane[colsn+k] = borderValue;
752
lane[colsn+cn+k] = borderValue;
753
}
754
755
int16x4_t vc6s16 = vmov_n_s16(6);
756
int32x4_t vc6s32 = vmovq_n_s32(6);
757
int32x4_t vc4s32 = vmovq_n_s32(4);
758
759
for (size_t i = 0; i < size.height; ++i)
760
{
761
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
762
//vertical convolution
763
ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
764
ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
765
ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
766
ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
767
768
const s16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
769
const s16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
770
const s16* ln2 = internal::getRowPtr(srcBase, srcStride, i);
771
const s16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
772
const s16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
773
774
size_t x = 0;
775
for (; x <= colsn - 4; x += 4)
776
{
777
internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
778
int16x4_t v0 = vld1_s16(ln0+x);
779
int16x4_t v1 = vld1_s16(ln1+x);
780
int16x4_t v2 = vld1_s16(ln2+x);
781
int16x4_t v3 = vld1_s16(ln3+x);
782
int16x4_t v4 = vld1_s16(ln4+x);
783
784
int32x4_t v = vaddl_s16(v0, v4);
785
int32x4_t v13 = vaddl_s16(v1, v3);
786
787
v = vmlal_s16(v, v2, vc6s16);
788
v = vmlaq_s32(v, v13, vc4s32);
789
790
vst1q_s32(lane + x, v);
791
}
792
for (; x < colsn; ++x)
793
lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
794
795
//left&right borders
796
if (borderType != BORDER_MODE_CONSTANT)
797
for (s32 k = 0; k < cn; ++k)
798
{
799
lane[-cn+k] = lane[idx_l1 + k];
800
lane[-cn-cn+k] = lane[idx_l2 + k];
801
802
lane[colsn+k] = lane[idx_r1 + k];
803
lane[colsn+cn+k] = lane[idx_r2 + k];
804
}
805
806
//horizontal convolution
807
x = 0;
808
switch(cn)
809
{
810
case 1:
811
case 2:
812
case 3:
813
for (; x <= colsn - 4; x += 4)
814
{
815
internal::prefetch(lane + x);
816
817
int32x4_t lane0 = vld1q_s32(lane + x - 2);
818
int32x4_t lane4 = vld1q_s32(lane + x + 2);
819
int32x4_t lane1 = vld1q_s32(lane + x - 1);
820
int32x4_t lane3 = vld1q_s32(lane + x + 1);
821
int32x4_t lane2 = vld1q_s32(lane + x + 0);
822
823
int32x4_t ln04 = vaddq_s32(lane0, lane4);
824
int32x4_t ln13 = vaddq_s32(lane1, lane3);
825
826
int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
827
int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
828
829
int16x4_t ls = vrshrn_n_s32(lsw, 8);
830
831
vst1_s16(dst + x, ls);
832
}
833
break;
834
case 4:
835
/* for (; x <= colsn - 4*4; x += 4*4)
836
{
837
internal::prefetch(lane + x);
838
internal::prefetch(lane + x + 16);
839
840
ptrdiff_t* lidx0 = lane + x - 2*4;
841
ptrdiff_t* lidx1 = lane + x - 1*4;
842
ptrdiff_t* lidx3 = lane + x + 1*4;
843
ptrdiff_t* lidx4 = lane + x + 2*4;
844
845
__asm__ __volatile__ (
846
"vld4.32 {d0, d2, d4, d6}, [%[in0]]! \n\t"
847
"vld4.32 {d1, d3, d5, d7}, [%[in0]] \n\t"
848
"vld4.32 {d8, d10, d12, d14}, [%[in4]]! \n\t"
849
"vld4.32 {d9, d11, d13, d15}, [%[in4]] \n\t"
850
"vadd.i32 q0, q4 \n\t"
851
"vadd.i32 q1, q5 \n\t"
852
"vadd.i32 q2, q6 \n\t"
853
"vadd.i32 q3, q7 \n\t"
854
"vld4.32 {d16, d18, d20, d22}, [%[in1]]! \n\t"
855
"vld4.32 {d17, d19, d21, d23}, [%[in1]] \n\t"
856
"vld4.32 {d8, d10, d12, d14}, [%[in3]]! \n\t"
857
"vld4.32 {d9, d11, d13, d15}, [%[in3]] \n\t"
858
"vadd.i32 q4, q8 \n\t"
859
"vadd.i32 q5, q9 \n\t"
860
"vadd.i32 q6, q10 \n\t"
861
"vadd.i32 q7, q11 \n\t"
862
"vld4.32 {d16, d18, d20, d22}, [%[in2],:256] \n\t"
863
"vld4.32 {d17, d19, d21, d23}, [%[in22],:256] \n\t"
864
"vmla.i32 q0, q4, %q[c4] \n\t"
865
"vmla.i32 q1, q5, %q[c4] \n\t"
866
"vmla.i32 q2, q6, %q[c4] \n\t"
867
"vmla.i32 q3, q7, %q[c4] \n\t"
868
"vmla.i32 q0, q8, %q[c6] \n\t"
869
"vmla.i32 q1, q9, %q[c6] \n\t"
870
"vmla.i32 q2, q10, %q[c6] \n\t"
871
"vmla.i32 q3, q11, %q[c6] \n\t"
872
"vrshrn.i32 d8, q0, #8 \n\t"
873
"vrshrn.i32 d9, q1, #8 \n\t"
874
"vrshrn.i32 d10, q2, #8 \n\t"
875
"vrshrn.i32 d11, q3, #8 \n\t"
876
"vst4.16 {d8-d11}, [%[out]] \n\t"
877
: [in0] "=r" (lidx0),
878
[in1] "=r" (lidx1),
879
[in3] "=r" (lidx3),
880
[in4] "=r" (lidx4)
881
: [out] "r" (dst + x),
882
"0" (lidx0),
883
"1" (lidx1),
884
"2" (lidx3),
885
"3" (lidx4),
886
[in2] "r" (lane + x),
887
[in22] "r" (lane + x + 4*2),
888
[c4] "w" (vc4s32), [c6] "w" (vc6s32)
889
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
890
);
891
*/
892
for (; x <= colsn - 4; x += 4)
893
{
894
internal::prefetch(lane + x);
895
896
int32x4_t lane0 = vld1q_s32(lane + x - 2);
897
int32x4_t lane4 = vld1q_s32(lane + x + 2);
898
int32x4_t lane1 = vld1q_s32(lane + x - 1);
899
int32x4_t lane3 = vld1q_s32(lane + x + 1);
900
int32x4_t lane2 = vld1q_s32(lane + x + 0);
901
902
int32x4_t ln04 = vaddq_s32(lane0, lane4);
903
int32x4_t ln13 = vaddq_s32(lane1, lane3);
904
905
int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
906
int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
907
908
int16x4_t ls = vrshrn_n_s32(lsw, 8);
909
910
vst1_s16(dst + x, ls);
911
}
912
break;
913
}
914
for (s32 h = 0; h < cn; ++h)
915
{
916
s32* ln = lane + h;
917
s16* dt = dst + h;
918
for (size_t k = x; k < colsn; k += cn)
919
{
920
dt[k] = (s16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8);
921
}
922
}
923
}
924
#else
925
(void)srcBase;
926
(void)srcStride;
927
(void)dstBase;
928
(void)dstStride;
929
(void)borderValue;
930
(void)borderMargin;
931
#endif
932
}
933
934
void gaussianBlur5x5(const Size2D &size, s32 cn,
935
const s32 * srcBase, ptrdiff_t srcStride,
936
s32 * dstBase, ptrdiff_t dstStride,
937
BORDER_MODE borderType, s32 borderValue, Margin borderMargin)
938
{
939
internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
940
#ifdef CAROTENE_NEON
941
size_t colsn = size.width * cn;
942
943
std::vector<s32> _tmp;
944
s32 *tmp = 0;
945
if (borderType == BORDER_MODE_CONSTANT)
946
{
947
_tmp.assign(colsn + 4*cn, borderValue);
948
tmp = &_tmp[cn << 1];
949
}
950
951
ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
952
ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
953
ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
954
ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
955
956
//1-line buffer
957
std::vector<s32> _buf(cn * (size.width + 4) + 32 / sizeof(s32));
958
s32* lane = internal::alignPtr(&_buf[cn << 1], 32);
959
960
if (borderType == BORDER_MODE_CONSTANT)
961
for (s32 k = 0; k < cn; ++k)
962
{
963
lane[-cn+k] = borderValue;
964
lane[-cn-cn+k] = borderValue;
965
lane[colsn+k] = borderValue;
966
lane[colsn+cn+k] = borderValue;
967
}
968
969
int32x4_t vc6s32 = vmovq_n_s32(6);
970
int32x4_t vc4s32 = vmovq_n_s32(4);
971
972
for (size_t i = 0; i < size.height; ++i)
973
{
974
s32* dst = internal::getRowPtr(dstBase, dstStride, i);
975
//vertical convolution
976
ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
977
ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
978
ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
979
ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
980
981
const s32* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
982
const s32* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
983
const s32* ln2 = internal::getRowPtr(srcBase, srcStride, i);
984
const s32* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
985
const s32* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
986
987
size_t x = 0;
988
for (; x <= colsn - 4; x += 4)
989
{
990
internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
991
int32x4_t v0 = vld1q_s32(ln0+x);
992
int32x4_t v1 = vld1q_s32(ln1+x);
993
int32x4_t v2 = vld1q_s32(ln2+x);
994
int32x4_t v3 = vld1q_s32(ln3+x);
995
int32x4_t v4 = vld1q_s32(ln4+x);
996
997
int32x4_t v = vaddq_s32(v0, v4);
998
int32x4_t v13 = vaddq_s32(v1, v3);
999
1000
v = vmlaq_s32(v, v2, vc6s32);
1001
v = vmlaq_s32(v, v13, vc4s32);
1002
1003
vst1q_s32(lane + x, v);
1004
}
1005
for (; x < colsn; ++x)
1006
lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
1007
1008
//left&right borders
1009
if (borderType != BORDER_MODE_CONSTANT)
1010
for (s32 k = 0; k < cn; ++k)
1011
{
1012
lane[-cn+k] = lane[idx_l1 + k];
1013
lane[-cn-cn+k] = lane[idx_l2 + k];
1014
1015
lane[colsn+k] = lane[idx_r1 + k];
1016
lane[colsn+cn+k] = lane[idx_r2 + k];
1017
}
1018
1019
//horizontal convolution
1020
x = 0;
1021
for (; x <= colsn - 4; x += 4)
1022
{
1023
internal::prefetch(lane + x);
1024
1025
int32x4_t lane0 = vld1q_s32(lane + x - 2);
1026
int32x4_t lane4 = vld1q_s32(lane + x + 2);
1027
int32x4_t lane1 = vld1q_s32(lane + x - 1);
1028
int32x4_t lane3 = vld1q_s32(lane + x + 1);
1029
int32x4_t lane2 = vld1q_s32(lane + x + 0);
1030
1031
int32x4_t ln04 = vaddq_s32(lane0, lane4);
1032
int32x4_t ln13 = vaddq_s32(lane1, lane3);
1033
1034
int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
1035
int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
1036
1037
vst1q_s32(dst + x, lsw);
1038
}
1039
for (s32 h = 0; h < cn; ++h)
1040
{
1041
s32* ln = lane + h;
1042
s32* dt = dst + h;
1043
for (size_t k = x; k < colsn; k += cn)
1044
{
1045
dt[k] = ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k];
1046
}
1047
}
1048
}
1049
#else
1050
(void)srcBase;
1051
(void)srcStride;
1052
(void)dstBase;
1053
(void)dstStride;
1054
(void)borderValue;
1055
(void)borderMargin;
1056
#endif
1057
}
1058
1059
} // namespace CAROTENE_NS
1060
1061