Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/blur.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include <vector>
41
42
#include "common.hpp"
43
#include "saturate_cast.hpp"
44
45
namespace CAROTENE_NS {
46
47
bool isBlur3x3Supported(const Size2D &size, BORDER_MODE border)
48
{
49
return isSupportedConfiguration() && size.width >= 8 &&
50
(border == BORDER_MODE_CONSTANT ||
51
border == BORDER_MODE_REPLICATE);
52
}
53
54
void blur3x3(const Size2D &size,
55
const u8 * srcBase, ptrdiff_t srcStride,
56
u8 * dstBase, ptrdiff_t dstStride,
57
BORDER_MODE border, u8 borderValue)
58
{
59
internal::assertSupportedConfiguration(isBlur3x3Supported(size, border));
60
#ifdef CAROTENE_NEON
61
const int16x8_t v_scale = vmovq_n_s16(3640);
62
const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3);
63
const uint16x8_t v_zero = vdupq_n_u16(0);
64
const uint8x8_t v_border = vdup_n_u8(borderValue);
65
66
uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
67
uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
68
69
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
70
71
for (ptrdiff_t y = 0; y < height; ++y)
72
{
73
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
74
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
75
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
76
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
77
78
s16 prevx = 0, currx = 0, nextx = 0;
79
ptrdiff_t x = 0;
80
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
81
82
// perform vertical convolution
83
for ( ; x <= bwidth; x += 8)
84
{
85
internal::prefetch(srow0 + x);
86
internal::prefetch(srow1 + x);
87
internal::prefetch(srow2 + x);
88
89
uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
90
uint8x8_t x1 = vld1_u8(srow1 + x);
91
uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
92
93
// calculate values for plain CPU part below if needed
94
if (x + 8 >= bwidth)
95
{
96
ptrdiff_t x3 = x == width ? width - 1 : x;
97
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
98
99
if (border == BORDER_MODE_CONSTANT && x4 < 0)
100
prevx = borderValue;
101
else
102
prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue);
103
104
currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue);
105
}
106
107
// make shift
108
if (x)
109
{
110
tprev = tcurr;
111
tcurr = tnext;
112
}
113
114
// and calculate next value
115
tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
116
117
// make extrapolation for the first elements
118
if (!x)
119
{
120
// make border
121
if (border == BORDER_MODE_CONSTANT)
122
tcurr = v_border_x3;
123
else if (border == BORDER_MODE_REPLICATE)
124
tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
125
126
continue;
127
}
128
129
// combine 3 "shifted" vectors
130
t0 = vextq_u16(tprev, tcurr, 7);
131
t1 = tcurr;
132
t2 = vextq_u16(tcurr, tnext, 1);
133
134
// and add them
135
t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
136
137
int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), v_scale);
138
uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0));
139
vst1_u8(drow + x - 8, it0);
140
}
141
142
x -= 8;
143
if (x == width)
144
--x;
145
146
for ( ; x < width; ++x)
147
{
148
// make extrapolation for the last elements
149
if (x + 1 >= width)
150
{
151
if (border == BORDER_MODE_CONSTANT)
152
nextx = borderValue * 3;
153
else if (border == BORDER_MODE_REPLICATE)
154
nextx = srow2[x] + srow1[x] + srow0[x];
155
}
156
else
157
nextx = (srow2 ? srow2[x + 1] : borderValue) +
158
srow1[x + 1] +
159
(srow0 ? srow0[x + 1] : borderValue);
160
161
f32 val = (prevx + currx + nextx) * (1 / 9.f) + 0.5f;
162
drow[x] = internal::saturate_cast<u8>((s32)val);
163
164
// make shift
165
prevx = currx;
166
currx = nextx;
167
}
168
}
169
#else
170
(void)size;
171
(void)srcBase;
172
(void)srcStride;
173
(void)dstBase;
174
(void)dstStride;
175
(void)border;
176
(void)borderValue;
177
#endif
178
}
179
180
bool isBlurU8Supported(const Size2D &size, s32 cn, BORDER_MODE border)
181
{
182
return isSupportedConfiguration() &&
183
cn > 0 && cn <= 4 &&
184
size.width*cn >= 8 && size.height >= 2 &&
185
(border == BORDER_MODE_CONSTANT ||
186
border == BORDER_MODE_REFLECT101 ||
187
border == BORDER_MODE_REFLECT ||
188
border == BORDER_MODE_REPLICATE);
189
}
190
191
void blur3x3(const Size2D &size, s32 cn,
192
const u8 * srcBase, ptrdiff_t srcStride,
193
u8 * dstBase, ptrdiff_t dstStride,
194
BORDER_MODE borderType, u8 borderValue)
195
{
196
internal::assertSupportedConfiguration(isBlurU8Supported(size, cn, borderType));
197
#ifdef CAROTENE_NEON
198
//#define FLOAT_VARIANT_1_9
199
#ifdef FLOAT_VARIANT_1_9
200
float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0);
201
float32x4_t v0_5 = vdupq_n_f32 (.5);
202
#else
203
const int16x8_t vScale = vmovq_n_s16(3640);
204
#endif
205
206
size_t colsn = size.width*cn;
207
208
std::vector<u8> _tmp;
209
u8 *tmp = 0;
210
if (borderType == BORDER_MODE_CONSTANT)
211
{
212
_tmp.assign(colsn + 2*cn, borderValue);
213
tmp = &_tmp[cn];
214
}
215
216
uint16x8_t tprev = vdupq_n_u16(0x0);
217
uint16x8_t tcurr = tprev;
218
uint16x8_t tnext = tprev;
219
uint16x8_t t0, t1, t2;
220
if(cn == 1)
221
{
222
for( size_t y = 0; y < size.height; y++ )
223
{
224
const u8* srow0;
225
const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
226
const u8* srow2;
227
u8* drow = internal::getRowPtr(dstBase, dstStride, y);
228
if (borderType == BORDER_MODE_REFLECT101) {
229
srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
230
srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
231
} else if (borderType == BORDER_MODE_CONSTANT) {
232
srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
233
srow2 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
234
} else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
235
srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
236
srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
237
}
238
239
// do vertical convolution
240
size_t x = 0;
241
const size_t bcols = y + 2 < size.height ? colsn : (colsn - 8);
242
for( ; x <= bcols; x += 8 )
243
{
244
internal::prefetch(srow0 + x);
245
internal::prefetch(srow1 + x);
246
internal::prefetch(srow2 + x);
247
248
uint8x8_t x0 = vld1_u8(srow0 + x);
249
uint8x8_t x1 = vld1_u8(srow1 + x);
250
uint8x8_t x2 = vld1_u8(srow2 + x);
251
252
tprev = tcurr;
253
tcurr = tnext;
254
tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
255
256
if(!x) {
257
tcurr = tnext;
258
259
// make border
260
if (borderType == BORDER_MODE_CONSTANT)
261
{
262
tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
263
}
264
else if (borderType == BORDER_MODE_REFLECT101)
265
{
266
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
267
}
268
else // borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REPLICATE
269
{
270
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7);
271
}
272
continue;
273
}
274
275
t0 = vextq_u16(tprev, tcurr, 7);
276
t1 = tcurr;
277
t2 = vextq_u16(tcurr, tnext, 1);
278
279
t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
280
281
#ifdef FLOAT_VARIANT_1_9
282
uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0));
283
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
284
float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
285
float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
286
tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
287
tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
288
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
289
vst1_u8(drow + x - 8, vmovn_u16(t0));
290
#else
291
int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale);
292
uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0));
293
vst1_u8(drow + x - 8, it0);
294
#endif
295
}
296
297
x -= 8;
298
if(x == colsn){
299
x--;
300
}
301
s16 prevx, rowx, nextx;
302
prevx = srow2[x-1] + srow1[x-1] + srow0[x-1];
303
rowx = srow2[x] + srow1[x] + srow0[x];
304
for( ; x < colsn; x++ )
305
{
306
if(x+1 >= colsn) {
307
// make border
308
if (borderType == BORDER_MODE_CONSTANT)
309
{
310
nextx = borderValue;
311
} else if (borderType == BORDER_MODE_REFLECT101)
312
{
313
nextx = srow2[x-1] + srow1[x-1] + srow0[x-1];
314
} else {
315
nextx = srow2[x] + srow1[x] + srow0[x];
316
}
317
} else {
318
nextx = srow2[x+1] + srow1[x+1] + srow0[x+1];
319
}
320
*(drow+x) = internal::saturate_cast<u8>((prevx + rowx + nextx)*(1/9.));
321
prevx = rowx;
322
rowx = nextx;
323
}
324
}
325
}
326
else
327
{
328
for( size_t y = 0; y < size.height; y++ )
329
{
330
const u8* srow0;
331
const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
332
const u8* srow2;
333
u8* drow = internal::getRowPtr(dstBase, dstStride, y);
334
if (borderType == BORDER_MODE_REFLECT101) {
335
srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
336
srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
337
} else if (borderType == BORDER_MODE_CONSTANT) {
338
srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
339
srow2 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
340
} else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
341
srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
342
srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
343
}
344
345
// do vertical convolution
346
size_t x = 0;
347
const size_t bcols = y + 2 < size.height ? colsn : (colsn - 8);
348
for( ; x <= bcols; x += 8 )
349
{
350
internal::prefetch(srow0 + x);
351
internal::prefetch(srow1 + x);
352
internal::prefetch(srow2 + x);
353
354
uint8x8_t x0 = vld1_u8(srow0 + x);
355
uint8x8_t x1 = vld1_u8(srow1 + x);
356
uint8x8_t x2 = vld1_u8(srow2 + x);
357
358
tprev = tcurr;
359
tcurr = tnext;
360
tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
361
362
if(!x) {
363
tcurr = tnext;
364
365
// make border
366
switch(cn)
367
{
368
case 2:
369
if (borderType == BORDER_MODE_CONSTANT)
370
{
371
tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
372
tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
373
}
374
else if (borderType == BORDER_MODE_REFLECT101)
375
{
376
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6);
377
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 6);
378
}
379
else
380
{
381
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6);
382
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
383
}
384
break;
385
case 3:
386
if (borderType == BORDER_MODE_CONSTANT)
387
{
388
tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
389
tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
390
tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
391
}
392
else if (borderType == BORDER_MODE_REFLECT101)
393
{
394
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 5);
395
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tcurr, 6);
396
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tcurr, 7);
397
}
398
else
399
{
400
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 5);
401
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 6);
402
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 7);
403
}
404
break;
405
case 4:
406
if (borderType == BORDER_MODE_CONSTANT)
407
{
408
tcurr = vsetq_lane_u16(borderValue, tcurr, 4);
409
tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
410
tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
411
tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
412
}
413
else if (borderType != BORDER_MODE_REFLECT101)
414
{
415
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 4);
416
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 5);
417
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6);
418
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 7);
419
}
420
break;
421
}
422
continue;
423
}
424
425
if(cn==2)
426
t0 = vextq_u16(tprev, tcurr, 6);
427
else if(cn==3)
428
t0 = vextq_u16(tprev, tcurr, 5);
429
else if(cn==4)
430
t0 = vextq_u16(tprev, tcurr, 4);
431
432
t1 = tcurr;
433
434
if(cn==2)
435
t2 = vextq_u16(tcurr, tnext, 2);
436
else if(cn==3)
437
t2 = vextq_u16(tcurr, tnext, 3);
438
else if(cn==4)
439
t2 = vextq_u16(tcurr, tnext, 4);
440
441
t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
442
443
#ifdef FLOAT_VARIANT_1_9
444
uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0));
445
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
446
float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
447
float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
448
tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
449
tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
450
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
451
vst1_u8(drow + x - 8, vmovn_u16(t0));
452
#else
453
int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale);
454
uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0));
455
vst1_u8(drow + x - 8, it0);
456
#endif
457
}
458
459
x -= 8;
460
if(x == colsn){
461
x -= cn;
462
}
463
s16 prevx[4], rowx[4], nextx[4];
464
for( s32 k = 0; k < cn; k++ )
465
{
466
prevx[(k + x%cn)%cn] = srow2[x+k-cn] + srow1[x+k-cn] + srow0[x+k-cn];
467
rowx[(k + x%cn)%cn] = srow2[x+k] + srow1[x+k] + srow0[x+k];
468
}
469
for( ; x < colsn; x++ )
470
{
471
size_t xx = x%cn;
472
if(x+cn >= colsn) {
473
// make border
474
if (borderType == BORDER_MODE_CONSTANT)
475
{
476
nextx[xx] = borderValue;
477
} else if (borderType == BORDER_MODE_REFLECT101)
478
{
479
nextx[xx] = srow2[x-cn] + srow1[x-cn] + srow0[x-cn];
480
} else {
481
nextx[xx] = srow2[x] + srow1[x] + srow0[x];
482
}
483
} else {
484
nextx[xx] = srow2[x+cn] + srow1[x+cn] + srow0[x+cn];
485
}
486
*(drow+x) = internal::saturate_cast<u8>((prevx[xx] + rowx[xx] + nextx[xx])*(1/9.));
487
prevx[xx] = rowx[xx];
488
rowx[xx] = nextx[xx];
489
}
490
}
491
}
492
#else
493
(void)srcBase;
494
(void)srcStride;
495
(void)dstBase;
496
(void)dstStride;
497
(void)borderValue;
498
#endif
499
}
500
501
void blur5x5(const Size2D &size, s32 cn,
502
const u8 * srcBase, ptrdiff_t srcStride,
503
u8 * dstBase, ptrdiff_t dstStride,
504
BORDER_MODE borderType, u8 borderValue)
505
{
506
internal::assertSupportedConfiguration(isBlurU8Supported(size, cn, borderType));
507
#ifdef CAROTENE_NEON
508
#define FLOAT_VARIANT_1_25
509
#ifdef FLOAT_VARIANT_1_25
510
float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f);
511
float32x4_t v0_5 = vdupq_n_f32 (.5f);
512
#else
513
const int16x8_t vScale = vmovq_n_s16(1310);
514
#endif
515
size_t colsn = size.width*cn;
516
517
std::vector<u8> _tmp;
518
u8 *tmp = 0;
519
if (borderType == BORDER_MODE_CONSTANT)
520
{
521
_tmp.assign(colsn + 2*cn, borderValue);
522
tmp = &_tmp[cn];
523
}
524
525
uint16x8_t tprev = vdupq_n_u16(0x0);
526
uint16x8_t tcurr = tprev;
527
uint16x8_t tnext = tprev;
528
uint16x8_t t0, t1, t2, t3, t4;
529
for( size_t y = 0; y < size.height; y++ )
530
{
531
const u8 *srow0, *srow1;
532
const u8 *srow2 = internal::getRowPtr(srcBase, srcStride, y);
533
const u8 *srow3, *srow4;
534
u8 *drow = internal::getRowPtr(dstBase, dstStride, y);
535
if (borderType == BORDER_MODE_REFLECT101) {
536
srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 2-y);
537
srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
538
srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
539
srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : (size.height<<1)-4-y);
540
} else if (borderType == BORDER_MODE_CONSTANT) {
541
srow0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp;
542
srow1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
543
srow3 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
544
srow4 = y < size.height-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp;
545
} else if (borderType == BORDER_MODE_REFLECT) {
546
srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 1-y);
547
srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
548
srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
549
srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : (size.height<<1)-3-y);
550
} else { // BORDER_MODE_REPLICATE
551
srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0);
552
srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
553
srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
554
srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : size.height-1);
555
}
556
557
// do vertical convolution
558
size_t x = 0;
559
const size_t bcols = y + 3 < size.height ? colsn : (colsn - 8);
560
for( ; x <= bcols; x += 8 )
561
{
562
internal::prefetch(srow0 + x);
563
internal::prefetch(srow1 + x);
564
internal::prefetch(srow2 + x);
565
internal::prefetch(srow3 + x);
566
internal::prefetch(srow4 + x);
567
568
uint8x8_t x0 = vld1_u8(srow0 + x);
569
uint8x8_t x1 = vld1_u8(srow1 + x);
570
uint8x8_t x2 = vld1_u8(srow2 + x);
571
uint8x8_t x3 = vld1_u8(srow3 + x);
572
uint8x8_t x4 = vld1_u8(srow4 + x);
573
574
tprev = tcurr;
575
tcurr = tnext;
576
tnext = vaddw_u8(vaddq_u16(vaddl_u8(x0, x1), vaddl_u8(x2, x3)), x4);
577
578
if(!x) {
579
tcurr = tnext;
580
581
if(borderType == BORDER_MODE_REFLECT101 && size.width < 3)
582
{
583
x = 8;
584
break;
585
}
586
587
// make border
588
switch(cn)
589
{
590
case 1:
591
if (borderType == BORDER_MODE_CONSTANT)
592
{
593
tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
594
tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
595
}
596
else if (borderType == BORDER_MODE_REFLECT101)
597
{
598
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6);
599
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
600
}
601
else if (borderType == BORDER_MODE_REFLECT)
602
{
603
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 6);
604
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7);
605
}
606
else
607
{
608
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6);
609
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7);
610
}
611
break;
612
case 2:
613
if (borderType == BORDER_MODE_CONSTANT)
614
{
615
tcurr = vsetq_lane_u16(borderValue, tcurr, 4);
616
tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
617
tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
618
tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
619
}
620
else if (borderType == BORDER_MODE_REFLECT101)
621
{
622
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6);
623
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 7);
624
}
625
else if (borderType == BORDER_MODE_REFLECT)
626
{
627
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 4);
628
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 5);
629
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6);
630
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
631
}
632
else
633
{
634
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 4);
635
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 5);
636
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6);
637
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
638
}
639
break;
640
case 3:
641
if (borderType == BORDER_MODE_CONSTANT)
642
{
643
tcurr = vsetq_lane_u16(borderValue, tcurr, 2);
644
tcurr = vsetq_lane_u16(borderValue, tcurr, 3);
645
tcurr = vsetq_lane_u16(borderValue, tcurr, 4);
646
tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
647
tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
648
tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
649
}
650
else if (borderType == BORDER_MODE_REFLECT101)
651
{
652
tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 6),tcurr, 2);
653
tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 7),tprev, 3);
654
tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tprev, 5);
655
tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tprev, 6);
656
tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tprev, 7);
657
s16 lane8 = srow4[8] + srow3[8] + srow2[8] + srow1[8] + srow0[8];
658
tcurr = vsetq_lane_u16(lane8,tprev, 4);
659
}
660
else if (borderType == BORDER_MODE_REFLECT)
661
{
662
tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 2);
663
tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tprev, 3);
664
tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tprev, 4);
665
tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tprev, 5);
666
tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 6);
667
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 7);
668
}
669
else
670
{
671
tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 2);
672
tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 3);
673
tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 4);
674
tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tprev, 5);
675
tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 6);
676
tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 7);
677
}
678
break;
679
case 4:
680
if (borderType == BORDER_MODE_CONSTANT)
681
{
682
tcurr = vsetq_lane_u16(borderValue, tcurr, 0);
683
tcurr = vsetq_lane_u16(borderValue, tcurr, 1);
684
tcurr = vsetq_lane_u16(borderValue, tcurr, 2);
685
tcurr = vsetq_lane_u16(borderValue, tcurr, 3);
686
tcurr = vsetq_lane_u16(borderValue, tcurr, 4);
687
tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
688
tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
689
tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
690
}
691
else if (borderType == BORDER_MODE_REFLECT101)
692
{
693
s16 lane8 = srow4[ 8] + srow3[ 8] + srow2[ 8] + srow1[ 8] + srow0[ 8];
694
s16 lane9 = srow4[ 9] + srow3[ 9] + srow2[ 9] + srow1[ 9] + srow0[ 9];
695
s16 lane10 = srow4[10] + srow3[10] + srow2[10] + srow1[10] + srow0[10];
696
s16 lane11 = srow4[11] + srow3[11] + srow2[11] + srow1[11] + srow0[11];
697
tprev = vsetq_lane_u16( lane8,tcurr, 0);
698
tprev = vsetq_lane_u16( lane9,tprev, 1);
699
tprev = vsetq_lane_u16(lane10,tprev, 2);
700
tcurr = vsetq_lane_u16(lane11,tprev, 3);
701
}
702
else if (borderType == BORDER_MODE_REFLECT)
703
{
704
tcurr = vcombine_u16(vget_high_u16(tcurr),vget_low_u16(tcurr));//swap 64-bit parts
705
}
706
else
707
{
708
tcurr = vcombine_u16(vget_low_u16(tcurr),vget_low_u16(tcurr));//double 64-bit part
709
}
710
break;
711
}
712
continue;
713
}
714
switch(cn)
715
{
716
case 1:
717
t0 = vextq_u16(tprev, tcurr, 6);
718
t1 = vextq_u16(tprev, tcurr, 7);
719
t2 = tcurr;
720
t3 = vextq_u16(tcurr, tnext, 1);
721
t4 = vextq_u16(tcurr, tnext, 2);
722
break;
723
case 2:
724
t0 = vextq_u16(tprev, tcurr, 4);
725
t1 = vextq_u16(tprev, tcurr, 6);
726
t2 = tcurr;
727
t3 = vextq_u16(tcurr, tnext, 2);
728
t4 = vextq_u16(tcurr, tnext, 4);
729
break;
730
case 3:
731
t0 = vextq_u16(tprev, tcurr, 2);
732
t1 = vextq_u16(tprev, tcurr, 5);
733
t2 = tcurr;
734
t3 = vextq_u16(tcurr, tnext, 3);
735
t4 = vextq_u16(tcurr, tnext, 6);
736
break;
737
case 4:
738
t0 = tprev;
739
t1 = vextq_u16(tprev, tcurr, 4);
740
t2 = tcurr;
741
t3 = vextq_u16(tcurr, tnext, 4);
742
t4 = tnext;
743
break;
744
default:
745
internal::assertSupportedConfiguration(false);//Unsupported channels number
746
return;
747
}
748
t0 = vqaddq_u16(vqaddq_u16(vqaddq_u16(t0, t1), vqaddq_u16(t2, t3)), t4);
749
750
#ifdef FLOAT_VARIANT_1_25
751
uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0));
752
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
753
float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1));
754
float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2));
755
tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
756
tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
757
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
758
vst1_u8(drow + x - 8, vmovn_u16(t0));
759
#else
760
int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale);
761
uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0));
762
vst1_u8(drow + x - 8, it0);
763
#endif
764
}
765
766
x -= 8;
767
if(x == colsn){
768
x -= cn;
769
}
770
s16 pprevx[4], prevx[4], rowx[4], nextx[4], nnextx[4];
771
ptrdiff_t px = x / cn;
772
for( s32 k = 0; k < cn; k++ )
773
{
774
ptrdiff_t ploc;
775
ploc = internal::borderInterpolate(px-2, size.width, borderType);
776
pprevx[k] = ploc < 0 ? 5*borderValue :
777
srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k];
778
779
ploc = internal::borderInterpolate(px-1, size.width, borderType);
780
prevx[k] = ploc < 0 ? 5*borderValue :
781
srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k];
782
783
rowx[k] = srow4[px*cn+k] + srow3[px*cn+k] + srow2[px*cn+k] + srow1[px*cn+k] + srow0[px*cn+k];
784
785
ploc = internal::borderInterpolate(px+1, size.width, borderType);
786
nextx[k] = ploc < 0 ? 5*borderValue :
787
srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k];
788
}
789
x = px*cn;
790
for( ; x < colsn; x+=cn, px++ )
791
{
792
for( s32 k = 0; k < cn; k++ )
793
{
794
ptrdiff_t ploc = internal::borderInterpolate(px+2, size.width, borderType);
795
nnextx[k] = ploc < 0 ? 5*borderValue :
796
srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k];
797
*(drow+x+k) = internal::saturate_cast<u8>((pprevx[k] + prevx[k] + rowx[k] + nextx[k] +nnextx[k])*(1/25.));
798
pprevx[k] = prevx[k];
799
prevx[k] = rowx[k];
800
rowx[k] = nextx[k];
801
nextx[k] = nnextx[k];
802
}
803
}
804
}
805
#else
806
(void)srcBase;
807
(void)srcStride;
808
(void)dstBase;
809
(void)dstStride;
810
(void)borderValue;
811
#endif
812
}
813
814
bool isBlurF32Supported(const Size2D &size, s32 cn, BORDER_MODE border)
815
{
816
return isSupportedConfiguration() &&
817
cn > 0 && cn <= 4 &&
818
size.width*cn >= 4 && size.height >= 2 &&
819
(border == BORDER_MODE_CONSTANT ||
820
border == BORDER_MODE_REFLECT101 ||
821
border == BORDER_MODE_REFLECT ||
822
border == BORDER_MODE_REPLICATE ||
823
border == BORDER_MODE_WRAP);
824
}
825
826
void blur3x3(const Size2D &size, s32 cn,
827
const f32 * srcBase, ptrdiff_t srcStride,
828
f32 * dstBase, ptrdiff_t dstStride,
829
BORDER_MODE borderType, f32 borderValue, Margin borderMargin)
830
{
831
internal::assertSupportedConfiguration(isBlurF32Supported(size, cn, borderType));
832
#ifdef CAROTENE_NEON
833
size_t colsn = size.width * cn;
834
835
std::vector<f32> _tmp;
836
f32 *tmp = 0;
837
if (borderType == BORDER_MODE_CONSTANT)
838
{
839
_tmp.assign(colsn + 2*cn, borderValue);
840
tmp = &_tmp[cn];
841
}
842
843
ptrdiff_t idx_l = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
844
ptrdiff_t idx_r = internal::borderInterpolate(size.width, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
845
846
//2-line buffer
847
std::vector<f32> _buf(4*(cn * (size.width + 2) + 32 / sizeof(f32)));
848
f32* lanea = internal::alignPtr(&_buf[cn], 32);
849
f32* laneA = internal::alignPtr(lanea + cn * (size.width + 2), 32);
850
851
f32* laneb = internal::alignPtr(laneA + cn * (size.width + 2), 32);
852
f32* laneB = internal::alignPtr(laneb + cn * (size.width + 2), 32);
853
854
if (borderType == BORDER_MODE_CONSTANT)
855
for (s32 k = 0; k < cn; ++k)
856
{
857
lanea[-cn+k] = borderValue;
858
lanea[colsn+k] = borderValue;
859
laneA[-cn+k] = borderValue;
860
laneA[colsn+k] = borderValue;
861
laneb[-cn+k] = borderValue;
862
laneb[colsn+k] = borderValue;
863
laneB[-cn+k] = borderValue;
864
laneB[colsn+k] = borderValue;
865
}
866
867
size_t i = 0;
868
f32* dsta = internal::getRowPtr(dstBase, dstStride, 0);
869
for (; i < size.height-1; i+=2)
870
{
871
//vertical convolution
872
ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
873
ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
874
875
const f32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
876
const f32* ln1 = internal::getRowPtr(srcBase, srcStride, i);
877
const f32* ln2 = internal::getRowPtr(srcBase, srcStride, i + 1);
878
const f32* ln3 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
879
880
size_t x = 0;
881
for (; x <= colsn - 4; x += 4)
882
{
883
internal::prefetch(ln1 + x);
884
internal::prefetch(ln2 + x);
885
internal::prefetch(ln0 + x);
886
internal::prefetch(ln3 + x);
887
box3x3f32_vert:
888
float32x4_t v1 = vld1q_f32(ln1 + x);
889
float32x4_t v2 = vld1q_f32(ln2 + x);
890
float32x4_t v0 = vld1q_f32(ln0 + x);
891
float32x4_t v3 = vld1q_f32(ln3 + x);
892
893
float32x4_t v = vaddq_f32(v1, v2);
894
float32x4_t w0 = vaddq_f32(v, v0);
895
float32x4_t w1 = vaddq_f32(v, v3);
896
897
vst1q_f32(lanea + x, w0);
898
vst1q_f32(laneb + x, w1);
899
}
900
if(x < colsn)
901
{
902
x = colsn-4;
903
goto box3x3f32_vert;
904
}
905
906
//left&right borders
907
if (borderType != BORDER_MODE_CONSTANT)
908
for (s32 k = 0; k < cn; ++k)
909
{
910
lanea[-cn+k] = lanea[idx_l + k];
911
lanea[colsn+k] = lanea[idx_r + k];
912
laneb[-cn+k] = laneb[idx_l + k];
913
laneb[colsn+k] = laneb[idx_r + k];
914
}
915
916
//horizontal convolution (2 lines from previous iteration)
917
if (i > 0)
918
{
919
f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
920
x = 0;
921
for (; x <= colsn - 4; x += 4)
922
{
923
internal::prefetch(laneA + x + cn);
924
internal::prefetch(laneB + x + cn);
925
box3x3f32_horiz:
926
float32x4_t lane0a = vld1q_f32(laneA + x - cn);
927
float32x4_t lane2a = vld1q_f32(laneA + x + cn);
928
float32x4_t lane1a = vld1q_f32(laneA + x);
929
930
float32x4_t lane0b = vld1q_f32(laneB + x - cn);
931
float32x4_t lane2b = vld1q_f32(laneB + x + cn);
932
float32x4_t lane1b = vld1q_f32(laneB + x);
933
934
float32x4_t va = vaddq_f32(lane0a, lane2a);
935
float32x4_t vb = vaddq_f32(lane0b, lane2b);
936
float32x4_t wa = vaddq_f32(va, lane1a);
937
float32x4_t wb = vaddq_f32(vb, lane1b);
938
939
vst1q_f32(dsta + x, wa);
940
vst1q_f32(dstb + x, wb);
941
}
942
if(x < colsn)
943
{
944
x = colsn-4;
945
goto box3x3f32_horiz;
946
}
947
dsta = internal::getRowPtr(dstBase, dstStride, i);
948
}
949
950
std::swap(lanea, laneA);
951
std::swap(laneb, laneB);
952
}
953
954
//last line
955
if(i < size.height)
956
{
957
//vertical convolution
958
ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
959
ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
960
961
const f32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
962
const f32* ln1 = internal::getRowPtr(srcBase, srcStride, i);
963
const f32* ln2 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
964
965
size_t x = 0;
966
for (; x <= colsn - 4; x += 4)
967
{
968
internal::prefetch(ln0 + x);
969
internal::prefetch(ln1 + x);
970
internal::prefetch(ln2 + x);
971
box3x3f32_vert_ll:
972
float32x4_t v0 = vld1q_f32(ln0+x);
973
float32x4_t v1 = vld1q_f32(ln1+x);
974
float32x4_t v2 = vld1q_f32(ln2+x);
975
976
float32x4_t v = vaddq_f32(v0, v1);
977
float32x4_t w = vaddq_f32(v, v2);
978
979
vst1q_f32(lanea + x, w);
980
}
981
if(x < colsn)
982
{
983
x = colsn-4;
984
goto box3x3f32_vert_ll;
985
}
986
987
//left&right borders
988
if (borderType != BORDER_MODE_CONSTANT)
989
for (s32 k = 0; k < cn; ++k)
990
{
991
lanea[-cn+k] = lanea[idx_l + k];
992
lanea[colsn+k] = lanea[idx_r + k];
993
}
994
995
//horizontal convolution (last 3 lines)
996
x = 0;
997
f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
998
f32* dstc = internal::getRowPtr(dstBase, dstStride, i);
999
for (; x <= colsn - 4; x += 4)
1000
{
1001
internal::prefetch(laneA + x + cn);
1002
internal::prefetch(laneB + x + cn);
1003
internal::prefetch(lanea + x + cn);
1004
box3x3f32_horiz_ll:
1005
float32x4_t lane0a = vld1q_f32(laneA + x - cn);
1006
float32x4_t lane2a = vld1q_f32(laneA + x + cn);
1007
float32x4_t lane1a = vld1q_f32(laneA + x);
1008
1009
float32x4_t lane0b = vld1q_f32(laneB + x - cn);
1010
float32x4_t lane2b = vld1q_f32(laneB + x + cn);
1011
float32x4_t lane1b = vld1q_f32(laneB + x);
1012
1013
float32x4_t lane0c = vld1q_f32(lanea + x - cn);
1014
float32x4_t lane2c = vld1q_f32(lanea + x + cn);
1015
float32x4_t lane1c = vld1q_f32(lanea + x);
1016
1017
float32x4_t va = vaddq_f32(lane0a, lane2a);
1018
float32x4_t vb = vaddq_f32(lane0b, lane2b);
1019
float32x4_t vc = vaddq_f32(lane0c, lane2c);
1020
float32x4_t wa = vaddq_f32(va, lane1a);
1021
float32x4_t wb = vaddq_f32(vb, lane1b);
1022
float32x4_t wc = vaddq_f32(vc, lane1c);
1023
1024
vst1q_f32(dsta + x, wa);
1025
vst1q_f32(dstb + x, wb);
1026
vst1q_f32(dstc + x, wc);
1027
}
1028
if(x < colsn)
1029
{
1030
x = colsn-4;
1031
goto box3x3f32_horiz_ll;
1032
}
1033
}
1034
else
1035
{
1036
//horizontal convolution (last 2 lines)
1037
f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
1038
size_t x = 0;
1039
for (; x <= colsn - 4; x += 4)
1040
{
1041
internal::prefetch(laneA + x + cn);
1042
internal::prefetch(laneB + x + cn);
1043
box3x3f32_horiz_last2:
1044
float32x4_t lane0a = vld1q_f32(laneA + x - cn);
1045
float32x4_t lane2a = vld1q_f32(laneA + x + cn);
1046
float32x4_t lane1a = vld1q_f32(laneA + x);
1047
1048
float32x4_t lane0b = vld1q_f32(laneB + x - cn);
1049
float32x4_t lane2b = vld1q_f32(laneB + x + cn);
1050
float32x4_t lane1b = vld1q_f32(laneB + x);
1051
1052
float32x4_t va = vaddq_f32(lane0a, lane2a);
1053
float32x4_t vb = vaddq_f32(lane0b, lane2b);
1054
float32x4_t wa = vaddq_f32(va, lane1a);
1055
float32x4_t wb = vaddq_f32(vb, lane1b);
1056
1057
vst1q_f32(dsta + x, wa);
1058
vst1q_f32(dstb + x, wb);
1059
}
1060
if(x < colsn)
1061
{
1062
x = colsn-4;
1063
goto box3x3f32_horiz_last2;
1064
}
1065
}
1066
#else
1067
(void)srcBase;
1068
(void)srcStride;
1069
(void)dstBase;
1070
(void)dstStride;
1071
(void)borderValue;
1072
(void)borderMargin;
1073
#endif
1074
}
1075
1076
bool isBlurS32Supported(const Size2D &size, s32 cn, BORDER_MODE border)
1077
{
1078
return isSupportedConfiguration() &&
1079
cn > 0 && cn <= 4 &&
1080
size.width*cn >= 4 && size.height >= 2 &&
1081
(border == BORDER_MODE_CONSTANT ||
1082
border == BORDER_MODE_REFLECT101 ||
1083
border == BORDER_MODE_REFLECT ||
1084
border == BORDER_MODE_REPLICATE ||
1085
border == BORDER_MODE_WRAP);
1086
}
1087
1088
void blur3x3(const Size2D &size, s32 cn,
1089
const s32 * srcBase, ptrdiff_t srcStride,
1090
s32 * dstBase, ptrdiff_t dstStride,
1091
BORDER_MODE borderType, s32 borderValue, Margin borderMargin)
1092
{
1093
internal::assertSupportedConfiguration(isBlurS32Supported(size, cn, borderType));
1094
#ifdef CAROTENE_NEON
1095
size_t colsn = size.width * cn;
1096
1097
std::vector<s32> _tmp;
1098
s32 *tmp = 0;
1099
if (borderType == BORDER_MODE_CONSTANT)
1100
{
1101
_tmp.assign(colsn + 2*cn, borderValue);
1102
tmp = &_tmp[cn];
1103
}
1104
1105
ptrdiff_t idx_l = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
1106
ptrdiff_t idx_r = internal::borderInterpolate(size.width, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
1107
1108
//2-line buffer
1109
std::vector<s32> _buf(4*(cn * (size.width + 2) + 32 / sizeof(s32)));
1110
s32* lanea = internal::alignPtr(&_buf[cn], 32);
1111
s32* laneA = internal::alignPtr(lanea + cn * (size.width + 2), 32);
1112
1113
s32* laneb = internal::alignPtr(laneA + cn * (size.width + 2), 32);
1114
s32* laneB = internal::alignPtr(laneb + cn * (size.width + 2), 32);
1115
1116
if (borderType == BORDER_MODE_CONSTANT)
1117
for (s32 k = 0; k < cn; ++k)
1118
{
1119
lanea[-cn+k] = borderValue;
1120
lanea[colsn+k] = borderValue;
1121
laneA[-cn+k] = borderValue;
1122
laneA[colsn+k] = borderValue;
1123
laneb[-cn+k] = borderValue;
1124
laneb[colsn+k] = borderValue;
1125
laneB[-cn+k] = borderValue;
1126
laneB[colsn+k] = borderValue;
1127
}
1128
1129
size_t i = 0;
1130
s32* dsta = internal::getRowPtr(dstBase, dstStride, 0);
1131
for (; i < size.height-1; i+=2)
1132
{
1133
//vertical convolution
1134
ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
1135
ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
1136
1137
const s32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
1138
const s32* ln1 = internal::getRowPtr(srcBase, srcStride, i);
1139
const s32* ln2 = internal::getRowPtr(srcBase, srcStride, i + 1);
1140
const s32* ln3 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
1141
1142
size_t x = 0;
1143
for (; x <= colsn - 4; x += 4)
1144
{
1145
internal::prefetch(ln1 + x);
1146
internal::prefetch(ln2 + x);
1147
internal::prefetch(ln0 + x);
1148
internal::prefetch(ln3 + x);
1149
box3x3s32_vert:
1150
int32x4_t v1 = vld1q_s32(ln1 + x);
1151
int32x4_t v2 = vld1q_s32(ln2 + x);
1152
int32x4_t v0 = vld1q_s32(ln0 + x);
1153
int32x4_t v3 = vld1q_s32(ln3 + x);
1154
1155
int32x4_t v = vaddq_s32(v1, v2);
1156
int32x4_t w0 = vaddq_s32(v, v0);
1157
int32x4_t w1 = vaddq_s32(v, v3);
1158
1159
vst1q_s32(lanea + x, w0);
1160
vst1q_s32(laneb + x, w1);
1161
}
1162
if(x < colsn)
1163
{
1164
x = colsn-4;
1165
goto box3x3s32_vert;
1166
}
1167
1168
//left&right borders
1169
if (borderType != BORDER_MODE_CONSTANT)
1170
for (s32 k = 0; k < cn; ++k)
1171
{
1172
lanea[-cn+k] = lanea[idx_l + k];
1173
lanea[colsn+k] = lanea[idx_r + k];
1174
laneb[-cn+k] = laneb[idx_l + k];
1175
laneb[colsn+k] = laneb[idx_r + k];
1176
}
1177
1178
//horizontal convolution (2 lines from previous iteration)
1179
if (i > 0)
1180
{
1181
s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
1182
x = 0;
1183
for (; x <= colsn - 4; x += 4)
1184
{
1185
internal::prefetch(laneA + x + cn);
1186
internal::prefetch(laneB + x + cn);
1187
box3x3s32_horiz:
1188
int32x4_t lane0a = vld1q_s32(laneA + x - cn);
1189
int32x4_t lane2a = vld1q_s32(laneA + x + cn);
1190
int32x4_t lane1a = vld1q_s32(laneA + x);
1191
1192
int32x4_t lane0b = vld1q_s32(laneB + x - cn);
1193
int32x4_t lane2b = vld1q_s32(laneB + x + cn);
1194
int32x4_t lane1b = vld1q_s32(laneB + x);
1195
1196
int32x4_t va = vaddq_s32(lane0a, lane2a);
1197
int32x4_t vb = vaddq_s32(lane0b, lane2b);
1198
int32x4_t wa = vaddq_s32(va, lane1a);
1199
int32x4_t wb = vaddq_s32(vb, lane1b);
1200
1201
vst1q_s32(dsta + x, wa);
1202
vst1q_s32(dstb + x, wb);
1203
}
1204
if(x < colsn)
1205
{
1206
x = colsn-4;
1207
goto box3x3s32_horiz;
1208
}
1209
dsta = internal::getRowPtr(dstBase, dstStride, i);
1210
}
1211
1212
std::swap(lanea, laneA);
1213
std::swap(laneb, laneB);
1214
}
1215
//last line
1216
if(i < size.height)
1217
{
1218
//vertical convolution
1219
ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
1220
ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
1221
1222
const s32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
1223
const s32* ln1 = internal::getRowPtr(srcBase, srcStride, i);
1224
const s32* ln2 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
1225
1226
size_t x = 0;
1227
for (; x <= colsn - 4; x += 4)
1228
{
1229
internal::prefetch(ln0 + x);
1230
internal::prefetch(ln1 + x);
1231
internal::prefetch(ln2 + x);
1232
box3x3s32_vert_ll:
1233
int32x4_t v0 = vld1q_s32(ln0+x);
1234
int32x4_t v1 = vld1q_s32(ln1+x);
1235
int32x4_t v2 = vld1q_s32(ln2+x);
1236
1237
int32x4_t v = vaddq_s32(v0, v1);
1238
int32x4_t w = vaddq_s32(v, v2);
1239
1240
vst1q_s32(lanea + x, w);
1241
}
1242
if(x < colsn)
1243
{
1244
x = colsn-4;
1245
goto box3x3s32_vert_ll;
1246
}
1247
1248
//left&right borders
1249
if (borderType != BORDER_MODE_CONSTANT)
1250
for (s32 k = 0; k < cn; ++k)
1251
{
1252
lanea[-cn+k] = lanea[idx_l + k];
1253
lanea[colsn+k] = lanea[idx_r + k];
1254
}
1255
1256
//horizontal convolution (last 3 lines)
1257
x = 0;
1258
s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
1259
s32* dstc = internal::getRowPtr(dstBase, dstStride, i);
1260
for (; x <= colsn - 4; x += 4)
1261
{
1262
internal::prefetch(laneA + x + cn);
1263
internal::prefetch(laneB + x + cn);
1264
internal::prefetch(lanea + x + cn);
1265
box3x3s32_horiz_ll:
1266
int32x4_t lane0a = vld1q_s32(laneA + x - cn);
1267
int32x4_t lane2a = vld1q_s32(laneA + x + cn);
1268
int32x4_t lane1a = vld1q_s32(laneA + x);
1269
1270
int32x4_t lane0b = vld1q_s32(laneB + x - cn);
1271
int32x4_t lane2b = vld1q_s32(laneB + x + cn);
1272
int32x4_t lane1b = vld1q_s32(laneB + x);
1273
1274
int32x4_t lane0c = vld1q_s32(lanea + x - cn);
1275
int32x4_t lane2c = vld1q_s32(lanea + x + cn);
1276
int32x4_t lane1c = vld1q_s32(lanea + x);
1277
1278
int32x4_t va = vaddq_s32(lane0a, lane2a);
1279
int32x4_t vb = vaddq_s32(lane0b, lane2b);
1280
int32x4_t vc = vaddq_s32(lane0c, lane2c);
1281
int32x4_t wa = vaddq_s32(va, lane1a);
1282
int32x4_t wb = vaddq_s32(vb, lane1b);
1283
int32x4_t wc = vaddq_s32(vc, lane1c);
1284
1285
vst1q_s32(dsta + x, wa);
1286
vst1q_s32(dstb + x, wb);
1287
vst1q_s32(dstc + x, wc);
1288
}
1289
if(x < colsn)
1290
{
1291
x = colsn-4;
1292
goto box3x3s32_horiz_ll;
1293
}
1294
}
1295
else
1296
{
1297
//horizontal convolution (last 2 lines)
1298
s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
1299
size_t x = 0;
1300
for (; x <= colsn - 4; x += 4)
1301
{
1302
internal::prefetch(laneA + x + cn);
1303
internal::prefetch(laneB + x + cn);
1304
box3x3s32_horiz_last2:
1305
int32x4_t lane0a = vld1q_s32(laneA + x - cn);
1306
int32x4_t lane2a = vld1q_s32(laneA + x + cn);
1307
int32x4_t lane1a = vld1q_s32(laneA + x);
1308
1309
int32x4_t lane0b = vld1q_s32(laneB + x - cn);
1310
int32x4_t lane2b = vld1q_s32(laneB + x + cn);
1311
int32x4_t lane1b = vld1q_s32(laneB + x);
1312
1313
int32x4_t va = vaddq_s32(lane0a, lane2a);
1314
int32x4_t vb = vaddq_s32(lane0b, lane2b);
1315
int32x4_t wa = vaddq_s32(va, lane1a);
1316
int32x4_t wb = vaddq_s32(vb, lane1b);
1317
1318
vst1q_s32(dsta + x, wa);
1319
vst1q_s32(dstb + x, wb);
1320
}
1321
if(x < colsn)
1322
{
1323
x = colsn-4;
1324
goto box3x3s32_horiz_last2;
1325
}
1326
}
1327
#else
1328
(void)srcBase;
1329
(void)srcStride;
1330
(void)dstBase;
1331
(void)dstStride;
1332
(void)borderValue;
1333
(void)borderMargin;
1334
#endif
1335
}
1336
1337
} //namespace CAROTENE_NS
1338
1339