Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/laplacian.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
#include "saturate_cast.hpp"
42
43
#include <vector>
44
45
namespace CAROTENE_NS {
46
47
bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border)
48
{
49
return isSupportedConfiguration() && size.width >= 8 &&
50
(border == BORDER_MODE_CONSTANT ||
51
border == BORDER_MODE_REPLICATE);
52
}
53
54
void Laplacian3x3(const Size2D &size,
55
const u8 * srcBase, ptrdiff_t srcStride,
56
u8 * dstBase, ptrdiff_t dstStride,
57
BORDER_MODE border, u8 borderValue)
58
{
59
internal::assertSupportedConfiguration(isLaplacian3x3Supported(size, border));
60
#ifdef CAROTENE_NEON
61
const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3);
62
const uint16x8_t v_zero = vdupq_n_u16(0);
63
const uint8x8_t v_border = vdup_n_u8(borderValue);
64
65
uint8x8_t vsub;
66
uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
67
uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
68
69
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
70
71
for (ptrdiff_t y = 0; y < height; ++y)
72
{
73
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
74
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
75
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
76
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
77
78
s16 prevx = 0, currx = 0, nextx = 0;
79
ptrdiff_t x = 0;
80
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
81
82
// perform vertical convolution
83
for ( ; x <= bwidth; x += 8)
84
{
85
internal::prefetch(srow0 + x);
86
internal::prefetch(srow1 + x);
87
internal::prefetch(srow2 + x);
88
89
uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
90
uint8x8_t x1 = vld1_u8(srow1 + x);
91
uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
92
93
// calculate values for plain CPU part below if needed
94
if (x + 8 >= bwidth)
95
{
96
ptrdiff_t x3 = x == width ? width - 1 : x;
97
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
98
99
if (border == BORDER_MODE_CONSTANT && x4 < 0)
100
prevx = borderValue;
101
else
102
prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue);
103
104
currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue);
105
}
106
107
// make shift
108
if (x)
109
{
110
tprev = tcurr;
111
tcurr = tnext;
112
}
113
114
// and calculate next value
115
tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
116
117
// make extrapolation for the first elements
118
if (!x)
119
{
120
// make border
121
if (border == BORDER_MODE_CONSTANT)
122
tcurr = v_border_x3;
123
else if (border == BORDER_MODE_REPLICATE)
124
tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
125
126
vsub = x1;
127
128
continue;
129
}
130
131
// combine 3 "shifted" vectors
132
t0 = vextq_u16(tprev, tcurr, 7);
133
t1 = tcurr;
134
t2 = vextq_u16(tcurr, tnext, 1);
135
136
// and add them
137
t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
138
139
int16x8_t tt0 = vsubq_s16(vreinterpretq_s16_u16(t0),
140
vreinterpretq_s16_u16(vaddw_u8(vshll_n_u8(vsub, 3), vsub)));
141
uint8x8_t it0 = vqmovun_s16(tt0);
142
vst1_u8(drow + x - 8, it0);
143
144
vsub = x1;
145
}
146
147
x -= 8;
148
if (x == width)
149
--x;
150
151
for ( ; x < width; ++x)
152
{
153
// make extrapolation for the last elements
154
if (x + 1 >= width)
155
{
156
if (border == BORDER_MODE_CONSTANT)
157
nextx = borderValue * 3;
158
else if (border == BORDER_MODE_REPLICATE)
159
nextx = srow2[x] + srow1[x] + srow0[x];
160
}
161
else
162
{
163
nextx = (srow2 ? srow2[x + 1] : borderValue) +
164
srow1[x + 1] +
165
(srow0 ? srow0[x + 1] : borderValue);
166
}
167
168
s32 val = (prevx + currx + nextx) - 9 * srow1[x];
169
drow[x] = internal::saturate_cast<u8>((s32)val);
170
171
// make shift
172
prevx = currx;
173
currx = nextx;
174
}
175
}
176
#else
177
(void)size;
178
(void)srcBase;
179
(void)srcStride;
180
(void)dstBase;
181
(void)dstStride;
182
(void)border;
183
(void)borderValue;
184
#endif
185
}
186
187
bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border)
188
{
189
return isSupportedConfiguration() &&
190
size.width >= 8 && size.height >= 1 &&
191
(border == BORDER_MODE_CONSTANT ||
192
border == BORDER_MODE_REFLECT ||
193
border == BORDER_MODE_REFLECT101 ||
194
border == BORDER_MODE_REPLICATE);
195
}
196
197
void Laplacian1OpenCV(const Size2D &size,
198
const u8 * srcBase, ptrdiff_t srcStride,
199
s16 * dstBase, ptrdiff_t dstStride,
200
BORDER_MODE border, u8 borderValue)
201
{
202
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
203
#ifdef CAROTENE_NEON
204
ptrdiff_t rows = size.height, cols = size.width;
205
206
std::vector<u8> _tmp;
207
u8 *tmp = 0;
208
if (border == BORDER_MODE_CONSTANT)
209
{
210
_tmp.assign(cols + 4,borderValue);
211
tmp = &_tmp[2];
212
}
213
214
for( ptrdiff_t y = 0; y < rows; y++ )
215
{
216
const u8* v0 = 0;
217
const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
218
const u8* v2 = 0;
219
// make border
220
if (border == BORDER_MODE_REFLECT101) {
221
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
222
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
223
} else if (border == BORDER_MODE_CONSTANT) {
224
v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
225
v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
226
} else {
227
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
228
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
229
}
230
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
231
232
int16x8_t tcurr = vmovq_n_s16(0x0);
233
int16x8_t tnext = vmovq_n_s16(0x0);
234
int16x8_t t0, t2;
235
uint8x8_t xx0 = vmov_n_u8(0x0);
236
uint8x8_t xx1 = vmov_n_u8(0x0);
237
uint8x8_t xx2 = vmov_n_u8(0x0);
238
ptrdiff_t x = 0;
239
const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
240
for( ; x <= bcols; x += 8 )
241
{
242
internal::prefetch(v0 + x);
243
internal::prefetch(v1 + x);
244
internal::prefetch(v2 + x);
245
246
uint8x8_t x0 = vld1_u8(v0 + x);
247
uint8x8_t x1 = vld1_u8(v1 + x);
248
uint8x8_t x2 = vld1_u8(v2 + x);
249
250
if(x) {
251
xx0 = xx1;
252
xx1 = xx2;
253
} else {
254
xx1 = x1;
255
// make border
256
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
257
{
258
xx1 = vset_lane_u8(vget_lane_u8(x1, 0),x1, 7);
259
}
260
else if (border == BORDER_MODE_CONSTANT)
261
{
262
xx1 = vset_lane_u8(borderValue, x1, 7);
263
}
264
else if (border == BORDER_MODE_REFLECT101)
265
{
266
xx1 = vset_lane_u8(vget_lane_u8(x1, 1),x1, 7);
267
}
268
}
269
xx2 = x1;
270
271
if(x) {
272
tcurr = tnext;
273
}
274
tnext = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x0, x2)),
275
vreinterpretq_s16_u16(vshll_n_u8(x1, 2)));
276
277
if(!x) {
278
tcurr = tnext;
279
continue;
280
}
281
t0 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx0, xx1, 7)));
282
t2 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx1, xx2, 1)));
283
t0 = vaddq_s16(vqaddq_s16(t0, t2), tcurr);
284
285
vst1q_s16(drow + x - 8, t0);
286
}
287
288
x -= 8;
289
if(x == cols){
290
x--;
291
}
292
293
for( ; x < cols; x++ )
294
{
295
s16 nextx;
296
s16 prevx;
297
// make border
298
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
299
{
300
prevx = x == 0 ? v1[0] : v1[x-1];
301
nextx = x == cols-1 ? v1[x] : v1[x+1];
302
}
303
else if (border == BORDER_MODE_REFLECT101)
304
{
305
prevx = x == 0 ? v1[1] : v1[x-1];
306
nextx = x == cols-1 ? v1[x-1] : v1[x+1];
307
}
308
else //if (border == BORDER_MODE_CONSTANT)
309
{
310
prevx = x == 0 ? borderValue : v1[x-1];
311
nextx = x == cols-1 ? borderValue : v1[x+1];
312
}
313
*(drow+x) = prevx + nextx - 4*v1[x] + v0[x] + v2[x];
314
}
315
}
316
#else
317
(void)size;
318
(void)srcBase;
319
(void)srcStride;
320
(void)dstBase;
321
(void)dstStride;
322
(void)border;
323
(void)borderValue;
324
#endif
325
}
326
327
void Laplacian3OpenCV(const Size2D &size,
328
const u8 * srcBase, ptrdiff_t srcStride,
329
s16 * dstBase, ptrdiff_t dstStride,
330
BORDER_MODE border, u8 borderValue)
331
{
332
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
333
#ifdef CAROTENE_NEON
334
ptrdiff_t rows = size.height, cols = size.width;
335
336
std::vector<u8> _tmp;
337
u8 *tmp = 0;
338
if (border == BORDER_MODE_CONSTANT)
339
{
340
_tmp.assign(cols + 4,borderValue);
341
tmp = &_tmp[2];
342
}
343
344
for( ptrdiff_t y = 0; y < rows; y++ )
345
{
346
const u8* v0 = 0;
347
const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
348
const u8* v2 = 0;
349
// make border
350
if (border == BORDER_MODE_REFLECT101) {
351
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
352
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
353
} else if (border == BORDER_MODE_CONSTANT) {
354
v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
355
v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
356
} else {
357
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
358
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
359
}
360
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
361
362
int16x8_t tprev = vmovq_n_s16(0x0);
363
int16x8_t tcurr = vmovq_n_s16(0x0);
364
int16x8_t tnext = vmovq_n_s16(0x0);
365
int16x8_t tc = vmovq_n_s16(0x0);
366
int16x8_t t0, t2, tcnext;
367
ptrdiff_t x = 0;
368
const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
369
for( ; x <= bcols; x += 8 )
370
{
371
internal::prefetch(v0 + x);
372
internal::prefetch(v1 + x);
373
internal::prefetch(v2 + x);
374
375
uint8x8_t x0 = vld1_u8(v0 + x);
376
uint8x8_t x1 = vld1_u8(v1 + x);
377
uint8x8_t x2 = vld1_u8(v2 + x);
378
tcnext = vreinterpretq_s16_u16(vshll_n_u8(x1, 2));
379
380
if(x) {
381
tprev = tcurr;
382
tcurr = tnext;
383
}
384
tnext = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
385
386
if(!x) {
387
tcurr = tnext;
388
tc = tcnext;
389
390
// make border
391
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
392
{
393
tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 0),tcurr, 7);
394
}
395
else if (border == BORDER_MODE_CONSTANT)
396
{
397
tcurr = vsetq_lane_s16(borderValue, tcurr, 7);
398
}
399
else if (border == BORDER_MODE_REFLECT101)
400
{
401
tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 1),tcurr, 7);
402
}
403
continue;
404
}
405
406
t0 = vextq_s16(tprev, tcurr, 7);
407
t2 = vextq_s16(tcurr, tnext, 1);
408
409
t0 = vsubq_s16(vqaddq_s16(t0, t2), tc);
410
tc = tcnext;
411
412
t0 = vshlq_n_s16(t0, 1);
413
vst1q_s16(drow + x - 8, t0);
414
}
415
x -= 8;
416
if(x == cols){
417
x--;
418
}
419
420
for( ; x < cols; x++ )
421
{
422
s16 nextx, nextx2;
423
s16 prevx, prevx2;
424
// make border
425
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
426
{
427
prevx = x == 0 ? v0[0] : v0[x-1];
428
prevx2 = x == 0 ? v2[0] : v2[x-1];
429
nextx = x == cols-1 ? v0[x] : v0[x+1];
430
nextx2 = x == cols-1 ? v2[x] : v2[x+1];
431
}
432
else if (border == BORDER_MODE_REFLECT101)
433
{
434
prevx = x == 0 ? v0[1] : v0[x-1];
435
prevx2 = x == 0 ? v2[1] : v2[x-1];
436
nextx = x == cols-1 ? v0[x-1] : v0[x+1];
437
nextx2 = x == cols-1 ? v2[x-1] : v2[x+1];
438
}
439
else //if (border == BORDER_MODE_CONSTANT)
440
{
441
prevx = x == 0 ? borderValue : v0[x-1];
442
prevx2 = x == 0 ? borderValue : v2[x-1];
443
nextx = x == cols-1 ? borderValue : v0[x+1];
444
nextx2 = x == cols-1 ? borderValue : v2[x+1];
445
}
446
s16 res = prevx + nextx - 4*v1[x] + prevx2 + nextx2;
447
*(drow+x) = 2*res;
448
}
449
}
450
#else
451
(void)size;
452
(void)srcBase;
453
(void)srcStride;
454
(void)dstBase;
455
(void)dstStride;
456
(void)border;
457
(void)borderValue;
458
#endif
459
}
460
461
void Laplacian5OpenCV(const Size2D &size,
462
const u8 * srcBase, ptrdiff_t srcStride,
463
s16 * dstBase, ptrdiff_t dstStride,
464
BORDER_MODE border, u8 borderValue)
465
{
466
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
467
#ifdef CAROTENE_NEON
468
ptrdiff_t rows = size.height, cols = size.width;
469
470
std::vector<u8> _tmp;
471
u8 *tmp = 0;
472
if (border == BORDER_MODE_CONSTANT)
473
{
474
_tmp.assign(cols + 4,borderValue);
475
tmp = &_tmp[2];
476
}
477
478
for( ptrdiff_t y = 0; y < rows; y++ )
479
{
480
const u8* v0 = 0;
481
const u8* v1 = 0;
482
const u8* v2 = internal::getRowPtr(srcBase, srcStride, y);
483
const u8* v3 = 0;
484
const u8* v4 = 0;
485
// make border
486
if (border == BORDER_MODE_REPLICATE) {
487
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0);
488
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
489
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
490
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 0 ? rows-1 : 0);
491
} else if (border == BORDER_MODE_REFLECT) {
492
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 1 ? 1-y : 0);
493
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
494
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
495
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 1 ? 2*rows-(y+3) : 0);
496
} else if (border == BORDER_MODE_REFLECT101) {
497
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 2-y ? 2-y : 0); ///check
498
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : rows > 1 ? 1 : 0);
499
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
500
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 2 ? 2*rows-(y+4) : 0);///bad if rows=2 y=1 rows - 4 + (2,1)
501
} else if (border == BORDER_MODE_CONSTANT) {
502
v0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp;
503
v1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
504
v3 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
505
v4 = y < rows-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp;
506
}
507
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
508
509
int16x8_t tnext, tc, t0;
510
int16x8_t tnext2, tnext3;
511
int16x8_t tnext1Old, tnext2Old, tnext3Old;
512
int16x8_t tnext4OldOldOld, tnext5OldOldOld;
513
514
int16x8_t tcurr1 = vmovq_n_s16(0x0);
515
int16x8_t tnext1 = vmovq_n_s16(0x0);
516
int16x8_t tprev1 = vmovq_n_s16(0x0);
517
int16x8_t tpprev1 = vmovq_n_s16(0x0);
518
int16x8_t tppprev1 = vmovq_n_s16(0x0);
519
520
int16x8_t tnext4Old = vmovq_n_s16(0x0);
521
int16x8_t tnext5Old = vmovq_n_s16(0x0);
522
int16x8_t tnext1OldOld = vmovq_n_s16(0x0);
523
int16x8_t tnext2OldOld = vmovq_n_s16(0x0);
524
int16x8_t tnext3OldOld = vmovq_n_s16(0x0);
525
int16x8_t tnext4OldOld = vmovq_n_s16(0x0);
526
int16x8_t tnext5OldOld = vmovq_n_s16(0x0);
527
528
// do vertical convolution
529
ptrdiff_t x = 0;
530
const ptrdiff_t bcols = y + 3 < rows ? cols : (cols - 8);
531
for( ; x <= bcols; x += 8 )
532
{
533
internal::prefetch(v0 + x);
534
internal::prefetch(v1 + x);
535
internal::prefetch(v2 + x);
536
internal::prefetch(v3 + x);
537
internal::prefetch(v4 + x);
538
539
uint8x8_t x0 = vld1_u8(v0 + x);
540
uint8x8_t x1 = vld1_u8(v1 + x);
541
uint8x8_t x2 = vld1_u8(v2 + x);
542
uint8x8_t x3 = vld1_u8(v3 + x);
543
uint8x8_t x4 = vld1_u8(v4 + x);
544
if(x) {
545
tcurr1 = tnext1;
546
}
547
548
tnext4OldOldOld = tnext4Old;
549
tnext5OldOldOld = tnext5Old;
550
tnext1Old = tnext1OldOld;
551
tnext2Old = tnext2OldOld;
552
tnext3Old = tnext3OldOld;
553
tnext4Old = tnext4OldOld;
554
tnext5Old = tnext5OldOld;
555
556
tnext3 = vreinterpretq_s16_u16(vaddq_u16(vaddl_u8(x3, x2),vaddl_u8(x2, x1)));
557
tnext3 = vshlq_n_s16(tnext3, 1);
558
559
tc = vreinterpretq_s16_u16(vsubl_u8(x4, x2));
560
tnext = vreinterpretq_s16_u16(vsubl_u8(x2, x0));
561
tnext2 = vsubq_s16(tc, tnext);
562
563
tnext1 = vaddq_s16(tnext3, tnext2);
564
// tnext1 = x0 + 2*x1 + 2*x2 + 2*x3 + x4
565
566
tnext2 = vshlq_n_s16(tnext2, 1);
567
// tnext2 = 2*x4 - 4*x2 + 2*x0
568
569
tnext3 = vsubq_s16(tnext2, vshlq_n_s16(tnext3, 1));
570
// tnext3 = 2*x0 - 4*x1 - 12*x2 - 4*x3 + 2*x4
571
572
tnext1OldOld = tnext1;
573
tnext2OldOld = tnext2;
574
tnext3OldOld = tnext3;
575
tnext4OldOld = tnext2;
576
tnext5OldOld = tnext1;
577
578
if(x) {
579
tnext1 = vextq_s16(tnext1Old, tnext1, 2);
580
tcurr1 = vextq_s16(tnext2Old, tnext2, 1);
581
tprev1 = tnext3Old;
582
583
if(x!=8) {
584
tpprev1 = vextq_s16(tnext4OldOldOld, tnext4Old, 7);
585
tppprev1 = vextq_s16(tnext5OldOldOld, tnext5Old, 6);
586
}
587
}
588
589
if(!x) {
590
// make border
591
if (border == BORDER_MODE_REPLICATE) {
592
tpprev1 = vextq_s16(tnext2, tnext2, 7);
593
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
594
595
tprev1 = vextq_s16(tnext1, tnext1, 6);
596
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 0);
597
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
598
} else if (border == BORDER_MODE_REFLECT) {
599
tpprev1 = vextq_s16(tnext2, tnext2, 7);
600
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
601
602
tprev1 = vextq_s16(tnext1, tnext1, 6);
603
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 0);
604
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
605
} else if (border == BORDER_MODE_REFLECT101) {
606
tpprev1 = vextq_s16(tnext2, tnext2, 7);
607
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 2),tpprev1, 0);
608
609
tprev1 = vextq_s16(tnext1, tnext1, 6);
610
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 1);
611
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 4),tprev1, 0);
612
} else if (border == BORDER_MODE_CONSTANT) {
613
tpprev1 = vextq_s16(tnext2, tnext2, 7);
614
tpprev1 = vsetq_lane_s16(borderValue, tpprev1, 0);
615
616
tprev1 = vextq_s16(tnext1, tnext1, 6);
617
tprev1 = vsetq_lane_s16(borderValue, tprev1, 0);
618
tprev1 = vsetq_lane_s16(borderValue, tprev1, 1);
619
}
620
tppprev1 = tprev1;
621
continue;
622
}
623
624
t0 = vaddq_s16(vaddq_s16(vqaddq_s16(tcurr1, tprev1), vqaddq_s16(tpprev1, tppprev1)), tnext1);
625
t0 = vaddq_s16(t0, t0);
626
vst1q_s16(drow + x - 8, t0);
627
}
628
x -= 8;
629
if(x >= cols - 1)
630
x = cols-2;
631
632
s16 pprevx = 0;
633
s16 prevx = 0;
634
s16 nextx = 0;
635
s16 nnextx = 0;
636
637
for( ; x < cols; x++ )
638
{
639
if (x == 0) {
640
// make border
641
if (border == BORDER_MODE_REPLICATE) {
642
pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
643
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
644
} else if (border == BORDER_MODE_REFLECT) {
645
pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
646
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
647
} else if (border == BORDER_MODE_REFLECT101) {
648
pprevx = v0[2] + 2*v1[2] + 2*v2[2] + 2*v3[2] + v4[2];
649
prevx = 2*v0[1] - 4*v2[1] + 2*v4[1];
650
} else if (border == BORDER_MODE_CONSTANT) {
651
pprevx = 8 * borderValue;
652
prevx = 0;
653
}
654
} else if (x == 1) {
655
// make border
656
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
657
pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
658
} else if (border == BORDER_MODE_REFLECT101) {
659
pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
660
} else if (border == BORDER_MODE_CONSTANT) {
661
pprevx = 8 * borderValue;
662
}
663
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
664
} else {
665
pprevx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
666
prevx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
667
}
668
s16 currx = 2*v0[x] - 4*v1[x] - 12*v2[x] - 4*v3[x] + 2*v4[x];
669
if (x == cols-1) {
670
// make border
671
if (border == BORDER_MODE_REPLICATE) {
672
nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
673
nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
674
} else if (border == BORDER_MODE_REFLECT) {
675
nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
676
nnextx = v0[x-1] + 2*v1[x-1] + 2*v2[x-1] + 2*v3[x-1] + v4[x-1];
677
} else if (border == BORDER_MODE_REFLECT101) {
678
nextx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
679
nnextx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
680
} else if (border == BORDER_MODE_CONSTANT) {
681
nextx = 0;
682
nnextx = 8 * borderValue;
683
}
684
} else if (x == cols-2) {
685
// make border
686
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
687
nnextx = v0[x+1] + 2*v1[x+1] + 2*v2[x+1] + 2*v3[x+1] + v4[x+1];
688
} else if (border == BORDER_MODE_REFLECT101) {
689
nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
690
} else if (border == BORDER_MODE_CONSTANT) {
691
nnextx = 8 * borderValue;
692
}
693
nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
694
} else {
695
nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
696
nnextx = v0[x+2] + 2*v1[x+2] + 2*v2[x+2] + 2*v3[x+2] + v4[x+2];
697
}
698
s16 res = pprevx + prevx + currx + nextx + nnextx;
699
*(drow+x) = 2*res;
700
}
701
}
702
#else
703
(void)size;
704
(void)srcBase;
705
(void)srcStride;
706
(void)dstBase;
707
(void)dstStride;
708
(void)border;
709
(void)borderValue;
710
#endif
711
}
712
713
} // namespace CAROTENE_NS
714
715