Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/pyramid.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
42
#include <vector>
43
44
namespace CAROTENE_NS {
45
46
bool isGaussianPyramidDownRTZSupported(const Size2D &srcSize, const Size2D &dstSize, BORDER_MODE border_mode)
47
{
48
if (!isSupportedConfiguration())
49
return false;
50
// Need at least 8 pixels for vectorization.
51
// Need to make sure dst width is half the src width.
52
// Don't care about dst height.
53
if ( dstSize.width < 8 || std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 )
54
return false;
55
56
// Current implementation only supports Reflect101 (ie: UNDEFINED mode)
57
if (border_mode != BORDER_MODE_UNDEFINED)
58
return false;
59
60
return true;
61
}
62
63
bool isGaussianPyramidDownU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
64
{
65
if (!isSupportedConfiguration())
66
return false;
67
if ( (dstSize.width * cn) < 8 ||
68
(cn != 1 && cn !=3 && cn!=4) ||
69
std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 ||
70
std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 )
71
return false;
72
73
return true;
74
}
75
76
bool isGaussianPyramidDownS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
77
{
78
if (!isSupportedConfiguration())
79
return false;
80
if ( (dstSize.width * cn) < 4 ||
81
(cn != 1 && cn !=3 && cn!=4) ||
82
std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 ||
83
std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 )
84
return false;
85
86
return true;
87
}
88
89
bool isGaussianPyramidDownF32Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
90
{
91
if (!isSupportedConfiguration())
92
return false;
93
if ( (dstSize.width * cn) < 4 ||
94
(cn != 1 && cn !=3 && cn!=4) ||
95
std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 ||
96
std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 )
97
return false;
98
99
return true;
100
}
101
102
bool isGaussianPyramidUpU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
103
{
104
if (!isSupportedConfiguration())
105
return false;
106
if ( (srcSize.width * cn) < 8 ||
107
(cn != 1 && cn !=3 && cn!=4) ||
108
std::abs((ptrdiff_t)dstSize.width - (ptrdiff_t)srcSize.width*2) != (ptrdiff_t)dstSize.width % 2 ||
109
std::abs((ptrdiff_t)dstSize.height - (ptrdiff_t)srcSize.height*2) != (ptrdiff_t)dstSize.height % 2 )
110
return false;
111
112
return true;
113
}
114
115
bool isGaussianPyramidUpS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
116
{
117
if (!isSupportedConfiguration())
118
return false;
119
if ( (srcSize.width * cn) < 12 ||
120
(cn != 1 && cn !=3 && cn!=4) ||
121
std::abs((ptrdiff_t)dstSize.width - (ptrdiff_t)srcSize.width*2) != (ptrdiff_t)dstSize.width % 2 ||
122
std::abs((ptrdiff_t)dstSize.height - (ptrdiff_t)srcSize.height*2) != (ptrdiff_t)dstSize.height % 2 )
123
return false;
124
125
return true;
126
}
127
128
#ifdef CAROTENE_NEON
129
130
namespace {
131
132
ptrdiff_t borderInterpolate101(ptrdiff_t p, ptrdiff_t len)
133
{
134
if (len == 1)
135
return 0;
136
else
137
{
138
while ((unsigned)p >= (unsigned)len)
139
{
140
if (p < 0)
141
p = -p;
142
else
143
p = (len - 1)*2 - p;
144
}
145
}
146
return p;
147
}
148
149
} // namespace
150
151
#endif
152
153
void gaussianPyramidDownRTZ(const Size2D &srcSize,
154
const u8 *srcBase, ptrdiff_t srcStride,
155
const Size2D &dstSize,
156
u8 *dstBase, ptrdiff_t dstStride,
157
BORDER_MODE border, u8 borderValue)
158
{
159
internal::assertSupportedConfiguration(isGaussianPyramidDownRTZSupported(srcSize, dstSize, border));
160
#ifdef CAROTENE_NEON
161
// Single-core NEON code
162
const size_t dwidth = dstSize.width;
163
const size_t dheight = dstSize.height;
164
const size_t swidth = srcSize.width;
165
const size_t sheight = srcSize.height;
166
167
ptrdiff_t idx_l1 = borderInterpolate101(-1, swidth);
168
ptrdiff_t idx_l2 = borderInterpolate101(-2, swidth);
169
ptrdiff_t idx_r1 = borderInterpolate101(swidth + 0, swidth);
170
ptrdiff_t idx_r2 = borderInterpolate101(swidth + 1, swidth);
171
172
//1-line buffer
173
std::vector<u16> _buf((swidth + 4) + 32/sizeof(u16));
174
u16* lane = internal::alignPtr(&_buf[2], 32);
175
176
uint8x8_t vc6u8 = vmov_n_u8(6);
177
uint16x8_t vc6u16 = vmovq_n_u16(6);
178
uint16x8_t vc4u16 = vmovq_n_u16(4);
179
180
u8* dst = dstBase;
181
182
for (size_t i = 0; i < dheight; ++i, dst += dstStride)
183
{
184
//vertical convolution
185
const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, sheight));
186
const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, sheight));
187
const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, sheight));
188
const u8* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, sheight));
189
const u8* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, sheight));
190
191
size_t x = 0;
192
for (; x <= swidth - 8; x += 8)
193
{
194
internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
195
uint8x8_t v0 = vld1_u8(ln0+x);
196
uint8x8_t v1 = vld1_u8(ln1+x);
197
uint8x8_t v2 = vld1_u8(ln2+x);
198
uint8x8_t v3 = vld1_u8(ln3+x);
199
uint8x8_t v4 = vld1_u8(ln4+x);
200
201
uint16x8_t v = vaddl_u8(v0, v4);
202
uint16x8_t v13 = vaddl_u8(v1, v3);
203
204
v = vmlal_u8(v, v2, vc6u8);
205
v = vmlaq_u16(v, v13, vc4u16);
206
207
vst1q_u16(lane + x, v);
208
}
209
for (; x < swidth; ++x)
210
{
211
lane[x] = ln0[x] + ln4[x] + 4u * (ln1[x] + ln3[x]) + 6u * ln2[x];
212
}
213
214
//left&right borders
215
lane[-1] = lane[idx_l1];
216
lane[-2] = lane[idx_l2];
217
218
lane[swidth] = lane[idx_r1];
219
lane[swidth+1] = lane[idx_r2];
220
221
//horizontal convolution
222
x = 0;
223
size_t vw = (swidth/2) - 7; // Using 7 instead of 8 allows swidth of 14 or 15.
224
for (; x < vw; x += 8)
225
{
226
internal::prefetch(lane + 2 * x);
227
uint16x8x2_t vLane0 = vld2q_u16(lane + 2*x-2); // L0[0] = x0 x2 x4 x6 x8 x10 x12 x14 L0[1] = x1 x3 x5 x7 x9 x11 x13 x15
228
uint16x8x2_t vLane1 = vld2q_u16(lane + 2*x-1); // L1[0] = x1 x3 x5 x7 x9 x11 x13 x15 L1[1] = x2 x4 x6 x8 x10 x12 x14 x16
229
uint16x8x2_t vLane2 = vld2q_u16(lane + 2*x+0); // L2[0] = x2 x4 x6 x8 x10 x12 x14 x16 L2[1] = x3 x5 x7 x9 x11 x13 x15 x17
230
uint16x8x2_t vLane3 = vld2q_u16(lane + 2*x+1); // L3[0] = x3 x5 x7 x9 x11 x13 x15 x17 L3[1] = x4 x6 x8 x10 x12 x14 x16 x18
231
uint16x8x2_t vLane4 = vld2q_u16(lane + 2*x+2); // L4[0] = x4 x6 x8 x10 x12 x14 x16 x18 L4[1] = x5 x7 x9 x11 x13 x15 x17 x19
232
uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
233
uint16x8_t vSum_1_3 = vaddq_u16(vLane1.val[0], vLane3.val[0]);
234
vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
235
vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_1_3, vc4u16);
236
uint8x8_t vRes = vshrn_n_u16(vSum_0_4, 8);
237
238
vst1_u8(dst + x, vRes);
239
}
240
241
for (; x < dwidth; x++)
242
{
243
dst[x] = u8((lane[2*x-2] + lane[2*x+2] + 4u * (lane[2*x-1] + lane[2*x+1]) + 6u * lane[2*x]) >> 8);
244
}
245
}
246
#else
247
// Remove 'unused parameter' warnings.
248
(void)srcSize;
249
(void)srcBase;
250
(void)srcStride;
251
(void)dstSize;
252
(void)dstBase;
253
(void)dstStride;
254
(void)border;
255
#endif
256
(void)borderValue;
257
}
258
259
void gaussianPyramidDown(const Size2D &srcSize,
260
const u8 *srcBase, ptrdiff_t srcStride,
261
const Size2D &dstSize,
262
u8 *dstBase, ptrdiff_t dstStride, u8 cn)
263
{
264
internal::assertSupportedConfiguration(isGaussianPyramidDownU8Supported(srcSize, dstSize, cn));
265
#ifdef CAROTENE_NEON
266
size_t dcolcn = dstSize.width*cn;
267
size_t scolcn = srcSize.width*cn;
268
size_t roiw8 = dcolcn - 7;
269
270
size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn;
271
size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn;
272
size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn;
273
size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn;
274
275
//1-line buffer
276
std::vector<u16> _buf(cn*(srcSize.width + 4) + 32/sizeof(u16));
277
u16* lane = internal::alignPtr(&_buf[2*cn], 32);
278
279
uint8x8_t vc6u8 = vmov_n_u8(6);
280
uint16x8_t vc6u16 = vmovq_n_u16(6);
281
uint16x8_t vc4u16 = vmovq_n_u16(4);
282
283
for (size_t i = 0; i < dstSize.height; ++i)
284
{
285
u8* dst = internal::getRowPtr(dstBase, dstStride, i);
286
//vertical convolution
287
const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height));
288
const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height));
289
const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height));
290
const u8* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height));
291
const u8* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height));
292
293
size_t x = 0;
294
for (; x <= scolcn - 8; x += 8)
295
{
296
internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2));
297
uint8x8_t v0 = vld1_u8(ln0+x);
298
uint8x8_t v1 = vld1_u8(ln1+x);
299
uint8x8_t v2 = vld1_u8(ln2+x);
300
uint8x8_t v3 = vld1_u8(ln3+x);
301
uint8x8_t v4 = vld1_u8(ln4+x);
302
303
uint16x8_t v = vaddl_u8(v0, v4);
304
uint16x8_t v13 = vaddl_u8(v1, v3);
305
306
v = vmlal_u8(v, v2, vc6u8);
307
v = vmlaq_u16(v, v13, vc4u16);
308
309
vst1q_u16(lane + x, v);
310
}
311
for (; x < scolcn; ++x)
312
{
313
lane[x] = ln0[x] + ln4[x] + 4u * (ln1[x] + ln3[x]) + 6u * ln2[x];
314
}
315
316
//left&right borders
317
for (u32 k = 0; k < cn; ++k)
318
{
319
lane[(s32)(-cn+k)] = lane[idx_l1 + k];
320
lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k];
321
322
lane[scolcn+k] = lane[idx_r1 + k];
323
lane[scolcn+cn+k] = lane[idx_r2 + k];
324
}
325
326
//horizontal convolution
327
x = 0;
328
switch(cn)
329
{
330
case 1:
331
for (; x < roiw8; x += 8)
332
{
333
internal::prefetch(lane + 2 * x);
334
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
335
__asm__ (
336
"vld2.16 {d0-d3}, [%[in0]] \n\t"
337
"vld2.16 {d4-d7}, [%[in4]] \n\t"
338
"vld2.16 {d12-d15}, [%[in1]] \n\t"
339
"vld2.16 {d16-d19}, [%[in3]] \n\t"
340
"vld2.16 {d8-d11}, [%[in2],:256] \n\t"
341
"vadd.i16 q0, q2 /*q0 = v0 + v4*/ \n\t"
342
"vadd.i16 q6, q8 /*q6 = v1 + v3*/ \n\t"
343
"vmla.i16 q0, q4, %q[c6] /*q0 += v2 * 6*/ \n\t"
344
"vmla.i16 q0, q6, %q[c4] /*q1 += (v1+v3) * 4*/ \n\t"
345
"vrshrn.u16 d8, q0, #8 \n\t"
346
"vst1.8 {d8}, [%[out]] \n\t"
347
: /*no output*/
348
: [out] "r" (dst + x),
349
[in0] "r" (lane + 2*x-2),
350
[in1] "r" (lane + 2*x-1),
351
[in2] "r" (lane + 2*x+0),
352
[in3] "r" (lane + 2*x+1),
353
[in4] "r" (lane + 2*x+2),
354
[c4] "w" (vc4u16), [c6] "w" (vc6u16)
355
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
356
);
357
#else
358
uint16x8x2_t vLane0 = vld2q_u16(lane + 2*x-2);
359
uint16x8x2_t vLane1 = vld2q_u16(lane + 2*x-1);
360
uint16x8x2_t vLane2 = vld2q_u16(lane + 2*x+0);
361
uint16x8x2_t vLane3 = vld2q_u16(lane + 2*x+1);
362
uint16x8x2_t vLane4 = vld2q_u16(lane + 2*x+2);
363
364
uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
365
uint16x8_t vSum_1_3 = vaddq_u16(vLane1.val[0], vLane3.val[0]);
366
vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
367
vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_1_3, vc4u16);
368
uint8x8_t vRes = vrshrn_n_u16(vSum_0_4, 8);
369
370
vst1_u8(dst + x, vRes);
371
#endif
372
}
373
break;
374
case 3:
375
{
376
uint16x4_t vx1 = vld1_u16(lane - 2*3);
377
uint16x4_t vx2 = vld1_u16(lane - 1*3);
378
uint16x4_t vx3 = vld1_u16(lane + 0*3);
379
uint16x8_t v0 = vcombine_u16(vx1, vx3);
380
381
uint8x8_t map = vreinterpret_u8_u64(vmov_n_u64(0xFFFF060504020100ULL));
382
for (; x < roiw8; x += 6)
383
{
384
internal::prefetch(lane + 2 * x + 12);
385
386
uint16x4_t vx_ = vld1_u16(lane + 2*x-1*3 + 6);
387
uint16x4_t vx4 = vld1_u16(lane + 2*x+0*3 + 6);
388
uint16x4_t vx5 = vld1_u16(lane + 2*x+1*3 + 6);
389
uint16x4_t vx6 = vld1_u16(lane + 2*x+2*3 + 6);
390
391
uint16x8_t v1 = vcombine_u16(vx2, vx_);
392
uint16x8_t v2 = vcombine_u16(vget_high_u16(v0), vx4);
393
uint16x8_t v3 = vcombine_u16(vx_, vx5);
394
uint16x8_t v4 = vcombine_u16(vx4, vx6);
395
vx2 = vx5;
396
397
uint16x8_t v = vaddq_u16(v0, v4);
398
uint16x8_t v13 = vaddq_u16(v1, v3);
399
400
v = vmlaq_u16(v, v2, vc6u16);
401
v = vmlaq_u16(v, v13, vc4u16);
402
403
uint8x8_t v8 = vrshrn_n_u16(v, 8);
404
405
v0 = v4;
406
407
vst1_u8(dst + x, vtbl1_u8(v8, map));
408
}
409
}
410
break;
411
case 4:
412
{
413
uint16x4_t vx1 = vld1_u16(lane - 2*4);
414
uint16x4_t vx2 = vld1_u16(lane - 1*4);
415
uint16x4_t vx3 = vld1_u16(lane + 0*4);
416
uint16x8_t v0 = vcombine_u16(vx1, vx3);
417
418
for (; x < roiw8; x += 8)
419
{
420
internal::prefetch(lane + 2 * x + 16);
421
422
uint16x4_t vx_ = vld1_u16(lane + 2 * x - 1*4 + 8);
423
uint16x4_t vx4 = vld1_u16(lane + 2 * x + 0*4 + 8);
424
uint16x4_t vx5 = vld1_u16(lane + 2 * x + 1*4 + 8);
425
uint16x4_t vx6 = vld1_u16(lane + 2 * x + 2*4 + 8);
426
427
uint16x8_t v1 = vcombine_u16(vx2, vx_);
428
uint16x8_t v2 = vcombine_u16(vget_high_u16(v0), vx4);
429
uint16x8_t v3 = vcombine_u16(vx_, vx5);
430
uint16x8_t v4 = vcombine_u16(vx4, vx6);
431
vx2 = vx5;
432
433
uint16x8_t v = vaddq_u16(v0, v4);
434
uint16x8_t v13 = vaddq_u16(v1, v3);
435
436
v = vmlaq_u16(v, v2, vc6u16);
437
v = vmlaq_u16(v, v13, vc4u16);
438
439
uint8x8_t v8 = vrshrn_n_u16(v, 8);
440
441
v0 = v4;
442
443
vst1_u8(dst + x, v8);
444
}
445
}
446
break;
447
}
448
449
for (u32 h = 0; h < cn; ++h)
450
{
451
u16* ln = lane + h;
452
u8* dt = dst + h;
453
for (size_t k = x; k < dcolcn; k += cn)
454
dt[k] = u8((ln[2*k-2*cn] + ln[2*k+2*cn] + 4u * (ln[2*k-cn] + ln[2*k+cn]) + 6u * ln[2*k] + (1 << 7)) >> 8);
455
}
456
}
457
#else
458
// Remove 'unused parameter' warnings.
459
(void)srcBase;
460
(void)srcStride;
461
(void)dstBase;
462
(void)dstStride;
463
#endif
464
}
465
466
void gaussianPyramidDown(const Size2D &srcSize,
467
const s16 *srcBase, ptrdiff_t srcStride,
468
const Size2D &dstSize,
469
s16 *dstBase, ptrdiff_t dstStride, u8 cn)
470
{
471
internal::assertSupportedConfiguration(isGaussianPyramidDownS16Supported(srcSize, dstSize, cn));
472
#ifdef CAROTENE_NEON
473
size_t dcolcn = dstSize.width*cn;
474
size_t scolcn = srcSize.width*cn;
475
size_t roiw4 = dcolcn - 3;
476
477
size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn;
478
size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn;
479
size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn;
480
size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn;
481
482
//1-line buffer
483
std::vector<s32> _buf(cn*(srcSize.width + 4) + 32/sizeof(s32));
484
s32* lane = internal::alignPtr(&_buf[2*cn], 32);
485
486
int16x4_t vc6s16 = vmov_n_s16(6);
487
int32x4_t vc6s32 = vmovq_n_s32(6);
488
int32x4_t vc4s32 = vmovq_n_s32(4);
489
490
for (size_t i = 0; i < dstSize.height; ++i)
491
{
492
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
493
//vertical convolution
494
const s16* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height));
495
const s16* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height));
496
const s16* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height));
497
const s16* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height));
498
const s16* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height));
499
500
size_t x = 0;
501
for (; x <= scolcn - 4; x += 4)
502
{
503
internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2));
504
int16x4_t v0 = vld1_s16(ln0 + x);
505
int16x4_t v1 = vld1_s16(ln1 + x);
506
int16x4_t v2 = vld1_s16(ln2 + x);
507
int16x4_t v3 = vld1_s16(ln3 + x);
508
int16x4_t v4 = vld1_s16(ln4 + x);
509
510
int32x4_t v = vaddl_s16(v0, v4);
511
int32x4_t v13 = vaddl_s16(v1, v3);
512
513
v = vmlal_s16(v, v2, vc6s16);
514
v = vmlaq_s32(v, v13, vc4s32);
515
516
vst1q_s32(lane + x, v);
517
}
518
for (; x < scolcn; ++x)
519
{
520
lane[x] = ln0[x] + ln4[x] + 4 * (ln1[x] + ln3[x]) + 6 * ln2[x];
521
}
522
523
//left&right borders
524
for (u32 k = 0; k < cn; ++k)
525
{
526
lane[(s32)(-cn+k)] = lane[idx_l1 + k];
527
lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k];
528
529
lane[scolcn+k] = lane[idx_r1 + k];
530
lane[scolcn+cn+k] = lane[idx_r2 + k];
531
}
532
533
//horizontal convolution
534
x = 0;
535
switch(cn)
536
{
537
case 1:
538
for (; x < roiw4; x += 4)
539
{
540
internal::prefetch(lane + 2 * x);
541
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
542
__asm__ (
543
"vld2.32 {d0-d3}, [%[in0]] \n\t"
544
"vld2.32 {d4-d7}, [%[in4]] \n\t"
545
"vld2.32 {d12-d15}, [%[in1]] \n\t"
546
"vld2.32 {d16-d19}, [%[in3]] \n\t"
547
"vld2.32 {d8-d11}, [%[in2],:256] \n\t"
548
"vadd.i32 q0, q2 \n\t"
549
"vadd.i32 q6, q8 \n\t"
550
"vmla.i32 q0, q4, %q[c6] \n\t"
551
"vmla.i32 q0, q6, %q[c4] \n\t"
552
"vrshrn.s32 d8, q0, #8 \n\t"
553
"vst1.16 {d8}, [%[out]] \n\t"
554
: /*no output*/
555
: [out] "r" (dst + x),
556
[in0] "r" (lane + 2*x-2),
557
[in1] "r" (lane + 2*x-1),
558
[in2] "r" (lane + 2*x+0),
559
[in3] "r" (lane + 2*x+1),
560
[in4] "r" (lane + 2*x+2),
561
[c4] "w" (vc4s32), [c6] "w" (vc6s32)
562
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
563
);
564
#else
565
int32x4x2_t vLane0 = vld2q_s32(lane + 2*x-2);
566
int32x4x2_t vLane1 = vld2q_s32(lane + 2*x-1);
567
int32x4x2_t vLane2 = vld2q_s32(lane + 2*x+0);
568
int32x4x2_t vLane3 = vld2q_s32(lane + 2*x+1);
569
int32x4x2_t vLane4 = vld2q_s32(lane + 2*x+2);
570
571
int32x4_t vSum_0_4 = vaddq_s32(vLane0.val[0], vLane4.val[0]);
572
int32x4_t vSum_1_3 = vaddq_s32(vLane1.val[0], vLane3.val[0]);
573
vSum_0_4 = vmlaq_s32(vSum_0_4, vLane2.val[0], vc6s32);
574
vSum_0_4 = vmlaq_s32(vSum_0_4, vSum_1_3, vc4s32);
575
int16x4_t vRes = vrshrn_n_s32(vSum_0_4, 8);
576
577
vst1_s16(dst + x, vRes);
578
#endif
579
}
580
break;
581
case 3:
582
{
583
int32x4_t v0 = vld1q_s32(lane - 2*3);
584
int32x4_t v1 = vld1q_s32(lane - 1*3);
585
int32x4_t v2 = vld1q_s32(lane + 0*3);
586
for (; x < roiw4; x += 3)
587
{
588
internal::prefetch(lane + 2 * x);
589
590
int32x4_t v3 = vld1q_s32(lane + 2 * x + 1*3);
591
int32x4_t v4 = vld1q_s32(lane + 2 * x + 2*3);
592
593
int32x4_t v = vaddq_s32(v0, v4);
594
int32x4_t v13 = vaddq_s32(v1, v3);
595
596
v = vmlaq_s32(v, v2, vc6s32);
597
v = vmlaq_s32(v, v13, vc4s32);
598
599
int16x4_t vv = vrshrn_n_s32(v, 8);
600
601
v0 = v2;
602
v1 = v3;
603
v2 = v4;
604
605
vst1_s16(dst + x, vv);
606
}
607
}
608
break;
609
case 4:
610
{
611
int32x4_t v0 = vld1q_s32(lane - 2*4);
612
int32x4_t v1 = vld1q_s32(lane - 1*4);
613
int32x4_t v2 = vld1q_s32(lane + 0*4);
614
for (; x < roiw4; x += 4)
615
{
616
internal::prefetch(lane + 2 * x + 8);
617
int32x4_t v3 = vld1q_s32(lane + 2 * x + 1*4);
618
int32x4_t v4 = vld1q_s32(lane + 2 * x + 2*4);
619
620
int32x4_t v = vaddq_s32(v0, v4);
621
int32x4_t v13 = vaddq_s32(v1, v3);
622
623
v = vmlaq_s32(v, v2, vc6s32);
624
v = vmlaq_s32(v, v13, vc4s32);
625
626
int16x4_t vv = vrshrn_n_s32(v, 8);
627
628
v0 = v2;
629
v1 = v3;
630
v2 = v4;
631
632
vst1_s16(dst + x, vv);
633
}
634
}
635
break;
636
}
637
638
for (u32 h = 0; h < cn; ++h)
639
{
640
s32* ln = lane + h;
641
s16* dt = dst + h;
642
for (size_t k = x; k < dcolcn; k += cn)
643
dt[k] = s16((ln[2*k-2*cn] + ln[2*k+2*cn] + 4 * (ln[2*k-cn] + ln[2*k+cn]) + 6 * ln[2*k] + (1 << 7)) >> 8);
644
}
645
}
646
#else
647
// Remove 'unused parameter' warnings.
648
(void)srcBase;
649
(void)srcStride;
650
(void)dstBase;
651
(void)dstStride;
652
#endif
653
}
654
655
void gaussianPyramidDown(const Size2D &srcSize,
656
const f32 *srcBase, ptrdiff_t srcStride,
657
const Size2D &dstSize,
658
f32 *dstBase, ptrdiff_t dstStride, u8 cn)
659
{
660
internal::assertSupportedConfiguration(isGaussianPyramidDownF32Supported(srcSize, dstSize, cn));
661
#ifdef CAROTENE_NEON
662
size_t dcolcn = dstSize.width*cn;
663
size_t scolcn = srcSize.width*cn;
664
size_t roiw4 = dcolcn - 3;
665
666
size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn;
667
size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn;
668
size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn;
669
size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn;
670
671
//1-line buffer
672
std::vector<f32> _buf(cn*(srcSize.width + 4) + 32/sizeof(f32));
673
f32* lane = internal::alignPtr(&_buf[2*cn], 32);
674
675
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
676
register float32x4_t vc6d4f32 asm ("q11") = vmovq_n_f32(1.5f); // 6/4
677
register float32x4_t vc1d4f32 asm ("q12") = vmovq_n_f32(0.25f); // 1/4
678
679
register float32x4_t vc1d64f32 asm ("q13") = vmovq_n_f32(0.015625f); //1/4/16
680
register float32x4_t vc4d64f32 asm ("q14") = vmovq_n_f32(0.0625f); //4/4/16
681
register float32x4_t vc6d64f32 asm ("q15") = vmovq_n_f32(0.09375f); //6/4/16
682
#else
683
float32x4_t vc6d4f32 = vmovq_n_f32(1.5f); // 6/4
684
float32x4_t vc1d4f32 = vmovq_n_f32(0.25f); // 1/4
685
686
float32x4_t vc1d64f32 = vmovq_n_f32(0.015625f); //1/4/16
687
float32x4_t vc4d64f32 = vmovq_n_f32(0.0625f); //4/4/16
688
float32x4_t vc6d64f32 = vmovq_n_f32(0.09375f); //6/4/16
689
#endif
690
691
for (size_t i = 0; i < dstSize.height; ++i)
692
{
693
f32* dst = internal::getRowPtr(dstBase, dstStride, i);
694
//vertical convolution
695
const f32* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height));
696
const f32* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height));
697
const f32* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height));
698
const f32* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height));
699
const f32* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height));
700
701
size_t x = 0;
702
for (; x <= scolcn - 4; x += 4)
703
{
704
internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2));
705
float32x4_t v0 = vld1q_f32((const float32_t*)ln0 + x);
706
float32x4_t v1 = vld1q_f32((const float32_t*)ln1 + x);
707
float32x4_t v2 = vld1q_f32((const float32_t*)ln2 + x);
708
float32x4_t v3 = vld1q_f32((const float32_t*)ln3 + x);
709
float32x4_t v4 = vld1q_f32((const float32_t*)ln4 + x);
710
711
float32x4_t v = vaddq_f32(v1, v3);
712
float32x4_t v04 = vaddq_f32(v0, v4);
713
714
v = vmlaq_f32(v, v2, vc6d4f32);
715
v = vmlaq_f32(v, v04, vc1d4f32);
716
717
vst1q_f32(lane + x, v);
718
}
719
for (; x < scolcn; ++x)
720
{
721
lane[x] = 0.25f*(ln0[x] + ln4[x]) + (ln1[x] + ln3[x]) + 1.5f * ln2[x];
722
}
723
724
//left&right borders
725
for (u32 k = 0; k < cn; ++k)
726
{
727
lane[(s32)(-cn+k)] = lane[idx_l1 + k];
728
lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k];
729
730
lane[scolcn+k] = lane[idx_r1 + k];
731
lane[scolcn+cn+k] = lane[idx_r2 + k];
732
}
733
734
//horizontal convolution
735
x = 0;
736
switch(cn)
737
{
738
case 1:
739
for (; x < roiw4; x += 4)
740
{
741
internal::prefetch(lane + 2 * x);
742
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
743
__asm__ __volatile__ (
744
"vld2.32 {d0-d3}, [%[in0]] \n\t"
745
"vld2.32 {d8-d11}, [%[in4]] \n\t"
746
"vld2.32 {d14-d17}, [%[in2],:256] \n\t"
747
"vld2.32 {d10-d13}, [%[in1]] \n\t"
748
"vld2.32 {d16-d19}, [%[in3]] \n\t"
749
"vmul.f32 q7, %q[c6d64] \n\t"
750
"vadd.f32 q0, q4 @v04 \n\t"
751
"vadd.f32 q5, q8 @v13 \n\t"
752
"vmla.f32 q7, q0, %q[c1d64] \n\t"
753
"vmla.f32 q7, q5, %q[c4d64] \n\t"
754
"vst1.32 {d14-d15}, [%[out]] \n\t"
755
:
756
: [out] "r" (dst + x),
757
[in0] "r" (lane + 2*x-2),
758
[in1] "r" (lane + 2*x-1),
759
[in2] "r" (lane + 2*x+0),
760
[in3] "r" (lane + 2*x+1),
761
[in4] "r" (lane + 2*x+2),
762
[c4d64] "w" (vc4d64f32), [c6d64] "w" (vc6d64f32), [c1d64] "w" (vc1d64f32)
763
: "d0","d1","d2","d3","d4",/*"d5","d6","d7",*/"d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" //ugly compiler "bug" - can't touch d5-d7
764
);
765
#else
766
float32x4x2_t vLane0 = vld2q_f32(lane + 2*x-2);
767
float32x4x2_t vLane1 = vld2q_f32(lane + 2*x-1);
768
float32x4x2_t vLane2 = vld2q_f32(lane + 2*x+0);
769
float32x4x2_t vLane3 = vld2q_f32(lane + 2*x+1);
770
float32x4x2_t vLane4 = vld2q_f32(lane + 2*x+2);
771
772
float32x4_t vSum_0_4 = vaddq_f32(vLane0.val[0], vLane4.val[0]);
773
float32x4_t vSum_1_3 = vaddq_f32(vLane1.val[0], vLane3.val[0]);
774
float32x4_t vRes = vmulq_f32(vLane2.val[0], vc6d64f32);
775
vRes = vmlaq_f32(vRes, vSum_0_4, vc1d64f32);
776
vRes = vmlaq_f32(vRes, vSum_1_3, vc4d64f32);
777
778
vst1q_f32(dst + x, vRes);
779
#endif
780
}
781
break;
782
case 3:
783
{
784
float32x4_t v0 = vld1q_f32((const float32_t*)lane - 2*3);
785
float32x4_t v1 = vld1q_f32((const float32_t*)lane - 1*3);
786
float32x4_t v2 = vld1q_f32((const float32_t*)lane + 0*3);
787
788
for (; x < roiw4; x += 3)
789
{
790
internal::prefetch(lane + 2 * x);
791
792
float32x4_t v3 = vld1q_f32((const float32_t*)lane + 2 * x + 1*3);
793
float32x4_t v4 = vld1q_f32((const float32_t*)lane + 2 * x + 2*3);
794
795
float32x4_t v04 = vaddq_f32(v0, v4);
796
float32x4_t v13 = vaddq_f32(v1, v3);
797
798
float32x4_t v = vmulq_f32(v2, vc6d64f32);
799
v = vmlaq_f32(v, v04, vc1d64f32);
800
v = vmlaq_f32(v, v13, vc4d64f32);
801
802
v0 = v2;
803
v1 = v3;
804
v2 = v4;
805
806
vst1q_f32(dst + x, v);
807
}
808
}
809
break;
810
case 4:
811
{
812
float32x4_t v0 = vld1q_f32((const float32_t*)lane - 2*4);
813
float32x4_t v1 = vld1q_f32((const float32_t*)lane - 1*4);
814
float32x4_t v2 = vld1q_f32((const float32_t*)lane + 0*4);
815
816
for (; x < roiw4; x += 4)
817
{
818
internal::prefetch(lane + 2 * x + 8);
819
820
float32x4_t v3 = vld1q_f32((const float32_t*)lane + 2 * x + 1*4);
821
float32x4_t v4 = vld1q_f32((const float32_t*)lane + 2 * x + 2*4);
822
823
float32x4_t v04 = vaddq_f32(v0, v4);
824
float32x4_t v13 = vaddq_f32(v1, v3);
825
826
float32x4_t v = vmulq_f32(v2, vc6d64f32);
827
v = vmlaq_f32(v, v04, vc1d64f32);
828
v = vmlaq_f32(v, v13, vc4d64f32);
829
830
v0 = v2;
831
v1 = v3;
832
v2 = v4;
833
834
vst1q_f32(dst + x, v);
835
}
836
}
837
break;
838
}
839
840
for (u32 h = 0; h < cn; ++h)
841
{
842
f32* ln = lane + h;
843
f32* dt = dst + h;
844
for (size_t k = x; k < dcolcn; k += cn)
845
dt[k] = 0.015625f * (ln[2*k-2*cn] + ln[2*k+2*cn]) + 0.0625f * (ln[2*k-cn] + ln[2*k+cn]) + 0.09375f * ln[2*k];
846
}
847
}
848
#else
849
// Remove 'unused parameter' warnings.
850
(void)srcBase;
851
(void)srcStride;
852
(void)dstBase;
853
(void)dstStride;
854
#endif
855
}
856
857
void gaussianPyramidUp(const Size2D &srcSize,
858
const u8 *srcBase, ptrdiff_t srcStride,
859
const Size2D &dstSize,
860
u8 *dstBase, ptrdiff_t dstStride, u8 cn)
861
{
862
internal::assertSupportedConfiguration(isGaussianPyramidUpU8Supported(srcSize, dstSize, cn));
863
#ifdef CAROTENE_NEON
864
size_t dcolshn = (dstSize.width/2) * cn;
865
size_t dcolshw = ((dstSize.width+1)/2) * cn;
866
size_t scolsn = srcSize.width*cn;
867
868
size_t idx_l = (borderInterpolate101(-2, 2 * srcSize.width)/2) * cn;
869
size_t idx_r1 = (borderInterpolate101(2 * srcSize.width + 0, 2 * srcSize.width)/2) * cn;
870
size_t idx_r2 = (borderInterpolate101(2 * srcSize.width + 2, 2 * srcSize.width + 2)/2) * cn;
871
872
//2-lines buffer
873
std::vector<u16> _buf(2*(cn*(srcSize.width + 3) + 32/sizeof(u16)));
874
u16* lane0 = internal::alignPtr(&_buf[cn], 32);
875
u16* lane1 = internal::alignPtr(lane0 + (3 + srcSize.width)*cn, 32);
876
877
uint8x8_t vc6u8 = vmov_n_u8(6);
878
uint16x8_t vc6u16 = vmovq_n_u16(6);
879
880
for (size_t i = 0; i < (dstSize.height + 1)/2; ++i)
881
{
882
u8* dst = internal::getRowPtr(dstBase, dstStride, 2*i);
883
//vertical convolution
884
const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 - 2, srcSize.height * 2)/2);
885
const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 0, srcSize.height * 2)/2);
886
const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 2, srcSize.height * 2)/2);
887
888
size_t x = 0;
889
for (; x <= scolsn - 8; x += 8)
890
{
891
internal::prefetch(internal::getRowPtr(ln1 + x, srcStride, (ptrdiff_t)x % 3 - 1));
892
uint8x8_t v0 = vld1_u8(ln0+x);
893
uint8x8_t v2 = vld1_u8(ln2+x);
894
uint8x8_t v1 = vld1_u8(ln1+x);
895
896
uint16x8_t vl0 = vaddl_u8(v0, v2);
897
uint16x8_t vl1 = vaddl_u8(v1, v2);
898
899
vl0 = vmlal_u8(vl0, v1, vc6u8);
900
vl1 = vshlq_n_u16(vl1, 2);
901
902
vst1q_u16(lane0 + x, vl0);
903
vst1q_u16(lane1 + x, vl1);
904
}
905
for (; x < scolsn; ++x)
906
{
907
lane0[x] = ln0[x] + ln2[x] + 6u * ln1[x];
908
lane1[x] = 4u * (ln1[x] + ln2[x]);
909
}
910
911
//left&right borders
912
for (u32 k = 0; k < cn; ++k)
913
{
914
lane0[(s32)(-cn+k)] = lane0[idx_l + k];
915
lane1[(s32)(-cn+k)] = lane1[idx_l + k];
916
917
lane0[scolsn+k] = lane0[idx_r1 + k];
918
lane0[scolsn+cn+k] = lane0[idx_r2 + k];
919
lane1[scolsn+k] = lane1[idx_r1 + k];
920
lane1[scolsn+cn+k] = lane1[idx_r2 + k];
921
}
922
923
//horizontal convolution
924
const u16* lane = lane0;
925
pyrUp8uHorizontalConvolution:
926
x = 0;
927
size_t lim;
928
switch(cn)
929
{
930
case 1:
931
lim = dcolshn > 7 ? dcolshn - 7 : 0;
932
for (; x < lim; x += 8)
933
{
934
internal::prefetch(lane + x);
935
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
936
__asm__ (
937
"vld1.16 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t"
938
"vld1.16 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t"
939
"vld1.16 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t"
940
"vadd.i16 q0, q1 /*q0 = v0 + v2*/ \n\t"
941
"vadd.i16 q3, q1, q2 /*q3 = v1 + v2*/ \n\t"
942
"vmla.i16 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t"
943
"vrshrn.u16 d9, q3, #4 \n\t"
944
"vrshrn.u16 d8, q0, #6 \n\t"
945
"vst2.8 {d8-d9}, [%[out]] \n\t"
946
: /*no output*/
947
: [out] "r" (dst + x*2),
948
[in0] "r" (lane + x - 1),
949
[in1] "r" (lane + x + 0),
950
[in2] "r" (lane + x + 1),
951
[c6] "w" (vc6u16)
952
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
953
);
954
#else
955
uint16x8_t vLane0 = vld1q_u16(lane + x - 1);
956
uint16x8_t vLane1 = vld1q_u16(lane + x + 0);
957
uint16x8_t vLane2 = vld1q_u16(lane + x + 1);
958
959
vLane0 = vaddq_u16(vLane0, vLane2);
960
vLane2 = vaddq_u16(vLane2, vLane1);
961
vLane0 = vmlaq_u16(vLane0, vLane1, vc6u16);
962
uint8x8x2_t vRes;
963
vRes.val[0] = vrshrn_n_u16(vLane0, 6);
964
vRes.val[1] = vrshrn_n_u16(vLane2, 4);
965
966
vst2_u8(dst + x*2, vRes);
967
#endif
968
}
969
break;
970
case 3:
971
{
972
lim = dcolshn > 23 ? dcolshn - 23 : 0;
973
for (; x < lim; x += 24)
974
{
975
internal::prefetch(lane + x);
976
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
977
__asm__ (
978
"vmov.u16 q9, #6 \n\t"
979
"vld3.16 {d0, d2, d4}, [%[in0]] /*v0*/ \n\t"
980
"vld3.16 {d1, d3, d5}, [%[in02]] \n\t"
981
"vld3.16 {d6, d8, d10}, [%[in2]] /*v2*/ \n\t"
982
"vld3.16 {d7, d9, d11}, [%[in22]] \n\t"
983
"vld3.16 {d12, d14, d16}, [%[in1]] /*v1*/ \n\t"
984
"vld3.16 {d13, d15, d17}, [%[in12]] \n\t"
985
"vadd.i16 q0, q3 /*v0 + v2*/ \n\t"
986
"vadd.i16 q1, q4 /*v0 + v2*/ \n\t"
987
"vadd.i16 q2, q5 /*v0 + v2*/ \n\t"
988
"vadd.i16 q3, q6 /*v1 + v2*/ \n\t"
989
"vadd.i16 q4, q7 /*v1 + v2*/ \n\t"
990
"vadd.i16 q5, q8 /*v1 + v2*/ \n\t"
991
"vmla.i16 q0, q6, q9 /*v0 + v2 + v1*6 */ \n\t"
992
"vmla.i16 q1, q7, q9 /*v0 + v2 + v1*6 */ \n\t"
993
"vmla.i16 q2, q8, q9 /*v0 + v2 + v1*6 */ \n\t"
994
"vrshrn.u16 d19, q3, #4 \n\t"
995
"vrshrn.u16 d21, q4, #4 \n\t"
996
"vrshrn.u16 d23, q5, #4 \n\t"
997
"vrshrn.u16 d18, q0, #6 \n\t"
998
"vrshrn.u16 d20, q1, #6 \n\t"
999
"vrshrn.u16 d22, q2, #6 \n\t"
1000
"vzip.8 d18, d19 \n\t"
1001
"vzip.8 d20, d21 \n\t"
1002
"vzip.8 d22, d23 \n\t"
1003
"vst3.8 {d18, d20, d22}, [%[out1]] \n\t"
1004
"vst3.8 {d19, d21, d23}, [%[out2]] \n\t"
1005
: /*no output*/
1006
: [out1] "r" (dst + 2 * x),
1007
[out2] "r" (dst + 2 * x + 24),
1008
[in0] "r" (lane + x - 3),
1009
[in02] "r" (lane + x + 9),
1010
[in1] "r" (lane + x),
1011
[in12] "r" (lane + x + 12),
1012
[in2] "r" (lane + x + 3),
1013
[in22] "r" (lane + x + 15)
1014
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
1015
);
1016
#else
1017
uint16x8_t vc6 = vmovq_n_u16(6);
1018
uint16x8x3_t vLane0 = vld3q_u16(lane + x - 3);
1019
uint16x8x3_t vLane1 = vld3q_u16(lane + x + 0);
1020
uint16x8x3_t vLane2 = vld3q_u16(lane + x + 3);
1021
1022
uint16x8_t vSum_0_3 = vaddq_u16(vLane0.val[0], vLane2.val[0]);
1023
uint16x8_t vSum_1_4 = vaddq_u16(vLane0.val[1], vLane2.val[1]);
1024
uint16x8_t vSum_2_5 = vaddq_u16(vLane0.val[2], vLane2.val[2]);
1025
uint16x8_t vSum_3_6 = vaddq_u16(vLane2.val[0], vLane1.val[0]);
1026
uint16x8_t vSum_4_7 = vaddq_u16(vLane2.val[1], vLane1.val[1]);
1027
uint16x8_t vSum_5_8 = vaddq_u16(vLane2.val[2], vLane1.val[2]);
1028
1029
vSum_0_3 = vmlaq_u16(vSum_0_3, vLane1.val[0], vc6);
1030
vSum_1_4 = vmlaq_u16(vSum_1_4, vLane1.val[1], vc6);
1031
vSum_2_5 = vmlaq_u16(vSum_2_5, vLane1.val[2], vc6);
1032
1033
uint8x8x2_t vSumShr3;
1034
vSumShr3.val[0] = vrshrn_n_u16(vSum_3_6, 4);
1035
vSumShr3.val[1] = vrshrn_n_u16(vSum_0_3, 6);;
1036
uint8x8x2_t vSumShr4;
1037
vSumShr4.val[0] = vrshrn_n_u16(vSum_4_7, 4);
1038
vSumShr4.val[1] = vrshrn_n_u16(vSum_1_4, 6);
1039
uint8x8x2_t vSumShr5;
1040
vSumShr5.val[0] = vrshrn_n_u16(vSum_5_8, 4);
1041
vSumShr5.val[1] = vrshrn_n_u16(vSum_2_5, 6);
1042
1043
vSumShr3 = vzip_u8(vSumShr3.val[1], vSumShr3.val[0]);
1044
vSumShr4 = vzip_u8(vSumShr4.val[1], vSumShr4.val[0]);
1045
vSumShr5 = vzip_u8(vSumShr5.val[1], vSumShr5.val[0]);
1046
1047
uint8x8x3_t vRes1;
1048
vRes1.val[0] = vSumShr3.val[0];
1049
vRes1.val[1] = vSumShr4.val[0];
1050
vRes1.val[2] = vSumShr5.val[0];
1051
vst3_u8(dst + 2 * x, vRes1);
1052
1053
uint8x8x3_t vRes2;
1054
vRes2.val[0] = vSumShr3.val[1];
1055
vRes2.val[1] = vSumShr4.val[1];
1056
vRes2.val[2] = vSumShr5.val[1];
1057
vst3_u8(dst + 2 * x + 24, vRes2);
1058
#endif
1059
}
1060
}
1061
break;
1062
case 4:
1063
lim = dcolshn > 7 ? dcolshn - 7 : 0;
1064
for (; x < lim; x += 8)
1065
{
1066
internal::prefetch(lane + x);
1067
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
1068
__asm__ (
1069
"vld1.16 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t"
1070
"vld1.16 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t"
1071
"vld1.16 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t"
1072
"vadd.i16 q0, q1 /*q0 = v0 + v2*/ \n\t"
1073
"vadd.i16 q3, q1, q2 /*q3 = v1 + v2*/ \n\t"
1074
"vmla.i16 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t"
1075
"vrshrn.u16 d9, q3, #4 \n\t"
1076
"vrshrn.u16 d8, q0, #6 \n\t"
1077
"vst2.32 {d8-d9}, [%[out]] \n\t"
1078
: /*no output*/
1079
: [out] "r" (dst + x*2),
1080
[in0] "r" (lane + x-4),
1081
[in1] "r" (lane + x),
1082
[in2] "r" (lane + x+4),
1083
[c6] "w" (vc6u16)
1084
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
1085
);
1086
#else
1087
uint16x8_t vLane0 = vld1q_u16(lane + x-4);
1088
uint16x8_t vLane1 = vld1q_u16(lane + x+0);
1089
uint16x8_t vLane2 = vld1q_u16(lane + x+4);
1090
1091
vLane0 = vaddq_u16(vLane0, vLane2);
1092
vLane2 = vaddq_u16(vLane2, vLane1);
1093
vLane0 = vmlaq_u16(vLane0, vLane1, vc6u16);
1094
uint32x2x2_t vRes;
1095
vRes.val[1] = vreinterpret_u32_u8(vrshrn_n_u16(vLane2, 4));
1096
vRes.val[0] = vreinterpret_u32_u8(vrshrn_n_u16(vLane0, 6));
1097
1098
vst2_u32((uint32_t*)(dst + x*2), vRes);
1099
#endif
1100
}
1101
break;
1102
};
1103
1104
for (u32 h = 0; h < cn; ++h)
1105
{
1106
const u16* ln = lane + h;
1107
u8* dt = dst + h;
1108
size_t k = x;
1109
for (; k < dcolshn; k += cn)
1110
{
1111
dt[2*k+0] = u8((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6u * ln[k] + (1 << 5)) >> 6);
1112
dt[2*k+cn] = u8(((ln[k] + ln[k+cn]) * 4u + (1 << 5)) >> 6);
1113
}
1114
for (; k < dcolshw; k += cn)
1115
dt[2*k] = u8((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6u * ln[k] + (1 << 5)) >> 6);
1116
}
1117
dst = internal::getRowPtr(dstBase, dstStride, 2*i+1);
1118
1119
//second row
1120
if (lane == lane0 && 2*i+1 < dstSize.height)
1121
{
1122
lane = lane1;
1123
goto pyrUp8uHorizontalConvolution;
1124
}
1125
}
1126
#else
1127
// Remove 'unused parameter' warnings.
1128
(void)srcBase;
1129
(void)srcStride;
1130
(void)dstBase;
1131
(void)dstStride;
1132
#endif
1133
}
1134
1135
void gaussianPyramidUp(const Size2D &srcSize,
1136
const s16 *srcBase, ptrdiff_t srcStride,
1137
const Size2D &dstSize,
1138
s16 *dstBase, ptrdiff_t dstStride, u8 cn)
1139
{
1140
internal::assertSupportedConfiguration(isGaussianPyramidUpS16Supported(srcSize, dstSize, cn));
1141
#ifdef CAROTENE_NEON
1142
size_t dcolshn = (dstSize.width/2) * cn;
1143
size_t dcolshw = ((dstSize.width+1)/2) * cn;
1144
size_t scolsn = srcSize.width*cn;
1145
1146
size_t idx_l = (borderInterpolate101(-2, 2 * srcSize.width)/2) * cn;
1147
size_t idx_r1 = (borderInterpolate101(2 * srcSize.width + 0, 2 * srcSize.width)/2) * cn;
1148
size_t idx_r2 = (borderInterpolate101(2 * srcSize.width + 2, 2 * srcSize.width + 2)/2) * cn;
1149
1150
//2-lines buffer
1151
std::vector<s32> _buf(2*(cn*(srcSize.width + 3) + 32/sizeof(s32)));
1152
s32* lane0 = internal::alignPtr(&_buf[cn], 32);
1153
s32* lane1 = internal::alignPtr(lane0 + (3 + srcSize.width)*cn, 32);
1154
1155
int16x4_t vc6s16 = vmov_n_s16(6);
1156
int32x4_t vc6s32 = vmovq_n_s32(6);
1157
1158
for (size_t i = 0; i < (dstSize.height + 1)/2; ++i)
1159
{
1160
s16* dst = internal::getRowPtr(dstBase, dstStride, 2*i);
1161
//vertical convolution
1162
const s16* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 - 2, srcSize.height * 2)/2);
1163
const s16* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 0, srcSize.height * 2)/2);
1164
const s16* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 2, srcSize.height * 2)/2);
1165
1166
size_t x = 0;
1167
for (; x <= scolsn - 4; x += 4)
1168
{
1169
internal::prefetch(internal::getRowPtr(ln1 + x, srcStride, (ptrdiff_t)x % 3 - 1));
1170
int16x4_t v0 = vld1_s16(ln0 + x);
1171
int16x4_t v2 = vld1_s16(ln2 + x);
1172
int16x4_t v1 = vld1_s16(ln1 + x);
1173
1174
int32x4_t vl0 = vaddl_s16(v0, v2);
1175
int32x4_t vl1 = vaddl_s16(v1, v2);
1176
1177
vl0 = vmlal_s16(vl0, v1, vc6s16);
1178
vl1 = vshlq_n_s32(vl1, 2);
1179
1180
vst1q_s32(lane0 + x, vl0);
1181
vst1q_s32(lane1 + x, vl1);
1182
}
1183
for (; x < scolsn; ++x)
1184
{
1185
lane0[x] = ln0[x] + ln2[x] + 6 * ln1[x];
1186
lane1[x] = 4 * (ln1[x] + ln2[x]);
1187
}
1188
1189
//left&right borders
1190
for (u32 k = 0; k < cn; ++k)
1191
{
1192
lane0[(s32)(-cn+k)] = lane0[idx_l + k];
1193
lane1[(s32)(-cn+k)] = lane1[idx_l + k];
1194
1195
lane0[scolsn+k] = lane0[idx_r1 + k];
1196
lane0[scolsn+cn+k] = lane0[idx_r2 + k];
1197
lane1[scolsn+k] = lane1[idx_r1 + k];
1198
lane1[scolsn+cn+k] = lane1[idx_r2 + k];
1199
}
1200
1201
//horizontal convolution
1202
const s32* lane = lane0;
1203
pyrUp16sHorizontalConvolution:
1204
x = 0;
1205
size_t lim;
1206
switch(cn)
1207
{
1208
case 1:
1209
lim = dcolshn > 3 ? dcolshn - 3 : 0;
1210
for (; x < lim; x += 4)
1211
{
1212
internal::prefetch(lane + x);
1213
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
1214
__asm__ (
1215
"vld1.32 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t"
1216
"vld1.32 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t"
1217
"vld1.32 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t"
1218
"vadd.i32 q0, q0, q1 /*q0 = v0 + v2*/ \n\t"
1219
"vadd.i32 q3, q1, q2 /*q3 = v1 + v2*/ \n\t"
1220
"vmla.i32 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t"
1221
"vrshrn.s32 d9, q3, #4 \n\t"
1222
"vrshrn.s32 d8, q0, #6 \n\t"
1223
"vst2.16 {d8-d9}, [%[out]] \n\t"
1224
: /*no output*/
1225
: [out] "r" (dst + x * 2),
1226
[in0] "r" (lane + x - 1),
1227
[in1] "r" (lane + x),
1228
[in2] "r" (lane + x + 1),
1229
[c6] "w" (vc6s32)
1230
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
1231
);
1232
#else
1233
int32x4_t vLane0 = vld1q_s32(lane + x - 1);
1234
int32x4_t vLane1 = vld1q_s32(lane + x);
1235
int32x4_t vLane2 = vld1q_s32(lane + x + 1);
1236
1237
vLane0 = vaddq_s32(vLane0, vLane2);
1238
vLane2 = vaddq_s32(vLane2, vLane1);
1239
vLane0 = vmlaq_s32(vLane0, vLane1, vc6s32);
1240
int16x4x2_t vRes;
1241
vRes.val[0] = vrshrn_n_s32(vLane0, 6);
1242
vRes.val[1] = vrshrn_n_s32(vLane2, 4);
1243
1244
vst2_s16(dst + x * 2, vRes);
1245
#endif
1246
}
1247
break;
1248
case 3:
1249
{
1250
lim = dcolshn > 11 ? dcolshn - 11 : 0;
1251
for (; x < lim; x += 12)
1252
{
1253
internal::prefetch(lane + x + 3);
1254
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
1255
__asm__ (
1256
"vmov.s32 q9, #6 \n\t"
1257
"vld3.32 {d0, d2, d4}, [%[in0]] /*v0*/ \n\t"
1258
"vld3.32 {d1, d3, d5}, [%[in2]] \n\t"
1259
"vld3.32 {d6, d8, d10}, [%[in2]] /*v2*/ \n\t"
1260
"vld3.32 {d7, d9, d11}, [%[in22]] \n\t"
1261
"vld3.32 {d12, d14, d16}, [%[in1]] /*v1*/ \n\t"
1262
"vld3.32 {d13, d15, d17}, [%[in12]] \n\t"
1263
"vadd.i32 q0, q3 /*v0 + v2*/ \n\t"
1264
"vadd.i32 q1, q4 /*v0 + v2*/ \n\t"
1265
"vadd.i32 q2, q5 /*v0 + v2*/ \n\t"
1266
"vadd.i32 q3, q6 /*v1 + v2*/ \n\t"
1267
"vadd.i32 q4, q7 /*v1 + v2*/ \n\t"
1268
"vadd.i32 q5, q8 /*v1 + v2*/ \n\t"
1269
"vmla.i32 q0, q6, q9 /*v0 + v2 + v1*6 */ \n\t"
1270
"vmla.i32 q1, q7, q9 /*v0 + v2 + v1*6 */ \n\t"
1271
"vmla.i32 q2, q8, q9 /*v0 + v2 + v1*6 */ \n\t"
1272
"vrshrn.s32 d19, q3, #4 \n\t"
1273
"vrshrn.s32 d21, q4, #4 \n\t"
1274
"vrshrn.s32 d23, q5, #4 \n\t"
1275
"vrshrn.s32 d18, q0, #6 \n\t"
1276
"vrshrn.s32 d20, q1, #6 \n\t"
1277
"vrshrn.s32 d22, q2, #6 \n\t"
1278
"vzip.16 d18, d19 \n\t"
1279
"vzip.16 d20, d21 \n\t"
1280
"vzip.16 d22, d23 \n\t"
1281
"vst3.16 {d18, d20, d22}, [%[out1]] \n\t"
1282
"vst3.16 {d19, d21, d23}, [%[out2]] \n\t"
1283
: /*no output*/
1284
: [out1] "r" (dst + 2*x),
1285
[out2] "r" (dst + 2*x + 12),
1286
[in0] "r" (lane + x - 3),
1287
[in1] "r" (lane + x),
1288
[in12] "r" (lane + x + 6),
1289
[in2] "r" (lane + x + 3),
1290
[in22] "r" (lane + x + 9)
1291
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
1292
);
1293
#else
1294
int32x4_t vc6 = vmovq_n_s32(6);
1295
int32x4x3_t vLane0 = vld3q_s32(lane + x - 3);
1296
int32x4x3_t vLane1 = vld3q_s32(lane + x);
1297
int32x4x3_t vLane2 = vld3q_s32(lane + x + 3);
1298
1299
int32x4_t vSum_0_3 = vaddq_s32(vLane0.val[0], vLane2.val[0]);
1300
int32x4_t vSum_1_4 = vaddq_s32(vLane0.val[1], vLane2.val[1]);
1301
int32x4_t vSum_2_5 = vaddq_s32(vLane0.val[2], vLane2.val[2]);
1302
int32x4_t vSum_3_6 = vaddq_s32(vLane2.val[0], vLane1.val[0]);
1303
int32x4_t vSum_4_7 = vaddq_s32(vLane2.val[1], vLane1.val[1]);
1304
int32x4_t vSum_5_8 = vaddq_s32(vLane2.val[2], vLane1.val[2]);
1305
1306
vSum_0_3 = vmlaq_s32(vSum_0_3, vLane1.val[0], vc6);
1307
vSum_1_4 = vmlaq_s32(vSum_1_4, vLane1.val[1], vc6);
1308
vSum_2_5 = vmlaq_s32(vSum_2_5, vLane1.val[2], vc6);
1309
1310
int16x4x2_t vSumShr1;
1311
vSumShr1.val[1] = vrshrn_n_s32(vSum_3_6, 4);
1312
vSumShr1.val[0] = vrshrn_n_s32(vSum_0_3, 6);
1313
1314
int16x4x2_t vSumShr2;
1315
vSumShr2.val[1] = vrshrn_n_s32(vSum_4_7, 4);
1316
vSumShr2.val[0] = vrshrn_n_s32(vSum_1_4, 6);
1317
1318
int16x4x2_t vSumShr3;
1319
vSumShr3.val[1] = vrshrn_n_s32(vSum_5_8, 4);
1320
vSumShr3.val[0] = vrshrn_n_s32(vSum_2_5, 6);
1321
1322
vSumShr1 = vzip_s16(vSumShr1.val[0], vSumShr1.val[1]);
1323
vSumShr2 = vzip_s16(vSumShr2.val[0], vSumShr2.val[1]);
1324
vSumShr3 = vzip_s16(vSumShr3.val[0], vSumShr3.val[1]);
1325
1326
int16x4x3_t vRes1;
1327
vRes1.val[0] = vSumShr1.val[0];
1328
vRes1.val[1] = vSumShr2.val[0];
1329
vRes1.val[2] = vSumShr3.val[0];
1330
vst3_s16((int16_t*)(dst + 2 * x), vRes1);
1331
1332
int16x4x3_t vRes2;
1333
vRes2.val[0] = vSumShr1.val[1];
1334
vRes2.val[1] = vSumShr2.val[1];
1335
vRes2.val[2] = vSumShr3.val[1];
1336
vst3_s16(dst + 2 * x + 12, vRes2);
1337
#endif
1338
}
1339
}
1340
break;
1341
case 4:
1342
lim = dcolshn > 3 ? dcolshn - 3 : 0;
1343
for (; x < lim; x += 4)
1344
{
1345
internal::prefetch(lane + x);
1346
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
1347
__asm__ (
1348
"vld1.32 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t"
1349
"vld1.32 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t"
1350
"vld1.32 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t"
1351
"vadd.i32 q0, q1 /*q0 = v0 + v2*/ \n\t"
1352
"vadd.i32 q3, q1, q2 /*q3 = v1 + v2*/ \n\t"
1353
"vmla.i32 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t"
1354
"vrshrn.s32 d9, q3, #4 \n\t"
1355
"vrshrn.s32 d8, q0, #6 \n\t"
1356
"vst1.16 {d8-d9}, [%[out]] \n\t"
1357
: /*no output*/
1358
: [out] "r" (dst + x * 2),
1359
[in0] "r" (lane + x - 4),
1360
[in1] "r" (lane + x),
1361
[in2] "r" (lane + x + 4),
1362
[c6] "w" (vc6s32)
1363
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
1364
);
1365
#else
1366
int32x4_t vLane0 = vld1q_s32(lane + x - 4);
1367
int32x4_t vLane1 = vld1q_s32(lane + x);
1368
int32x4_t vLane2 = vld1q_s32(lane + x + 4);
1369
1370
vLane0 = vaddq_s32(vLane0, vLane2);
1371
vLane2 = vaddq_s32(vLane2, vLane1);
1372
vLane0 = vmlaq_s32(vLane0, vLane1, vc6s32);
1373
int16x4x2_t vRes;
1374
vRes.val[0] = vrshrn_n_s32(vLane0, 6);
1375
vRes.val[1] = vrshrn_n_s32(vLane2, 4);
1376
1377
vst1q_s16(dst + x * 2, vcombine_s16(vRes.val[0], vRes.val[1]));
1378
#endif
1379
}
1380
break;
1381
};
1382
1383
for (u32 h = 0; h < cn; ++h)
1384
{
1385
const s32* ln = lane + h;
1386
s16* dt = dst + h;
1387
size_t k = x;
1388
for (; k < dcolshn; k += cn)
1389
{
1390
dt[2*k+0] = s16((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6 * ln[k] + (1 << 5)) >> 6);
1391
dt[2*k+cn] = s16(((ln[k] + ln[k+cn]) * 4 + (1 << 5)) >> 6);
1392
}
1393
for (; k < dcolshw; k += cn)
1394
dt[2*k] = s16((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6 * ln[k] + (1 << 5)) >> 6);
1395
}
1396
dst = internal::getRowPtr(dstBase, dstStride, 2*i+1);
1397
1398
//second row
1399
if (lane == lane0 && 2*i+1 < dstSize.height)
1400
{
1401
lane = lane1;
1402
goto pyrUp16sHorizontalConvolution;
1403
}
1404
}
1405
#else
1406
// Remove 'unused parameter' warnings.
1407
(void)srcBase;
1408
(void)srcStride;
1409
(void)dstBase;
1410
(void)dstStride;
1411
#endif
1412
}
1413
1414
} // namespace CAROTENE_NS
1415
1416