Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/resize.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
#include "vtransform.hpp"
42
43
#include <cmath>
44
#include <vector>
45
#include <algorithm>
46
47
namespace CAROTENE_NS {
48
49
bool isResizeNearestNeighborSupported(const Size2D &ssize, u32 elemSize)
50
{
51
#if SIZE_MAX <= UINT32_MAX
52
(void)ssize;
53
#endif
54
bool supportedElemSize = (elemSize == 1) || (elemSize == 3) || (elemSize == 4);
55
return isSupportedConfiguration()
56
#if SIZE_MAX > UINT32_MAX
57
&& !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF)// Restrict image size since internally used resizeGeneric performs
58
// index evaluation with u32
59
#endif
60
&& supportedElemSize;
61
}
62
63
bool isResizeAreaSupported(f32 wr, f32 hr, u32 channels)
64
{
65
bool supportedRatio = false;
66
67
if (channels == 1)
68
supportedRatio = (hr == wr) && ((wr == 2.0f) || (wr == 4.0f) || (wr == 0.5));
69
else if (channels == 3)
70
supportedRatio = (hr == wr) && ((wr == 2.0f) || (wr == 4.0f) || (wr == 0.5f));
71
else if (channels == 4)
72
supportedRatio = (hr == wr) && ((wr == 2.0f) || (wr == 4.0f) || (wr == 0.5f));
73
74
return isSupportedConfiguration() && supportedRatio;
75
}
76
77
bool isResizeLinearSupported(const Size2D &ssize, const Size2D &dsize,
78
f32 wr, f32 hr, u32 channels)
79
{
80
if ((wr <= 2.0f) && (hr <= 2.0f))
81
{
82
bool channelsSupport = (channels == 1) || (channels == 3) || (channels == 4);
83
return (ssize.width >= 16) && (dsize.height >= 8) &&
84
(dsize.width >= 8) && channelsSupport;
85
}
86
87
return false;
88
}
89
90
bool isResizeLinearOpenCVSupported(const Size2D &ssize, const Size2D &dsize, u32 channels)
91
{
92
switch(channels)
93
{
94
case 1:
95
if (ssize.width >= 8
96
#if SIZE_MAX > UINT32_MAX
97
&& !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF)// Restrict image size since internal index evaluation
98
// is performed with u32
99
#endif
100
&& dsize.width >= 8 && dsize.height >= 8)
101
return isSupportedConfiguration();
102
return false;
103
case 4:
104
if (ssize.width >= 2
105
#if SIZE_MAX > UINT32_MAX
106
&& !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF)// Restrict image size since internal index evaluation
107
// is performed with u32
108
#endif
109
&& dsize.width >= 2 && dsize.height >= 8
110
&& (2*dsize.width != ssize.width || 2*dsize.height != ssize.height)) // 2x downscaling is performed as area in OpenCV which differs from this implementation
111
return isSupportedConfiguration();
112
default:
113
return false;
114
};
115
}
116
117
#ifdef CAROTENE_NEON
118
119
namespace {
120
121
u32 * calcLUT(size_t size, f32 ratio,
122
std::vector<u32> & _ofs)
123
{
124
_ofs.resize(size);
125
u32 * ofs = &_ofs[0];
126
127
size_t roiw8 = size >= 7 ? size - 7 : 0;
128
size_t roiw4 = size >= 3 ? size - 3 : 0;
129
size_t x = 0;
130
131
f32 indeces[4] = { 0, 1, 2, 3 };
132
float32x4_t v_index = vld1q_f32(indeces), v_inc = vdupq_n_f32(4);
133
float32x4_t v_05 = vdupq_n_f32(0.5f), v_ratio = vdupq_n_f32(ratio);
134
135
for ( ; x < roiw8; x += 8)
136
{
137
float32x4_t v_dstf = vmulq_f32(vaddq_f32(v_index, v_05), v_ratio);
138
vst1q_u32(ofs + x, vcvtq_u32_f32(v_dstf));
139
v_index = vaddq_f32(v_index, v_inc);
140
141
v_dstf = vmulq_f32(vaddq_f32(v_index, v_05), v_ratio);
142
vst1q_u32(ofs + x + 4, vcvtq_u32_f32(v_dstf));
143
v_index = vaddq_f32(v_index, v_inc);
144
}
145
146
for ( ; x < roiw4; x += 4)
147
{
148
float32x4_t v_dstf = vmulq_f32(vaddq_f32(v_index, v_05), v_ratio);
149
vst1q_u32(ofs + x, vcvtq_u32_f32(v_dstf));
150
v_index = vaddq_f32(v_index, v_inc);
151
}
152
153
for ( ; x < size; ++x)
154
{
155
ofs[x] = static_cast<u32>(floorf((x + 0.5f) * ratio));
156
}
157
158
return ofs;
159
}
160
161
template <typename T>
162
void resizeGeneric(const Size2D &dsize,
163
const void * srcBase, ptrdiff_t srcStride,
164
void * dstBase, ptrdiff_t dstStride,
165
f32 wr, f32 hr)
166
{
167
std::vector<u32> _x_ofs;
168
u32 * x_ofs = calcLUT(dsize.width, wr, _x_ofs);//32bit LUT is used so we could get issues on src image dimensions greater than (2^32-1)
169
170
for (size_t dst_y = 0; dst_y < dsize.height; ++dst_y)
171
{
172
size_t src_y = static_cast<size_t>(floorf((dst_y + 0.5f) * hr));
173
const T * src = internal::getRowPtr(static_cast<const T *>(srcBase), srcStride, src_y);
174
T * dst = internal::getRowPtr(static_cast<T *>(dstBase), dstStride, dst_y);
175
176
for (size_t dst_x = 0; dst_x < dsize.width; ++dst_x)
177
{
178
internal::prefetch(src + dst_x);
179
dst[dst_x] = src[x_ofs[dst_x]];
180
}
181
}
182
}
183
184
typedef struct _24bit_
185
{
186
u8 a[3];
187
} _24bit;
188
189
} // namespace
190
191
192
#endif
193
194
void resizeNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
195
const void * srcBase, ptrdiff_t srcStride,
196
void * dstBase, ptrdiff_t dstStride,
197
f32 wr, f32 hr, u32 elemSize)
198
{
199
internal::assertSupportedConfiguration(wr > 0 && hr > 0 &&
200
(dsize.width - 0.5) * wr < ssize.width &&
201
(dsize.height - 0.5) * hr < ssize.height && // Ensure we have enough source data
202
(dsize.width + 0.5) * wr >= ssize.width &&
203
(dsize.height + 0.5) * hr >= ssize.height && // Ensure source isn't too big
204
isResizeNearestNeighborSupported(ssize, elemSize));
205
#ifdef CAROTENE_NEON
206
207
if (elemSize == 1)
208
{
209
resizeGeneric<u8>(dsize,
210
srcBase, srcStride,
211
dstBase, dstStride,
212
wr, hr);
213
}
214
else if (elemSize == 3)
215
{
216
resizeGeneric<_24bit>(dsize,
217
srcBase, srcStride,
218
dstBase, dstStride,
219
wr, hr);
220
}
221
else if (elemSize == 4)
222
{
223
resizeGeneric<u32>(dsize,
224
srcBase, srcStride,
225
dstBase, dstStride,
226
wr, hr);
227
}
228
229
#else
230
(void)dsize;
231
(void)srcBase;
232
(void)srcStride;
233
(void)dstBase;
234
(void)dstStride;
235
(void)wr;
236
(void)hr;
237
#endif
238
}
239
240
#ifdef CAROTENE_NEON
241
template <bool opencv_like, int shiftsize>
242
inline uint8x8_t areaDownsamplingDivision(uint16x8_t data)
243
{
244
return vshrn_n_u16(data, shiftsize);
245
}
246
template <>
247
inline uint8x8_t areaDownsamplingDivision<true,2>(uint16x8_t data)
248
{
249
// rounding
250
return vrshrn_n_u16(data,2);
251
}
252
template <>
253
inline uint8x8_t areaDownsamplingDivision<true,4>(uint16x8_t data)
254
{
255
// bankers rounding
256
return vrshrn_n_u16(vqsubq_u16(data, vshrq_n_u16(vbicq_u16(vdupq_n_u16(1<<4), data), 4)),4);
257
}
258
259
template <bool opencv_like, int shiftsize>
260
inline u8 areaDownsamplingDivision(u16 data)
261
{
262
return data >> shiftsize;
263
}
264
template <>
265
inline u8 areaDownsamplingDivision<true,2>(u16 data)
266
{
267
// rounding
268
return (data + 2) >> 2;
269
}
270
template <>
271
inline u8 areaDownsamplingDivision<true,4>(u16 data)
272
{
273
// bankers rounding
274
return (data - (((1<<4) & ~data) >> 4) + 8) >> 4;
275
}
276
#endif
277
278
template <bool opencv_like>
279
inline void resizeAreaRounding(const Size2D &ssize, const Size2D &dsize,
280
const u8 * srcBase, ptrdiff_t srcStride,
281
u8 * dstBase, ptrdiff_t dstStride,
282
f32 wr, f32 hr, u32 channels)
283
{
284
internal::assertSupportedConfiguration(isResizeAreaSupported(wr, hr, channels) &&
285
std::abs(dsize.width * wr - ssize.width) < 0.1 &&
286
std::abs(dsize.height * hr - ssize.height) < 0.1);
287
#ifdef CAROTENE_NEON
288
if (channels == 1)
289
{
290
if ((wr == 2.0f) && (hr == 2.0f))
291
{
292
size_t roiw8 = dsize.width >= 7 ? dsize.width - 7 : 0;
293
294
for (size_t i = 0; i < dsize.height; ++i)
295
{
296
const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 1);
297
const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 1) + 1);
298
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
299
size_t sj = 0, dj = 0;
300
301
for ( ; dj < roiw8; dj += 8, sj += 16)
302
{
303
internal::prefetch(src0_row + sj);
304
internal::prefetch(src1_row + sj);
305
306
uint16x8_t vSum1 = vpaddlq_u8(vld1q_u8(src0_row + sj));
307
uint16x8_t vSum2 = vpaddlq_u8(vld1q_u8(src1_row + sj));
308
uint8x8_t vRes1 = areaDownsamplingDivision<opencv_like,2>(vaddq_u16(vSum1, vSum2));
309
310
vst1_u8(dst_row + dj, vRes1);
311
}
312
313
for ( ; dj < dsize.width; ++dj, sj += 2)
314
{
315
dst_row[dj] = areaDownsamplingDivision<opencv_like,2>(
316
(u16)src0_row[sj] + src0_row[sj + 1] +
317
src1_row[sj] + src1_row[sj + 1]);
318
}
319
}
320
}
321
else if ((wr == 0.5f) && (hr == 0.5f))
322
{
323
size_t roiw32 = dsize.width >= 31 ? dsize.width - 31 : 0;
324
size_t roiw16 = dsize.width >= 15 ? dsize.width - 15 : 0;
325
326
for (size_t i = 0; i < dsize.height; i += 2)
327
{
328
const u8 * src_row = internal::getRowPtr(srcBase, srcStride, i >> 1);
329
u8 * dst0_row = internal::getRowPtr(dstBase, dstStride, i);
330
u8 * dst1_row = internal::getRowPtr(dstBase, dstStride, std::min(i + 1, dsize.height - 1));
331
size_t sj = 0, dj = 0;
332
333
for ( ; dj < roiw32; dj += 32, sj += 16)
334
{
335
internal::prefetch(src_row + sj);
336
337
uint8x16x2_t v_dst;
338
v_dst.val[0] = v_dst.val[1] = vld1q_u8(src_row + sj);
339
340
vst2q_u8(dst0_row + dj, v_dst);
341
vst2q_u8(dst1_row + dj, v_dst);
342
}
343
344
for ( ; dj < roiw16; dj += 16, sj += 8)
345
{
346
uint8x8x2_t v_dst;
347
v_dst.val[0] = v_dst.val[1] = vld1_u8(src_row + sj);
348
349
vst2_u8(dst0_row + dj, v_dst);
350
vst2_u8(dst1_row + dj, v_dst);
351
}
352
353
for ( ; dj < dsize.width; dj += 2, ++sj)
354
{
355
u8 src_val = src_row[sj];
356
dst0_row[dj] = dst0_row[dj + 1] = src_val;
357
dst1_row[dj] = dst1_row[dj + 1] = src_val;
358
}
359
}
360
}
361
else //if ((wr == 4.0f) && (hr == 4.0f)) //the only scale that lasts after isSupported check
362
{
363
#ifndef __ANDROID__
364
size_t roiw16 = dsize.width >= 15 ? dsize.width - 15 : 0;
365
#endif
366
size_t roiw8 = dsize.width >= 7 ? dsize.width - 7 : 0;
367
368
for (size_t i = 0; i < dsize.height; ++i)
369
{
370
const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 2);
371
const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 1);
372
const u8 * src2_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 2);
373
const u8 * src3_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 3);
374
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
375
size_t sj = 0, dj = 0;
376
377
#ifndef __ANDROID__
378
for ( ; dj < roiw16; dj += 16, sj += 64)
379
{
380
internal::prefetch(src0_row + sj);
381
internal::prefetch(src1_row + sj);
382
internal::prefetch(src2_row + sj);
383
internal::prefetch(src3_row + sj);
384
385
uint8x16x4_t vLane1 = vld4q_u8(src0_row + sj);
386
uint8x16x4_t vLane2 = vld4q_u8(src1_row + sj);
387
uint8x16x4_t vLane3 = vld4q_u8(src2_row + sj);
388
uint8x16x4_t vLane4 = vld4q_u8(src3_row + sj);
389
390
uint16x8_t vSum_0 = vaddl_u8(vget_low_u8(vLane1.val[0]), vget_low_u8(vLane1.val[1]));
391
vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane1.val[2]), vget_low_u8(vLane1.val[3])));
392
vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane2.val[0]), vget_low_u8(vLane2.val[1])));
393
vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane2.val[2]), vget_low_u8(vLane2.val[3])));
394
vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane3.val[0]), vget_low_u8(vLane3.val[1])));
395
vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane3.val[2]), vget_low_u8(vLane3.val[3])));
396
vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane4.val[0]), vget_low_u8(vLane4.val[1])));
397
vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane4.val[2]), vget_low_u8(vLane4.val[3])));
398
399
uint16x8_t vSum_1 = vaddl_u8(vget_high_u8(vLane1.val[0]), vget_high_u8(vLane1.val[1]));
400
vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane1.val[2]), vget_high_u8(vLane1.val[3])));
401
vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane2.val[0]), vget_high_u8(vLane2.val[1])));
402
vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane2.val[2]), vget_high_u8(vLane2.val[3])));
403
vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane3.val[0]), vget_high_u8(vLane3.val[1])));
404
vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane3.val[2]), vget_high_u8(vLane3.val[3])));
405
vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane4.val[0]), vget_high_u8(vLane4.val[1])));
406
vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane4.val[2]), vget_high_u8(vLane4.val[3])));
407
408
uint8x8_t vRes_0 = areaDownsamplingDivision<opencv_like,4>(vSum_0);
409
uint8x8_t vRes_1 = areaDownsamplingDivision<opencv_like,4>(vSum_1);
410
411
vst1q_u8(dst_row + dj, vcombine_u8(vRes_0, vRes_1));
412
}
413
#endif
414
415
for ( ; dj < roiw8; dj += 8, sj += 32)
416
{
417
internal::prefetch(src0_row + sj);
418
internal::prefetch(src1_row + sj);
419
internal::prefetch(src2_row + sj);
420
internal::prefetch(src3_row + sj);
421
422
uint8x8x4_t vLane1 = vld4_u8(src0_row + sj);
423
uint8x8x4_t vLane2 = vld4_u8(src1_row + sj);
424
uint8x8x4_t vLane3 = vld4_u8(src2_row + sj);
425
uint8x8x4_t vLane4 = vld4_u8(src3_row + sj);
426
427
uint16x8_t vSum = vaddl_u8(vLane1.val[0], vLane1.val[1]);
428
vSum = vaddq_u16(vSum, vaddl_u8(vLane1.val[2], vLane1.val[3]));
429
vSum = vaddq_u16(vSum, vaddl_u8(vLane2.val[0], vLane2.val[1]));
430
vSum = vaddq_u16(vSum, vaddl_u8(vLane2.val[2], vLane2.val[3]));
431
vSum = vaddq_u16(vSum, vaddl_u8(vLane3.val[0], vLane3.val[1]));
432
vSum = vaddq_u16(vSum, vaddl_u8(vLane3.val[2], vLane3.val[3]));
433
vSum = vaddq_u16(vSum, vaddl_u8(vLane4.val[0], vLane4.val[1]));
434
vSum = vaddq_u16(vSum, vaddl_u8(vLane4.val[2], vLane4.val[3]));
435
436
vst1_u8(dst_row + dj, (areaDownsamplingDivision<opencv_like,4>(vSum)));
437
}
438
439
for ( ; dj < dsize.width; ++dj, sj += 4)
440
{
441
dst_row[dj] = areaDownsamplingDivision<opencv_like,4>(
442
(u16)src0_row[sj] + src0_row[sj + 1] + src0_row[sj + 2] + src0_row[sj + 3] +
443
src1_row[sj] + src1_row[sj + 1] + src1_row[sj + 2] + src1_row[sj + 3] +
444
src2_row[sj] + src2_row[sj + 1] + src2_row[sj + 2] + src2_row[sj + 3] +
445
src3_row[sj] + src3_row[sj + 1] + src3_row[sj + 2] + src3_row[sj + 3]);
446
}
447
}
448
}
449
}
450
else if (channels == 4)
451
{
452
if ((wr == 2.0f) && (hr == 2.0f))
453
{
454
#ifndef __ANDROID__
455
size_t roiw4 = dsize.width >= 3 ? (dsize.width - 3) << 2 : 0;
456
#endif
457
size_t roiw2 = dsize.width >= 1 ? (dsize.width - 1) << 2 : 0;
458
459
for (size_t i = 0; i < dsize.height; ++i)
460
{
461
const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 1);
462
const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 1) + 1);
463
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
464
size_t sj = 0, dj = 0;
465
466
#ifndef __ANDROID__
467
for ( ; dj < roiw4; dj += 16, sj += 32)
468
{
469
internal::prefetch(src0_row + sj);
470
internal::prefetch(src1_row + sj);
471
472
uint8x8_t vRes_0, vRes_1;
473
474
{
475
uint8x16_t vLane1 = vld1q_u8(src0_row + sj);
476
uint8x16_t vLane2 = vld1q_u8(src1_row + sj);
477
478
uint16x8_t vLane_l = vaddl_u8(vget_low_u8(vLane1), vget_low_u8(vLane2));
479
uint16x8_t vLane_h = vaddl_u8(vget_high_u8(vLane1), vget_high_u8(vLane2));
480
481
uint16x4_t vSum_l = vadd_u16(vget_low_u16(vLane_l), vget_high_u16(vLane_l));
482
uint16x4_t vSum_h = vadd_u16(vget_low_u16(vLane_h), vget_high_u16(vLane_h));
483
484
vRes_0 = areaDownsamplingDivision<opencv_like,2>(vcombine_u16(vSum_l, vSum_h));
485
}
486
487
{
488
uint8x16_t vLane1 = vld1q_u8(src0_row + sj + 16);
489
uint8x16_t vLane2 = vld1q_u8(src1_row + sj + 16);
490
491
uint16x8_t vLane_l = vaddl_u8(vget_low_u8(vLane1), vget_low_u8(vLane2));
492
uint16x8_t vLane_h = vaddl_u8(vget_high_u8(vLane1), vget_high_u8(vLane2));
493
494
uint16x4_t vSum_l = vadd_u16(vget_low_u16(vLane_l), vget_high_u16(vLane_l));
495
uint16x4_t vSum_h = vadd_u16(vget_low_u16(vLane_h), vget_high_u16(vLane_h));
496
497
vRes_1 = areaDownsamplingDivision<opencv_like,2>(vcombine_u16(vSum_l, vSum_h));
498
}
499
500
vst1q_u8(dst_row + dj, vcombine_u8(vRes_0, vRes_1));
501
}
502
#endif
503
504
for ( ; dj < roiw2; dj += 8, sj += 16)
505
{
506
internal::prefetch(src0_row + sj);
507
internal::prefetch(src1_row + sj);
508
509
uint8x16_t vLane1 = vld1q_u8(src0_row + sj);
510
uint8x16_t vLane2 = vld1q_u8(src1_row + sj);
511
512
uint16x8_t vLane_l = vaddl_u8(vget_low_u8(vLane1), vget_low_u8(vLane2));
513
uint16x8_t vLane_h = vaddl_u8(vget_high_u8(vLane1), vget_high_u8(vLane2));
514
515
uint16x4_t vSum_l = vadd_u16(vget_low_u16(vLane_l), vget_high_u16(vLane_l));
516
uint16x4_t vSum_h = vadd_u16(vget_low_u16(vLane_h), vget_high_u16(vLane_h));
517
518
uint8x8_t vRes = areaDownsamplingDivision<opencv_like,2>(vcombine_u16(vSum_l, vSum_h));
519
vst1_u8(dst_row + dj, vRes);
520
}
521
522
for (size_t dwidth = dsize.width << 2; dj < dwidth; dj += 4, sj += 8)
523
{
524
dst_row[dj ] = areaDownsamplingDivision<opencv_like,2>(
525
(u16)src0_row[sj ] + src0_row[sj + 4] +
526
src1_row[sj ] + src1_row[sj + 4]);
527
dst_row[dj + 1] = areaDownsamplingDivision<opencv_like,2>(
528
(u16)src0_row[sj + 1] + src0_row[sj + 5] +
529
src1_row[sj + 1] + src1_row[sj + 5]);
530
dst_row[dj + 2] = areaDownsamplingDivision<opencv_like,2>(
531
(u16)src0_row[sj + 2] + src0_row[sj + 6] +
532
src1_row[sj + 2] + src1_row[sj + 6]);
533
dst_row[dj + 3] = areaDownsamplingDivision<opencv_like,2>(
534
(u16)src0_row[sj + 3] + src0_row[sj + 7] +
535
src1_row[sj + 3] + src1_row[sj + 7]);
536
}
537
}
538
}
539
else if ((wr == 0.5f) && (hr == 0.5f))
540
{
541
#ifndef __ANDROID__
542
size_t roiw32 = dsize.width >= 31 ? (dsize.width - 31) << 2 : 0;
543
#endif
544
size_t roiw16 = dsize.width >= 15 ? (dsize.width - 15) << 2 : 0;
545
546
for (size_t i = 0; i < dsize.height; i += 2)
547
{
548
const u8 * src_row = internal::getRowPtr(srcBase, srcStride, i >> 1);
549
u8 * dst0_row = internal::getRowPtr(dstBase, dstStride, i);
550
u8 * dst1_row = internal::getRowPtr(dstBase, dstStride, std::min(i + 1, dsize.height - 1));
551
size_t sj = 0, dj = 0;
552
553
#ifndef __ANDROID__
554
for ( ; dj < roiw32; dj += 128, sj += 64)
555
{
556
internal::prefetch(src_row + sj);
557
558
uint8x16x4_t v_src = vld4q_u8(src_row + sj);
559
uint8x16x2_t v_c0 = vzipq_u8(v_src.val[0], v_src.val[0]);
560
uint8x16x2_t v_c1 = vzipq_u8(v_src.val[1], v_src.val[1]);
561
uint8x16x2_t v_c2 = vzipq_u8(v_src.val[2], v_src.val[2]);
562
uint8x16x2_t v_c3 = vzipq_u8(v_src.val[3], v_src.val[3]);
563
564
uint8x16x4_t v_dst;
565
v_dst.val[0] = v_c0.val[0];
566
v_dst.val[1] = v_c1.val[0];
567
v_dst.val[2] = v_c2.val[0];
568
v_dst.val[3] = v_c3.val[0];
569
vst4q_u8(dst0_row + dj, v_dst);
570
vst4q_u8(dst1_row + dj, v_dst);
571
572
v_dst.val[0] = v_c0.val[1];
573
v_dst.val[1] = v_c1.val[1];
574
v_dst.val[2] = v_c2.val[1];
575
v_dst.val[3] = v_c3.val[1];
576
vst4q_u8(dst0_row + dj + 64, v_dst);
577
vst4q_u8(dst1_row + dj + 64, v_dst);
578
}
579
#endif
580
581
for ( ; dj < roiw16; dj += 64, sj += 32)
582
{
583
internal::prefetch(src_row + sj);
584
585
uint8x8x4_t v_src = vld4_u8(src_row + sj);
586
uint8x8x2_t v_c0 = vzip_u8(v_src.val[0], v_src.val[0]);
587
uint8x8x2_t v_c1 = vzip_u8(v_src.val[1], v_src.val[1]);
588
uint8x8x2_t v_c2 = vzip_u8(v_src.val[2], v_src.val[2]);
589
uint8x8x2_t v_c3 = vzip_u8(v_src.val[3], v_src.val[3]);
590
591
uint8x16x4_t v_dst;
592
v_dst.val[0] = vcombine_u8(v_c0.val[0], v_c0.val[1]);
593
v_dst.val[1] = vcombine_u8(v_c1.val[0], v_c1.val[1]);
594
v_dst.val[2] = vcombine_u8(v_c2.val[0], v_c2.val[1]);
595
v_dst.val[3] = vcombine_u8(v_c3.val[0], v_c3.val[1]);
596
vst4q_u8(dst0_row + dj, v_dst);
597
vst4q_u8(dst1_row + dj, v_dst);
598
}
599
600
for (size_t dwidth = dsize.width << 2; dj < dwidth; dj += 8, sj += 4)
601
{
602
u8 src_val = src_row[sj];
603
dst0_row[dj] = dst0_row[dj + 4] = src_val;
604
dst1_row[dj] = dst1_row[dj + 4] = src_val;
605
606
src_val = src_row[sj + 1];
607
dst0_row[dj + 1] = dst0_row[dj + 5] = src_val;
608
dst1_row[dj + 1] = dst1_row[dj + 5] = src_val;
609
610
src_val = src_row[sj + 2];
611
dst0_row[dj + 2] = dst0_row[dj + 6] = src_val;
612
dst1_row[dj + 2] = dst1_row[dj + 6] = src_val;
613
614
src_val = src_row[sj + 3];
615
dst0_row[dj + 3] = dst0_row[dj + 7] = src_val;
616
dst1_row[dj + 3] = dst1_row[dj + 7] = src_val;
617
}
618
}
619
}
620
else //if ((hr == 4.0f) && (wr == 4.0f)) //the only scale that lasts after isSupported check
621
{
622
size_t roiw4 = dsize.width >= 3 ? (dsize.width - 3) << 2 : 0;
623
size_t roiw2 = dsize.width >= 1 ? (dsize.width - 1) << 2 : 0;
624
625
for (size_t i = 0; i < dsize.height; ++i)
626
{
627
const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 2);
628
const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 1);
629
const u8 * src2_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 2);
630
const u8 * src3_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 3);
631
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
632
size_t sj = 0, dj = 0;
633
634
for ( ; dj < roiw4; dj += 16, sj += 64)
635
{
636
internal::prefetch(src0_row + sj);
637
internal::prefetch(src1_row + sj);
638
internal::prefetch(src2_row + sj);
639
internal::prefetch(src3_row + sj);
640
641
uint8x16_t vLane10 = vld1q_u8(src0_row + sj), vLane11 = vld1q_u8(src0_row + sj + 16);
642
uint8x16_t vLane20 = vld1q_u8(src1_row + sj), vLane21 = vld1q_u8(src1_row + sj + 16);
643
uint8x16_t vLane30 = vld1q_u8(src2_row + sj), vLane31 = vld1q_u8(src2_row + sj + 16);
644
uint8x16_t vLane40 = vld1q_u8(src3_row + sj), vLane41 = vld1q_u8(src3_row + sj + 16);
645
646
uint16x8_t v_part_0, v_part_1;
647
{
648
uint16x8_t v_sum0 = vaddl_u8(vget_low_u8(vLane10), vget_high_u8(vLane10));
649
v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane20), vget_high_u8(vLane20)));
650
v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane30), vget_high_u8(vLane30)));
651
v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane40), vget_high_u8(vLane40)));
652
653
uint16x8_t v_sum1 = vaddl_u8(vget_low_u8(vLane11), vget_high_u8(vLane11));
654
v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane21), vget_high_u8(vLane21)));
655
v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane31), vget_high_u8(vLane31)));
656
v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane41), vget_high_u8(vLane41)));
657
658
v_part_0 = vcombine_u16(vadd_u16(vget_low_u16(v_sum0), vget_high_u16(v_sum0)),
659
vadd_u16(vget_low_u16(v_sum1), vget_high_u16(v_sum1)));
660
}
661
662
vLane10 = vld1q_u8(src0_row + sj + 32);
663
vLane11 = vld1q_u8(src0_row + sj + 48);
664
vLane20 = vld1q_u8(src1_row + sj + 32);
665
vLane21 = vld1q_u8(src1_row + sj + 48);
666
vLane30 = vld1q_u8(src2_row + sj + 32);
667
vLane31 = vld1q_u8(src2_row + sj + 48);
668
vLane40 = vld1q_u8(src3_row + sj + 32);
669
vLane41 = vld1q_u8(src3_row + sj + 48);
670
671
{
672
uint16x8_t v_sum0 = vaddl_u8(vget_low_u8(vLane10), vget_high_u8(vLane10));
673
v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane20), vget_high_u8(vLane20)));
674
v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane30), vget_high_u8(vLane30)));
675
v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane40), vget_high_u8(vLane40)));
676
677
uint16x8_t v_sum1 = vaddl_u8(vget_low_u8(vLane11), vget_high_u8(vLane11));
678
v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane21), vget_high_u8(vLane21)));
679
v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane31), vget_high_u8(vLane31)));
680
v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane41), vget_high_u8(vLane41)));
681
682
v_part_1 = vcombine_u16(vadd_u16(vget_low_u16(v_sum0), vget_high_u16(v_sum0)),
683
vadd_u16(vget_low_u16(v_sum1), vget_high_u16(v_sum1)));
684
}
685
686
vst1q_u8(dst_row + dj, vcombine_u8(areaDownsamplingDivision<opencv_like,4>(v_part_0),
687
areaDownsamplingDivision<opencv_like,4>(v_part_1)));
688
}
689
690
for ( ; dj < roiw2; dj += 8, sj += 32)
691
{
692
uint8x16_t vLane10 = vld1q_u8(src0_row + sj), vLane11 = vld1q_u8(src0_row + sj + 16);
693
uint8x16_t vLane20 = vld1q_u8(src1_row + sj), vLane21 = vld1q_u8(src1_row + sj + 16);
694
uint8x16_t vLane30 = vld1q_u8(src2_row + sj), vLane31 = vld1q_u8(src2_row + sj + 16);
695
uint8x16_t vLane40 = vld1q_u8(src3_row + sj), vLane41 = vld1q_u8(src3_row + sj + 16);
696
697
uint16x8_t v_sum0 = vaddl_u8(vget_low_u8(vLane10), vget_high_u8(vLane10));
698
v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane20), vget_high_u8(vLane20)));
699
v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane30), vget_high_u8(vLane30)));
700
v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane40), vget_high_u8(vLane40)));
701
702
uint16x8_t v_sum1 = vaddl_u8(vget_low_u8(vLane11), vget_high_u8(vLane11));
703
v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane21), vget_high_u8(vLane21)));
704
v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane31), vget_high_u8(vLane31)));
705
v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane41), vget_high_u8(vLane41)));
706
707
uint16x8_t v_sum = vcombine_u16(vadd_u16(vget_low_u16(v_sum0), vget_high_u16(v_sum0)),
708
vadd_u16(vget_low_u16(v_sum1), vget_high_u16(v_sum1)));
709
710
vst1_u8(dst_row + dj, (areaDownsamplingDivision<opencv_like,4>(v_sum)));
711
}
712
713
for (size_t dwidth = dsize.width << 2; dj < dwidth; dj += 4, sj += 16)
714
{
715
dst_row[dj ] = areaDownsamplingDivision<opencv_like,4>(
716
(u16)src0_row[sj ] + src0_row[sj + 4] +
717
src0_row[sj + 8] + src0_row[sj + 12] +
718
src1_row[sj ] + src1_row[sj + 4] +
719
src1_row[sj + 8] + src1_row[sj + 12] +
720
src2_row[sj ] + src2_row[sj + 4] +
721
src2_row[sj + 8] + src2_row[sj + 12] +
722
src3_row[sj ] + src3_row[sj + 4] +
723
src3_row[sj + 8] + src3_row[sj + 12]);
724
725
dst_row[dj + 1] = areaDownsamplingDivision<opencv_like,4>(
726
(u16)src0_row[sj + 1] + src0_row[sj + 5] +
727
src0_row[sj + 9] + src0_row[sj + 13] +
728
src1_row[sj + 1] + src1_row[sj + 5] +
729
src1_row[sj + 9] + src1_row[sj + 13] +
730
src2_row[sj + 1] + src2_row[sj + 5] +
731
src2_row[sj + 9] + src2_row[sj + 13] +
732
src3_row[sj + 1] + src3_row[sj + 5] +
733
src3_row[sj + 9] + src3_row[sj + 13]);
734
735
dst_row[dj + 2] = areaDownsamplingDivision<opencv_like,4>(
736
(u16)src0_row[sj + 2] + src0_row[sj + 6] +
737
src0_row[sj + 10] + src0_row[sj + 14] +
738
src1_row[sj + 2] + src1_row[sj + 6] +
739
src1_row[sj + 10] + src1_row[sj + 14] +
740
src2_row[sj + 2] + src2_row[sj + 6] +
741
src2_row[sj + 10] + src2_row[sj + 14] +
742
src3_row[sj + 2] + src3_row[sj + 6] +
743
src3_row[sj + 10] + src3_row[sj + 14]);
744
745
dst_row[dj + 3] = areaDownsamplingDivision<opencv_like,4>(
746
(u16)src0_row[sj + 3] + src0_row[sj + 7] +
747
src0_row[sj + 11] + src0_row[sj + 15] +
748
src1_row[sj + 3] + src1_row[sj + 7] +
749
src1_row[sj + 11] + src1_row[sj + 15] +
750
src2_row[sj + 3] + src2_row[sj + 7] +
751
src2_row[sj + 11] + src2_row[sj + 15] +
752
src3_row[sj + 3] + src3_row[sj + 7] +
753
src3_row[sj + 11] + src3_row[sj + 15]);
754
}
755
}
756
}
757
}
758
else if (channels == 3)
759
{
760
if ((wr == 2.0f) && (wr == 2.0f))
761
{
762
#ifndef __ANDROID__
763
size_t roiw16 = dsize.width >= 15 ? (dsize.width - 15) * 3 : 0;
764
#endif
765
size_t roiw8 = dsize.width >= 7 ? (dsize.width - 7) * 3 : 0;
766
767
for (size_t i = 0; i < dsize.height; ++i)
768
{
769
const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 1);
770
const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 1) + 1);
771
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
772
size_t sj = 0, dj = 0;
773
774
#ifndef __ANDROID__
775
for ( ; dj < roiw16; dj += 48, sj += 96)
776
{
777
internal::prefetch(src0_row + sj);
778
internal::prefetch(src1_row + sj);
779
780
uint8x16x3_t vLane1 = vld3q_u8(src0_row + sj);
781
uint8x16x3_t vLane2 = vld3q_u8(src1_row + sj);
782
783
uint8x8x3_t v_dst0, v_dst1;
784
{
785
uint16x8_t v_el0 = vpaddlq_u8(vLane1.val[0]);
786
uint16x8_t v_el1 = vpaddlq_u8(vLane1.val[1]);
787
uint16x8_t v_el2 = vpaddlq_u8(vLane1.val[2]);
788
v_el0 = vpadalq_u8(v_el0, vLane2.val[0]);
789
v_el1 = vpadalq_u8(v_el1, vLane2.val[1]);
790
v_el2 = vpadalq_u8(v_el2, vLane2.val[2]);
791
792
v_dst0.val[0] = areaDownsamplingDivision<opencv_like,2>(v_el0);
793
v_dst0.val[1] = areaDownsamplingDivision<opencv_like,2>(v_el1);
794
v_dst0.val[2] = areaDownsamplingDivision<opencv_like,2>(v_el2);
795
}
796
797
vLane1 = vld3q_u8(src0_row + sj + 48);
798
vLane2 = vld3q_u8(src1_row + sj + 48);
799
{
800
uint16x8_t v_el0 = vpaddlq_u8(vLane1.val[0]);
801
uint16x8_t v_el1 = vpaddlq_u8(vLane1.val[1]);
802
uint16x8_t v_el2 = vpaddlq_u8(vLane1.val[2]);
803
v_el0 = vpadalq_u8(v_el0, vLane2.val[0]);
804
v_el1 = vpadalq_u8(v_el1, vLane2.val[1]);
805
v_el2 = vpadalq_u8(v_el2, vLane2.val[2]);
806
807
v_dst1.val[0] = areaDownsamplingDivision<opencv_like,2>(v_el0);
808
v_dst1.val[1] = areaDownsamplingDivision<opencv_like,2>(v_el1);
809
v_dst1.val[2] = areaDownsamplingDivision<opencv_like,2>(v_el2);
810
}
811
812
uint8x16x3_t v_dst;
813
v_dst.val[0] = vcombine_u8(v_dst0.val[0], v_dst1.val[0]);
814
v_dst.val[1] = vcombine_u8(v_dst0.val[1], v_dst1.val[1]);
815
v_dst.val[2] = vcombine_u8(v_dst0.val[2], v_dst1.val[2]);
816
817
vst3q_u8(dst_row + dj, v_dst);
818
}
819
#endif
820
821
for ( ; dj < roiw8; dj += 24, sj += 48)
822
{
823
internal::prefetch(src0_row + sj);
824
internal::prefetch(src1_row + sj);
825
826
uint8x16x3_t vLane1 = vld3q_u8(src0_row + sj);
827
uint8x16x3_t vLane2 = vld3q_u8(src1_row + sj);
828
829
uint16x8_t v_el0 = vpaddlq_u8(vLane1.val[0]);
830
uint16x8_t v_el1 = vpaddlq_u8(vLane1.val[1]);
831
uint16x8_t v_el2 = vpaddlq_u8(vLane1.val[2]);
832
v_el0 = vpadalq_u8(v_el0, vLane2.val[0]);
833
v_el1 = vpadalq_u8(v_el1, vLane2.val[1]);
834
v_el2 = vpadalq_u8(v_el2, vLane2.val[2]);
835
836
uint8x8x3_t v_dst;
837
v_dst.val[0] = areaDownsamplingDivision<opencv_like,2>(v_el0);
838
v_dst.val[1] = areaDownsamplingDivision<opencv_like,2>(v_el1);
839
v_dst.val[2] = areaDownsamplingDivision<opencv_like,2>(v_el2);
840
841
vst3_u8(dst_row + dj, v_dst);
842
}
843
844
for (size_t dwidth = dsize.width * 3; dj < dwidth; dj += 3, sj += 6)
845
{
846
dst_row[dj ] = areaDownsamplingDivision<opencv_like,2>(
847
(u16)src0_row[sj ] + src0_row[sj + 3] +
848
src1_row[sj ] + src1_row[sj + 3]);
849
dst_row[dj + 1] = areaDownsamplingDivision<opencv_like,2>(
850
(u16)src0_row[sj + 1] + src0_row[sj + 4] +
851
src1_row[sj + 1] + src1_row[sj + 4]);
852
dst_row[dj + 2] = areaDownsamplingDivision<opencv_like,2>(
853
(u16)src0_row[sj + 2] + src0_row[sj + 5] +
854
src1_row[sj + 2] + src1_row[sj + 5]);
855
}
856
}
857
}
858
else if ((wr == 0.5f) && (hr == 0.5f))
859
{
860
#ifndef __ANDROID__
861
size_t roiw32 = dsize.width >= 31 ? (dsize.width - 31) * 3 : 0;
862
#endif
863
size_t roiw16 = dsize.width >= 15 ? (dsize.width - 15) * 3 : 0;
864
865
for (size_t i = 0; i < dsize.height; i += 2)
866
{
867
const u8 * src_row = internal::getRowPtr(srcBase, srcStride, i >> 1);
868
u8 * dst0_row = internal::getRowPtr(dstBase, dstStride, i);
869
u8 * dst1_row = internal::getRowPtr(dstBase, dstStride, std::min(i + 1, dsize.height - 1));
870
size_t sj = 0, dj = 0;
871
872
#ifndef __ANDROID__
873
for ( ; dj < roiw32; dj += 96, sj += 48)
874
{
875
internal::prefetch(src_row + sj);
876
877
uint8x16x3_t v_src = vld3q_u8(src_row + sj);
878
uint8x16x2_t v_c0 = vzipq_u8(v_src.val[0], v_src.val[0]);
879
uint8x16x2_t v_c1 = vzipq_u8(v_src.val[1], v_src.val[1]);
880
uint8x16x2_t v_c2 = vzipq_u8(v_src.val[2], v_src.val[2]);
881
882
uint8x16x3_t v_dst;
883
v_dst.val[0] = v_c0.val[0];
884
v_dst.val[1] = v_c1.val[0];
885
v_dst.val[2] = v_c2.val[0];
886
vst3q_u8(dst0_row + dj, v_dst);
887
vst3q_u8(dst1_row + dj, v_dst);
888
889
v_dst.val[0] = v_c0.val[1];
890
v_dst.val[1] = v_c1.val[1];
891
v_dst.val[2] = v_c2.val[1];
892
vst3q_u8(dst0_row + dj + 48, v_dst);
893
vst3q_u8(dst1_row + dj + 48, v_dst);
894
}
895
#endif
896
897
for ( ; dj < roiw16; dj += 48, sj += 24)
898
{
899
internal::prefetch(src_row + sj);
900
901
uint8x8x3_t v_src = vld3_u8(src_row + sj);
902
uint8x8x2_t v_c0 = vzip_u8(v_src.val[0], v_src.val[0]);
903
uint8x8x2_t v_c1 = vzip_u8(v_src.val[1], v_src.val[1]);
904
uint8x8x2_t v_c2 = vzip_u8(v_src.val[2], v_src.val[2]);
905
906
uint8x16x3_t v_dst;
907
v_dst.val[0] = vcombine_u8(v_c0.val[0], v_c0.val[1]);
908
v_dst.val[1] = vcombine_u8(v_c1.val[0], v_c1.val[1]);
909
v_dst.val[2] = vcombine_u8(v_c2.val[0], v_c2.val[1]);
910
vst3q_u8(dst0_row + dj, v_dst);
911
vst3q_u8(dst1_row + dj, v_dst);
912
}
913
914
for (size_t dwidth = dsize.width * 3; dj < dwidth; dj += 6, sj += 3)
915
{
916
u8 src_val = src_row[sj];
917
dst0_row[dj] = dst0_row[dj + 3] = src_val;
918
dst1_row[dj] = dst1_row[dj + 3] = src_val;
919
920
src_val = src_row[sj + 1];
921
dst0_row[dj + 1] = dst0_row[dj + 4] = src_val;
922
dst1_row[dj + 1] = dst1_row[dj + 4] = src_val;
923
924
src_val = src_row[sj + 2];
925
dst0_row[dj + 2] = dst0_row[dj + 5] = src_val;
926
dst1_row[dj + 2] = dst1_row[dj + 5] = src_val;
927
}
928
}
929
}
930
else //if ((hr == 4.0f) && (wr == 4.0f)) //the only scale that lasts after isSupported check
931
{
932
#ifndef __ANDROID__
933
size_t roiw8 = dsize.width >= 7 ? (dsize.width - 7) * 3 : 0;
934
#endif
935
936
for (size_t i = 0; i < dsize.height; ++i)
937
{
938
const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 2);
939
const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 1);
940
const u8 * src2_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 2);
941
const u8 * src3_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 3);
942
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
943
size_t sj = 0, dj = 0;
944
945
#ifndef __ANDROID__
946
for ( ; dj < roiw8; dj += 24, sj += 96)
947
{
948
internal::prefetch(src0_row + sj);
949
internal::prefetch(src1_row + sj);
950
internal::prefetch(src2_row + sj);
951
internal::prefetch(src3_row + sj);
952
953
uint8x16x3_t vLane10 = vld3q_u8(src0_row + sj), vLane11 = vld3q_u8(src0_row + sj + 48);
954
uint8x16x3_t vLane20 = vld3q_u8(src1_row + sj), vLane21 = vld3q_u8(src1_row + sj + 48);
955
uint8x16x3_t vLane30 = vld3q_u8(src2_row + sj), vLane31 = vld3q_u8(src2_row + sj + 48);
956
uint8x16x3_t vLane40 = vld3q_u8(src3_row + sj), vLane41 = vld3q_u8(src3_row + sj + 48);
957
958
uint8x8x3_t v_dst;
959
960
// channel 0
961
{
962
uint16x8_t v_lane0 = vpaddlq_u8(vLane10.val[0]);
963
uint16x8_t v_lane1 = vpaddlq_u8(vLane20.val[0]);
964
uint16x8_t v_lane2 = vpaddlq_u8(vLane30.val[0]);
965
uint16x8_t v_lane3 = vpaddlq_u8(vLane40.val[0]);
966
v_lane0 = vaddq_u16(v_lane0, v_lane1);
967
v_lane0 = vaddq_u16(v_lane0, v_lane2);
968
v_lane0 = vaddq_u16(v_lane0, v_lane3);
969
970
uint16x8_t v_lane0_ = vpaddlq_u8(vLane11.val[0]);
971
uint16x8_t v_lane1_ = vpaddlq_u8(vLane21.val[0]);
972
uint16x8_t v_lane2_ = vpaddlq_u8(vLane31.val[0]);
973
uint16x8_t v_lane3_ = vpaddlq_u8(vLane41.val[0]);
974
v_lane0_ = vaddq_u16(v_lane0_, v_lane1_);
975
v_lane0_ = vaddq_u16(v_lane0_, v_lane2_);
976
v_lane0_ = vaddq_u16(v_lane0_, v_lane3_);
977
978
v_dst.val[0] = areaDownsamplingDivision<opencv_like,4>(
979
vcombine_u16(vmovn_u32(vpaddlq_u16(v_lane0)),
980
vmovn_u32(vpaddlq_u16(v_lane0_))));
981
}
982
983
// channel 1
984
{
985
uint16x8_t v_lane0 = vpaddlq_u8(vLane10.val[1]);
986
uint16x8_t v_lane1 = vpaddlq_u8(vLane20.val[1]);
987
uint16x8_t v_lane2 = vpaddlq_u8(vLane30.val[1]);
988
uint16x8_t v_lane3 = vpaddlq_u8(vLane40.val[1]);
989
v_lane0 = vaddq_u16(v_lane0, v_lane1);
990
v_lane0 = vaddq_u16(v_lane0, v_lane2);
991
v_lane0 = vaddq_u16(v_lane0, v_lane3);
992
993
uint16x8_t v_lane0_ = vpaddlq_u8(vLane11.val[1]);
994
uint16x8_t v_lane1_ = vpaddlq_u8(vLane21.val[1]);
995
uint16x8_t v_lane2_ = vpaddlq_u8(vLane31.val[1]);
996
uint16x8_t v_lane3_ = vpaddlq_u8(vLane41.val[1]);
997
v_lane0_ = vaddq_u16(v_lane0_, v_lane1_);
998
v_lane0_ = vaddq_u16(v_lane0_, v_lane2_);
999
v_lane0_ = vaddq_u16(v_lane0_, v_lane3_);
1000
1001
v_dst.val[1] = areaDownsamplingDivision<opencv_like,4>(
1002
vcombine_u16(vmovn_u32(vpaddlq_u16(v_lane0)),
1003
vmovn_u32(vpaddlq_u16(v_lane0_))));
1004
}
1005
1006
// channel 2
1007
{
1008
uint16x8_t v_lane0 = vpaddlq_u8(vLane10.val[2]);
1009
uint16x8_t v_lane1 = vpaddlq_u8(vLane20.val[2]);
1010
uint16x8_t v_lane2 = vpaddlq_u8(vLane30.val[2]);
1011
uint16x8_t v_lane3 = vpaddlq_u8(vLane40.val[2]);
1012
v_lane0 = vaddq_u16(v_lane0, v_lane1);
1013
v_lane0 = vaddq_u16(v_lane0, v_lane2);
1014
v_lane0 = vaddq_u16(v_lane0, v_lane3);
1015
1016
uint16x8_t v_lane0_ = vpaddlq_u8(vLane11.val[2]);
1017
uint16x8_t v_lane1_ = vpaddlq_u8(vLane21.val[2]);
1018
uint16x8_t v_lane2_ = vpaddlq_u8(vLane31.val[2]);
1019
uint16x8_t v_lane3_ = vpaddlq_u8(vLane41.val[2]);
1020
v_lane0_ = vaddq_u16(v_lane0_, v_lane1_);
1021
v_lane0_ = vaddq_u16(v_lane0_, v_lane2_);
1022
v_lane0_ = vaddq_u16(v_lane0_, v_lane3_);
1023
1024
v_dst.val[2] = areaDownsamplingDivision<opencv_like,4>(
1025
vcombine_u16(vmovn_u32(vpaddlq_u16(v_lane0)),
1026
vmovn_u32(vpaddlq_u16(v_lane0_))));
1027
}
1028
1029
vst3_u8(dst_row + dj, v_dst);
1030
}
1031
#endif
1032
1033
for (size_t dwidth = dsize.width * 3; dj < dwidth; dj += 3, sj += 12)
1034
{
1035
dst_row[dj ] = areaDownsamplingDivision<opencv_like,4>(
1036
(u16)src0_row[sj ] + src0_row[sj + 3] +
1037
src0_row[sj + 6] + src0_row[sj + 9] +
1038
src1_row[sj ] + src1_row[sj + 3] +
1039
src1_row[sj + 6] + src1_row[sj + 9] +
1040
src2_row[sj ] + src2_row[sj + 3] +
1041
src2_row[sj + 6] + src2_row[sj + 9] +
1042
src3_row[sj ] + src3_row[sj + 3] +
1043
src3_row[sj + 6] + src3_row[sj + 9]);
1044
1045
dst_row[dj + 1] = areaDownsamplingDivision<opencv_like,4>(
1046
(u16)src0_row[sj + 1] + src0_row[sj + 4] +
1047
src0_row[sj + 7] + src0_row[sj + 10] +
1048
src1_row[sj + 1] + src1_row[sj + 4] +
1049
src1_row[sj + 7] + src1_row[sj + 10] +
1050
src2_row[sj + 1] + src2_row[sj + 4] +
1051
src2_row[sj + 7] + src2_row[sj + 10] +
1052
src3_row[sj + 1] + src3_row[sj + 4] +
1053
src3_row[sj + 7] + src3_row[sj + 10]);
1054
1055
dst_row[dj + 2] = areaDownsamplingDivision<opencv_like,4>(
1056
(u16)src0_row[sj + 2] + src0_row[sj + 5] +
1057
src0_row[sj + 8] + src0_row[sj + 11] +
1058
src1_row[sj + 2] + src1_row[sj + 5] +
1059
src1_row[sj + 8] + src1_row[sj + 11] +
1060
src2_row[sj + 2] + src2_row[sj + 5] +
1061
src2_row[sj + 8] + src2_row[sj + 11] +
1062
src3_row[sj + 2] + src3_row[sj + 5] +
1063
src3_row[sj + 8] + src3_row[sj + 11]);
1064
}
1065
}
1066
}
1067
}
1068
#else
1069
(void)dsize;
1070
(void)srcBase;
1071
(void)srcStride;
1072
(void)dstBase;
1073
(void)dstStride;
1074
(void)wr;
1075
(void)hr;
1076
#endif
1077
(void)ssize;
1078
}
1079
1080
void resizeAreaOpenCV(const Size2D &ssize, const Size2D &dsize,
1081
const u8 * srcBase, ptrdiff_t srcStride,
1082
u8 * dstBase, ptrdiff_t dstStride,
1083
f32 wr, f32 hr, u32 channels)
1084
{
1085
resizeAreaRounding<true>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr, channels);
1086
}
1087
1088
void resizeArea(const Size2D &ssize, const Size2D &dsize,
1089
const u8 * srcBase, ptrdiff_t srcStride,
1090
u8 * dstBase, ptrdiff_t dstStride,
1091
f32 wr, f32 hr, u32 channels)
1092
{
1093
resizeAreaRounding<false>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr, channels);
1094
}
1095
1096
#ifdef CAROTENE_NEON
1097
1098
namespace {
1099
1100
uint8x8_t resizeLinearStep(uint8x16_t vr1, uint8x16_t vr2,
1101
uint8x8_t vlutl, uint8x8_t vluth,
1102
float32x4_t vrw, float32x4_t vcw0, float32x4_t vcw1)
1103
{
1104
uint8x8_t vr1l = internal::vqtbl1_u8(vr1, vlutl);
1105
uint8x8_t vr1h = internal::vqtbl1_u8(vr1, vluth);
1106
uint8x8_t vr2l = internal::vqtbl1_u8(vr2, vlutl);
1107
uint8x8_t vr2h = internal::vqtbl1_u8(vr2, vluth);
1108
1109
uint16x8_t v1hw = vmovl_u8(vr1h);
1110
uint16x8_t v2hw = vmovl_u8(vr2h);
1111
1112
int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h));
1113
int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h));
1114
1115
float32x4_t v1L = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1hw)));
1116
float32x4_t v1H = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1hw)));
1117
float32x4_t v2L = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v2hw)));
1118
float32x4_t v2H = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v2hw)));
1119
1120
v1L = vmlaq_f32(v1L, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v1df))), vcw0);
1121
v1H = vmlaq_f32(v1H, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v1df))), vcw1);
1122
v2L = vmlaq_f32(v2L, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v2df))), vcw0);
1123
v2H = vmlaq_f32(v2H, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v2df))), vcw1);
1124
1125
float32x4_t vdiffL = vsubq_f32(v1L, v2L);
1126
float32x4_t vdiffH = vsubq_f32(v1H, v2H);
1127
1128
float32x4_t vL = vmlaq_f32(v2L, vdiffL, vrw);
1129
float32x4_t vH = vmlaq_f32(v2H, vdiffH, vrw);
1130
uint16x4_t vL_ = vmovn_u32(vcvtq_u32_f32(vL));
1131
uint16x4_t vH_ = vmovn_u32(vcvtq_u32_f32(vH));
1132
return vmovn_u16(vcombine_u16(vL_, vH_));
1133
}
1134
1135
} // namespace
1136
1137
namespace {
1138
1139
void resize_bilinear_rows(const Size2D &ssize, const Size2D &dsize,
1140
const u8 * srcBase, ptrdiff_t srcStride,
1141
u8 * dstBase, ptrdiff_t dstStride,
1142
f32 hr, const u8** gcols, u8* gcweight, u8* buf)
1143
{
1144
f32 scale_y_offset = 0.5f * hr - 0.5f;
1145
1146
size_t dst_h8 = dsize.height & ~7;
1147
size_t dst_w8 = dsize.width & ~7;
1148
size_t src_w8 = ssize.width & ~7;
1149
1150
size_t r = 0;
1151
for (; r < dst_h8; r += 8)
1152
{
1153
resize8u_xystretch:
1154
const u8* rows[16];
1155
u8 rweight[8];
1156
1157
for (u32 i = 0; i < 8; ++i)
1158
{
1159
f32 w = (i + r) * hr + scale_y_offset;
1160
ptrdiff_t src_row = floorf(w);
1161
ptrdiff_t src_row2 = src_row + 1;
1162
1163
rweight[i] = (u8)((src_row2-w) * 128);
1164
1165
if (src_row < 0)
1166
src_row = 0;
1167
if (src_row2 >= (ptrdiff_t)ssize.height)
1168
src_row2 = ssize.height-1;
1169
1170
rows[2 * i] = srcBase + src_row * srcStride;
1171
rows[2 * i + 1] = srcBase + src_row2 * srcStride;
1172
}
1173
1174
uint8x8_t vr0w = vdup_n_u8(rweight[0]);
1175
uint8x8_t vr1w = vdup_n_u8(rweight[1]);
1176
uint8x8_t vr2w = vdup_n_u8(rweight[2]);
1177
uint8x8_t vr3w = vdup_n_u8(rweight[3]);
1178
uint8x8_t vr4w = vdup_n_u8(rweight[4]);
1179
uint8x8_t vr5w = vdup_n_u8(rweight[5]);
1180
uint8x8_t vr6w = vdup_n_u8(rweight[6]);
1181
uint8x8_t vr7w = vdup_n_u8(rweight[7]);
1182
1183
uint8x8_t vr0w2 = vdup_n_u8(128 - rweight[0]);
1184
uint8x8_t vr1w2 = vdup_n_u8(128 - rweight[1]);
1185
uint8x8_t vr2w2 = vdup_n_u8(128 - rweight[2]);
1186
uint8x8_t vr3w2 = vdup_n_u8(128 - rweight[3]);
1187
uint8x8_t vr4w2 = vdup_n_u8(128 - rweight[4]);
1188
uint8x8_t vr5w2 = vdup_n_u8(128 - rweight[5]);
1189
uint8x8_t vr6w2 = vdup_n_u8(128 - rweight[6]);
1190
uint8x8_t vr7w2 = vdup_n_u8(128 - rweight[7]);
1191
1192
size_t col = 0;
1193
for(; col < src_w8; col += 8)
1194
{
1195
internal::prefetch(rows[3] + col);
1196
internal::prefetch(rows[7] + col);
1197
internal::prefetch(rows[11] + col);
1198
internal::prefetch(rows[15] + col);
1199
resize8u_ystretch:
1200
uint8x8_t vsrc0l1 = vld1_u8(rows[0] + col);
1201
uint8x8_t vsrc0l2 = vld1_u8(rows[1] + col);
1202
uint8x8_t vsrc1l1 = vld1_u8(rows[2] + col);
1203
uint8x8_t vsrc1l2 = vld1_u8(rows[3] + col);
1204
1205
// (l1 * w + l2 * (128 - w) + 64) / 128
1206
uint16x8_t vdst0l = vmull_u8(vsrc0l1, vr0w);
1207
uint16x8_t vdst1l = vmull_u8(vsrc1l1, vr1w);
1208
1209
uint8x8_t vsrc2l1 = vld1_u8(rows[4] + col);
1210
uint8x8_t vsrc2l2 = vld1_u8(rows[5] + col);
1211
uint8x8_t vsrc3l1 = vld1_u8(rows[6] + col);
1212
uint8x8_t vsrc3l2 = vld1_u8(rows[7] + col);
1213
1214
vdst0l = vmlal_u8(vdst0l, vsrc0l2, vr0w2);
1215
vdst1l = vmlal_u8(vdst1l, vsrc1l2, vr1w2);
1216
uint16x8_t vdst2l = vmull_u8(vsrc2l1, vr2w);
1217
uint16x8_t vdst3l = vmull_u8(vsrc3l1, vr3w);
1218
1219
uint8x8_t vsrc4l1 = vld1_u8(rows[8] + col);
1220
uint8x8_t vsrc4l2 = vld1_u8(rows[9] + col);
1221
uint8x8_t vsrc5l1 = vld1_u8(rows[10] + col);
1222
uint8x8_t vsrc5l2 = vld1_u8(rows[11] + col);
1223
1224
vdst2l = vmlal_u8(vdst2l, vsrc2l2, vr2w2);
1225
vdst3l = vmlal_u8(vdst3l, vsrc3l2, vr3w2);
1226
uint16x8_t vdst4l = vmull_u8(vsrc4l1, vr4w);
1227
uint16x8_t vdst5l = vmull_u8(vsrc5l1, vr5w);
1228
1229
uint8x8_t vsrc6l1 = vld1_u8(rows[12] + col);
1230
uint8x8_t vsrc6l2 = vld1_u8(rows[13] + col);
1231
uint8x8_t vsrc7l1 = vld1_u8(rows[14] + col);
1232
uint8x8_t vsrc7l2 = vld1_u8(rows[15] + col);
1233
1234
uint8x8_t vdst0 = vrshrn_n_u16(vdst0l, 7);
1235
uint8x8_t vdst1 = vrshrn_n_u16(vdst1l, 7);
1236
vdst4l = vmlal_u8(vdst4l, vsrc4l2, vr4w2);
1237
vdst5l = vmlal_u8(vdst5l, vsrc5l2, vr5w2);
1238
uint16x8_t vdst6l = vmull_u8(vsrc6l1, vr6w);
1239
uint16x8_t vdst7l = vmull_u8(vsrc7l1, vr7w);
1240
1241
uint8x8_t vdst2 = vrshrn_n_u16(vdst2l, 7);
1242
uint8x8_t vdst3 = vrshrn_n_u16(vdst3l, 7);
1243
vdst6l = vmlal_u8(vdst6l, vsrc6l2, vr6w2);
1244
vdst7l = vmlal_u8(vdst7l, vsrc7l2, vr7w2);
1245
1246
uint8x8_t vdst4 = vrshrn_n_u16(vdst4l, 7);
1247
uint8x8_t vdst5 = vrshrn_n_u16(vdst5l, 7);
1248
uint8x8_t vdst6 = vrshrn_n_u16(vdst6l, 7);
1249
uint8x8_t vdst7 = vrshrn_n_u16(vdst7l, 7);
1250
1251
// == 8x8 matrix transpose ==
1252
1253
//00 01 02 03 04 05 06 07 d0
1254
//10 11 12 13 14 15 16 17 d1
1255
//20 21 22 23 24 25 26 27 d2
1256
//30 31 32 33 34 35 36 37 d3
1257
//40 41 42 43 44 45 46 47 d4
1258
//50 51 52 53 54 55 56 57 d5
1259
//60 61 62 63 64 65 66 67 d6
1260
//70 71 72 73 74 75 76 77 d7
1261
1262
uint8x8x2_t vdst10t = vtrn_u8(vdst0, vdst1);
1263
uint8x8x2_t vdst32t = vtrn_u8(vdst2, vdst3);
1264
uint8x8x2_t vdst54t = vtrn_u8(vdst4, vdst5);
1265
uint8x8x2_t vdst76t = vtrn_u8(vdst6, vdst7);
1266
1267
uint8x16_t vd1d0 = vcombine_u8(vdst10t.val[0], vdst10t.val[1]);
1268
uint8x16_t vd3d2 = vcombine_u8(vdst32t.val[0], vdst32t.val[1]);
1269
uint8x16_t vd5d4 = vcombine_u8(vdst54t.val[0], vdst54t.val[1]);
1270
uint8x16_t vd7d6 = vcombine_u8(vdst76t.val[0], vdst76t.val[1]);
1271
1272
//00 10 02 12 04 14 06 16 d0
1273
//01 11 03 13 05 15 07 17 d1
1274
//20 30 22 32 24 34 26 36 d2
1275
//21 31 23 33 25 35 27 37 d3
1276
//40 50 42 52 44 54 46 56 d4
1277
//41 51 43 53 45 55 47 57 d5
1278
//60 70 62 72 64 74 66 76 d6
1279
//61 71 63 73 65 75 67 77 d7
1280
1281
uint16x8x2_t vq1q0t = vtrnq_u16((uint16x8_t)vd1d0, (uint16x8_t)vd3d2);
1282
uint16x8x2_t vq3q2t = vtrnq_u16((uint16x8_t)vd5d4, (uint16x8_t)vd7d6);
1283
1284
//00 10 20 30 04 14 24 34 d0
1285
//01 11 21 31 05 15 25 35 d1
1286
//02 12 22 32 06 16 26 36 d2
1287
//03 13 23 33 07 17 27 37 d3
1288
//40 50 60 70 44 54 64 74 d4
1289
//41 51 61 71 45 55 65 75 d5
1290
//42 52 62 72 46 56 66 76 d6
1291
//43 53 63 73 47 57 67 77 d7
1292
1293
uint32x4x2_t vq2q0t = vtrnq_u32((uint32x4_t)vq1q0t.val[0], (uint32x4_t)vq3q2t.val[0]);
1294
uint32x4x2_t vq3q1t = vtrnq_u32((uint32x4_t)vq1q0t.val[1], (uint32x4_t)vq3q2t.val[1]);
1295
1296
//00 10 20 30 40 50 60 70 d0
1297
//01 11 21 31 41 51 61 71 d1
1298
//02 12 22 32 42 52 62 72 d2
1299
//03 13 23 33 43 53 63 73 d3
1300
//04 14 24 34 44 54 64 74 d4
1301
//05 15 25 35 45 55 65 75 d5
1302
//06 16 26 36 46 56 66 76 d6
1303
//07 17 27 37 47 57 67 77 d7
1304
1305
vst1q_u8(buf + col * 8 + 0, (uint8x16_t)vq2q0t.val[0]);
1306
vst1q_u8(buf + col * 8 + 16, (uint8x16_t)vq3q1t.val[0]);
1307
vst1q_u8(buf + col * 8 + 32, (uint8x16_t)vq2q0t.val[1]);
1308
vst1q_u8(buf + col * 8 + 48, (uint8x16_t)vq3q1t.val[1]);
1309
}
1310
1311
if (col < ssize.width)
1312
{
1313
col = ssize.width - 8;
1314
goto resize8u_ystretch;
1315
}
1316
1317
u8* dst_data = dstBase + r * dstStride;
1318
const u8** cols = gcols;
1319
u8* cweight = gcweight;
1320
1321
size_t dcol = 0;
1322
for (; dcol < dst_w8; dcol += 8, cols += 16, cweight += 8)
1323
{
1324
internal::prefetch(cols[0], 64*4);
1325
resize8u_xstretch:
1326
uint8x8_t vc0w = vdup_n_u8(cweight[0]);
1327
uint8x8_t vc1w = vdup_n_u8(cweight[1]);
1328
uint8x8_t vc2w = vdup_n_u8(cweight[2]);
1329
uint8x8_t vc3w = vdup_n_u8(cweight[3]);
1330
uint8x8_t vc4w = vdup_n_u8(cweight[4]);
1331
uint8x8_t vc5w = vdup_n_u8(cweight[5]);
1332
uint8x8_t vc6w = vdup_n_u8(cweight[6]);
1333
uint8x8_t vc7w = vdup_n_u8(cweight[7]);
1334
1335
uint8x8_t vc0w2 = vdup_n_u8(128 - cweight[0]);
1336
uint8x8_t vc1w2 = vdup_n_u8(128 - cweight[1]);
1337
uint8x8_t vc2w2 = vdup_n_u8(128 - cweight[2]);
1338
uint8x8_t vc3w2 = vdup_n_u8(128 - cweight[3]);
1339
uint8x8_t vc4w2 = vdup_n_u8(128 - cweight[4]);
1340
uint8x8_t vc5w2 = vdup_n_u8(128 - cweight[5]);
1341
uint8x8_t vc6w2 = vdup_n_u8(128 - cweight[6]);
1342
uint8x8_t vc7w2 = vdup_n_u8(128 - cweight[7]);
1343
1344
uint8x8_t vsrc0l1 = vld1_u8(cols[0]);
1345
uint8x8_t vsrc0l2 = vld1_u8(cols[1]);
1346
uint8x8_t vsrc1l1 = vld1_u8(cols[2]);
1347
uint8x8_t vsrc1l2 = vld1_u8(cols[3]);
1348
uint8x8_t vsrc2l1 = vld1_u8(cols[4]);
1349
uint8x8_t vsrc2l2 = vld1_u8(cols[5]);
1350
uint8x8_t vsrc3l1 = vld1_u8(cols[6]);
1351
uint8x8_t vsrc3l2 = vld1_u8(cols[7]);
1352
uint8x8_t vsrc4l1 = vld1_u8(cols[8]);
1353
uint8x8_t vsrc4l2 = vld1_u8(cols[9]);
1354
uint8x8_t vsrc5l1 = vld1_u8(cols[10]);
1355
uint8x8_t vsrc5l2 = vld1_u8(cols[11]);
1356
uint8x8_t vsrc6l1 = vld1_u8(cols[12]);
1357
uint8x8_t vsrc6l2 = vld1_u8(cols[13]);
1358
uint8x8_t vsrc7l1 = vld1_u8(cols[14]);
1359
uint8x8_t vsrc7l2 = vld1_u8(cols[15]);
1360
1361
// (l1 * w + l2 * (128 - w) + 64) / 128
1362
uint16x8_t vdst0l = vmull_u8(vsrc0l1, vc0w);
1363
uint16x8_t vdst1l = vmull_u8(vsrc1l1, vc1w);
1364
uint16x8_t vdst2l = vmull_u8(vsrc2l1, vc2w);
1365
uint16x8_t vdst3l = vmull_u8(vsrc3l1, vc3w);
1366
uint16x8_t vdst4l = vmull_u8(vsrc4l1, vc4w);
1367
uint16x8_t vdst5l = vmull_u8(vsrc5l1, vc5w);
1368
uint16x8_t vdst6l = vmull_u8(vsrc6l1, vc6w);
1369
uint16x8_t vdst7l = vmull_u8(vsrc7l1, vc7w);
1370
1371
vdst0l = vmlal_u8(vdst0l, vsrc0l2, vc0w2);
1372
vdst1l = vmlal_u8(vdst1l, vsrc1l2, vc1w2);
1373
vdst2l = vmlal_u8(vdst2l, vsrc2l2, vc2w2);
1374
vdst3l = vmlal_u8(vdst3l, vsrc3l2, vc3w2);
1375
vdst4l = vmlal_u8(vdst4l, vsrc4l2, vc4w2);
1376
vdst5l = vmlal_u8(vdst5l, vsrc5l2, vc5w2);
1377
vdst6l = vmlal_u8(vdst6l, vsrc6l2, vc6w2);
1378
vdst7l = vmlal_u8(vdst7l, vsrc7l2, vc7w2);
1379
1380
uint8x8_t vdst0 = vrshrn_n_u16(vdst0l, 7);
1381
uint8x8_t vdst1 = vrshrn_n_u16(vdst1l, 7);
1382
uint8x8_t vdst2 = vrshrn_n_u16(vdst2l, 7);
1383
uint8x8_t vdst3 = vrshrn_n_u16(vdst3l, 7);
1384
uint8x8_t vdst4 = vrshrn_n_u16(vdst4l, 7);
1385
uint8x8_t vdst5 = vrshrn_n_u16(vdst5l, 7);
1386
uint8x8_t vdst6 = vrshrn_n_u16(vdst6l, 7);
1387
uint8x8_t vdst7 = vrshrn_n_u16(vdst7l, 7);
1388
1389
// == 8x8 matrix transpose ==
1390
uint8x8x2_t vdst10t = vtrn_u8(vdst0, vdst1);
1391
uint8x8x2_t vdst32t = vtrn_u8(vdst2, vdst3);
1392
uint8x8x2_t vdst54t = vtrn_u8(vdst4, vdst5);
1393
uint8x8x2_t vdst76t = vtrn_u8(vdst6, vdst7);
1394
uint8x16_t vd1d0 = vcombine_u8(vdst10t.val[0], vdst10t.val[1]);
1395
uint8x16_t vd3d2 = vcombine_u8(vdst32t.val[0], vdst32t.val[1]);
1396
uint8x16_t vd5d4 = vcombine_u8(vdst54t.val[0], vdst54t.val[1]);
1397
uint8x16_t vd7d6 = vcombine_u8(vdst76t.val[0], vdst76t.val[1]);
1398
uint16x8x2_t vq1q0t = vtrnq_u16((uint16x8_t)vd1d0, (uint16x8_t)vd3d2);
1399
uint16x8x2_t vq3q2t = vtrnq_u16((uint16x8_t)vd5d4, (uint16x8_t)vd7d6);
1400
uint32x4x2_t vq2q0t = vtrnq_u32((uint32x4_t)vq1q0t.val[0], (uint32x4_t)vq3q2t.val[0]);
1401
uint32x4x2_t vq3q1t = vtrnq_u32((uint32x4_t)vq1q0t.val[1], (uint32x4_t)vq3q2t.val[1]);
1402
1403
//save results
1404
vst1_u8(dst_data + 0 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq2q0t.val[0]));
1405
vst1_u8(dst_data + 1 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq2q0t.val[0]));
1406
vst1_u8(dst_data + 2 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq3q1t.val[0]));
1407
vst1_u8(dst_data + 3 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq3q1t.val[0]));
1408
vst1_u8(dst_data + 4 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq2q0t.val[1]));
1409
vst1_u8(dst_data + 5 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq2q0t.val[1]));
1410
vst1_u8(dst_data + 6 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq3q1t.val[1]));
1411
vst1_u8(dst_data + 7 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq3q1t.val[1]));
1412
}
1413
1414
if (dcol < dsize.width)
1415
{
1416
dcol = dsize.width - 8;
1417
cols = gcols + dcol * 2;
1418
cweight = gcweight + dcol;
1419
goto resize8u_xstretch;
1420
}
1421
}
1422
1423
if (r < dsize.height)
1424
{
1425
r = dsize.height - 8;
1426
goto resize8u_xystretch;
1427
}
1428
}
1429
1430
template <int channels> struct resizeLinearInternals;
1431
template <> struct resizeLinearInternals<1>
1432
{
1433
int32x4_t vc_upd;
1434
int32x4_t vc0;
1435
int32x4_t vcmax;
1436
1437
inline resizeLinearInternals(int32x4_t & vi, u32 srccols)
1438
{
1439
vc_upd = vdupq_n_s32(4);
1440
vc0 = vdupq_n_s32(0);
1441
vcmax = vdupq_n_s32(srccols-1);
1442
1443
s32 tmp0123[] = {0, 1, 2, 3 };
1444
vi = vld1q_s32(tmp0123);
1445
}
1446
inline void updateIndexes(int32x4_t & vi, int32x4_t & vsrch, int32x4_t & vsrcl)
1447
{
1448
vsrch = vminq_s32(vsrch, vcmax);
1449
vsrcl = vmaxq_s32(vsrcl, vc0);
1450
vsrcl = vminq_s32(vsrcl, vcmax);//for safe tail
1451
vsrch = vshlq_n_s32(vsrch, 3);
1452
vsrcl = vshlq_n_s32(vsrcl, 3);
1453
vi = vaddq_s32(vi, vc_upd);
1454
}
1455
};
1456
template <> struct resizeLinearInternals<4>
1457
{
1458
int32x4_t vc_upd;
1459
int32x4_t vc0;
1460
int32x4_t vcmax;
1461
int32x4_t v0123x8;
1462
1463
inline resizeLinearInternals(int32x4_t & vi, u32 srccols)
1464
{
1465
vc_upd = vdupq_n_s32(1);
1466
vc0 = vdupq_n_s32(0);
1467
vcmax = vdupq_n_s32(srccols-1);
1468
s32 tmp0123x8[] = {0, 8, 16, 24};
1469
v0123x8 = vld1q_s32(tmp0123x8);
1470
1471
vi = vc0;
1472
}
1473
inline void updateIndexes(int32x4_t & vi, int32x4_t & vsrch, int32x4_t & vsrcl)
1474
{
1475
vsrch = vminq_s32(vsrch, vcmax);
1476
vsrcl = vmaxq_s32(vsrcl, vc0);
1477
vsrch = vshlq_n_s32(vsrch, 5);
1478
vsrcl = vshlq_n_s32(vsrcl, 5);
1479
vsrch = vaddq_s32(vsrch, v0123x8);
1480
vsrcl = vaddq_s32(vsrcl, v0123x8);
1481
vi = vaddq_s32(vi, vc_upd);
1482
}
1483
};
1484
1485
template <int channels>
1486
void resizeLinearOpenCVchan(const Size2D &_ssize, const Size2D &_dsize,
1487
const u8 * srcBase, ptrdiff_t srcStride,
1488
u8 * dstBase, ptrdiff_t dstStride,
1489
f32 wr, f32 hr)
1490
{
1491
float scale_x_offset = 0.5f * wr - 0.5f;
1492
1493
Size2D ssize(_ssize.width*channels, _ssize.height);
1494
Size2D dsize(_dsize.width*channels, _dsize.height);
1495
1496
std::vector<u8> gcweight((dsize.width + 7) & ~7);
1497
std::vector<const u8*> gcols(((dsize.width + 7) & ~7) * 2);
1498
std::vector<u8> buf(((ssize.width + 7) & ~7) * 8); // (8 rows) x (width of src)
1499
1500
float32x4_t vscale_x = vdupq_n_f32(wr);
1501
float32x4_t vscale_x_offset = vdupq_n_f32(scale_x_offset);
1502
int32x4_t vc1 = vdupq_n_s32(1);
1503
float32x4_t vc128f = vdupq_n_f32(128.0f);
1504
1505
int32x4_t vi;
1506
resizeLinearInternals<channels> indexes(vi, _ssize.width);//u32 is used to store indexes
1507
//so we could get issues on src image dimensions greater than (2^32-1)
1508
1509
for (size_t dcol = 0; dcol < dsize.width; dcol += 8)
1510
{
1511
s32 idx[16];
1512
1513
float32x4_t vif = vcvtq_f32_s32(vi);
1514
float32x4_t vw = vmlaq_f32(vscale_x_offset, vscale_x, vif);
1515
int32x4_t vwi = vcvtq_s32_f32(vw);
1516
float32x4_t vwif = vcvtq_f32_s32(vwi);
1517
int32x4_t vmask = (int32x4_t)vcltq_f32(vwif, vw);
1518
int32x4_t vsrch = vsubq_s32(vwi, vmask);
1519
int32x4_t vsrcl = vsubq_s32(vsrch, vc1);
1520
float32x4_t vsrchf = vcvtq_f32_s32(vsrch);
1521
float32x4_t vw2 = vsubq_f32(vsrchf, vw);
1522
1523
vw2 = vmulq_f32(vw2, vc128f);
1524
uint32x4_t vw32u = vcvtq_u32_f32(vw2);
1525
uint16x4_t vw16ul = vmovn_u32(vw32u);
1526
indexes.updateIndexes(vi, vsrch, vsrcl);
1527
1528
vst1q_s32(idx + 0, vsrcl);
1529
vst1q_s32(idx + 8, vsrch);
1530
1531
vif = vcvtq_f32_s32(vi);
1532
vw = vmlaq_f32(vscale_x_offset, vscale_x, vif);
1533
vwi = vcvtq_s32_f32(vw);
1534
vwif = vcvtq_f32_s32(vwi);
1535
vmask = (int32x4_t)vcltq_f32(vwif, vw);
1536
vsrch = vsubq_s32(vwi, vmask);
1537
vsrcl = vsubq_s32(vsrch, vc1);
1538
vsrchf = vcvtq_f32_s32(vsrch);
1539
vw2 = vsubq_f32(vsrchf, vw);
1540
1541
vw2 = vmulq_f32(vw2, vc128f);
1542
vw32u = vcvtq_u32_f32(vw2);
1543
indexes.updateIndexes(vi, vsrch, vsrcl);
1544
1545
uint16x4_t vw16uh = vmovn_u32(vw32u);
1546
1547
vst1q_s32(idx + 4, vsrcl);
1548
vst1q_s32(idx + 12, vsrch);
1549
1550
uint8x8_t vw8u = vmovn_u16(vcombine_u16(vw16ul, vw16uh));
1551
1552
for (u32 i = 0; i < 8; ++i)
1553
{
1554
gcols[dcol * 2 + i*2] = &buf[idx[i]];
1555
gcols[dcol * 2 + i*2 + 1] = &buf[idx[i + 8]];
1556
}
1557
1558
vst1_u8(&gcweight[dcol], vw8u);
1559
}
1560
1561
resize_bilinear_rows(ssize, dsize, srcBase, srcStride, dstBase, dstStride, hr, &gcols[0], &gcweight[0], &buf[0]);
1562
}
1563
1564
void downsample_bilinear_8uc1(const Size2D &ssize, const Size2D &dsize,
1565
const u8 * srcBase, ptrdiff_t srcStride,
1566
u8 * dstBase, ptrdiff_t dstStride,
1567
f32 wr, f32 hr)
1568
{
1569
internal::assertSupportedConfiguration(wr <= 2.f && hr <= 2.f);
1570
1571
enum { SHIFT_BITS = 11 };
1572
1573
f32 scale_x_offset = 0.5f * wr - 0.5f;
1574
f32 scale_y_offset = 0.5f * hr - 0.5f;
1575
1576
std::vector<s32> _buf(dsize.height*(2*(sizeof(ptrdiff_t)/sizeof(s32))+1)+1);
1577
ptrdiff_t* buf = (ptrdiff_t*)&_buf[0];
1578
s32* buf2 = (s32*)buf+2*(sizeof(ptrdiff_t)/sizeof(s32))*dsize.height;
1579
for(size_t row = 0; row < (size_t)dsize.height; ++row)
1580
{
1581
f32 r = row * hr + scale_y_offset;
1582
ptrdiff_t src_row = floorf(r);
1583
ptrdiff_t src_row2 = src_row + 1;
1584
1585
f32 rweight = src_row2 - r;
1586
buf2[row] = floorf(rweight * (1 << SHIFT_BITS) + 0.5f);
1587
buf[0 * dsize.height + row] = std::max<ptrdiff_t>(0, src_row);
1588
buf[1 * dsize.height + row] = std::min((ptrdiff_t)ssize.height-1, src_row2);
1589
}
1590
1591
#define USE_CORRECT_VERSION 0
1592
1593
ptrdiff_t col = 0;
1594
/***********************************************/
1595
for(; col <= (ptrdiff_t)dsize.width-16; col+=16)
1596
{
1597
ptrdiff_t col1[16];
1598
ptrdiff_t col2[16];
1599
s16 cwi[16];
1600
1601
for(s32 k = 0; k < 16; ++k)
1602
{
1603
f32 c = (col + k) * wr + scale_x_offset;
1604
col1[k] = (ptrdiff_t)c;
1605
col2[k] = col1[k] + 1;
1606
1607
cwi[k] = (short)floorf((col2[k] - c) * (1 << SHIFT_BITS) + 0.5f);
1608
1609
if(col1[k] < 0) col1[k] = 0;
1610
if(col2[k] >= (ptrdiff_t)ssize.width) col2[k] = ssize.width-1;
1611
}
1612
1613
ptrdiff_t x = std::min(col1[0], (ptrdiff_t)ssize.width-16);
1614
ptrdiff_t y = std::min(col1[8], (ptrdiff_t)ssize.width-16);
1615
u8 lutl[16];
1616
u8 luth[16];
1617
for(s32 k = 0; k < 8; ++k)
1618
{
1619
lutl[k] = (u8)(col1[k] - x);
1620
luth[k] = (u8)(col2[k] - x);
1621
lutl[k+8] = (u8)(col1[k+8] - y);
1622
luth[k+8] = (u8)(col2[k+8] - y);
1623
}
1624
1625
uint8x8_t vlutl = vld1_u8(lutl);
1626
uint8x8_t vluth = vld1_u8(luth);
1627
int16x8_t vcw = vld1q_s16(cwi);
1628
1629
uint8x8_t vlutl_ = vld1_u8(lutl+8);
1630
uint8x8_t vluth_ = vld1_u8(luth+8);
1631
int16x8_t vcw_ = vld1q_s16(cwi+8);
1632
1633
for(ptrdiff_t row = 0; row < (ptrdiff_t)dsize.height; ++row)
1634
{
1635
#if USE_CORRECT_VERSION
1636
int32x4_t vrw = vdupq_n_s32(buf2[row]);
1637
#else
1638
int16x8_t vrw = vdupq_n_s16((int16_t)buf2[row]);
1639
int16x8_t vrW = vdupq_n_s16((int16_t)((1 << SHIFT_BITS) - buf2[row]));
1640
#endif
1641
1642
internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 2*srcStride);
1643
internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 3*srcStride);
1644
1645
{
1646
union { uint8x16_t v; uint8x8x2_t w; } vr1 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[0*dsize.height + row]) + x) };
1647
union { uint8x16_t v; uint8x8x2_t w; } vr2 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x) };
1648
1649
uint8x8_t vr1l = vtbl2_u8(vr1.w, vlutl);
1650
uint8x8_t vr1h = vtbl2_u8(vr1.w, vluth);
1651
uint8x8_t vr2l = vtbl2_u8(vr2.w, vlutl);
1652
uint8x8_t vr2h = vtbl2_u8(vr2.w, vluth);
1653
1654
uint16x8_t v1hw = vmovl_u8(vr1h);
1655
uint16x8_t v2hw = vmovl_u8(vr2h);
1656
1657
int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h));
1658
int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h));
1659
1660
int32x4_t v1L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v1hw), SHIFT_BITS));
1661
int32x4_t v1H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v1hw), SHIFT_BITS));
1662
int32x4_t v2L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v2hw), SHIFT_BITS));
1663
int32x4_t v2H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v2hw), SHIFT_BITS));
1664
1665
v1L = vmlal_s16(v1L, vget_low_s16(v1df), vget_low_s16(vcw));
1666
v1H = vmlal_s16(v1H, vget_high_s16(v1df), vget_high_s16(vcw));
1667
v2L = vmlal_s16(v2L, vget_low_s16(v2df), vget_low_s16(vcw));
1668
v2H = vmlal_s16(v2H, vget_high_s16(v2df), vget_high_s16(vcw));
1669
1670
#if USE_CORRECT_VERSION
1671
/* correct version */
1672
int32x4_t vL = vshlq_n_s32(v2L, SHIFT_BITS);
1673
int32x4_t vH = vshlq_n_s32(v2H, SHIFT_BITS);
1674
int32x4_t vdiffL = vsubq_s32(v1L, v2L);
1675
int32x4_t vdiffH = vsubq_s32(v1H, v2H);
1676
1677
vL = vmlaq_s32(vL, vdiffL, vrw);
1678
vH = vmlaq_s32(vH, vdiffH, vrw);
1679
uint16x4_t vL_ = vqrshrun_n_s32(vL, 2*SHIFT_BITS - 8);
1680
uint16x4_t vH_ = vqrshrun_n_s32(vH, 2*SHIFT_BITS - 8);
1681
uint8x8_t vres = vrshrn_n_u16(vcombine_u16(vL_, vH_), 8);
1682
vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres);
1683
#else
1684
/* ugly version matching to OpenCV's SSE optimization */
1685
int16x4_t v1Ls = vshrn_n_s32(v1L, 4);
1686
int16x4_t v1Hs = vshrn_n_s32(v1H, 4);
1687
int16x4_t v2Ls = vshrn_n_s32(v2L, 4);
1688
int16x4_t v2Hs = vshrn_n_s32(v2H, 4);
1689
1690
int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw);
1691
int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW);
1692
1693
int16x8_t vsum = vaddq_s16(vshrq_n_s16(v1s,1), vshrq_n_s16(v2s,1));
1694
uint8x8_t vres = vqrshrun_n_s16(vsum, 2);
1695
1696
vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres);
1697
#endif
1698
}
1699
1700
{
1701
union { uint8x16_t v; uint8x8x2_t w; } vr1 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[0*dsize.height + row]) + y) };
1702
union { uint8x16_t v; uint8x8x2_t w; } vr2 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + y) };
1703
1704
uint8x8_t vr1l = vtbl2_u8(vr1.w, vlutl_);
1705
uint8x8_t vr1h = vtbl2_u8(vr1.w, vluth_);
1706
uint8x8_t vr2l = vtbl2_u8(vr2.w, vlutl_);
1707
uint8x8_t vr2h = vtbl2_u8(vr2.w, vluth_);
1708
1709
uint16x8_t v1hw = vmovl_u8(vr1h);
1710
uint16x8_t v2hw = vmovl_u8(vr2h);
1711
1712
int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h));
1713
int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h));
1714
1715
int32x4_t v1L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v1hw), SHIFT_BITS));
1716
int32x4_t v1H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v1hw), SHIFT_BITS));
1717
int32x4_t v2L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v2hw), SHIFT_BITS));
1718
int32x4_t v2H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v2hw), SHIFT_BITS));
1719
1720
v1L = vmlal_s16(v1L, vget_low_s16(v1df), vget_low_s16(vcw_));
1721
v1H = vmlal_s16(v1H, vget_high_s16(v1df), vget_high_s16(vcw_));
1722
v2L = vmlal_s16(v2L, vget_low_s16(v2df), vget_low_s16(vcw_));
1723
v2H = vmlal_s16(v2H, vget_high_s16(v2df), vget_high_s16(vcw_));
1724
1725
#if USE_CORRECT_VERSION
1726
/* correct version */
1727
int32x4_t vL = vshlq_n_s32(v2L, SHIFT_BITS);
1728
int32x4_t vH = vshlq_n_s32(v2H, SHIFT_BITS);
1729
int32x4_t vdiffL = vsubq_s32(v1L, v2L);
1730
int32x4_t vdiffH = vsubq_s32(v1H, v2H);
1731
1732
vL = vmlaq_s32(vL, vdiffL, vrw);
1733
vH = vmlaq_s32(vH, vdiffH, vrw);
1734
uint16x4_t vL_ = vqrshrun_n_s32(vL, 2*SHIFT_BITS - 8);
1735
uint16x4_t vH_ = vqrshrun_n_s32(vH, 2*SHIFT_BITS - 8);
1736
uint8x8_t vres = vrshrn_n_u16(vcombine_u16(vL_, vH_), 8);
1737
vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col + 8, vres);
1738
#else
1739
/* ugly version matching to OpenCV's SSE optimization */
1740
int16x4_t v1Ls = vshrn_n_s32(v1L, 4);
1741
int16x4_t v1Hs = vshrn_n_s32(v1H, 4);
1742
int16x4_t v2Ls = vshrn_n_s32(v2L, 4);
1743
int16x4_t v2Hs = vshrn_n_s32(v2H, 4);
1744
1745
int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw);
1746
int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW);
1747
1748
int16x8_t vsum = vaddq_s16(vshrq_n_s16(v1s,1), vshrq_n_s16(v2s,1));
1749
uint8x8_t vres = vqrshrun_n_s16(vsum, 2);
1750
1751
vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col + 8, vres);
1752
#endif
1753
}
1754
}
1755
}
1756
/***********************************************/
1757
for(; col <= (ptrdiff_t)dsize.width-8; col+=8)
1758
{
1759
downsample_bilinear_8uc1_col_loop8:
1760
ptrdiff_t col1[8];
1761
ptrdiff_t col2[8];
1762
s16 cwi[8];
1763
1764
for(s32 k = 0; k < 8; ++k)
1765
{
1766
f32 c = (col + k) * wr + scale_x_offset;
1767
col1[k] = (ptrdiff_t)c;
1768
col2[k] = col1[k] + 1;
1769
1770
cwi[k] = (s16)floorf((col2[k] - c) * (1 << SHIFT_BITS) + 0.5f);
1771
1772
if(col1[k] < 0) col1[k] = 0;
1773
if(col2[k] >= (ptrdiff_t)ssize.width) col2[k] = (ptrdiff_t)ssize.width-1;
1774
}
1775
1776
ptrdiff_t x = std::min(col1[0], (ptrdiff_t)ssize.width-16);
1777
u8 lutl[8];
1778
u8 luth[8];
1779
for(s32 k = 0; k < 8; ++k)
1780
{
1781
lutl[k] = (u8)(col1[k] - x);
1782
luth[k] = (u8)(col2[k] - x);
1783
}
1784
1785
uint8x8_t vlutl = vld1_u8(lutl);
1786
uint8x8_t vluth = vld1_u8(luth);
1787
int16x8_t vcw = vld1q_s16(cwi);
1788
1789
for(ptrdiff_t row = 0; row < (ptrdiff_t)dsize.height; ++row)
1790
{
1791
#if USE_CORRECT_VERSION
1792
int32x4_t vrw = vdupq_n_s32(buf2[row]);
1793
#else
1794
int16x8_t vrw = vdupq_n_s16((int16_t)buf2[row]);
1795
int16x8_t vrW = vdupq_n_s16((int16_t)((1 << SHIFT_BITS) - buf2[row]));
1796
#endif
1797
1798
internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 2*srcStride);
1799
internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 3*srcStride);
1800
1801
union { uint8x16_t v; uint8x8x2_t w; } vr1 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[0*dsize.height + row]) + x) };
1802
union { uint8x16_t v; uint8x8x2_t w; } vr2 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x) };
1803
1804
uint8x8_t vr1l = vtbl2_u8(vr1.w, vlutl);
1805
uint8x8_t vr1h = vtbl2_u8(vr1.w, vluth);
1806
uint8x8_t vr2l = vtbl2_u8(vr2.w, vlutl);
1807
uint8x8_t vr2h = vtbl2_u8(vr2.w, vluth);
1808
1809
uint16x8_t v1hw = vmovl_u8(vr1h);
1810
uint16x8_t v2hw = vmovl_u8(vr2h);
1811
1812
int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h));
1813
int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h));
1814
1815
int32x4_t v1L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v1hw), SHIFT_BITS));
1816
int32x4_t v1H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v1hw), SHIFT_BITS));
1817
int32x4_t v2L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v2hw), SHIFT_BITS));
1818
int32x4_t v2H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v2hw), SHIFT_BITS));
1819
1820
v1L = vmlal_s16(v1L, vget_low_s16(v1df), vget_low_s16(vcw));
1821
v1H = vmlal_s16(v1H, vget_high_s16(v1df), vget_high_s16(vcw));
1822
v2L = vmlal_s16(v2L, vget_low_s16(v2df), vget_low_s16(vcw));
1823
v2H = vmlal_s16(v2H, vget_high_s16(v2df), vget_high_s16(vcw));
1824
1825
#if USE_CORRECT_VERSION
1826
/* correct version */
1827
int32x4_t vL = vshlq_n_s32(v2L, SHIFT_BITS);
1828
int32x4_t vH = vshlq_n_s32(v2H, SHIFT_BITS);
1829
int32x4_t vdiffL = vsubq_s32(v1L, v2L);
1830
int32x4_t vdiffH = vsubq_s32(v1H, v2H);
1831
1832
vL = vmlaq_s32(vL, vdiffL, vrw);
1833
vH = vmlaq_s32(vH, vdiffH, vrw);
1834
uint16x4_t vL_ = vqrshrun_n_s32(vL, 2*SHIFT_BITS - 8);
1835
uint16x4_t vH_ = vqrshrun_n_s32(vH, 2*SHIFT_BITS - 8);
1836
uint8x8_t vres = vrshrn_n_u16(vcombine_u16(vL_, vH_), 8);
1837
vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres);
1838
#else
1839
/* ugly version matching to OpenCV's SSE optimization */
1840
int16x4_t v1Ls = vshrn_n_s32(v1L, 4);
1841
int16x4_t v1Hs = vshrn_n_s32(v1H, 4);
1842
int16x4_t v2Ls = vshrn_n_s32(v2L, 4);
1843
int16x4_t v2Hs = vshrn_n_s32(v2H, 4);
1844
1845
int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw);
1846
int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW);
1847
1848
int16x8_t vsum = vaddq_s16(vshrq_n_s16(v1s,1), vshrq_n_s16(v2s,1));
1849
uint8x8_t vres = vqrshrun_n_s16(vsum, 2);
1850
1851
vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres);
1852
#endif
1853
}
1854
}
1855
if (col < (ptrdiff_t)dsize.width)
1856
{
1857
col = dsize.width - 8;
1858
goto downsample_bilinear_8uc1_col_loop8;
1859
}
1860
}
1861
1862
} // namespace
1863
1864
#endif
1865
1866
void resizeLinearOpenCV(const Size2D &ssize, const Size2D &dsize,
1867
const u8 * srcBase, ptrdiff_t srcStride,
1868
u8 * dstBase, ptrdiff_t dstStride,
1869
f32 wr, f32 hr, u32 channels)
1870
{
1871
internal::assertSupportedConfiguration(wr > 0 && hr > 0 &&
1872
(dsize.width - 0.5) * wr - 0.5 < ssize.width &&
1873
(dsize.height - 0.5) * hr - 0.5 < ssize.height && // Ensure we have enough source data
1874
(dsize.width + 0.5) * wr + 0.5 >= ssize.width &&
1875
(dsize.height + 0.5) * hr + 0.5 >= ssize.height && // Ensure source isn't too big
1876
isResizeLinearOpenCVSupported(ssize, dsize, channels));
1877
#ifdef CAROTENE_NEON
1878
if(1 == channels)
1879
{
1880
if (wr <= 1.f && hr <= 1.f)
1881
resizeLinearOpenCVchan<1>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr);
1882
else if (wr <= 2.0f && hr <= 2.0f && ssize.width >= 16)
1883
downsample_bilinear_8uc1(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr);
1884
else
1885
resizeLinearOpenCVchan<1>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr);
1886
}
1887
else if(4 == channels)
1888
resizeLinearOpenCVchan<4>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr);
1889
#else
1890
(void)ssize;
1891
(void)dsize;
1892
(void)srcBase;
1893
(void)srcStride;
1894
(void)dstBase;
1895
(void)dstStride;
1896
(void)wr;
1897
(void)hr;
1898
(void)channels;
1899
#endif
1900
}
1901
1902
void resizeLinear(const Size2D &ssize, const Size2D &dsize,
1903
const u8 * srcBase, ptrdiff_t srcStride,
1904
u8 * dstBase, ptrdiff_t dstStride,
1905
f32 wr, f32 hr, u32 channels)
1906
{
1907
internal::assertSupportedConfiguration(wr > 0 && hr > 0 &&
1908
(dsize.width - 0.5) * wr - 0.5 < ssize.width &&
1909
(dsize.height - 0.5) * hr - 0.5 < ssize.height && // Ensure we have enough source data
1910
(dsize.width + 0.5) * wr + 0.5 >= ssize.width &&
1911
(dsize.height + 0.5) * hr + 0.5 >= ssize.height && // Ensure source isn't too big
1912
isResizeLinearSupported(ssize, dsize,
1913
wr, hr, channels));
1914
#ifdef CAROTENE_NEON
1915
f32 scale_x = wr;
1916
f32 scale_x_offset = 0.5f * scale_x - 0.5f;
1917
f32 scale_y = hr;
1918
f32 scale_y_offset = 0.5f * scale_y - 0.5f;
1919
1920
std::vector<ptrdiff_t> _buf(dsize.height * 3 + 1);
1921
std::vector<f32> coeff(dsize.height);
1922
ptrdiff_t * buf = &_buf[0];
1923
1924
for (size_t row = 0; row < dsize.height; ++row)
1925
{
1926
f32 r = row * scale_y + scale_y_offset;
1927
ptrdiff_t src_row = floorf(r);
1928
ptrdiff_t src_row2 = src_row + 1;
1929
1930
f32 rweight = src_row2 - r;
1931
buf[0 * dsize.height + row] = std::max<ptrdiff_t>(0, src_row);
1932
buf[1 * dsize.height + row] = std::min<ptrdiff_t>(ssize.height - 1, src_row2);
1933
coeff[row] = rweight;
1934
}
1935
1936
size_t col = 0;
1937
for ( ; col + 16 <= dsize.width; col += 16)
1938
{
1939
ptrdiff_t col1[16], col2[16];
1940
f32 cwi[16];
1941
1942
for(s32 k = 0; k < 16; ++k)
1943
{
1944
f32 c = (col + k) * scale_x + scale_x_offset;
1945
col1[k] = floorf(c);
1946
col2[k] = col1[k] + 1;
1947
1948
cwi[k] = col2[k] - c;
1949
1950
if (col1[k] < 0)
1951
col1[k] = 0;
1952
if (col2[k] >= (ptrdiff_t)ssize.width)
1953
col2[k] = ssize.width - 1;
1954
}
1955
1956
ptrdiff_t x = std::min<ptrdiff_t>(col1[0], ssize.width - 16);
1957
ptrdiff_t y = std::min<ptrdiff_t>(col1[8], ssize.width - 16);
1958
u8 lutl[16], luth[16];
1959
1960
for (s32 k = 0; k < 8; ++k)
1961
{
1962
lutl[k] = (u8)(col1[k] - x);
1963
luth[k] = (u8)(col2[k] - x);
1964
lutl[k + 8] = (u8)(col1[k + 8] - y);
1965
luth[k + 8] = (u8)(col2[k + 8] - y);
1966
}
1967
1968
uint8x8_t vlutl = vld1_u8(lutl);
1969
uint8x8_t vluth = vld1_u8(luth);
1970
float32x4_t vcw0 = vld1q_f32(cwi);
1971
float32x4_t vcw1 = vld1q_f32(cwi + 4);
1972
1973
uint8x8_t vlutl_ = vld1_u8(lutl + 8);
1974
uint8x8_t vluth_ = vld1_u8(luth + 8);
1975
float32x4_t vcw0_ = vld1q_f32(cwi + 8);
1976
float32x4_t vcw1_ = vld1q_f32(cwi + 12);
1977
1978
if (channels == 1)
1979
{
1980
for (size_t row = 0; row < dsize.height; ++row)
1981
{
1982
float32x4_t vrw = vdupq_n_f32(coeff[row]);
1983
1984
const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
1985
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
1986
u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
1987
1988
internal::prefetch(srow0 + x + 2 * srcStride);
1989
internal::prefetch(srow1 + x + 2 * srcStride);
1990
1991
uint8x8_t vres0 = resizeLinearStep(vld1q_u8(srow0 + x), vld1q_u8(srow1 + x),
1992
vlutl, vluth,
1993
vrw, vcw0, vcw1);
1994
1995
uint8x8_t vres1 = resizeLinearStep(vld1q_u8(srow0 + y), vld1q_u8(srow1 + y),
1996
vlutl_, vluth_,
1997
vrw, vcw0_, vcw1_);
1998
1999
vst1q_u8(drow + col, vcombine_u8(vres0, vres1));
2000
}
2001
}
2002
else if (channels == 3)
2003
{
2004
for (size_t row = 0; row < dsize.height; ++row)
2005
{
2006
float32x4_t vrw = vdupq_n_f32(coeff[row]);
2007
2008
const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
2009
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
2010
u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
2011
2012
internal::prefetch(srow0 + x + 2 * srcStride);
2013
internal::prefetch(srow1 + x + 2 * srcStride);
2014
2015
uint8x16x3_t v_src10 = vld3q_u8(srow0 + (x * 3));
2016
uint8x16x3_t v_src20 = vld3q_u8(srow1 + (x * 3));
2017
2018
uint8x16x3_t v_src11 = vld3q_u8(srow0 + (y * 3));
2019
uint8x16x3_t v_src21 = vld3q_u8(srow1 + (y * 3));
2020
2021
uint8x16x3_t v_dst;
2022
2023
v_dst.val[0] = vcombine_u8(resizeLinearStep(v_src10.val[0], v_src20.val[0], vlutl, vluth, vrw, vcw0, vcw1),
2024
resizeLinearStep(v_src11.val[0], v_src21.val[0], vlutl_, vluth_, vrw, vcw0_, vcw1_));
2025
v_dst.val[1] = vcombine_u8(resizeLinearStep(v_src10.val[1], v_src20.val[1], vlutl, vluth, vrw, vcw0, vcw1),
2026
resizeLinearStep(v_src11.val[1], v_src21.val[1], vlutl_, vluth_, vrw, vcw0_, vcw1_));
2027
v_dst.val[2] = vcombine_u8(resizeLinearStep(v_src10.val[2], v_src20.val[2], vlutl, vluth, vrw, vcw0, vcw1),
2028
resizeLinearStep(v_src11.val[2], v_src21.val[2], vlutl_, vluth_, vrw, vcw0_, vcw1_));
2029
2030
vst3q_u8(drow + (col * 3), v_dst);
2031
}
2032
}
2033
else if (channels == 4)
2034
{
2035
for (size_t row = 0; row < dsize.height; ++row)
2036
{
2037
float32x4_t vrw = vdupq_n_f32(coeff[row]);
2038
2039
const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
2040
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
2041
u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
2042
2043
internal::prefetch(srow0 + x + 2 * srcStride);
2044
internal::prefetch(srow1 + x + 2 * srcStride);
2045
2046
uint8x16x4_t v_src10 = vld4q_u8(srow0 + (x << 2));
2047
uint8x16x4_t v_src20 = vld4q_u8(srow1 + (x << 2));
2048
2049
uint8x16x4_t v_src11 = vld4q_u8(srow0 + (y << 2));
2050
uint8x16x4_t v_src21 = vld4q_u8(srow1 + (y << 2));
2051
2052
uint8x16x4_t v_dst;
2053
2054
v_dst.val[0] = vcombine_u8(resizeLinearStep(v_src10.val[0], v_src20.val[0], vlutl, vluth, vrw, vcw0, vcw1),
2055
resizeLinearStep(v_src11.val[0], v_src21.val[0], vlutl_, vluth_, vrw, vcw0_, vcw1_));
2056
v_dst.val[1] = vcombine_u8(resizeLinearStep(v_src10.val[1], v_src20.val[1], vlutl, vluth, vrw, vcw0, vcw1),
2057
resizeLinearStep(v_src11.val[1], v_src21.val[1], vlutl_, vluth_, vrw, vcw0_, vcw1_));
2058
v_dst.val[2] = vcombine_u8(resizeLinearStep(v_src10.val[2], v_src20.val[2], vlutl, vluth, vrw, vcw0, vcw1),
2059
resizeLinearStep(v_src11.val[2], v_src21.val[2], vlutl_, vluth_, vrw, vcw0_, vcw1_));
2060
v_dst.val[3] = vcombine_u8(resizeLinearStep(v_src10.val[3], v_src20.val[3], vlutl, vluth, vrw, vcw0, vcw1),
2061
resizeLinearStep(v_src11.val[3], v_src21.val[3], vlutl_, vluth_, vrw, vcw0_, vcw1_));
2062
2063
vst4q_u8(drow + (col << 2), v_dst);
2064
}
2065
}
2066
}
2067
2068
for ( ; col + 8 <= dsize.width; col += 8)
2069
{
2070
downsample_bilinear_8uc1_col_loop8:
2071
ptrdiff_t col1[8], col2[8];
2072
f32 cwi[8];
2073
2074
for (s32 k = 0; k < 8; ++k)
2075
{
2076
f32 c = (col + k) * scale_x + scale_x_offset;
2077
col1[k] = floorf(c);
2078
col2[k] = col1[k] + 1;
2079
2080
cwi[k] = col2[k] - c;
2081
2082
if (col1[k] < 0)
2083
col1[k] = 0;
2084
if (col2[k] >= (ptrdiff_t)ssize.width)
2085
col2[k] = ssize.width - 1;
2086
}
2087
2088
ptrdiff_t x = std::min<ptrdiff_t>(col1[0], ssize.width - 16);
2089
u8 lutl[8], luth[8];
2090
for (s32 k = 0; k < 8; ++k)
2091
{
2092
lutl[k] = (u8)(col1[k] - x);
2093
luth[k] = (u8)(col2[k] - x);
2094
}
2095
2096
uint8x8_t vlutl = vld1_u8(lutl);
2097
uint8x8_t vluth = vld1_u8(luth);
2098
float32x4_t vcw0 = vld1q_f32(cwi);
2099
float32x4_t vcw1 = vld1q_f32(cwi + 4);
2100
2101
if (channels == 1)
2102
{
2103
for (size_t row = 0; row < dsize.height; ++row)
2104
{
2105
float32x4_t vrw = vdupq_n_f32(coeff[row]);
2106
2107
const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
2108
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
2109
u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
2110
2111
internal::prefetch(srow0 + x + 2 * srcStride);
2112
internal::prefetch(srow1 + x + 2 * srcStride);
2113
2114
uint8x8_t vres = resizeLinearStep(vld1q_u8(srow0 + x), vld1q_u8(srow1 + x),
2115
vlutl, vluth,
2116
vrw, vcw0, vcw1);
2117
vst1_u8(drow + col, vres);
2118
}
2119
}
2120
else if (channels == 3)
2121
{
2122
for (size_t row = 0; row < dsize.height; ++row)
2123
{
2124
float32x4_t vrw = vdupq_n_f32(coeff[row]);
2125
2126
const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
2127
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
2128
u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
2129
2130
internal::prefetch(srow0 + x + 2 * srcStride);
2131
internal::prefetch(srow1 + x + 2 * srcStride);
2132
2133
uint8x16x3_t v_src1 = vld3q_u8(srow0 + (x * 3));
2134
uint8x16x3_t v_src2 = vld3q_u8(srow1 + (x * 3));
2135
2136
uint8x8x3_t v_dst;
2137
2138
v_dst.val[0] = resizeLinearStep(v_src1.val[0], v_src2.val[0], vlutl, vluth, vrw, vcw0, vcw1);
2139
v_dst.val[1] = resizeLinearStep(v_src1.val[1], v_src2.val[1], vlutl, vluth, vrw, vcw0, vcw1);
2140
v_dst.val[2] = resizeLinearStep(v_src1.val[2], v_src2.val[2], vlutl, vluth, vrw, vcw0, vcw1);
2141
2142
vst3_u8(drow + (col * 3), v_dst);
2143
}
2144
}
2145
else if (channels == 4)
2146
{
2147
for (size_t row = 0; row < dsize.height; ++row)
2148
{
2149
float32x4_t vrw = vdupq_n_f32(coeff[row]);
2150
2151
const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
2152
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
2153
u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
2154
2155
internal::prefetch(srow0 + x + 2 * srcStride);
2156
internal::prefetch(srow1 + x + 2 * srcStride);
2157
2158
uint8x16x4_t v_src1 = vld4q_u8(srow0 + (x << 2));
2159
uint8x16x4_t v_src2 = vld4q_u8(srow1 + (x << 2));
2160
2161
uint8x8x4_t v_dst;
2162
2163
v_dst.val[0] = resizeLinearStep(v_src1.val[0], v_src2.val[0], vlutl, vluth, vrw, vcw0, vcw1);
2164
v_dst.val[1] = resizeLinearStep(v_src1.val[1], v_src2.val[1], vlutl, vluth, vrw, vcw0, vcw1);
2165
v_dst.val[2] = resizeLinearStep(v_src1.val[2], v_src2.val[2], vlutl, vluth, vrw, vcw0, vcw1);
2166
v_dst.val[3] = resizeLinearStep(v_src1.val[3], v_src2.val[3], vlutl, vluth, vrw, vcw0, vcw1);
2167
2168
vst4_u8(drow + (col << 2), v_dst);
2169
}
2170
}
2171
}
2172
2173
if (col < dsize.width)
2174
{
2175
col = dsize.width - 8;
2176
goto downsample_bilinear_8uc1_col_loop8;
2177
}
2178
2179
#else
2180
(void)ssize;
2181
(void)dsize;
2182
(void)srcBase;
2183
(void)srcStride;
2184
(void)dstBase;
2185
(void)dstStride;
2186
(void)wr;
2187
(void)hr;
2188
(void)channels;
2189
#endif
2190
}
2191
2192
} // namespace CAROTENE_NS
2193
2194