CoCalc -- resize.cpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/resize.cpp
¹⁶³³⁷ views
1
/*
2
 * By downloading, copying, installing or using the software you agree to this license.
3
 * If you do not agree to this license, do not download, install,
4
 * copy or use the software.
5
 *
6
 *
7
 *                           License Agreement
8
 *                For Open Source Computer Vision Library
9
 *                        (3-clause BSD License)
10
 *
11
 * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
12
 * Third party copyrights are property of their respective owners.
13
 *
14
 * Redistribution and use in source and binary forms, with or without modification,
15
 * are permitted provided that the following conditions are met:
16
 *
17
 *   * Redistributions of source code must retain the above copyright notice,
18
 *     this list of conditions and the following disclaimer.
19
 *
20
 *   * Redistributions in binary form must reproduce the above copyright notice,
21
 *     this list of conditions and the following disclaimer in the documentation
22
 *     and/or other materials provided with the distribution.
23
 *
24
 *   * Neither the names of the copyright holders nor the names of the contributors
25
 *     may be used to endorse or promote products derived from this software
26
 *     without specific prior written permission.
27
 *
28
 * This software is provided by the copyright holders and contributors "as is" and
29
 * any express or implied warranties, including, but not limited to, the implied
30
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31
 * In no event shall copyright holders or contributors be liable for any direct,
32
 * indirect, incidental, special, exemplary, or consequential damages
33
 * (including, but not limited to, procurement of substitute goods or services;
34
 * loss of use, data, or profits; or business interruption) however caused
35
 * and on any theory of liability, whether in contract, strict liability,
36
 * or tort (including negligence or otherwise) arising in any way out of
37
 * the use of this software, even if advised of the possibility of such damage.
38
 */
39

40
#include "common.hpp"
41
#include "vtransform.hpp"
42

43
#include <cmath>
44
#include <vector>
45
#include <algorithm>
46

47
namespace CAROTENE_NS {
48

49
bool isResizeNearestNeighborSupported(const Size2D &ssize, u32 elemSize)
50
{
51
#if SIZE_MAX <= UINT32_MAX
52
    (void)ssize;
53
#endif
54
    bool supportedElemSize = (elemSize == 1) || (elemSize == 3) || (elemSize == 4);
55
    return isSupportedConfiguration()
56
#if SIZE_MAX > UINT32_MAX
57
           && !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF)// Restrict image size since internally used resizeGeneric performs
58
                                                                      // index evaluation with u32
59
#endif
60
           && supportedElemSize;
61
}
62

63
bool isResizeAreaSupported(f32 wr, f32 hr, u32 channels)
64
{
65
    bool supportedRatio = false;
66

67
    if (channels == 1)
68
        supportedRatio = (hr == wr) && ((wr == 2.0f) || (wr == 4.0f) || (wr == 0.5));
69
    else if (channels == 3)
70
        supportedRatio = (hr == wr) && ((wr == 2.0f) || (wr == 4.0f) || (wr == 0.5f));
71
    else if (channels == 4)
72
        supportedRatio = (hr == wr) && ((wr == 2.0f) || (wr == 4.0f) || (wr == 0.5f));
73

74
    return isSupportedConfiguration() && supportedRatio;
75
}
76

77
bool isResizeLinearSupported(const Size2D &ssize, const Size2D &dsize,
78
                             f32 wr, f32 hr, u32 channels)
79
{
80
    if ((wr <= 2.0f) && (hr <= 2.0f))
81
    {
82
        bool channelsSupport = (channels == 1) || (channels == 3) || (channels == 4);
83
        return (ssize.width >= 16) && (dsize.height >= 8) &&
84
                (dsize.width >= 8) && channelsSupport;
85
    }
86

87
    return false;
88
}
89

90
bool isResizeLinearOpenCVSupported(const Size2D &ssize, const Size2D &dsize, u32 channels)
91
{
92
    switch(channels)
93
    {
94
    case 1:
95
        if (ssize.width >= 8
96
#if SIZE_MAX > UINT32_MAX
97
            && !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF)// Restrict image size since internal index evaluation
98
                                                                       // is performed with u32
99
#endif
100
            && dsize.width >= 8 && dsize.height >= 8)
101
            return isSupportedConfiguration();
102
        return false;
103
    case 4:
104
        if (ssize.width >= 2
105
#if SIZE_MAX > UINT32_MAX
106
            && !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF)// Restrict image size since internal index evaluation
107
                                                                       // is performed with u32
108
#endif
109
            && dsize.width >= 2 && dsize.height >= 8
110
            && (2*dsize.width != ssize.width || 2*dsize.height != ssize.height)) // 2x downscaling is performed as area in OpenCV which differs from this implementation
111
            return isSupportedConfiguration();
112
    default:
113
        return false;
114
    };
115
}
116

117
#ifdef CAROTENE_NEON
118

119
namespace {
120

121
u32 * calcLUT(size_t size, f32 ratio,
122
              std::vector<u32> & _ofs)
123
{
124
    _ofs.resize(size);
125
    u32 * ofs = &_ofs[0];
126

127
    size_t roiw8 = size >= 7 ? size - 7 : 0;
128
    size_t roiw4 = size >= 3 ? size - 3 : 0;
129
    size_t x = 0;
130

131
    f32 indeces[4] = { 0, 1, 2, 3 };
132
    float32x4_t v_index = vld1q_f32(indeces), v_inc = vdupq_n_f32(4);
133
    float32x4_t v_05 = vdupq_n_f32(0.5f), v_ratio = vdupq_n_f32(ratio);
134

135
    for ( ; x < roiw8; x += 8)
136
    {
137
        float32x4_t v_dstf = vmulq_f32(vaddq_f32(v_index, v_05), v_ratio);
138
        vst1q_u32(ofs + x, vcvtq_u32_f32(v_dstf));
139
        v_index = vaddq_f32(v_index, v_inc);
140

141
        v_dstf = vmulq_f32(vaddq_f32(v_index, v_05), v_ratio);
142
        vst1q_u32(ofs + x + 4, vcvtq_u32_f32(v_dstf));
143
        v_index = vaddq_f32(v_index, v_inc);
144
    }
145

146
    for ( ; x < roiw4; x += 4)
147
    {
148
        float32x4_t v_dstf = vmulq_f32(vaddq_f32(v_index, v_05), v_ratio);
149
        vst1q_u32(ofs + x, vcvtq_u32_f32(v_dstf));
150
        v_index = vaddq_f32(v_index, v_inc);
151
    }
152

153
    for ( ; x < size; ++x)
154
    {
155
        ofs[x] = static_cast<u32>(floorf((x + 0.5f) * ratio));
156
    }
157

158
    return ofs;
159
}
160

161
template <typename T>
162
void resizeGeneric(const Size2D &dsize,
163
                   const void * srcBase, ptrdiff_t srcStride,
164
                   void * dstBase, ptrdiff_t dstStride,
165
                   f32 wr, f32 hr)
166
{
167
    std::vector<u32> _x_ofs;
168
    u32 * x_ofs = calcLUT(dsize.width, wr, _x_ofs);//32bit LUT is used so we could get issues on src image dimensions greater than (2^32-1)
169

170
    for (size_t dst_y = 0; dst_y < dsize.height; ++dst_y)
171
    {
172
        size_t src_y = static_cast<size_t>(floorf((dst_y + 0.5f) * hr));
173
        const T * src = internal::getRowPtr(static_cast<const T *>(srcBase), srcStride, src_y);
174
        T * dst = internal::getRowPtr(static_cast<T *>(dstBase), dstStride, dst_y);
175

176
        for (size_t dst_x = 0; dst_x < dsize.width; ++dst_x)
177
        {
178
            internal::prefetch(src + dst_x);
179
            dst[dst_x] = src[x_ofs[dst_x]];
180
        }
181
    }
182
}
183

184
typedef struct _24bit_
185
{
186
    u8 a[3];
187
} _24bit;
188

189
} // namespace
190

191

192
#endif
193

194
void resizeNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
195
                           const void * srcBase, ptrdiff_t srcStride,
196
                           void * dstBase, ptrdiff_t dstStride,
197
                           f32 wr, f32 hr, u32 elemSize)
198
{
199
    internal::assertSupportedConfiguration(wr > 0 && hr > 0 &&
200
                                           (dsize.width - 0.5) * wr < ssize.width &&
201
                                           (dsize.height - 0.5) * hr < ssize.height &&  // Ensure we have enough source data
202
                                           (dsize.width + 0.5) * wr >= ssize.width &&
203
                                           (dsize.height + 0.5) * hr >= ssize.height && // Ensure source isn't too big
204
                                           isResizeNearestNeighborSupported(ssize, elemSize));
205
#ifdef CAROTENE_NEON
206

207
    if (elemSize == 1)
208
    {
209
        resizeGeneric<u8>(dsize,
210
                          srcBase, srcStride,
211
                          dstBase, dstStride,
212
                          wr, hr);
213
    }
214
    else if (elemSize == 3)
215
    {
216
        resizeGeneric<_24bit>(dsize,
217
                              srcBase, srcStride,
218
                              dstBase, dstStride,
219
                              wr, hr);
220
    }
221
    else if (elemSize == 4)
222
    {
223
        resizeGeneric<u32>(dsize,
224
                           srcBase, srcStride,
225
                           dstBase, dstStride,
226
                           wr, hr);
227
    }
228

229
#else
230
    (void)dsize;
231
    (void)srcBase;
232
    (void)srcStride;
233
    (void)dstBase;
234
    (void)dstStride;
235
    (void)wr;
236
    (void)hr;
237
#endif
238
}
239

240
#ifdef CAROTENE_NEON
241
template <bool opencv_like, int shiftsize>
242
inline uint8x8_t areaDownsamplingDivision(uint16x8_t data)
243
{
244
    return vshrn_n_u16(data, shiftsize);
245
}
246
template <>
247
inline uint8x8_t areaDownsamplingDivision<true,2>(uint16x8_t data)
248
{
249
    // rounding
250
    return vrshrn_n_u16(data,2);
251
}
252
template <>
253
inline uint8x8_t areaDownsamplingDivision<true,4>(uint16x8_t data)
254
{
255
    // bankers rounding
256
    return vrshrn_n_u16(vqsubq_u16(data, vshrq_n_u16(vbicq_u16(vdupq_n_u16(1<<4), data), 4)),4);
257
}
258

259
template <bool opencv_like, int shiftsize>
260
inline u8 areaDownsamplingDivision(u16 data)
261
{
262
    return data >> shiftsize;
263
}
264
template <>
265
inline u8 areaDownsamplingDivision<true,2>(u16 data)
266
{
267
    // rounding
268
    return (data + 2) >> 2;
269
}
270
template <>
271
inline u8 areaDownsamplingDivision<true,4>(u16 data)
272
{
273
    // bankers rounding
274
    return (data - (((1<<4) & ~data) >> 4) + 8) >> 4;
275
}
276
#endif
277

278
template <bool opencv_like>
279
inline void resizeAreaRounding(const Size2D &ssize, const Size2D &dsize,
280
                               const u8 * srcBase, ptrdiff_t srcStride,
281
                               u8 * dstBase, ptrdiff_t dstStride,
282
                               f32 wr, f32 hr, u32 channels)
283
{
284
    internal::assertSupportedConfiguration(isResizeAreaSupported(wr, hr, channels) &&
285
                                           std::abs(dsize.width  * wr -  ssize.width) < 0.1 &&
286
                                           std::abs(dsize.height * hr - ssize.height) < 0.1);
287
#ifdef CAROTENE_NEON
288
    if (channels == 1)
289
    {
290
        if ((wr == 2.0f) && (hr == 2.0f))
291
        {
292
            size_t roiw8 = dsize.width >= 7 ? dsize.width - 7 : 0;
293

294
            for (size_t i = 0; i < dsize.height; ++i)
295
            {
296
                const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 1);
297
                const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 1) + 1);
298
                u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
299
                size_t sj = 0, dj = 0;
300

301
                for ( ; dj < roiw8; dj += 8, sj += 16)
302
                {
303
                    internal::prefetch(src0_row + sj);
304
                    internal::prefetch(src1_row + sj);
305

306
                    uint16x8_t vSum1 = vpaddlq_u8(vld1q_u8(src0_row + sj));
307
                    uint16x8_t vSum2 = vpaddlq_u8(vld1q_u8(src1_row + sj));
308
                    uint8x8_t vRes1 = areaDownsamplingDivision<opencv_like,2>(vaddq_u16(vSum1, vSum2));
309

310
                    vst1_u8(dst_row + dj, vRes1);
311
                }
312

313
                for ( ; dj < dsize.width; ++dj, sj += 2)
314
                {
315
                    dst_row[dj] = areaDownsamplingDivision<opencv_like,2>(
316
                                      (u16)src0_row[sj] + src0_row[sj + 1] +
317
                                      src1_row[sj] + src1_row[sj + 1]);
318
                }
319
            }
320
        }
321
        else if ((wr == 0.5f) && (hr == 0.5f))
322
        {
323
            size_t roiw32 = dsize.width >= 31 ? dsize.width - 31 : 0;
324
            size_t roiw16 = dsize.width >= 15 ? dsize.width - 15 : 0;
325

326
            for (size_t i = 0; i < dsize.height; i += 2)
327
            {
328
                const u8 * src_row = internal::getRowPtr(srcBase, srcStride, i >> 1);
329
                u8 * dst0_row = internal::getRowPtr(dstBase, dstStride, i);
330
                u8 * dst1_row = internal::getRowPtr(dstBase, dstStride, std::min(i + 1, dsize.height - 1));
331
                size_t sj = 0, dj = 0;
332

333
                for ( ; dj < roiw32; dj += 32, sj += 16)
334
                {
335
                    internal::prefetch(src_row + sj);
336

337
                    uint8x16x2_t v_dst;
338
                    v_dst.val[0] = v_dst.val[1] = vld1q_u8(src_row + sj);
339

340
                    vst2q_u8(dst0_row + dj, v_dst);
341
                    vst2q_u8(dst1_row + dj, v_dst);
342
                }
343

344
                for ( ; dj < roiw16; dj += 16, sj += 8)
345
                {
346
                    uint8x8x2_t v_dst;
347
                    v_dst.val[0] = v_dst.val[1] = vld1_u8(src_row + sj);
348

349
                    vst2_u8(dst0_row + dj, v_dst);
350
                    vst2_u8(dst1_row + dj, v_dst);
351
                }
352

353
                for ( ; dj < dsize.width; dj += 2, ++sj)
354
                {
355
                    u8 src_val = src_row[sj];
356
                    dst0_row[dj] = dst0_row[dj + 1] = src_val;
357
                    dst1_row[dj] = dst1_row[dj + 1] = src_val;
358
                }
359
            }
360
        }
361
        else //if ((wr == 4.0f) && (hr == 4.0f)) //the only scale that lasts after isSupported check
362
        {
363
#ifndef __ANDROID__
364
            size_t roiw16 = dsize.width >= 15 ? dsize.width - 15 : 0;
365
#endif
366
            size_t roiw8 = dsize.width >= 7 ? dsize.width - 7 : 0;
367

368
            for (size_t i = 0; i < dsize.height; ++i)
369
            {
370
                const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 2);
371
                const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 1);
372
                const u8 * src2_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 2);
373
                const u8 * src3_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 3);
374
                u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
375
                size_t sj = 0, dj = 0;
376

377
#ifndef __ANDROID__
378
                for ( ; dj < roiw16; dj += 16, sj += 64)
379
                {
380
                    internal::prefetch(src0_row + sj);
381
                    internal::prefetch(src1_row + sj);
382
                    internal::prefetch(src2_row + sj);
383
                    internal::prefetch(src3_row + sj);
384

385
                    uint8x16x4_t vLane1 = vld4q_u8(src0_row + sj);
386
                    uint8x16x4_t vLane2 = vld4q_u8(src1_row + sj);
387
                    uint8x16x4_t vLane3 = vld4q_u8(src2_row + sj);
388
                    uint8x16x4_t vLane4 = vld4q_u8(src3_row + sj);
389

390
                    uint16x8_t vSum_0 = vaddl_u8(vget_low_u8(vLane1.val[0]), vget_low_u8(vLane1.val[1]));
391
                    vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane1.val[2]), vget_low_u8(vLane1.val[3])));
392
                    vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane2.val[0]), vget_low_u8(vLane2.val[1])));
393
                    vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane2.val[2]), vget_low_u8(vLane2.val[3])));
394
                    vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane3.val[0]), vget_low_u8(vLane3.val[1])));
395
                    vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane3.val[2]), vget_low_u8(vLane3.val[3])));
396
                    vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane4.val[0]), vget_low_u8(vLane4.val[1])));
397
                    vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane4.val[2]), vget_low_u8(vLane4.val[3])));
398

399
                    uint16x8_t vSum_1 = vaddl_u8(vget_high_u8(vLane1.val[0]), vget_high_u8(vLane1.val[1]));
400
                    vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane1.val[2]), vget_high_u8(vLane1.val[3])));
401
                    vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane2.val[0]), vget_high_u8(vLane2.val[1])));
402
                    vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane2.val[2]), vget_high_u8(vLane2.val[3])));
403
                    vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane3.val[0]), vget_high_u8(vLane3.val[1])));
404
                    vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane3.val[2]), vget_high_u8(vLane3.val[3])));
405
                    vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane4.val[0]), vget_high_u8(vLane4.val[1])));
406
                    vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane4.val[2]), vget_high_u8(vLane4.val[3])));
407

408
                    uint8x8_t vRes_0 = areaDownsamplingDivision<opencv_like,4>(vSum_0);
409
                    uint8x8_t vRes_1 = areaDownsamplingDivision<opencv_like,4>(vSum_1);
410

411
                    vst1q_u8(dst_row + dj, vcombine_u8(vRes_0, vRes_1));
412
                }
413
#endif
414

415
                for ( ; dj < roiw8; dj += 8, sj += 32)
416
                {
417
                    internal::prefetch(src0_row + sj);
418
                    internal::prefetch(src1_row + sj);
419
                    internal::prefetch(src2_row + sj);
420
                    internal::prefetch(src3_row + sj);
421

422
                    uint8x8x4_t vLane1 = vld4_u8(src0_row + sj);
423
                    uint8x8x4_t vLane2 = vld4_u8(src1_row + sj);
424
                    uint8x8x4_t vLane3 = vld4_u8(src2_row + sj);
425
                    uint8x8x4_t vLane4 = vld4_u8(src3_row + sj);
426

427
                    uint16x8_t vSum = vaddl_u8(vLane1.val[0], vLane1.val[1]);
428
                    vSum = vaddq_u16(vSum, vaddl_u8(vLane1.val[2], vLane1.val[3]));
429
                    vSum = vaddq_u16(vSum, vaddl_u8(vLane2.val[0], vLane2.val[1]));
430
                    vSum = vaddq_u16(vSum, vaddl_u8(vLane2.val[2], vLane2.val[3]));
431
                    vSum = vaddq_u16(vSum, vaddl_u8(vLane3.val[0], vLane3.val[1]));
432
                    vSum = vaddq_u16(vSum, vaddl_u8(vLane3.val[2], vLane3.val[3]));
433
                    vSum = vaddq_u16(vSum, vaddl_u8(vLane4.val[0], vLane4.val[1]));
434
                    vSum = vaddq_u16(vSum, vaddl_u8(vLane4.val[2], vLane4.val[3]));
435

436
                    vst1_u8(dst_row + dj, (areaDownsamplingDivision<opencv_like,4>(vSum)));
437
                }
438

439
                for ( ; dj < dsize.width; ++dj, sj += 4)
440
                {
441
                    dst_row[dj] = areaDownsamplingDivision<opencv_like,4>(
442
                                      (u16)src0_row[sj] + src0_row[sj + 1] + src0_row[sj + 2] + src0_row[sj + 3] +
443
                                      src1_row[sj] + src1_row[sj + 1] + src1_row[sj + 2] + src1_row[sj + 3] +
444
                                      src2_row[sj] + src2_row[sj + 1] + src2_row[sj + 2] + src2_row[sj + 3] +
445
                                      src3_row[sj] + src3_row[sj + 1] + src3_row[sj + 2] + src3_row[sj + 3]);
446
                }
447
            }
448
        }
449
    }
450
    else if (channels == 4)
451
    {
452
        if ((wr == 2.0f) && (hr == 2.0f))
453
        {
454
#ifndef __ANDROID__
455
            size_t roiw4 = dsize.width >= 3 ? (dsize.width - 3) << 2 : 0;
456
#endif
457
            size_t roiw2 = dsize.width >= 1 ? (dsize.width - 1) << 2 : 0;
458

459
            for (size_t i = 0; i < dsize.height; ++i)
460
            {
461
                const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 1);
462
                const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 1) + 1);
463
                u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
464
                size_t sj = 0, dj = 0;
465

466
#ifndef __ANDROID__
467
                for ( ; dj < roiw4; dj += 16, sj += 32)
468
                {
469
                    internal::prefetch(src0_row + sj);
470
                    internal::prefetch(src1_row + sj);
471

472
                    uint8x8_t vRes_0, vRes_1;
473

474
                    {
475
                        uint8x16_t vLane1 = vld1q_u8(src0_row + sj);
476
                        uint8x16_t vLane2 = vld1q_u8(src1_row + sj);
477

478
                        uint16x8_t vLane_l = vaddl_u8(vget_low_u8(vLane1), vget_low_u8(vLane2));
479
                        uint16x8_t vLane_h = vaddl_u8(vget_high_u8(vLane1), vget_high_u8(vLane2));
480

481
                        uint16x4_t vSum_l = vadd_u16(vget_low_u16(vLane_l), vget_high_u16(vLane_l));
482
                        uint16x4_t vSum_h = vadd_u16(vget_low_u16(vLane_h), vget_high_u16(vLane_h));
483

484
                        vRes_0 = areaDownsamplingDivision<opencv_like,2>(vcombine_u16(vSum_l, vSum_h));
485
                    }
486

487
                    {
488
                        uint8x16_t vLane1 = vld1q_u8(src0_row + sj + 16);
489
                        uint8x16_t vLane2 = vld1q_u8(src1_row + sj + 16);
490

491
                        uint16x8_t vLane_l = vaddl_u8(vget_low_u8(vLane1), vget_low_u8(vLane2));
492
                        uint16x8_t vLane_h = vaddl_u8(vget_high_u8(vLane1), vget_high_u8(vLane2));
493

494
                        uint16x4_t vSum_l = vadd_u16(vget_low_u16(vLane_l), vget_high_u16(vLane_l));
495
                        uint16x4_t vSum_h = vadd_u16(vget_low_u16(vLane_h), vget_high_u16(vLane_h));
496

497
                        vRes_1 = areaDownsamplingDivision<opencv_like,2>(vcombine_u16(vSum_l, vSum_h));
498
                    }
499

500
                    vst1q_u8(dst_row + dj, vcombine_u8(vRes_0, vRes_1));
501
                }
502
#endif
503

504
                for ( ; dj < roiw2; dj += 8, sj += 16)
505
                {
506
                    internal::prefetch(src0_row + sj);
507
                    internal::prefetch(src1_row + sj);
508

509
                    uint8x16_t vLane1 = vld1q_u8(src0_row + sj);
510
                    uint8x16_t vLane2 = vld1q_u8(src1_row + sj);
511

512
                    uint16x8_t vLane_l = vaddl_u8(vget_low_u8(vLane1), vget_low_u8(vLane2));
513
                    uint16x8_t vLane_h = vaddl_u8(vget_high_u8(vLane1), vget_high_u8(vLane2));
514

515
                    uint16x4_t vSum_l = vadd_u16(vget_low_u16(vLane_l), vget_high_u16(vLane_l));
516
                    uint16x4_t vSum_h = vadd_u16(vget_low_u16(vLane_h), vget_high_u16(vLane_h));
517

518
                    uint8x8_t vRes = areaDownsamplingDivision<opencv_like,2>(vcombine_u16(vSum_l, vSum_h));
519
                    vst1_u8(dst_row + dj, vRes);
520
                }
521

522
                for (size_t dwidth = dsize.width << 2; dj < dwidth; dj += 4, sj += 8)
523
                {
524
                    dst_row[dj    ] = areaDownsamplingDivision<opencv_like,2>(
525
                                          (u16)src0_row[sj    ] + src0_row[sj + 4] +
526
                                               src1_row[sj    ] + src1_row[sj + 4]);
527
                    dst_row[dj + 1] = areaDownsamplingDivision<opencv_like,2>(
528
                                          (u16)src0_row[sj + 1] + src0_row[sj + 5] +
529
                                               src1_row[sj + 1] + src1_row[sj + 5]);
530
                    dst_row[dj + 2] = areaDownsamplingDivision<opencv_like,2>(
531
                                          (u16)src0_row[sj + 2] + src0_row[sj + 6] +
532
                                               src1_row[sj + 2] + src1_row[sj + 6]);
533
                    dst_row[dj + 3] = areaDownsamplingDivision<opencv_like,2>(
534
                                          (u16)src0_row[sj + 3] + src0_row[sj + 7] +
535
                                               src1_row[sj + 3] + src1_row[sj + 7]);
536
                }
537
            }
538
        }
539
        else if ((wr == 0.5f) && (hr == 0.5f))
540
        {
541
#ifndef __ANDROID__
542
            size_t roiw32 = dsize.width >= 31 ? (dsize.width - 31) << 2 : 0;
543
#endif
544
            size_t roiw16 = dsize.width >= 15 ? (dsize.width - 15) << 2 : 0;
545

546
            for (size_t i = 0; i < dsize.height; i += 2)
547
            {
548
                const u8 * src_row = internal::getRowPtr(srcBase, srcStride, i >> 1);
549
                u8 * dst0_row = internal::getRowPtr(dstBase, dstStride, i);
550
                u8 * dst1_row = internal::getRowPtr(dstBase, dstStride, std::min(i + 1, dsize.height - 1));
551
                size_t sj = 0, dj = 0;
552

553
#ifndef __ANDROID__
554
                for ( ; dj < roiw32; dj += 128, sj += 64)
555
                {
556
                    internal::prefetch(src_row + sj);
557

558
                    uint8x16x4_t v_src = vld4q_u8(src_row + sj);
559
                    uint8x16x2_t v_c0 = vzipq_u8(v_src.val[0], v_src.val[0]);
560
                    uint8x16x2_t v_c1 = vzipq_u8(v_src.val[1], v_src.val[1]);
561
                    uint8x16x2_t v_c2 = vzipq_u8(v_src.val[2], v_src.val[2]);
562
                    uint8x16x2_t v_c3 = vzipq_u8(v_src.val[3], v_src.val[3]);
563

564
                    uint8x16x4_t v_dst;
565
                    v_dst.val[0] = v_c0.val[0];
566
                    v_dst.val[1] = v_c1.val[0];
567
                    v_dst.val[2] = v_c2.val[0];
568
                    v_dst.val[3] = v_c3.val[0];
569
                    vst4q_u8(dst0_row + dj, v_dst);
570
                    vst4q_u8(dst1_row + dj, v_dst);
571

572
                    v_dst.val[0] = v_c0.val[1];
573
                    v_dst.val[1] = v_c1.val[1];
574
                    v_dst.val[2] = v_c2.val[1];
575
                    v_dst.val[3] = v_c3.val[1];
576
                    vst4q_u8(dst0_row + dj + 64, v_dst);
577
                    vst4q_u8(dst1_row + dj + 64, v_dst);
578
                }
579
#endif
580

581
                for ( ; dj < roiw16; dj += 64, sj += 32)
582
                {
583
                    internal::prefetch(src_row + sj);
584

585
                    uint8x8x4_t v_src = vld4_u8(src_row + sj);
586
                    uint8x8x2_t v_c0 = vzip_u8(v_src.val[0], v_src.val[0]);
587
                    uint8x8x2_t v_c1 = vzip_u8(v_src.val[1], v_src.val[1]);
588
                    uint8x8x2_t v_c2 = vzip_u8(v_src.val[2], v_src.val[2]);
589
                    uint8x8x2_t v_c3 = vzip_u8(v_src.val[3], v_src.val[3]);
590

591
                    uint8x16x4_t v_dst;
592
                    v_dst.val[0] = vcombine_u8(v_c0.val[0], v_c0.val[1]);
593
                    v_dst.val[1] = vcombine_u8(v_c1.val[0], v_c1.val[1]);
594
                    v_dst.val[2] = vcombine_u8(v_c2.val[0], v_c2.val[1]);
595
                    v_dst.val[3] = vcombine_u8(v_c3.val[0], v_c3.val[1]);
596
                    vst4q_u8(dst0_row + dj, v_dst);
597
                    vst4q_u8(dst1_row + dj, v_dst);
598
                }
599

600
                for (size_t dwidth = dsize.width << 2; dj < dwidth; dj += 8, sj += 4)
601
                {
602
                    u8 src_val = src_row[sj];
603
                    dst0_row[dj] = dst0_row[dj + 4] = src_val;
604
                    dst1_row[dj] = dst1_row[dj + 4] = src_val;
605

606
                    src_val = src_row[sj + 1];
607
                    dst0_row[dj + 1] = dst0_row[dj + 5] = src_val;
608
                    dst1_row[dj + 1] = dst1_row[dj + 5] = src_val;
609

610
                    src_val = src_row[sj + 2];
611
                    dst0_row[dj + 2] = dst0_row[dj + 6] = src_val;
612
                    dst1_row[dj + 2] = dst1_row[dj + 6] = src_val;
613

614
                    src_val = src_row[sj + 3];
615
                    dst0_row[dj + 3] = dst0_row[dj + 7] = src_val;
616
                    dst1_row[dj + 3] = dst1_row[dj + 7] = src_val;
617
                }
618
            }
619
        }
620
        else //if ((hr == 4.0f) && (wr == 4.0f)) //the only scale that lasts after isSupported check
621
        {
622
            size_t roiw4 = dsize.width >= 3 ? (dsize.width - 3) << 2 : 0;
623
            size_t roiw2 = dsize.width >= 1 ? (dsize.width - 1) << 2 : 0;
624

625
            for (size_t i = 0; i < dsize.height; ++i)
626
            {
627
                const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 2);
628
                const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 1);
629
                const u8 * src2_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 2);
630
                const u8 * src3_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 3);
631
                u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
632
                size_t sj = 0, dj = 0;
633

634
                for ( ; dj < roiw4; dj += 16, sj += 64)
635
                {
636
                    internal::prefetch(src0_row + sj);
637
                    internal::prefetch(src1_row + sj);
638
                    internal::prefetch(src2_row + sj);
639
                    internal::prefetch(src3_row + sj);
640

641
                    uint8x16_t vLane10 = vld1q_u8(src0_row + sj), vLane11 = vld1q_u8(src0_row + sj + 16);
642
                    uint8x16_t vLane20 = vld1q_u8(src1_row + sj), vLane21 = vld1q_u8(src1_row + sj + 16);
643
                    uint8x16_t vLane30 = vld1q_u8(src2_row + sj), vLane31 = vld1q_u8(src2_row + sj + 16);
644
                    uint8x16_t vLane40 = vld1q_u8(src3_row + sj), vLane41 = vld1q_u8(src3_row + sj + 16);
645

646
                    uint16x8_t v_part_0, v_part_1;
647
                    {
648
                        uint16x8_t v_sum0 = vaddl_u8(vget_low_u8(vLane10), vget_high_u8(vLane10));
649
                        v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane20), vget_high_u8(vLane20)));
650
                        v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane30), vget_high_u8(vLane30)));
651
                        v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane40), vget_high_u8(vLane40)));
652

653
                        uint16x8_t v_sum1 = vaddl_u8(vget_low_u8(vLane11), vget_high_u8(vLane11));
654
                        v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane21), vget_high_u8(vLane21)));
655
                        v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane31), vget_high_u8(vLane31)));
656
                        v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane41), vget_high_u8(vLane41)));
657

658
                        v_part_0 = vcombine_u16(vadd_u16(vget_low_u16(v_sum0), vget_high_u16(v_sum0)),
659
                                                vadd_u16(vget_low_u16(v_sum1), vget_high_u16(v_sum1)));
660
                    }
661

662
                    vLane10 = vld1q_u8(src0_row + sj + 32);
663
                    vLane11 = vld1q_u8(src0_row + sj + 48);
664
                    vLane20 = vld1q_u8(src1_row + sj + 32);
665
                    vLane21 = vld1q_u8(src1_row + sj + 48);
666
                    vLane30 = vld1q_u8(src2_row + sj + 32);
667
                    vLane31 = vld1q_u8(src2_row + sj + 48);
668
                    vLane40 = vld1q_u8(src3_row + sj + 32);
669
                    vLane41 = vld1q_u8(src3_row + sj + 48);
670

671
                    {
672
                        uint16x8_t v_sum0 = vaddl_u8(vget_low_u8(vLane10), vget_high_u8(vLane10));
673
                        v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane20), vget_high_u8(vLane20)));
674
                        v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane30), vget_high_u8(vLane30)));
675
                        v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane40), vget_high_u8(vLane40)));
676

677
                        uint16x8_t v_sum1 = vaddl_u8(vget_low_u8(vLane11), vget_high_u8(vLane11));
678
                        v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane21), vget_high_u8(vLane21)));
679
                        v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane31), vget_high_u8(vLane31)));
680
                        v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane41), vget_high_u8(vLane41)));
681

682
                        v_part_1 = vcombine_u16(vadd_u16(vget_low_u16(v_sum0), vget_high_u16(v_sum0)),
683
                                                vadd_u16(vget_low_u16(v_sum1), vget_high_u16(v_sum1)));
684
                    }
685

686
                    vst1q_u8(dst_row + dj, vcombine_u8(areaDownsamplingDivision<opencv_like,4>(v_part_0),
687
                                                       areaDownsamplingDivision<opencv_like,4>(v_part_1)));
688
                }
689

690
                for ( ; dj < roiw2; dj += 8, sj += 32)
691
                {
692
                    uint8x16_t vLane10 = vld1q_u8(src0_row + sj), vLane11 = vld1q_u8(src0_row + sj + 16);
693
                    uint8x16_t vLane20 = vld1q_u8(src1_row + sj), vLane21 = vld1q_u8(src1_row + sj + 16);
694
                    uint8x16_t vLane30 = vld1q_u8(src2_row + sj), vLane31 = vld1q_u8(src2_row + sj + 16);
695
                    uint8x16_t vLane40 = vld1q_u8(src3_row + sj), vLane41 = vld1q_u8(src3_row + sj + 16);
696

697
                    uint16x8_t v_sum0 = vaddl_u8(vget_low_u8(vLane10), vget_high_u8(vLane10));
698
                    v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane20), vget_high_u8(vLane20)));
699
                    v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane30), vget_high_u8(vLane30)));
700
                    v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane40), vget_high_u8(vLane40)));
701

702
                    uint16x8_t v_sum1 = vaddl_u8(vget_low_u8(vLane11), vget_high_u8(vLane11));
703
                    v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane21), vget_high_u8(vLane21)));
704
                    v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane31), vget_high_u8(vLane31)));
705
                    v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane41), vget_high_u8(vLane41)));
706

707
                    uint16x8_t v_sum = vcombine_u16(vadd_u16(vget_low_u16(v_sum0), vget_high_u16(v_sum0)),
708
                                                    vadd_u16(vget_low_u16(v_sum1), vget_high_u16(v_sum1)));
709

710
                    vst1_u8(dst_row + dj, (areaDownsamplingDivision<opencv_like,4>(v_sum)));
711
                }
712

713
                for (size_t dwidth = dsize.width << 2; dj < dwidth; dj += 4, sj += 16)
714
                {
715
                    dst_row[dj    ] = areaDownsamplingDivision<opencv_like,4>(
716
                                            (u16)src0_row[sj     ] + src0_row[sj +  4] +
717
                                                 src0_row[sj +  8] + src0_row[sj + 12] +
718
                                                 src1_row[sj     ] + src1_row[sj +  4] +
719
                                                 src1_row[sj +  8] + src1_row[sj + 12] +
720
                                                 src2_row[sj     ] + src2_row[sj +  4] +
721
                                                 src2_row[sj +  8] + src2_row[sj + 12] +
722
                                                 src3_row[sj     ] + src3_row[sj +  4] +
723
                                                 src3_row[sj +  8] + src3_row[sj + 12]);
724

725
                    dst_row[dj + 1] = areaDownsamplingDivision<opencv_like,4>(
726
                                            (u16)src0_row[sj +  1] + src0_row[sj +  5] +
727
                                                 src0_row[sj +  9] + src0_row[sj + 13] +
728
                                                 src1_row[sj +  1] + src1_row[sj +  5] +
729
                                                 src1_row[sj +  9] + src1_row[sj + 13] +
730
                                                 src2_row[sj +  1] + src2_row[sj +  5] +
731
                                                 src2_row[sj +  9] + src2_row[sj + 13] +
732
                                                 src3_row[sj +  1] + src3_row[sj +  5] +
733
                                                 src3_row[sj +  9] + src3_row[sj + 13]);
734

735
                    dst_row[dj + 2] = areaDownsamplingDivision<opencv_like,4>(
736
                                            (u16)src0_row[sj +  2] + src0_row[sj +  6] +
737
                                                 src0_row[sj + 10] + src0_row[sj + 14] +
738
                                                 src1_row[sj +  2] + src1_row[sj +  6] +
739
                                                 src1_row[sj + 10] + src1_row[sj + 14] +
740
                                                 src2_row[sj +  2] + src2_row[sj +  6] +
741
                                                 src2_row[sj + 10] + src2_row[sj + 14] +
742
                                                 src3_row[sj +  2] + src3_row[sj +  6] +
743
                                                 src3_row[sj + 10] + src3_row[sj + 14]);
744

745
                    dst_row[dj + 3] = areaDownsamplingDivision<opencv_like,4>(
746
                                            (u16)src0_row[sj +  3] + src0_row[sj +  7] +
747
                                                 src0_row[sj + 11] + src0_row[sj + 15] +
748
                                                 src1_row[sj +  3] + src1_row[sj +  7] +
749
                                                 src1_row[sj + 11] + src1_row[sj + 15] +
750
                                                 src2_row[sj +  3] + src2_row[sj +  7] +
751
                                                 src2_row[sj + 11] + src2_row[sj + 15] +
752
                                                 src3_row[sj +  3] + src3_row[sj +  7] +
753
                                                 src3_row[sj + 11] + src3_row[sj + 15]);
754
                }
755
            }
756
        }
757
    }
758
    else if (channels == 3)
759
    {
760
        if ((wr == 2.0f) && (wr == 2.0f))
761
        {
762
#ifndef __ANDROID__
763
            size_t roiw16 = dsize.width >= 15 ? (dsize.width - 15) * 3 : 0;
764
#endif
765
            size_t roiw8 = dsize.width >= 7 ? (dsize.width - 7) * 3 : 0;
766

767
            for (size_t i = 0; i < dsize.height; ++i)
768
            {
769
                const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 1);
770
                const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 1) + 1);
771
                u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
772
                size_t sj = 0, dj = 0;
773

774
#ifndef __ANDROID__
775
                for ( ; dj < roiw16; dj += 48, sj += 96)
776
                {
777
                    internal::prefetch(src0_row + sj);
778
                    internal::prefetch(src1_row + sj);
779

780
                    uint8x16x3_t vLane1 = vld3q_u8(src0_row + sj);
781
                    uint8x16x3_t vLane2 = vld3q_u8(src1_row + sj);
782

783
                    uint8x8x3_t v_dst0, v_dst1;
784
                    {
785
                        uint16x8_t v_el0 = vpaddlq_u8(vLane1.val[0]);
786
                        uint16x8_t v_el1 = vpaddlq_u8(vLane1.val[1]);
787
                        uint16x8_t v_el2 = vpaddlq_u8(vLane1.val[2]);
788
                        v_el0 = vpadalq_u8(v_el0, vLane2.val[0]);
789
                        v_el1 = vpadalq_u8(v_el1, vLane2.val[1]);
790
                        v_el2 = vpadalq_u8(v_el2, vLane2.val[2]);
791

792
                        v_dst0.val[0] = areaDownsamplingDivision<opencv_like,2>(v_el0);
793
                        v_dst0.val[1] = areaDownsamplingDivision<opencv_like,2>(v_el1);
794
                        v_dst0.val[2] = areaDownsamplingDivision<opencv_like,2>(v_el2);
795
                    }
796

797
                    vLane1 = vld3q_u8(src0_row + sj + 48);
798
                    vLane2 = vld3q_u8(src1_row + sj + 48);
799
                    {
800
                        uint16x8_t v_el0 = vpaddlq_u8(vLane1.val[0]);
801
                        uint16x8_t v_el1 = vpaddlq_u8(vLane1.val[1]);
802
                        uint16x8_t v_el2 = vpaddlq_u8(vLane1.val[2]);
803
                        v_el0 = vpadalq_u8(v_el0, vLane2.val[0]);
804
                        v_el1 = vpadalq_u8(v_el1, vLane2.val[1]);
805
                        v_el2 = vpadalq_u8(v_el2, vLane2.val[2]);
806

807
                        v_dst1.val[0] = areaDownsamplingDivision<opencv_like,2>(v_el0);
808
                        v_dst1.val[1] = areaDownsamplingDivision<opencv_like,2>(v_el1);
809
                        v_dst1.val[2] = areaDownsamplingDivision<opencv_like,2>(v_el2);
810
                    }
811

812
                    uint8x16x3_t v_dst;
813
                    v_dst.val[0] = vcombine_u8(v_dst0.val[0], v_dst1.val[0]);
814
                    v_dst.val[1] = vcombine_u8(v_dst0.val[1], v_dst1.val[1]);
815
                    v_dst.val[2] = vcombine_u8(v_dst0.val[2], v_dst1.val[2]);
816

817
                    vst3q_u8(dst_row + dj, v_dst);
818
                }
819
#endif
820

821
                for ( ; dj < roiw8; dj += 24, sj += 48)
822
                {
823
                    internal::prefetch(src0_row + sj);
824
                    internal::prefetch(src1_row + sj);
825

826
                    uint8x16x3_t vLane1 = vld3q_u8(src0_row + sj);
827
                    uint8x16x3_t vLane2 = vld3q_u8(src1_row + sj);
828

829
                    uint16x8_t v_el0 = vpaddlq_u8(vLane1.val[0]);
830
                    uint16x8_t v_el1 = vpaddlq_u8(vLane1.val[1]);
831
                    uint16x8_t v_el2 = vpaddlq_u8(vLane1.val[2]);
832
                    v_el0 = vpadalq_u8(v_el0, vLane2.val[0]);
833
                    v_el1 = vpadalq_u8(v_el1, vLane2.val[1]);
834
                    v_el2 = vpadalq_u8(v_el2, vLane2.val[2]);
835

836
                    uint8x8x3_t v_dst;
837
                    v_dst.val[0] = areaDownsamplingDivision<opencv_like,2>(v_el0);
838
                    v_dst.val[1] = areaDownsamplingDivision<opencv_like,2>(v_el1);
839
                    v_dst.val[2] = areaDownsamplingDivision<opencv_like,2>(v_el2);
840

841
                    vst3_u8(dst_row + dj, v_dst);
842
                }
843

844
                for (size_t dwidth = dsize.width * 3; dj < dwidth; dj += 3, sj += 6)
845
                {
846
                    dst_row[dj    ] = areaDownsamplingDivision<opencv_like,2>(
847
                                          (u16)src0_row[sj    ] + src0_row[sj + 3] +
848
                                               src1_row[sj    ] + src1_row[sj + 3]);
849
                    dst_row[dj + 1] = areaDownsamplingDivision<opencv_like,2>(
850
                                          (u16)src0_row[sj + 1] + src0_row[sj + 4] +
851
                                               src1_row[sj + 1] + src1_row[sj + 4]);
852
                    dst_row[dj + 2] = areaDownsamplingDivision<opencv_like,2>(
853
                                          (u16)src0_row[sj + 2] + src0_row[sj + 5] +
854
                                               src1_row[sj + 2] + src1_row[sj + 5]);
855
                }
856
            }
857
        }
858
        else if ((wr == 0.5f) && (hr == 0.5f))
859
        {
860
#ifndef __ANDROID__
861
            size_t roiw32 = dsize.width >= 31 ? (dsize.width - 31) * 3 : 0;
862
#endif
863
            size_t roiw16 = dsize.width >= 15 ? (dsize.width - 15) * 3 : 0;
864

865
            for (size_t i = 0; i < dsize.height; i += 2)
866
            {
867
                const u8 * src_row = internal::getRowPtr(srcBase, srcStride, i >> 1);
868
                u8 * dst0_row = internal::getRowPtr(dstBase, dstStride, i);
869
                u8 * dst1_row = internal::getRowPtr(dstBase, dstStride, std::min(i + 1, dsize.height - 1));
870
                size_t sj = 0, dj = 0;
871

872
#ifndef __ANDROID__
873
                for ( ; dj < roiw32; dj += 96, sj += 48)
874
                {
875
                    internal::prefetch(src_row + sj);
876

877
                    uint8x16x3_t v_src = vld3q_u8(src_row + sj);
878
                    uint8x16x2_t v_c0 = vzipq_u8(v_src.val[0], v_src.val[0]);
879
                    uint8x16x2_t v_c1 = vzipq_u8(v_src.val[1], v_src.val[1]);
880
                    uint8x16x2_t v_c2 = vzipq_u8(v_src.val[2], v_src.val[2]);
881

882
                    uint8x16x3_t v_dst;
883
                    v_dst.val[0] = v_c0.val[0];
884
                    v_dst.val[1] = v_c1.val[0];
885
                    v_dst.val[2] = v_c2.val[0];
886
                    vst3q_u8(dst0_row + dj, v_dst);
887
                    vst3q_u8(dst1_row + dj, v_dst);
888

889
                    v_dst.val[0] = v_c0.val[1];
890
                    v_dst.val[1] = v_c1.val[1];
891
                    v_dst.val[2] = v_c2.val[1];
892
                    vst3q_u8(dst0_row + dj + 48, v_dst);
893
                    vst3q_u8(dst1_row + dj + 48, v_dst);
894
                }
895
#endif
896

897
                for ( ; dj < roiw16; dj += 48, sj += 24)
898
                {
899
                    internal::prefetch(src_row + sj);
900

901
                    uint8x8x3_t v_src = vld3_u8(src_row + sj);
902
                    uint8x8x2_t v_c0 = vzip_u8(v_src.val[0], v_src.val[0]);
903
                    uint8x8x2_t v_c1 = vzip_u8(v_src.val[1], v_src.val[1]);
904
                    uint8x8x2_t v_c2 = vzip_u8(v_src.val[2], v_src.val[2]);
905

906
                    uint8x16x3_t v_dst;
907
                    v_dst.val[0] = vcombine_u8(v_c0.val[0], v_c0.val[1]);
908
                    v_dst.val[1] = vcombine_u8(v_c1.val[0], v_c1.val[1]);
909
                    v_dst.val[2] = vcombine_u8(v_c2.val[0], v_c2.val[1]);
910
                    vst3q_u8(dst0_row + dj, v_dst);
911
                    vst3q_u8(dst1_row + dj, v_dst);
912
                }
913

914
                for (size_t dwidth = dsize.width * 3; dj < dwidth; dj += 6, sj += 3)
915
                {
916
                    u8 src_val = src_row[sj];
917
                    dst0_row[dj] = dst0_row[dj + 3] = src_val;
918
                    dst1_row[dj] = dst1_row[dj + 3] = src_val;
919

920
                    src_val = src_row[sj + 1];
921
                    dst0_row[dj + 1] = dst0_row[dj + 4] = src_val;
922
                    dst1_row[dj + 1] = dst1_row[dj + 4] = src_val;
923

924
                    src_val = src_row[sj + 2];
925
                    dst0_row[dj + 2] = dst0_row[dj + 5] = src_val;
926
                    dst1_row[dj + 2] = dst1_row[dj + 5] = src_val;
927
                }
928
            }
929
        }
930
        else //if ((hr == 4.0f) && (wr == 4.0f)) //the only scale that lasts after isSupported check
931
        {
932
#ifndef __ANDROID__
933
            size_t roiw8 = dsize.width >= 7 ? (dsize.width - 7) * 3 : 0;
934
#endif
935

936
            for (size_t i = 0; i < dsize.height; ++i)
937
            {
938
                const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 2);
939
                const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 1);
940
                const u8 * src2_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 2);
941
                const u8 * src3_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 3);
942
                u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i);
943
                size_t sj = 0, dj = 0;
944

945
#ifndef __ANDROID__
946
                for ( ; dj < roiw8; dj += 24, sj += 96)
947
                {
948
                    internal::prefetch(src0_row + sj);
949
                    internal::prefetch(src1_row + sj);
950
                    internal::prefetch(src2_row + sj);
951
                    internal::prefetch(src3_row + sj);
952

953
                    uint8x16x3_t vLane10 = vld3q_u8(src0_row + sj), vLane11 = vld3q_u8(src0_row + sj + 48);
954
                    uint8x16x3_t vLane20 = vld3q_u8(src1_row + sj), vLane21 = vld3q_u8(src1_row + sj + 48);
955
                    uint8x16x3_t vLane30 = vld3q_u8(src2_row + sj), vLane31 = vld3q_u8(src2_row + sj + 48);
956
                    uint8x16x3_t vLane40 = vld3q_u8(src3_row + sj), vLane41 = vld3q_u8(src3_row + sj + 48);
957

958
                    uint8x8x3_t v_dst;
959

960
                    // channel 0
961
                    {
962
                        uint16x8_t v_lane0 = vpaddlq_u8(vLane10.val[0]);
963
                        uint16x8_t v_lane1 = vpaddlq_u8(vLane20.val[0]);
964
                        uint16x8_t v_lane2 = vpaddlq_u8(vLane30.val[0]);
965
                        uint16x8_t v_lane3 = vpaddlq_u8(vLane40.val[0]);
966
                        v_lane0 = vaddq_u16(v_lane0, v_lane1);
967
                        v_lane0 = vaddq_u16(v_lane0, v_lane2);
968
                        v_lane0 = vaddq_u16(v_lane0, v_lane3);
969

970
                        uint16x8_t v_lane0_ = vpaddlq_u8(vLane11.val[0]);
971
                        uint16x8_t v_lane1_ = vpaddlq_u8(vLane21.val[0]);
972
                        uint16x8_t v_lane2_ = vpaddlq_u8(vLane31.val[0]);
973
                        uint16x8_t v_lane3_ = vpaddlq_u8(vLane41.val[0]);
974
                        v_lane0_ = vaddq_u16(v_lane0_, v_lane1_);
975
                        v_lane0_ = vaddq_u16(v_lane0_, v_lane2_);
976
                        v_lane0_ = vaddq_u16(v_lane0_, v_lane3_);
977

978
                        v_dst.val[0] = areaDownsamplingDivision<opencv_like,4>(
979
                                           vcombine_u16(vmovn_u32(vpaddlq_u16(v_lane0)),
980
                                                        vmovn_u32(vpaddlq_u16(v_lane0_))));
981
                    }
982

983
                    // channel 1
984
                    {
985
                        uint16x8_t v_lane0 = vpaddlq_u8(vLane10.val[1]);
986
                        uint16x8_t v_lane1 = vpaddlq_u8(vLane20.val[1]);
987
                        uint16x8_t v_lane2 = vpaddlq_u8(vLane30.val[1]);
988
                        uint16x8_t v_lane3 = vpaddlq_u8(vLane40.val[1]);
989
                        v_lane0 = vaddq_u16(v_lane0, v_lane1);
990
                        v_lane0 = vaddq_u16(v_lane0, v_lane2);
991
                        v_lane0 = vaddq_u16(v_lane0, v_lane3);
992

993
                        uint16x8_t v_lane0_ = vpaddlq_u8(vLane11.val[1]);
994
                        uint16x8_t v_lane1_ = vpaddlq_u8(vLane21.val[1]);
995
                        uint16x8_t v_lane2_ = vpaddlq_u8(vLane31.val[1]);
996
                        uint16x8_t v_lane3_ = vpaddlq_u8(vLane41.val[1]);
997
                        v_lane0_ = vaddq_u16(v_lane0_, v_lane1_);
998
                        v_lane0_ = vaddq_u16(v_lane0_, v_lane2_);
999
                        v_lane0_ = vaddq_u16(v_lane0_, v_lane3_);
1000

1001
                        v_dst.val[1] = areaDownsamplingDivision<opencv_like,4>(
1002
                                           vcombine_u16(vmovn_u32(vpaddlq_u16(v_lane0)),
1003
                                                        vmovn_u32(vpaddlq_u16(v_lane0_))));
1004
                    }
1005

1006
                    // channel 2
1007
                    {
1008
                        uint16x8_t v_lane0 = vpaddlq_u8(vLane10.val[2]);
1009
                        uint16x8_t v_lane1 = vpaddlq_u8(vLane20.val[2]);
1010
                        uint16x8_t v_lane2 = vpaddlq_u8(vLane30.val[2]);
1011
                        uint16x8_t v_lane3 = vpaddlq_u8(vLane40.val[2]);
1012
                        v_lane0 = vaddq_u16(v_lane0, v_lane1);
1013
                        v_lane0 = vaddq_u16(v_lane0, v_lane2);
1014
                        v_lane0 = vaddq_u16(v_lane0, v_lane3);
1015

1016
                        uint16x8_t v_lane0_ = vpaddlq_u8(vLane11.val[2]);
1017
                        uint16x8_t v_lane1_ = vpaddlq_u8(vLane21.val[2]);
1018
                        uint16x8_t v_lane2_ = vpaddlq_u8(vLane31.val[2]);
1019
                        uint16x8_t v_lane3_ = vpaddlq_u8(vLane41.val[2]);
1020
                        v_lane0_ = vaddq_u16(v_lane0_, v_lane1_);
1021
                        v_lane0_ = vaddq_u16(v_lane0_, v_lane2_);
1022
                        v_lane0_ = vaddq_u16(v_lane0_, v_lane3_);
1023

1024
                        v_dst.val[2] = areaDownsamplingDivision<opencv_like,4>(
1025
                                           vcombine_u16(vmovn_u32(vpaddlq_u16(v_lane0)),
1026
                                                        vmovn_u32(vpaddlq_u16(v_lane0_))));
1027
                    }
1028

1029
                    vst3_u8(dst_row + dj, v_dst);
1030
                }
1031
#endif
1032

1033
                for (size_t dwidth = dsize.width * 3; dj < dwidth; dj += 3, sj += 12)
1034
                {
1035
                    dst_row[dj    ] = areaDownsamplingDivision<opencv_like,4>(
1036
                                          (u16)src0_row[sj    ] + src0_row[sj +  3] +
1037
                                               src0_row[sj + 6] + src0_row[sj +  9] +
1038
                                               src1_row[sj    ] + src1_row[sj +  3] +
1039
                                               src1_row[sj + 6] + src1_row[sj +  9] +
1040
                                               src2_row[sj    ] + src2_row[sj +  3] +
1041
                                               src2_row[sj + 6] + src2_row[sj +  9] +
1042
                                               src3_row[sj    ] + src3_row[sj +  3] +
1043
                                               src3_row[sj + 6] + src3_row[sj +  9]);
1044

1045
                    dst_row[dj + 1] = areaDownsamplingDivision<opencv_like,4>(
1046
                                          (u16)src0_row[sj + 1] + src0_row[sj +  4] +
1047
                                               src0_row[sj + 7] + src0_row[sj + 10] +
1048
                                               src1_row[sj + 1] + src1_row[sj +  4] +
1049
                                               src1_row[sj + 7] + src1_row[sj + 10] +
1050
                                               src2_row[sj + 1] + src2_row[sj +  4] +
1051
                                               src2_row[sj + 7] + src2_row[sj + 10] +
1052
                                               src3_row[sj + 1] + src3_row[sj +  4] +
1053
                                               src3_row[sj + 7] + src3_row[sj + 10]);
1054

1055
                    dst_row[dj + 2] = areaDownsamplingDivision<opencv_like,4>(
1056
                                          (u16)src0_row[sj + 2] + src0_row[sj +  5] +
1057
                                               src0_row[sj + 8] + src0_row[sj + 11] +
1058
                                               src1_row[sj + 2] + src1_row[sj +  5] +
1059
                                               src1_row[sj + 8] + src1_row[sj + 11] +
1060
                                               src2_row[sj + 2] + src2_row[sj +  5] +
1061
                                               src2_row[sj + 8] + src2_row[sj + 11] +
1062
                                               src3_row[sj + 2] + src3_row[sj +  5] +
1063
                                               src3_row[sj + 8] + src3_row[sj + 11]);
1064
                }
1065
            }
1066
        }
1067
    }
1068
#else
1069
    (void)dsize;
1070
    (void)srcBase;
1071
    (void)srcStride;
1072
    (void)dstBase;
1073
    (void)dstStride;
1074
    (void)wr;
1075
    (void)hr;
1076
#endif
1077
    (void)ssize;
1078
}
1079

1080
void resizeAreaOpenCV(const Size2D &ssize, const Size2D &dsize,
1081
                const u8 * srcBase, ptrdiff_t srcStride,
1082
                u8 * dstBase, ptrdiff_t dstStride,
1083
                f32 wr, f32 hr, u32 channels)
1084
{
1085
   resizeAreaRounding<true>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr, channels);
1086
}
1087

1088
void resizeArea(const Size2D &ssize, const Size2D &dsize,
1089
                const u8 * srcBase, ptrdiff_t srcStride,
1090
                u8 * dstBase, ptrdiff_t dstStride,
1091
                f32 wr, f32 hr, u32 channels)
1092
{
1093
   resizeAreaRounding<false>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr, channels);
1094
}
1095

1096
#ifdef CAROTENE_NEON
1097

1098
namespace {
1099

1100
uint8x8_t resizeLinearStep(uint8x16_t vr1, uint8x16_t vr2,
1101
                           uint8x8_t vlutl, uint8x8_t vluth,
1102
                           float32x4_t vrw, float32x4_t vcw0, float32x4_t vcw1)
1103
{
1104
    uint8x8_t vr1l = internal::vqtbl1_u8(vr1, vlutl);
1105
    uint8x8_t vr1h = internal::vqtbl1_u8(vr1, vluth);
1106
    uint8x8_t vr2l = internal::vqtbl1_u8(vr2, vlutl);
1107
    uint8x8_t vr2h = internal::vqtbl1_u8(vr2, vluth);
1108

1109
    uint16x8_t v1hw = vmovl_u8(vr1h);
1110
    uint16x8_t v2hw = vmovl_u8(vr2h);
1111

1112
    int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h));
1113
    int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h));
1114

1115
    float32x4_t v1L = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1hw)));
1116
    float32x4_t v1H = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1hw)));
1117
    float32x4_t v2L = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v2hw)));
1118
    float32x4_t v2H = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v2hw)));
1119

1120
    v1L = vmlaq_f32(v1L, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v1df))), vcw0);
1121
    v1H = vmlaq_f32(v1H, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v1df))), vcw1);
1122
    v2L = vmlaq_f32(v2L, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v2df))), vcw0);
1123
    v2H = vmlaq_f32(v2H, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v2df))), vcw1);
1124

1125
    float32x4_t vdiffL = vsubq_f32(v1L, v2L);
1126
    float32x4_t vdiffH = vsubq_f32(v1H, v2H);
1127

1128
    float32x4_t vL = vmlaq_f32(v2L, vdiffL, vrw);
1129
    float32x4_t vH = vmlaq_f32(v2H, vdiffH, vrw);
1130
    uint16x4_t vL_ = vmovn_u32(vcvtq_u32_f32(vL));
1131
    uint16x4_t vH_ = vmovn_u32(vcvtq_u32_f32(vH));
1132
    return vmovn_u16(vcombine_u16(vL_, vH_));
1133
}
1134

1135
} // namespace
1136

1137
namespace {
1138

1139
void resize_bilinear_rows(const Size2D &ssize, const Size2D &dsize,
1140
                        const u8 * srcBase, ptrdiff_t srcStride,
1141
                        u8 * dstBase, ptrdiff_t dstStride,
1142
                        f32 hr, const u8** gcols, u8* gcweight, u8* buf)
1143
{
1144
    f32 scale_y_offset = 0.5f * hr - 0.5f;
1145

1146
    size_t dst_h8 = dsize.height & ~7;
1147
    size_t dst_w8 = dsize.width & ~7;
1148
    size_t src_w8 = ssize.width & ~7;
1149

1150
    size_t r = 0;
1151
    for (; r < dst_h8; r += 8)
1152
    {
1153
resize8u_xystretch:
1154
        const u8* rows[16];
1155
        u8 rweight[8];
1156

1157
        for (u32 i = 0; i < 8; ++i)
1158
        {
1159
            f32 w = (i + r) * hr + scale_y_offset;
1160
            ptrdiff_t src_row = floorf(w);
1161
            ptrdiff_t src_row2 = src_row + 1;
1162

1163
            rweight[i] = (u8)((src_row2-w) * 128);
1164

1165
            if (src_row < 0)
1166
                src_row = 0;
1167
            if (src_row2 >= (ptrdiff_t)ssize.height)
1168
                src_row2 = ssize.height-1;
1169

1170
            rows[2 * i] = srcBase + src_row * srcStride;
1171
            rows[2 * i + 1] = srcBase + src_row2 * srcStride;
1172
        }
1173

1174
        uint8x8_t vr0w = vdup_n_u8(rweight[0]);
1175
        uint8x8_t vr1w = vdup_n_u8(rweight[1]);
1176
        uint8x8_t vr2w = vdup_n_u8(rweight[2]);
1177
        uint8x8_t vr3w = vdup_n_u8(rweight[3]);
1178
        uint8x8_t vr4w = vdup_n_u8(rweight[4]);
1179
        uint8x8_t vr5w = vdup_n_u8(rweight[5]);
1180
        uint8x8_t vr6w = vdup_n_u8(rweight[6]);
1181
        uint8x8_t vr7w = vdup_n_u8(rweight[7]);
1182

1183
        uint8x8_t vr0w2 = vdup_n_u8(128 - rweight[0]);
1184
        uint8x8_t vr1w2 = vdup_n_u8(128 - rweight[1]);
1185
        uint8x8_t vr2w2 = vdup_n_u8(128 - rweight[2]);
1186
        uint8x8_t vr3w2 = vdup_n_u8(128 - rweight[3]);
1187
        uint8x8_t vr4w2 = vdup_n_u8(128 - rweight[4]);
1188
        uint8x8_t vr5w2 = vdup_n_u8(128 - rweight[5]);
1189
        uint8x8_t vr6w2 = vdup_n_u8(128 - rweight[6]);
1190
        uint8x8_t vr7w2 = vdup_n_u8(128 - rweight[7]);
1191

1192
        size_t col = 0;
1193
        for(; col < src_w8; col += 8)
1194
        {
1195
            internal::prefetch(rows[3] + col);
1196
            internal::prefetch(rows[7] + col);
1197
            internal::prefetch(rows[11] + col);
1198
            internal::prefetch(rows[15] + col);
1199
resize8u_ystretch:
1200
            uint8x8_t vsrc0l1 = vld1_u8(rows[0] + col);
1201
            uint8x8_t vsrc0l2 = vld1_u8(rows[1] + col);
1202
            uint8x8_t vsrc1l1 = vld1_u8(rows[2] + col);
1203
            uint8x8_t vsrc1l2 = vld1_u8(rows[3] + col);
1204

1205
            // (l1 * w + l2 * (128 - w) + 64) / 128
1206
            uint16x8_t vdst0l = vmull_u8(vsrc0l1, vr0w);
1207
            uint16x8_t vdst1l = vmull_u8(vsrc1l1, vr1w);
1208

1209
            uint8x8_t vsrc2l1 = vld1_u8(rows[4] + col);
1210
            uint8x8_t vsrc2l2 = vld1_u8(rows[5] + col);
1211
            uint8x8_t vsrc3l1 = vld1_u8(rows[6] + col);
1212
            uint8x8_t vsrc3l2 = vld1_u8(rows[7] + col);
1213

1214
            vdst0l = vmlal_u8(vdst0l, vsrc0l2, vr0w2);
1215
            vdst1l = vmlal_u8(vdst1l, vsrc1l2, vr1w2);
1216
            uint16x8_t vdst2l = vmull_u8(vsrc2l1, vr2w);
1217
            uint16x8_t vdst3l = vmull_u8(vsrc3l1, vr3w);
1218

1219
            uint8x8_t vsrc4l1 = vld1_u8(rows[8] + col);
1220
            uint8x8_t vsrc4l2 = vld1_u8(rows[9] + col);
1221
            uint8x8_t vsrc5l1 = vld1_u8(rows[10] + col);
1222
            uint8x8_t vsrc5l2 = vld1_u8(rows[11] + col);
1223

1224
            vdst2l = vmlal_u8(vdst2l, vsrc2l2, vr2w2);
1225
            vdst3l = vmlal_u8(vdst3l, vsrc3l2, vr3w2);
1226
            uint16x8_t vdst4l = vmull_u8(vsrc4l1, vr4w);
1227
            uint16x8_t vdst5l = vmull_u8(vsrc5l1, vr5w);
1228

1229
            uint8x8_t vsrc6l1 = vld1_u8(rows[12] + col);
1230
            uint8x8_t vsrc6l2 = vld1_u8(rows[13] + col);
1231
            uint8x8_t vsrc7l1 = vld1_u8(rows[14] + col);
1232
            uint8x8_t vsrc7l2 = vld1_u8(rows[15] + col);
1233

1234
            uint8x8_t vdst0 = vrshrn_n_u16(vdst0l, 7);
1235
            uint8x8_t vdst1 = vrshrn_n_u16(vdst1l, 7);
1236
            vdst4l = vmlal_u8(vdst4l, vsrc4l2, vr4w2);
1237
            vdst5l = vmlal_u8(vdst5l, vsrc5l2, vr5w2);
1238
            uint16x8_t vdst6l = vmull_u8(vsrc6l1, vr6w);
1239
            uint16x8_t vdst7l = vmull_u8(vsrc7l1, vr7w);
1240

1241
            uint8x8_t vdst2 = vrshrn_n_u16(vdst2l, 7);
1242
            uint8x8_t vdst3 = vrshrn_n_u16(vdst3l, 7);
1243
            vdst6l = vmlal_u8(vdst6l, vsrc6l2, vr6w2);
1244
            vdst7l = vmlal_u8(vdst7l, vsrc7l2, vr7w2);
1245

1246
            uint8x8_t vdst4 = vrshrn_n_u16(vdst4l, 7);
1247
            uint8x8_t vdst5 = vrshrn_n_u16(vdst5l, 7);
1248
            uint8x8_t vdst6 = vrshrn_n_u16(vdst6l, 7);
1249
            uint8x8_t vdst7 = vrshrn_n_u16(vdst7l, 7);
1250

1251
            // == 8x8 matrix transpose ==
1252

1253
            //00 01 02 03 04 05 06 07   d0
1254
            //10 11 12 13 14 15 16 17   d1
1255
            //20 21 22 23 24 25 26 27   d2
1256
            //30 31 32 33 34 35 36 37   d3
1257
            //40 41 42 43 44 45 46 47   d4
1258
            //50 51 52 53 54 55 56 57   d5
1259
            //60 61 62 63 64 65 66 67   d6
1260
            //70 71 72 73 74 75 76 77   d7
1261

1262
            uint8x8x2_t vdst10t = vtrn_u8(vdst0, vdst1);
1263
            uint8x8x2_t vdst32t = vtrn_u8(vdst2, vdst3);
1264
            uint8x8x2_t vdst54t = vtrn_u8(vdst4, vdst5);
1265
            uint8x8x2_t vdst76t = vtrn_u8(vdst6, vdst7);
1266

1267
            uint8x16_t vd1d0 = vcombine_u8(vdst10t.val[0], vdst10t.val[1]);
1268
            uint8x16_t vd3d2 = vcombine_u8(vdst32t.val[0], vdst32t.val[1]);
1269
            uint8x16_t vd5d4 = vcombine_u8(vdst54t.val[0], vdst54t.val[1]);
1270
            uint8x16_t vd7d6 = vcombine_u8(vdst76t.val[0], vdst76t.val[1]);
1271

1272
            //00 10 02 12 04 14 06 16   d0
1273
            //01 11 03 13 05 15 07 17   d1
1274
            //20 30 22 32 24 34 26 36   d2
1275
            //21 31 23 33 25 35 27 37   d3
1276
            //40 50 42 52 44 54 46 56   d4
1277
            //41 51 43 53 45 55 47 57   d5
1278
            //60 70 62 72 64 74 66 76   d6
1279
            //61 71 63 73 65 75 67 77   d7
1280

1281
            uint16x8x2_t vq1q0t = vtrnq_u16((uint16x8_t)vd1d0, (uint16x8_t)vd3d2);
1282
            uint16x8x2_t vq3q2t = vtrnq_u16((uint16x8_t)vd5d4, (uint16x8_t)vd7d6);
1283

1284
            //00 10 20 30 04 14 24 34   d0
1285
            //01 11 21 31 05 15 25 35   d1
1286
            //02 12 22 32 06 16 26 36   d2
1287
            //03 13 23 33 07 17 27 37   d3
1288
            //40 50 60 70 44 54 64 74   d4
1289
            //41 51 61 71 45 55 65 75   d5
1290
            //42 52 62 72 46 56 66 76   d6
1291
            //43 53 63 73 47 57 67 77   d7
1292

1293
            uint32x4x2_t vq2q0t = vtrnq_u32((uint32x4_t)vq1q0t.val[0], (uint32x4_t)vq3q2t.val[0]);
1294
            uint32x4x2_t vq3q1t = vtrnq_u32((uint32x4_t)vq1q0t.val[1], (uint32x4_t)vq3q2t.val[1]);
1295

1296
            //00 10 20 30 40 50 60 70   d0
1297
            //01 11 21 31 41 51 61 71   d1
1298
            //02 12 22 32 42 52 62 72   d2
1299
            //03 13 23 33 43 53 63 73   d3
1300
            //04 14 24 34 44 54 64 74   d4
1301
            //05 15 25 35 45 55 65 75   d5
1302
            //06 16 26 36 46 56 66 76   d6
1303
            //07 17 27 37 47 57 67 77   d7
1304

1305
            vst1q_u8(buf + col * 8 +  0, (uint8x16_t)vq2q0t.val[0]);
1306
            vst1q_u8(buf + col * 8 + 16, (uint8x16_t)vq3q1t.val[0]);
1307
            vst1q_u8(buf + col * 8 + 32, (uint8x16_t)vq2q0t.val[1]);
1308
            vst1q_u8(buf + col * 8 + 48, (uint8x16_t)vq3q1t.val[1]);
1309
        }
1310

1311
        if (col < ssize.width)
1312
        {
1313
            col = ssize.width - 8;
1314
            goto resize8u_ystretch;
1315
        }
1316

1317
        u8* dst_data = dstBase + r * dstStride;
1318
        const u8** cols = gcols;
1319
        u8* cweight = gcweight;
1320

1321
        size_t dcol = 0;
1322
        for (; dcol < dst_w8; dcol += 8, cols += 16, cweight += 8)
1323
        {
1324
            internal::prefetch(cols[0], 64*4);
1325
resize8u_xstretch:
1326
            uint8x8_t vc0w = vdup_n_u8(cweight[0]);
1327
            uint8x8_t vc1w = vdup_n_u8(cweight[1]);
1328
            uint8x8_t vc2w = vdup_n_u8(cweight[2]);
1329
            uint8x8_t vc3w = vdup_n_u8(cweight[3]);
1330
            uint8x8_t vc4w = vdup_n_u8(cweight[4]);
1331
            uint8x8_t vc5w = vdup_n_u8(cweight[5]);
1332
            uint8x8_t vc6w = vdup_n_u8(cweight[6]);
1333
            uint8x8_t vc7w = vdup_n_u8(cweight[7]);
1334

1335
            uint8x8_t vc0w2 = vdup_n_u8(128 - cweight[0]);
1336
            uint8x8_t vc1w2 = vdup_n_u8(128 - cweight[1]);
1337
            uint8x8_t vc2w2 = vdup_n_u8(128 - cweight[2]);
1338
            uint8x8_t vc3w2 = vdup_n_u8(128 - cweight[3]);
1339
            uint8x8_t vc4w2 = vdup_n_u8(128 - cweight[4]);
1340
            uint8x8_t vc5w2 = vdup_n_u8(128 - cweight[5]);
1341
            uint8x8_t vc6w2 = vdup_n_u8(128 - cweight[6]);
1342
            uint8x8_t vc7w2 = vdup_n_u8(128 - cweight[7]);
1343

1344
            uint8x8_t vsrc0l1 = vld1_u8(cols[0]);
1345
            uint8x8_t vsrc0l2 = vld1_u8(cols[1]);
1346
            uint8x8_t vsrc1l1 = vld1_u8(cols[2]);
1347
            uint8x8_t vsrc1l2 = vld1_u8(cols[3]);
1348
            uint8x8_t vsrc2l1 = vld1_u8(cols[4]);
1349
            uint8x8_t vsrc2l2 = vld1_u8(cols[5]);
1350
            uint8x8_t vsrc3l1 = vld1_u8(cols[6]);
1351
            uint8x8_t vsrc3l2 = vld1_u8(cols[7]);
1352
            uint8x8_t vsrc4l1 = vld1_u8(cols[8]);
1353
            uint8x8_t vsrc4l2 = vld1_u8(cols[9]);
1354
            uint8x8_t vsrc5l1 = vld1_u8(cols[10]);
1355
            uint8x8_t vsrc5l2 = vld1_u8(cols[11]);
1356
            uint8x8_t vsrc6l1 = vld1_u8(cols[12]);
1357
            uint8x8_t vsrc6l2 = vld1_u8(cols[13]);
1358
            uint8x8_t vsrc7l1 = vld1_u8(cols[14]);
1359
            uint8x8_t vsrc7l2 = vld1_u8(cols[15]);
1360

1361
            // (l1 * w + l2 * (128 - w) + 64) / 128
1362
            uint16x8_t vdst0l = vmull_u8(vsrc0l1, vc0w);
1363
            uint16x8_t vdst1l = vmull_u8(vsrc1l1, vc1w);
1364
            uint16x8_t vdst2l = vmull_u8(vsrc2l1, vc2w);
1365
            uint16x8_t vdst3l = vmull_u8(vsrc3l1, vc3w);
1366
            uint16x8_t vdst4l = vmull_u8(vsrc4l1, vc4w);
1367
            uint16x8_t vdst5l = vmull_u8(vsrc5l1, vc5w);
1368
            uint16x8_t vdst6l = vmull_u8(vsrc6l1, vc6w);
1369
            uint16x8_t vdst7l = vmull_u8(vsrc7l1, vc7w);
1370

1371
            vdst0l = vmlal_u8(vdst0l, vsrc0l2, vc0w2);
1372
            vdst1l = vmlal_u8(vdst1l, vsrc1l2, vc1w2);
1373
            vdst2l = vmlal_u8(vdst2l, vsrc2l2, vc2w2);
1374
            vdst3l = vmlal_u8(vdst3l, vsrc3l2, vc3w2);
1375
            vdst4l = vmlal_u8(vdst4l, vsrc4l2, vc4w2);
1376
            vdst5l = vmlal_u8(vdst5l, vsrc5l2, vc5w2);
1377
            vdst6l = vmlal_u8(vdst6l, vsrc6l2, vc6w2);
1378
            vdst7l = vmlal_u8(vdst7l, vsrc7l2, vc7w2);
1379

1380
            uint8x8_t vdst0 = vrshrn_n_u16(vdst0l, 7);
1381
            uint8x8_t vdst1 = vrshrn_n_u16(vdst1l, 7);
1382
            uint8x8_t vdst2 = vrshrn_n_u16(vdst2l, 7);
1383
            uint8x8_t vdst3 = vrshrn_n_u16(vdst3l, 7);
1384
            uint8x8_t vdst4 = vrshrn_n_u16(vdst4l, 7);
1385
            uint8x8_t vdst5 = vrshrn_n_u16(vdst5l, 7);
1386
            uint8x8_t vdst6 = vrshrn_n_u16(vdst6l, 7);
1387
            uint8x8_t vdst7 = vrshrn_n_u16(vdst7l, 7);
1388

1389
            // == 8x8 matrix transpose ==
1390
            uint8x8x2_t vdst10t = vtrn_u8(vdst0, vdst1);
1391
            uint8x8x2_t vdst32t = vtrn_u8(vdst2, vdst3);
1392
            uint8x8x2_t vdst54t = vtrn_u8(vdst4, vdst5);
1393
            uint8x8x2_t vdst76t = vtrn_u8(vdst6, vdst7);
1394
            uint8x16_t vd1d0 = vcombine_u8(vdst10t.val[0], vdst10t.val[1]);
1395
            uint8x16_t vd3d2 = vcombine_u8(vdst32t.val[0], vdst32t.val[1]);
1396
            uint8x16_t vd5d4 = vcombine_u8(vdst54t.val[0], vdst54t.val[1]);
1397
            uint8x16_t vd7d6 = vcombine_u8(vdst76t.val[0], vdst76t.val[1]);
1398
            uint16x8x2_t vq1q0t = vtrnq_u16((uint16x8_t)vd1d0, (uint16x8_t)vd3d2);
1399
            uint16x8x2_t vq3q2t = vtrnq_u16((uint16x8_t)vd5d4, (uint16x8_t)vd7d6);
1400
            uint32x4x2_t vq2q0t = vtrnq_u32((uint32x4_t)vq1q0t.val[0], (uint32x4_t)vq3q2t.val[0]);
1401
            uint32x4x2_t vq3q1t = vtrnq_u32((uint32x4_t)vq1q0t.val[1], (uint32x4_t)vq3q2t.val[1]);
1402

1403
            //save results
1404
            vst1_u8(dst_data + 0 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq2q0t.val[0]));
1405
            vst1_u8(dst_data + 1 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq2q0t.val[0]));
1406
            vst1_u8(dst_data + 2 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq3q1t.val[0]));
1407
            vst1_u8(dst_data + 3 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq3q1t.val[0]));
1408
            vst1_u8(dst_data + 4 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq2q0t.val[1]));
1409
            vst1_u8(dst_data + 5 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq2q0t.val[1]));
1410
            vst1_u8(dst_data + 6 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq3q1t.val[1]));
1411
            vst1_u8(dst_data + 7 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq3q1t.val[1]));
1412
        }
1413

1414
        if (dcol < dsize.width)
1415
        {
1416
            dcol = dsize.width - 8;
1417
            cols = gcols + dcol * 2;
1418
            cweight = gcweight + dcol;
1419
            goto resize8u_xstretch;
1420
        }
1421
    }
1422

1423
    if (r < dsize.height)
1424
    {
1425
        r = dsize.height - 8;
1426
        goto resize8u_xystretch;
1427
    }
1428
}
1429

1430
template <int channels> struct resizeLinearInternals;
1431
template <> struct resizeLinearInternals<1>
1432
{
1433
    int32x4_t vc_upd;
1434
    int32x4_t vc0;
1435
    int32x4_t vcmax;
1436

1437
    inline resizeLinearInternals(int32x4_t & vi, u32 srccols)
1438
    {
1439
        vc_upd = vdupq_n_s32(4);
1440
        vc0 = vdupq_n_s32(0);
1441
        vcmax = vdupq_n_s32(srccols-1);
1442

1443
        s32 tmp0123[] = {0, 1, 2, 3 };
1444
        vi = vld1q_s32(tmp0123);
1445
    }
1446
    inline void updateIndexes(int32x4_t & vi, int32x4_t & vsrch, int32x4_t & vsrcl)
1447
    {
1448
        vsrch = vminq_s32(vsrch, vcmax);
1449
        vsrcl = vmaxq_s32(vsrcl, vc0);
1450
        vsrcl = vminq_s32(vsrcl, vcmax);//for safe tail
1451
        vsrch = vshlq_n_s32(vsrch, 3);
1452
        vsrcl = vshlq_n_s32(vsrcl, 3);
1453
        vi = vaddq_s32(vi, vc_upd);
1454
    }
1455
};
1456
template <> struct resizeLinearInternals<4>
1457
{
1458
    int32x4_t vc_upd;
1459
    int32x4_t vc0;
1460
    int32x4_t vcmax;
1461
    int32x4_t v0123x8;
1462

1463
    inline resizeLinearInternals(int32x4_t & vi, u32 srccols)
1464
    {
1465
        vc_upd = vdupq_n_s32(1);
1466
        vc0 = vdupq_n_s32(0);
1467
        vcmax = vdupq_n_s32(srccols-1);
1468
        s32 tmp0123x8[] = {0, 8, 16, 24};
1469
        v0123x8 = vld1q_s32(tmp0123x8);
1470

1471
        vi = vc0;
1472
    }
1473
    inline void updateIndexes(int32x4_t & vi, int32x4_t & vsrch, int32x4_t & vsrcl)
1474
    {
1475
        vsrch = vminq_s32(vsrch, vcmax);
1476
        vsrcl = vmaxq_s32(vsrcl, vc0);
1477
        vsrch = vshlq_n_s32(vsrch, 5);
1478
        vsrcl = vshlq_n_s32(vsrcl, 5);
1479
        vsrch = vaddq_s32(vsrch, v0123x8);
1480
        vsrcl = vaddq_s32(vsrcl, v0123x8);
1481
        vi = vaddq_s32(vi, vc_upd);
1482
    }
1483
};
1484

1485
template <int channels>
1486
void resizeLinearOpenCVchan(const Size2D &_ssize, const Size2D &_dsize,
1487
                            const u8 * srcBase, ptrdiff_t srcStride,
1488
                            u8 * dstBase, ptrdiff_t dstStride,
1489
                            f32 wr, f32 hr)
1490
{
1491
    float scale_x_offset = 0.5f * wr - 0.5f;
1492

1493
    Size2D ssize(_ssize.width*channels, _ssize.height);
1494
    Size2D dsize(_dsize.width*channels, _dsize.height);
1495

1496
    std::vector<u8> gcweight((dsize.width + 7) & ~7);
1497
    std::vector<const u8*> gcols(((dsize.width + 7) & ~7) * 2);
1498
    std::vector<u8> buf(((ssize.width + 7) & ~7) * 8); // (8 rows) x (width of src)
1499

1500
    float32x4_t vscale_x = vdupq_n_f32(wr);
1501
    float32x4_t vscale_x_offset = vdupq_n_f32(scale_x_offset);
1502
    int32x4_t vc1 = vdupq_n_s32(1);
1503
    float32x4_t vc128f = vdupq_n_f32(128.0f);
1504

1505
    int32x4_t vi;
1506
    resizeLinearInternals<channels> indexes(vi, _ssize.width);//u32 is used to store indexes
1507
                                                              //so we could get issues on src image dimensions greater than (2^32-1)
1508

1509
    for (size_t dcol = 0; dcol < dsize.width; dcol += 8)
1510
    {
1511
        s32 idx[16];
1512

1513
        float32x4_t vif = vcvtq_f32_s32(vi);
1514
        float32x4_t vw = vmlaq_f32(vscale_x_offset, vscale_x, vif);
1515
        int32x4_t vwi = vcvtq_s32_f32(vw);
1516
        float32x4_t vwif = vcvtq_f32_s32(vwi);
1517
        int32x4_t vmask = (int32x4_t)vcltq_f32(vwif, vw);
1518
        int32x4_t vsrch = vsubq_s32(vwi, vmask);
1519
        int32x4_t vsrcl = vsubq_s32(vsrch, vc1);
1520
        float32x4_t vsrchf = vcvtq_f32_s32(vsrch);
1521
        float32x4_t vw2 = vsubq_f32(vsrchf, vw);
1522

1523
        vw2 = vmulq_f32(vw2, vc128f);
1524
        uint32x4_t vw32u = vcvtq_u32_f32(vw2);
1525
        uint16x4_t vw16ul = vmovn_u32(vw32u);
1526
        indexes.updateIndexes(vi, vsrch, vsrcl);
1527

1528
        vst1q_s32(idx + 0, vsrcl);
1529
        vst1q_s32(idx + 8, vsrch);
1530

1531
        vif = vcvtq_f32_s32(vi);
1532
        vw = vmlaq_f32(vscale_x_offset, vscale_x, vif);
1533
        vwi = vcvtq_s32_f32(vw);
1534
        vwif = vcvtq_f32_s32(vwi);
1535
        vmask = (int32x4_t)vcltq_f32(vwif, vw);
1536
        vsrch = vsubq_s32(vwi, vmask);
1537
        vsrcl = vsubq_s32(vsrch, vc1);
1538
        vsrchf = vcvtq_f32_s32(vsrch);
1539
        vw2 = vsubq_f32(vsrchf, vw);
1540

1541
        vw2 = vmulq_f32(vw2, vc128f);
1542
        vw32u = vcvtq_u32_f32(vw2);
1543
        indexes.updateIndexes(vi, vsrch, vsrcl);
1544

1545
        uint16x4_t vw16uh = vmovn_u32(vw32u);
1546

1547
        vst1q_s32(idx + 4, vsrcl);
1548
        vst1q_s32(idx + 12, vsrch);
1549

1550
        uint8x8_t vw8u = vmovn_u16(vcombine_u16(vw16ul, vw16uh));
1551

1552
        for (u32 i = 0; i < 8; ++i)
1553
        {
1554
            gcols[dcol * 2 + i*2] = &buf[idx[i]];
1555
            gcols[dcol * 2 + i*2 + 1] = &buf[idx[i + 8]];
1556
        }
1557

1558
        vst1_u8(&gcweight[dcol], vw8u);
1559
    }
1560

1561
    resize_bilinear_rows(ssize, dsize, srcBase, srcStride, dstBase, dstStride, hr, &gcols[0], &gcweight[0], &buf[0]);
1562
}
1563

1564
void downsample_bilinear_8uc1(const Size2D &ssize, const Size2D &dsize,
1565
                              const u8 * srcBase, ptrdiff_t srcStride,
1566
                              u8 * dstBase, ptrdiff_t dstStride,
1567
                              f32 wr, f32 hr)
1568
{
1569
    internal::assertSupportedConfiguration(wr <= 2.f && hr <= 2.f);
1570

1571
    enum { SHIFT_BITS = 11 };
1572

1573
    f32 scale_x_offset = 0.5f * wr - 0.5f;
1574
    f32 scale_y_offset = 0.5f * hr - 0.5f;
1575

1576
    std::vector<s32> _buf(dsize.height*(2*(sizeof(ptrdiff_t)/sizeof(s32))+1)+1);
1577
    ptrdiff_t* buf = (ptrdiff_t*)&_buf[0];
1578
    s32* buf2 = (s32*)buf+2*(sizeof(ptrdiff_t)/sizeof(s32))*dsize.height;
1579
    for(size_t row = 0; row < (size_t)dsize.height; ++row)
1580
    {
1581
        f32 r = row * hr + scale_y_offset;
1582
        ptrdiff_t src_row = floorf(r);
1583
        ptrdiff_t src_row2 = src_row + 1;
1584

1585
        f32 rweight = src_row2 - r;
1586
        buf2[row] = floorf(rweight * (1 << SHIFT_BITS) + 0.5f);
1587
        buf[0 * dsize.height + row] = std::max<ptrdiff_t>(0, src_row);
1588
        buf[1 * dsize.height + row] = std::min((ptrdiff_t)ssize.height-1, src_row2);
1589
    }
1590

1591
#define USE_CORRECT_VERSION 0
1592

1593
    ptrdiff_t col = 0;
1594
/***********************************************/
1595
    for(; col <= (ptrdiff_t)dsize.width-16; col+=16)
1596
    {
1597
        ptrdiff_t col1[16];
1598
        ptrdiff_t col2[16];
1599
        s16 cwi[16];
1600

1601
        for(s32 k = 0; k < 16; ++k)
1602
        {
1603
            f32 c = (col + k) * wr + scale_x_offset;
1604
            col1[k] = (ptrdiff_t)c;
1605
            col2[k] = col1[k] + 1;
1606

1607
            cwi[k] = (short)floorf((col2[k] - c) * (1 << SHIFT_BITS) + 0.5f);
1608

1609
            if(col1[k] < 0) col1[k] = 0;
1610
            if(col2[k] >= (ptrdiff_t)ssize.width) col2[k] = ssize.width-1;
1611
        }
1612

1613
        ptrdiff_t x = std::min(col1[0], (ptrdiff_t)ssize.width-16);
1614
        ptrdiff_t y = std::min(col1[8], (ptrdiff_t)ssize.width-16);
1615
        u8 lutl[16];
1616
        u8 luth[16];
1617
        for(s32 k = 0; k < 8; ++k)
1618
        {
1619
            lutl[k] = (u8)(col1[k] - x);
1620
            luth[k] = (u8)(col2[k] - x);
1621
            lutl[k+8] = (u8)(col1[k+8] - y);
1622
            luth[k+8] = (u8)(col2[k+8] - y);
1623
        }
1624

1625
        uint8x8_t vlutl = vld1_u8(lutl);
1626
        uint8x8_t vluth = vld1_u8(luth);
1627
        int16x8_t vcw = vld1q_s16(cwi);
1628

1629
        uint8x8_t vlutl_ = vld1_u8(lutl+8);
1630
        uint8x8_t vluth_ = vld1_u8(luth+8);
1631
        int16x8_t vcw_ = vld1q_s16(cwi+8);
1632

1633
        for(ptrdiff_t row = 0; row < (ptrdiff_t)dsize.height; ++row)
1634
        {
1635
#if USE_CORRECT_VERSION
1636
            int32x4_t vrw = vdupq_n_s32(buf2[row]);
1637
#else
1638
            int16x8_t vrw = vdupq_n_s16((int16_t)buf2[row]);
1639
            int16x8_t vrW = vdupq_n_s16((int16_t)((1 << SHIFT_BITS) - buf2[row]));
1640
#endif
1641

1642
            internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 2*srcStride);
1643
            internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 3*srcStride);
1644

1645
            {
1646
                union { uint8x16_t v; uint8x8x2_t w; } vr1 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[0*dsize.height + row]) + x) };
1647
                union { uint8x16_t v; uint8x8x2_t w; } vr2 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x) };
1648

1649
                uint8x8_t vr1l = vtbl2_u8(vr1.w, vlutl);
1650
                uint8x8_t vr1h = vtbl2_u8(vr1.w, vluth);
1651
                uint8x8_t vr2l = vtbl2_u8(vr2.w, vlutl);
1652
                uint8x8_t vr2h = vtbl2_u8(vr2.w, vluth);
1653

1654
                uint16x8_t v1hw = vmovl_u8(vr1h);
1655
                uint16x8_t v2hw = vmovl_u8(vr2h);
1656

1657
                int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h));
1658
                int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h));
1659

1660
                int32x4_t v1L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v1hw),  SHIFT_BITS));
1661
                int32x4_t v1H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v1hw), SHIFT_BITS));
1662
                int32x4_t v2L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v2hw),  SHIFT_BITS));
1663
                int32x4_t v2H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v2hw), SHIFT_BITS));
1664

1665
                v1L = vmlal_s16(v1L, vget_low_s16(v1df), vget_low_s16(vcw));
1666
                v1H = vmlal_s16(v1H, vget_high_s16(v1df), vget_high_s16(vcw));
1667
                v2L = vmlal_s16(v2L, vget_low_s16(v2df), vget_low_s16(vcw));
1668
                v2H = vmlal_s16(v2H, vget_high_s16(v2df), vget_high_s16(vcw));
1669

1670
#if USE_CORRECT_VERSION
1671
                /* correct version */
1672
                int32x4_t vL = vshlq_n_s32(v2L, SHIFT_BITS);
1673
                int32x4_t vH = vshlq_n_s32(v2H, SHIFT_BITS);
1674
                int32x4_t vdiffL = vsubq_s32(v1L, v2L);
1675
                int32x4_t vdiffH = vsubq_s32(v1H, v2H);
1676

1677
                vL = vmlaq_s32(vL, vdiffL, vrw);
1678
                vH = vmlaq_s32(vH, vdiffH, vrw);
1679
                uint16x4_t vL_ = vqrshrun_n_s32(vL, 2*SHIFT_BITS - 8);
1680
                uint16x4_t vH_ = vqrshrun_n_s32(vH, 2*SHIFT_BITS - 8);
1681
                uint8x8_t vres = vrshrn_n_u16(vcombine_u16(vL_, vH_), 8);
1682
                vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres);
1683
#else
1684
                /* ugly version matching to OpenCV's SSE optimization */
1685
                int16x4_t v1Ls = vshrn_n_s32(v1L, 4);
1686
                int16x4_t v1Hs = vshrn_n_s32(v1H, 4);
1687
                int16x4_t v2Ls = vshrn_n_s32(v2L, 4);
1688
                int16x4_t v2Hs = vshrn_n_s32(v2H, 4);
1689

1690
                int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw);
1691
                int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW);
1692

1693
                int16x8_t vsum = vaddq_s16(vshrq_n_s16(v1s,1), vshrq_n_s16(v2s,1));
1694
                uint8x8_t vres = vqrshrun_n_s16(vsum, 2);
1695

1696
                vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres);
1697
#endif
1698
            }
1699

1700
            {
1701
                union { uint8x16_t v; uint8x8x2_t w; } vr1 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[0*dsize.height + row]) + y) };
1702
                union { uint8x16_t v; uint8x8x2_t w; } vr2 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + y) };
1703

1704
                uint8x8_t vr1l = vtbl2_u8(vr1.w, vlutl_);
1705
                uint8x8_t vr1h = vtbl2_u8(vr1.w, vluth_);
1706
                uint8x8_t vr2l = vtbl2_u8(vr2.w, vlutl_);
1707
                uint8x8_t vr2h = vtbl2_u8(vr2.w, vluth_);
1708

1709
                uint16x8_t v1hw = vmovl_u8(vr1h);
1710
                uint16x8_t v2hw = vmovl_u8(vr2h);
1711

1712
                int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h));
1713
                int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h));
1714

1715
                int32x4_t v1L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v1hw),  SHIFT_BITS));
1716
                int32x4_t v1H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v1hw), SHIFT_BITS));
1717
                int32x4_t v2L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v2hw),  SHIFT_BITS));
1718
                int32x4_t v2H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v2hw), SHIFT_BITS));
1719

1720
                v1L = vmlal_s16(v1L, vget_low_s16(v1df), vget_low_s16(vcw_));
1721
                v1H = vmlal_s16(v1H, vget_high_s16(v1df), vget_high_s16(vcw_));
1722
                v2L = vmlal_s16(v2L, vget_low_s16(v2df), vget_low_s16(vcw_));
1723
                v2H = vmlal_s16(v2H, vget_high_s16(v2df), vget_high_s16(vcw_));
1724

1725
#if USE_CORRECT_VERSION
1726
                /* correct version */
1727
                int32x4_t vL = vshlq_n_s32(v2L, SHIFT_BITS);
1728
                int32x4_t vH = vshlq_n_s32(v2H, SHIFT_BITS);
1729
                int32x4_t vdiffL = vsubq_s32(v1L, v2L);
1730
                int32x4_t vdiffH = vsubq_s32(v1H, v2H);
1731

1732
                vL = vmlaq_s32(vL, vdiffL, vrw);
1733
                vH = vmlaq_s32(vH, vdiffH, vrw);
1734
                uint16x4_t vL_ = vqrshrun_n_s32(vL, 2*SHIFT_BITS - 8);
1735
                uint16x4_t vH_ = vqrshrun_n_s32(vH, 2*SHIFT_BITS - 8);
1736
                uint8x8_t vres = vrshrn_n_u16(vcombine_u16(vL_, vH_), 8);
1737
                vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col + 8, vres);
1738
#else
1739
                /* ugly version matching to OpenCV's SSE optimization */
1740
                int16x4_t v1Ls = vshrn_n_s32(v1L, 4);
1741
                int16x4_t v1Hs = vshrn_n_s32(v1H, 4);
1742
                int16x4_t v2Ls = vshrn_n_s32(v2L, 4);
1743
                int16x4_t v2Hs = vshrn_n_s32(v2H, 4);
1744

1745
                int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw);
1746
                int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW);
1747

1748
                int16x8_t vsum = vaddq_s16(vshrq_n_s16(v1s,1), vshrq_n_s16(v2s,1));
1749
                uint8x8_t vres = vqrshrun_n_s16(vsum, 2);
1750

1751
                vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col + 8, vres);
1752
#endif
1753
            }
1754
        }
1755
    }
1756
/***********************************************/
1757
    for(; col <= (ptrdiff_t)dsize.width-8; col+=8)
1758
    {
1759
downsample_bilinear_8uc1_col_loop8:
1760
        ptrdiff_t col1[8];
1761
        ptrdiff_t col2[8];
1762
        s16 cwi[8];
1763

1764
        for(s32 k = 0; k < 8; ++k)
1765
        {
1766
            f32 c = (col + k) * wr + scale_x_offset;
1767
            col1[k] = (ptrdiff_t)c;
1768
            col2[k] = col1[k] + 1;
1769

1770
            cwi[k] = (s16)floorf((col2[k] - c) * (1 << SHIFT_BITS) + 0.5f);
1771

1772
            if(col1[k] < 0) col1[k] = 0;
1773
            if(col2[k] >= (ptrdiff_t)ssize.width) col2[k] = (ptrdiff_t)ssize.width-1;
1774
        }
1775

1776
        ptrdiff_t x = std::min(col1[0], (ptrdiff_t)ssize.width-16);
1777
        u8 lutl[8];
1778
        u8 luth[8];
1779
        for(s32 k = 0; k < 8; ++k)
1780
        {
1781
            lutl[k] = (u8)(col1[k] - x);
1782
            luth[k] = (u8)(col2[k] - x);
1783
        }
1784

1785
        uint8x8_t vlutl = vld1_u8(lutl);
1786
        uint8x8_t vluth = vld1_u8(luth);
1787
        int16x8_t vcw = vld1q_s16(cwi);
1788

1789
        for(ptrdiff_t row = 0; row < (ptrdiff_t)dsize.height; ++row)
1790
        {
1791
#if USE_CORRECT_VERSION
1792
            int32x4_t vrw = vdupq_n_s32(buf2[row]);
1793
#else
1794
            int16x8_t vrw = vdupq_n_s16((int16_t)buf2[row]);
1795
            int16x8_t vrW = vdupq_n_s16((int16_t)((1 << SHIFT_BITS) - buf2[row]));
1796
#endif
1797

1798
            internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 2*srcStride);
1799
            internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 3*srcStride);
1800

1801
            union { uint8x16_t v; uint8x8x2_t w; } vr1 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[0*dsize.height + row]) + x) };
1802
            union { uint8x16_t v; uint8x8x2_t w; } vr2 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x) };
1803

1804
            uint8x8_t vr1l = vtbl2_u8(vr1.w, vlutl);
1805
            uint8x8_t vr1h = vtbl2_u8(vr1.w, vluth);
1806
            uint8x8_t vr2l = vtbl2_u8(vr2.w, vlutl);
1807
            uint8x8_t vr2h = vtbl2_u8(vr2.w, vluth);
1808

1809
            uint16x8_t v1hw = vmovl_u8(vr1h);
1810
            uint16x8_t v2hw = vmovl_u8(vr2h);
1811

1812
            int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h));
1813
            int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h));
1814

1815
            int32x4_t v1L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v1hw),  SHIFT_BITS));
1816
            int32x4_t v1H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v1hw), SHIFT_BITS));
1817
            int32x4_t v2L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v2hw),  SHIFT_BITS));
1818
            int32x4_t v2H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v2hw), SHIFT_BITS));
1819

1820
            v1L = vmlal_s16(v1L, vget_low_s16(v1df), vget_low_s16(vcw));
1821
            v1H = vmlal_s16(v1H, vget_high_s16(v1df), vget_high_s16(vcw));
1822
            v2L = vmlal_s16(v2L, vget_low_s16(v2df), vget_low_s16(vcw));
1823
            v2H = vmlal_s16(v2H, vget_high_s16(v2df), vget_high_s16(vcw));
1824

1825
#if USE_CORRECT_VERSION
1826
            /* correct version */
1827
            int32x4_t vL = vshlq_n_s32(v2L, SHIFT_BITS);
1828
            int32x4_t vH = vshlq_n_s32(v2H, SHIFT_BITS);
1829
            int32x4_t vdiffL = vsubq_s32(v1L, v2L);
1830
            int32x4_t vdiffH = vsubq_s32(v1H, v2H);
1831

1832
            vL = vmlaq_s32(vL, vdiffL, vrw);
1833
            vH = vmlaq_s32(vH, vdiffH, vrw);
1834
            uint16x4_t vL_ = vqrshrun_n_s32(vL, 2*SHIFT_BITS - 8);
1835
            uint16x4_t vH_ = vqrshrun_n_s32(vH, 2*SHIFT_BITS - 8);
1836
            uint8x8_t vres = vrshrn_n_u16(vcombine_u16(vL_, vH_), 8);
1837
            vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres);
1838
#else
1839
            /* ugly version matching to OpenCV's SSE optimization */
1840
            int16x4_t v1Ls = vshrn_n_s32(v1L, 4);
1841
            int16x4_t v1Hs = vshrn_n_s32(v1H, 4);
1842
            int16x4_t v2Ls = vshrn_n_s32(v2L, 4);
1843
            int16x4_t v2Hs = vshrn_n_s32(v2H, 4);
1844

1845
            int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw);
1846
            int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW);
1847

1848
            int16x8_t vsum = vaddq_s16(vshrq_n_s16(v1s,1), vshrq_n_s16(v2s,1));
1849
            uint8x8_t vres = vqrshrun_n_s16(vsum, 2);
1850

1851
            vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres);
1852
#endif
1853
        }
1854
    }
1855
    if (col < (ptrdiff_t)dsize.width)
1856
    {
1857
        col = dsize.width - 8;
1858
        goto downsample_bilinear_8uc1_col_loop8;
1859
    }
1860
}
1861

1862
} // namespace
1863

1864
#endif
1865

1866
void resizeLinearOpenCV(const Size2D &ssize, const Size2D &dsize,
1867
                        const u8 * srcBase, ptrdiff_t srcStride,
1868
                        u8 * dstBase, ptrdiff_t dstStride,
1869
                        f32 wr, f32 hr, u32 channels)
1870
{
1871
    internal::assertSupportedConfiguration(wr > 0 && hr > 0 &&
1872
                                           (dsize.width - 0.5) * wr - 0.5 < ssize.width &&
1873
                                           (dsize.height - 0.5) * hr - 0.5 < ssize.height &&  // Ensure we have enough source data
1874
                                           (dsize.width + 0.5) * wr + 0.5 >= ssize.width &&
1875
                                           (dsize.height + 0.5) * hr + 0.5 >= ssize.height && // Ensure source isn't too big
1876
                                           isResizeLinearOpenCVSupported(ssize, dsize, channels));
1877
#ifdef CAROTENE_NEON
1878
        if(1 == channels)
1879
        {
1880
            if (wr <= 1.f && hr <= 1.f)
1881
                resizeLinearOpenCVchan<1>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr);
1882
            else if (wr <= 2.0f && hr <= 2.0f && ssize.width >= 16)
1883
                downsample_bilinear_8uc1(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr);
1884
            else
1885
                resizeLinearOpenCVchan<1>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr);
1886
        }
1887
        else if(4 == channels)
1888
            resizeLinearOpenCVchan<4>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr);
1889
#else
1890
    (void)ssize;
1891
    (void)dsize;
1892
    (void)srcBase;
1893
    (void)srcStride;
1894
    (void)dstBase;
1895
    (void)dstStride;
1896
    (void)wr;
1897
    (void)hr;
1898
    (void)channels;
1899
#endif
1900
}
1901

1902
void resizeLinear(const Size2D &ssize, const Size2D &dsize,
1903
                  const u8 * srcBase, ptrdiff_t srcStride,
1904
                  u8 * dstBase, ptrdiff_t dstStride,
1905
                  f32 wr, f32 hr, u32 channels)
1906
{
1907
    internal::assertSupportedConfiguration(wr > 0 && hr > 0 &&
1908
                                           (dsize.width - 0.5) * wr - 0.5 < ssize.width &&
1909
                                           (dsize.height - 0.5) * hr - 0.5 < ssize.height &&  // Ensure we have enough source data
1910
                                           (dsize.width + 0.5) * wr + 0.5 >= ssize.width &&
1911
                                           (dsize.height + 0.5) * hr + 0.5 >= ssize.height && // Ensure source isn't too big
1912
                                           isResizeLinearSupported(ssize, dsize,
1913
                                                                   wr, hr, channels));
1914
#ifdef CAROTENE_NEON
1915
    f32 scale_x = wr;
1916
    f32 scale_x_offset = 0.5f * scale_x - 0.5f;
1917
    f32 scale_y = hr;
1918
    f32 scale_y_offset = 0.5f * scale_y - 0.5f;
1919

1920
    std::vector<ptrdiff_t> _buf(dsize.height * 3 + 1);
1921
    std::vector<f32> coeff(dsize.height);
1922
    ptrdiff_t * buf = &_buf[0];
1923

1924
    for (size_t row = 0; row < dsize.height; ++row)
1925
    {
1926
        f32 r = row * scale_y + scale_y_offset;
1927
        ptrdiff_t src_row = floorf(r);
1928
        ptrdiff_t src_row2 = src_row + 1;
1929

1930
        f32 rweight = src_row2 - r;
1931
        buf[0 * dsize.height + row] = std::max<ptrdiff_t>(0, src_row);
1932
        buf[1 * dsize.height + row] = std::min<ptrdiff_t>(ssize.height - 1, src_row2);
1933
        coeff[row] = rweight;
1934
    }
1935

1936
    size_t col = 0;
1937
    for ( ; col + 16 <= dsize.width; col += 16)
1938
    {
1939
        ptrdiff_t col1[16], col2[16];
1940
        f32 cwi[16];
1941

1942
        for(s32 k = 0; k < 16; ++k)
1943
        {
1944
            f32 c = (col + k) * scale_x + scale_x_offset;
1945
            col1[k] = floorf(c);
1946
            col2[k] = col1[k] + 1;
1947

1948
            cwi[k] = col2[k] - c;
1949

1950
            if (col1[k] < 0)
1951
                col1[k] = 0;
1952
            if (col2[k] >= (ptrdiff_t)ssize.width)
1953
                col2[k] = ssize.width - 1;
1954
        }
1955

1956
        ptrdiff_t x = std::min<ptrdiff_t>(col1[0], ssize.width - 16);
1957
        ptrdiff_t y = std::min<ptrdiff_t>(col1[8], ssize.width - 16);
1958
        u8 lutl[16], luth[16];
1959

1960
        for (s32 k = 0; k < 8; ++k)
1961
        {
1962
            lutl[k] = (u8)(col1[k] - x);
1963
            luth[k] = (u8)(col2[k] - x);
1964
            lutl[k + 8] = (u8)(col1[k + 8] - y);
1965
            luth[k + 8] = (u8)(col2[k + 8] - y);
1966
        }
1967

1968
        uint8x8_t vlutl = vld1_u8(lutl);
1969
        uint8x8_t vluth = vld1_u8(luth);
1970
        float32x4_t vcw0 = vld1q_f32(cwi);
1971
        float32x4_t vcw1 = vld1q_f32(cwi + 4);
1972

1973
        uint8x8_t vlutl_ = vld1_u8(lutl + 8);
1974
        uint8x8_t vluth_ = vld1_u8(luth + 8);
1975
        float32x4_t vcw0_ = vld1q_f32(cwi + 8);
1976
        float32x4_t vcw1_ = vld1q_f32(cwi + 12);
1977

1978
        if (channels == 1)
1979
        {
1980
            for (size_t row = 0; row < dsize.height; ++row)
1981
            {
1982
                float32x4_t vrw = vdupq_n_f32(coeff[row]);
1983

1984
                const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
1985
                const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
1986
                u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
1987

1988
                internal::prefetch(srow0 + x + 2 * srcStride);
1989
                internal::prefetch(srow1 + x + 2 * srcStride);
1990

1991
                uint8x8_t vres0 = resizeLinearStep(vld1q_u8(srow0 + x), vld1q_u8(srow1 + x),
1992
                                                   vlutl, vluth,
1993
                                                   vrw, vcw0, vcw1);
1994

1995
                uint8x8_t vres1 = resizeLinearStep(vld1q_u8(srow0 + y), vld1q_u8(srow1 + y),
1996
                                                   vlutl_, vluth_,
1997
                                                   vrw, vcw0_, vcw1_);
1998

1999
                vst1q_u8(drow + col, vcombine_u8(vres0, vres1));
2000
            }
2001
        }
2002
        else if (channels == 3)
2003
        {
2004
            for (size_t row = 0; row < dsize.height; ++row)
2005
            {
2006
                float32x4_t vrw = vdupq_n_f32(coeff[row]);
2007

2008
                const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
2009
                const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
2010
                u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
2011

2012
                internal::prefetch(srow0 + x + 2 * srcStride);
2013
                internal::prefetch(srow1 + x + 2 * srcStride);
2014

2015
                uint8x16x3_t v_src10 = vld3q_u8(srow0 + (x * 3));
2016
                uint8x16x3_t v_src20 = vld3q_u8(srow1 + (x * 3));
2017

2018
                uint8x16x3_t v_src11 = vld3q_u8(srow0 + (y * 3));
2019
                uint8x16x3_t v_src21 = vld3q_u8(srow1 + (y * 3));
2020

2021
                uint8x16x3_t v_dst;
2022

2023
                v_dst.val[0] = vcombine_u8(resizeLinearStep(v_src10.val[0], v_src20.val[0], vlutl, vluth, vrw, vcw0, vcw1),
2024
                                           resizeLinearStep(v_src11.val[0], v_src21.val[0], vlutl_, vluth_, vrw, vcw0_, vcw1_));
2025
                v_dst.val[1] = vcombine_u8(resizeLinearStep(v_src10.val[1], v_src20.val[1], vlutl, vluth, vrw, vcw0, vcw1),
2026
                                           resizeLinearStep(v_src11.val[1], v_src21.val[1], vlutl_, vluth_, vrw, vcw0_, vcw1_));
2027
                v_dst.val[2] = vcombine_u8(resizeLinearStep(v_src10.val[2], v_src20.val[2], vlutl, vluth, vrw, vcw0, vcw1),
2028
                                           resizeLinearStep(v_src11.val[2], v_src21.val[2], vlutl_, vluth_, vrw, vcw0_, vcw1_));
2029

2030
                vst3q_u8(drow + (col * 3), v_dst);
2031
            }
2032
        }
2033
        else if (channels == 4)
2034
        {
2035
            for (size_t row = 0; row < dsize.height; ++row)
2036
            {
2037
                float32x4_t vrw = vdupq_n_f32(coeff[row]);
2038

2039
                const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
2040
                const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
2041
                u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
2042

2043
                internal::prefetch(srow0 + x + 2 * srcStride);
2044
                internal::prefetch(srow1 + x + 2 * srcStride);
2045

2046
                uint8x16x4_t v_src10 = vld4q_u8(srow0 + (x << 2));
2047
                uint8x16x4_t v_src20 = vld4q_u8(srow1 + (x << 2));
2048

2049
                uint8x16x4_t v_src11 = vld4q_u8(srow0 + (y << 2));
2050
                uint8x16x4_t v_src21 = vld4q_u8(srow1 + (y << 2));
2051

2052
                uint8x16x4_t v_dst;
2053

2054
                v_dst.val[0] = vcombine_u8(resizeLinearStep(v_src10.val[0], v_src20.val[0], vlutl, vluth, vrw, vcw0, vcw1),
2055
                                           resizeLinearStep(v_src11.val[0], v_src21.val[0], vlutl_, vluth_, vrw, vcw0_, vcw1_));
2056
                v_dst.val[1] = vcombine_u8(resizeLinearStep(v_src10.val[1], v_src20.val[1], vlutl, vluth, vrw, vcw0, vcw1),
2057
                                           resizeLinearStep(v_src11.val[1], v_src21.val[1], vlutl_, vluth_, vrw, vcw0_, vcw1_));
2058
                v_dst.val[2] = vcombine_u8(resizeLinearStep(v_src10.val[2], v_src20.val[2], vlutl, vluth, vrw, vcw0, vcw1),
2059
                                           resizeLinearStep(v_src11.val[2], v_src21.val[2], vlutl_, vluth_, vrw, vcw0_, vcw1_));
2060
                v_dst.val[3] = vcombine_u8(resizeLinearStep(v_src10.val[3], v_src20.val[3], vlutl, vluth, vrw, vcw0, vcw1),
2061
                                           resizeLinearStep(v_src11.val[3], v_src21.val[3], vlutl_, vluth_, vrw, vcw0_, vcw1_));
2062

2063
                vst4q_u8(drow + (col << 2), v_dst);
2064
            }
2065
        }
2066
    }
2067

2068
    for ( ; col + 8 <= dsize.width; col += 8)
2069
    {
2070
downsample_bilinear_8uc1_col_loop8:
2071
        ptrdiff_t col1[8], col2[8];
2072
        f32 cwi[8];
2073

2074
        for (s32 k = 0; k < 8; ++k)
2075
        {
2076
            f32 c = (col + k) * scale_x + scale_x_offset;
2077
            col1[k] = floorf(c);
2078
            col2[k] = col1[k] + 1;
2079

2080
            cwi[k] = col2[k] - c;
2081

2082
            if (col1[k] < 0)
2083
                col1[k] = 0;
2084
            if (col2[k] >= (ptrdiff_t)ssize.width)
2085
                col2[k] = ssize.width - 1;
2086
        }
2087

2088
        ptrdiff_t x = std::min<ptrdiff_t>(col1[0], ssize.width - 16);
2089
        u8 lutl[8], luth[8];
2090
        for (s32 k = 0; k < 8; ++k)
2091
        {
2092
            lutl[k] = (u8)(col1[k] - x);
2093
            luth[k] = (u8)(col2[k] - x);
2094
        }
2095

2096
        uint8x8_t vlutl = vld1_u8(lutl);
2097
        uint8x8_t vluth = vld1_u8(luth);
2098
        float32x4_t vcw0 = vld1q_f32(cwi);
2099
        float32x4_t vcw1 = vld1q_f32(cwi + 4);
2100

2101
        if (channels == 1)
2102
        {
2103
            for (size_t row = 0; row < dsize.height; ++row)
2104
            {
2105
                float32x4_t vrw = vdupq_n_f32(coeff[row]);
2106

2107
                const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
2108
                const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
2109
                u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
2110

2111
                internal::prefetch(srow0 + x + 2 * srcStride);
2112
                internal::prefetch(srow1 + x + 2 * srcStride);
2113

2114
                uint8x8_t vres = resizeLinearStep(vld1q_u8(srow0 + x), vld1q_u8(srow1 + x),
2115
                                                  vlutl, vluth,
2116
                                                  vrw, vcw0, vcw1);
2117
                vst1_u8(drow + col, vres);
2118
            }
2119
        }
2120
        else if (channels == 3)
2121
        {
2122
            for (size_t row = 0; row < dsize.height; ++row)
2123
            {
2124
                float32x4_t vrw = vdupq_n_f32(coeff[row]);
2125

2126
                const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
2127
                const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
2128
                u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
2129

2130
                internal::prefetch(srow0 + x + 2 * srcStride);
2131
                internal::prefetch(srow1 + x + 2 * srcStride);
2132

2133
                uint8x16x3_t v_src1 = vld3q_u8(srow0 + (x * 3));
2134
                uint8x16x3_t v_src2 = vld3q_u8(srow1 + (x * 3));
2135

2136
                uint8x8x3_t v_dst;
2137

2138
                v_dst.val[0] = resizeLinearStep(v_src1.val[0], v_src2.val[0], vlutl, vluth, vrw, vcw0, vcw1);
2139
                v_dst.val[1] = resizeLinearStep(v_src1.val[1], v_src2.val[1], vlutl, vluth, vrw, vcw0, vcw1);
2140
                v_dst.val[2] = resizeLinearStep(v_src1.val[2], v_src2.val[2], vlutl, vluth, vrw, vcw0, vcw1);
2141

2142
                vst3_u8(drow + (col * 3), v_dst);
2143
            }
2144
        }
2145
        else if (channels == 4)
2146
        {
2147
            for (size_t row = 0; row < dsize.height; ++row)
2148
            {
2149
                float32x4_t vrw = vdupq_n_f32(coeff[row]);
2150

2151
                const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]);
2152
                const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]);
2153
                u8 * drow = internal::getRowPtr(dstBase, dstStride, row);
2154

2155
                internal::prefetch(srow0 + x + 2 * srcStride);
2156
                internal::prefetch(srow1 + x + 2 * srcStride);
2157

2158
                uint8x16x4_t v_src1 = vld4q_u8(srow0 + (x << 2));
2159
                uint8x16x4_t v_src2 = vld4q_u8(srow1 + (x << 2));
2160

2161
                uint8x8x4_t v_dst;
2162

2163
                v_dst.val[0] = resizeLinearStep(v_src1.val[0], v_src2.val[0], vlutl, vluth, vrw, vcw0, vcw1);
2164
                v_dst.val[1] = resizeLinearStep(v_src1.val[1], v_src2.val[1], vlutl, vluth, vrw, vcw0, vcw1);
2165
                v_dst.val[2] = resizeLinearStep(v_src1.val[2], v_src2.val[2], vlutl, vluth, vrw, vcw0, vcw1);
2166
                v_dst.val[3] = resizeLinearStep(v_src1.val[3], v_src2.val[3], vlutl, vluth, vrw, vcw0, vcw1);
2167

2168
                vst4_u8(drow + (col << 2), v_dst);
2169
            }
2170
        }
2171
    }
2172

2173
    if (col < dsize.width)
2174
    {
2175
        col = dsize.width - 8;
2176
        goto downsample_bilinear_8uc1_col_loop8;
2177
    }
2178

2179
#else
2180
    (void)ssize;
2181
    (void)dsize;
2182
    (void)srcBase;
2183
    (void)srcStride;
2184
    (void)dstBase;
2185
    (void)dstStride;
2186
    (void)wr;
2187
    (void)hr;
2188
    (void)channels;
2189
#endif
2190
}
2191

2192
} // namespace CAROTENE_NS
2193

2194
Product

Resources

Company