Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/remap.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "remap.hpp"
41
42
namespace CAROTENE_NS {
43
44
#ifdef CAROTENE_NEON
45
46
namespace internal {
47
48
void remapNearestNeighborReplicate(const Size2D size,
49
const u8 * srcBase,
50
const s32 * map,
51
u8 * dstBase, ptrdiff_t dstStride)
52
{
53
for (size_t y = 0; y < size.height; ++y)
54
{
55
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
56
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
57
58
for (size_t x = 0; x < size.width; ++x)
59
{
60
dst_row[x] = srcBase[map_row[x]];
61
}
62
}
63
}
64
65
void remapNearestNeighborConst(const Size2D size,
66
const u8 * srcBase,
67
const s32 * map,
68
u8 * dstBase, ptrdiff_t dstStride,
69
u8 borderValue)
70
{
71
for (size_t y = 0; y < size.height; ++y)
72
{
73
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
74
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
75
76
for (size_t x = 0; x < size.width; ++x)
77
{
78
s32 src_idx = map_row[x];
79
dst_row[x] = src_idx >= 0 ? srcBase[map_row[x]] : borderValue;
80
}
81
}
82
}
83
84
void remapLinearReplicate(const Size2D size,
85
const u8 * srcBase,
86
const s32 * map,
87
const f32 * coeffs,
88
u8 * dstBase, ptrdiff_t dstStride)
89
{
90
int16x8_t v_zero16 = vdupq_n_s16(0);
91
92
for (size_t y = 0; y < size.height; ++y)
93
{
94
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
95
const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
96
97
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
98
99
size_t x = 0;
100
for ( ; x + 8 < size.width; x += 8)
101
{
102
int16x8_t v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2)]], v_zero16, 0);
103
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 4]], v_src00, 1);
104
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 8]], v_src00, 2);
105
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 12]], v_src00, 3);
106
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 16]], v_src00, 4);
107
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 20]], v_src00, 5);
108
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 24]], v_src00, 6);
109
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 28]], v_src00, 7);
110
111
int16x8_t v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 1]], v_zero16, 0);
112
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 5]], v_src01, 1);
113
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 9]], v_src01, 2);
114
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 13]], v_src01, 3);
115
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 17]], v_src01, 4);
116
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 21]], v_src01, 5);
117
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 25]], v_src01, 6);
118
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 29]], v_src01, 7);
119
120
int16x8_t v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 2]], v_zero16, 0);
121
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 6]], v_src10, 1);
122
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 10]], v_src10, 2);
123
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 14]], v_src10, 3);
124
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 18]], v_src10, 4);
125
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 22]], v_src10, 5);
126
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 26]], v_src10, 6);
127
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 30]], v_src10, 7);
128
129
int16x8_t v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 3]], v_zero16, 0);
130
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 7]], v_src11, 1);
131
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 11]], v_src11, 2);
132
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 15]], v_src11, 3);
133
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 19]], v_src11, 4);
134
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 23]], v_src11, 5);
135
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 27]], v_src11, 6);
136
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 31]], v_src11, 7);
137
138
// first part
139
float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
140
float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
141
142
float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
143
float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
144
vget_low_s16(v_src00))), v_coeff.val[0]);
145
float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
146
vget_low_s16(v_src10))), v_coeff.val[0]);
147
148
float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
149
uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
150
151
// second part
152
v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
153
v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
154
155
v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
156
v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
157
vget_high_s16(v_src00))), v_coeff.val[0]);
158
v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
159
vget_high_s16(v_src10))), v_coeff.val[0]);
160
161
v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
162
uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
163
164
// store
165
vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
166
}
167
168
for ( ; x < size.width; ++x)
169
{
170
s32 src00_index = map_row[(x << 2)];
171
s32 src10_index = map_row[(x << 2) + 2];
172
f32 dst_val_0 = (srcBase[map_row[(x << 2) + 1]] - srcBase[src00_index]) * coeff_row[x << 1] +
173
srcBase[src00_index];
174
f32 dst_val_1 = (srcBase[map_row[(x << 2) + 3]] - srcBase[src10_index]) * coeff_row[x << 1] +
175
srcBase[src10_index];
176
dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
177
}
178
}
179
}
180
181
void remapLinearConst(const Size2D size,
182
const u8 * srcBase,
183
const s32 * map,
184
const f32 * coeffs,
185
u8 * dstBase, ptrdiff_t dstStride,
186
u8 borderValue)
187
{
188
int16x8_t v_zero16 = vdupq_n_s16(0);
189
190
for (size_t y = 0; y < size.height; ++y)
191
{
192
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
193
const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
194
195
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
196
197
size_t x = 0;
198
for ( ; x + 8 < size.width; x += 8)
199
{
200
int16x8_t v_src00 = vsetq_lane_s16(map_row[(x << 2)] >= 0 ? srcBase[map_row[(x << 2)]] : borderValue, v_zero16, 0);
201
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 4] >= 0 ? srcBase[map_row[(x << 2) + 4]] : borderValue, v_src00, 1);
202
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 8] >= 0 ? srcBase[map_row[(x << 2) + 8]] : borderValue, v_src00, 2);
203
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 12] >= 0 ? srcBase[map_row[(x << 2) + 12]] : borderValue, v_src00, 3);
204
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 16] >= 0 ? srcBase[map_row[(x << 2) + 16]] : borderValue, v_src00, 4);
205
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 20] >= 0 ? srcBase[map_row[(x << 2) + 20]] : borderValue, v_src00, 5);
206
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 24] >= 0 ? srcBase[map_row[(x << 2) + 24]] : borderValue, v_src00, 6);
207
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 28] >= 0 ? srcBase[map_row[(x << 2) + 28]] : borderValue, v_src00, 7);
208
209
int16x8_t v_src01 = vsetq_lane_s16(map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue, v_zero16, 0);
210
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 5] >= 0 ? srcBase[map_row[(x << 2) + 5]] : borderValue, v_src01, 1);
211
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 9] >= 0 ? srcBase[map_row[(x << 2) + 9]] : borderValue, v_src01, 2);
212
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 13] >= 0 ? srcBase[map_row[(x << 2) + 13]] : borderValue, v_src01, 3);
213
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 17] >= 0 ? srcBase[map_row[(x << 2) + 17]] : borderValue, v_src01, 4);
214
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 21] >= 0 ? srcBase[map_row[(x << 2) + 21]] : borderValue, v_src01, 5);
215
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 25] >= 0 ? srcBase[map_row[(x << 2) + 25]] : borderValue, v_src01, 6);
216
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 29] >= 0 ? srcBase[map_row[(x << 2) + 29]] : borderValue, v_src01, 7);
217
218
int16x8_t v_src10 = vsetq_lane_s16(map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue, v_zero16, 0);
219
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 6] >= 0 ? srcBase[map_row[(x << 2) + 6]] : borderValue, v_src10, 1);
220
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 10] >= 0 ? srcBase[map_row[(x << 2) + 10]] : borderValue, v_src10, 2);
221
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 14] >= 0 ? srcBase[map_row[(x << 2) + 14]] : borderValue, v_src10, 3);
222
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 18] >= 0 ? srcBase[map_row[(x << 2) + 18]] : borderValue, v_src10, 4);
223
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 22] >= 0 ? srcBase[map_row[(x << 2) + 22]] : borderValue, v_src10, 5);
224
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 26] >= 0 ? srcBase[map_row[(x << 2) + 26]] : borderValue, v_src10, 6);
225
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 30] >= 0 ? srcBase[map_row[(x << 2) + 30]] : borderValue, v_src10, 7);
226
227
int16x8_t v_src11 = vsetq_lane_s16(map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue, v_zero16, 0);
228
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 7] >= 0 ? srcBase[map_row[(x << 2) + 7]] : borderValue, v_src11, 1);
229
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 11] >= 0 ? srcBase[map_row[(x << 2) + 11]] : borderValue, v_src11, 2);
230
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 15] >= 0 ? srcBase[map_row[(x << 2) + 15]] : borderValue, v_src11, 3);
231
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 19] >= 0 ? srcBase[map_row[(x << 2) + 19]] : borderValue, v_src11, 4);
232
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 23] >= 0 ? srcBase[map_row[(x << 2) + 23]] : borderValue, v_src11, 5);
233
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 27] >= 0 ? srcBase[map_row[(x << 2) + 27]] : borderValue, v_src11, 6);
234
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 31] >= 0 ? srcBase[map_row[(x << 2) + 31]] : borderValue, v_src11, 7);
235
236
// first part
237
float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
238
float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
239
240
float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
241
float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
242
vget_low_s16(v_src00))), v_coeff.val[0]);
243
float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
244
vget_low_s16(v_src10))), v_coeff.val[0]);
245
246
float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
247
uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
248
249
// second part
250
v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
251
v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
252
253
v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
254
v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
255
vget_high_s16(v_src00))), v_coeff.val[0]);
256
v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
257
vget_high_s16(v_src10))), v_coeff.val[0]);
258
259
v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
260
uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
261
262
// store
263
vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
264
}
265
266
for ( ; x < size.width; ++x)
267
{
268
s16 src00 = map_row[(x << 2) + 0] >= 0 ? srcBase[map_row[(x << 2) + 0]] : borderValue;
269
s16 src01 = map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue;
270
s16 src10 = map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue;
271
s16 src11 = map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue;
272
273
f32 dst_val_0 = (src01 - src00) * coeff_row[(x << 1)] + src00;
274
f32 dst_val_1 = (src11 - src10) * coeff_row[(x << 1)] + src10;
275
dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
276
}
277
}
278
}
279
280
} // namespace internal
281
282
#endif // CAROTENE_NEON
283
284
bool isRemapNearestNeighborSupported(const Size2D &ssize)
285
{
286
#if SIZE_MAX > UINT32_MAX
287
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
288
// is performed with u32
289
isSupportedConfiguration();
290
#else
291
(void)ssize;
292
return isSupportedConfiguration();
293
#endif
294
}
295
296
bool isRemapLinearSupported(const Size2D &ssize)
297
{
298
#if SIZE_MAX > UINT32_MAX
299
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
300
// is performed with u32
301
isSupportedConfiguration();
302
#else
303
(void)ssize;
304
return isSupportedConfiguration();
305
#endif
306
}
307
308
void remapNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
309
const u8 * srcBase, ptrdiff_t srcStride,
310
const f32 * tableBase, ptrdiff_t tableStride,
311
u8 * dstBase, ptrdiff_t dstStride,
312
BORDER_MODE borderMode, u8 borderValue)
313
{
314
internal::assertSupportedConfiguration(isRemapNearestNeighborSupported(ssize));
315
#ifdef CAROTENE_NEON
316
using namespace internal;
317
318
s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
319
s32 * map = alignPtr(_map, 16);
320
321
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
322
int32x2_t v_width2 = vdup_n_s32(ssize.width - 1), v_height2 = vdup_n_s32(ssize.height - 1);
323
int32x4_t v_step4 = vdupq_n_s32(srcStride);
324
int32x2_t v_step2 = vdup_n_s32(srcStride);
325
326
if (borderMode == BORDER_MODE_REPLICATE)
327
{
328
int32x4_t v_zero4 = vdupq_n_s32(0);
329
int32x2_t v_zero2 = vdup_n_s32(0);
330
331
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
332
{
333
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
334
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
335
{
336
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
337
338
// compute table
339
for (size_t y = 0; y < blockHeight; ++y)
340
{
341
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
342
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
343
344
size_t x = 0;
345
for ( ; x + 8 <= blockWidth; x += 8)
346
{
347
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
348
v_table1 = vld2q_f32(table_row + (x << 1) + 8);
349
350
int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
351
int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
352
int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
353
vst1q_s32(map_row + x, v_dst_index);
354
355
v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table1.val[0])));
356
v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table1.val[1])));
357
v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
358
vst1q_s32(map_row + x + 4, v_dst_index);
359
}
360
361
for ( ; x + 4 <= blockWidth; x += 4)
362
{
363
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
364
365
int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
366
int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
367
int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
368
vst1q_s32(map_row + x, v_dst_index);
369
}
370
371
for ( ; x + 2 <= blockWidth; x += 2)
372
{
373
float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
374
375
int32x2_t v_dst_x = vmax_s32(v_zero2, vmin_s32(v_width2, vcvt_s32_f32(v_table0.val[0])));
376
int32x2_t v_dst_y = vmax_s32(v_zero2, vmin_s32(v_height2, vcvt_s32_f32(v_table0.val[1])));
377
int32x2_t v_dst_index = vmla_s32(v_dst_x, v_dst_y, v_step2);
378
vst1_s32(map_row + x, v_dst_index);
379
}
380
381
for ( ; x < blockWidth; ++x)
382
{
383
s32 src_x = std::max(0, std::min<s32>(ssize.width - 1, (s32)floorf(table_row[(x << 1) + 0])));
384
s32 src_y = std::max(0, std::min<s32>(ssize.height - 1, (s32)floorf(table_row[(x << 1) + 1])));
385
map_row[x] = src_y * srcStride + src_x;
386
}
387
}
388
389
// make remap
390
remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
391
getRowPtr(dstBase, dstStride, i) + j, dstStride);
392
}
393
}
394
}
395
else if (borderMode == BORDER_MODE_CONSTANT)
396
{
397
int32x4_t v_m1_4 = vdupq_n_s32(-1);
398
int32x2_t v_m1_2 = vdup_n_s32(-1);
399
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
400
float32x2_t v_zero2 = vdup_n_f32(0.0f);
401
402
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
403
{
404
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
405
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
406
{
407
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
408
409
// compute table
410
for (size_t y = 0; y < blockHeight; ++y)
411
{
412
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
413
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
414
415
size_t x = 0;
416
for ( ; x + 8 <= blockWidth; x += 8)
417
{
418
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
419
v_table1 = vld2q_f32(table_row + (x << 1) + 8);
420
421
int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
422
int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
423
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
424
vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
425
int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
426
vst1q_s32(map_row + x, v_dst_index);
427
428
v_dst_x = vcvtq_s32_f32(v_table1.val[0]);
429
v_dst_y = vcvtq_s32_f32(v_table1.val[1]);
430
v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table1.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
431
vandq_u32(vcgeq_f32(v_table1.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
432
v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
433
vst1q_s32(map_row + x + 4, v_dst_index);
434
}
435
436
for ( ; x + 4 <= blockWidth; x += 4)
437
{
438
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
439
440
int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
441
int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
442
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
443
vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
444
int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
445
vst1q_s32(map_row + x, v_dst_index);
446
}
447
448
for ( ; x + 2 <= blockWidth; x += 2)
449
{
450
float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
451
452
int32x2_t v_dst_x = vcvt_s32_f32(v_table0.val[0]);
453
int32x2_t v_dst_y = vcvt_s32_f32(v_table0.val[1]);
454
uint32x2_t v_mask = vand_u32(vand_u32(vcge_f32(v_table0.val[0], v_zero2), vcle_s32(v_dst_x, v_width2)),
455
vand_u32(vcge_f32(v_table0.val[1], v_zero2), vcle_s32(v_dst_y, v_height2)));
456
int32x2_t v_dst_index = vbsl_s32(v_mask, vmla_s32(v_dst_x, v_dst_y, v_step2), v_m1_2);
457
vst1_s32(map_row + x, v_dst_index);
458
}
459
460
for ( ; x < blockWidth; ++x)
461
{
462
s32 src_x = (s32)floorf(table_row[(x << 1) + 0]);
463
s32 src_y = (s32)floorf(table_row[(x << 1) + 1]);
464
map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
465
(src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
466
}
467
}
468
469
// make remap
470
remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
471
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
472
}
473
}
474
}
475
476
#else
477
(void)ssize;
478
(void)dsize;
479
(void)srcBase;
480
(void)srcStride;
481
(void)tableBase;
482
(void)tableStride;
483
(void)dstBase;
484
(void)dstStride;
485
(void)borderMode;
486
(void)borderValue;
487
#endif
488
}
489
490
void remapLinear(const Size2D &ssize, const Size2D &dsize,
491
const u8 * srcBase, ptrdiff_t srcStride,
492
const f32 * tableBase, ptrdiff_t tableStride,
493
u8 * dstBase, ptrdiff_t dstStride,
494
BORDER_MODE borderMode, u8 borderValue)
495
{
496
internal::assertSupportedConfiguration(isRemapLinearSupported(ssize));
497
#ifdef CAROTENE_NEON
498
using namespace internal;
499
500
s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
501
f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
502
503
s32 * map = alignPtr(_map, 16);
504
f32 * coeffs = alignPtr(_coeffs, 16);
505
506
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
507
int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
508
float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
509
510
if (borderMode == BORDER_MODE_REPLICATE)
511
{
512
int32x4_t v_zero4 = vdupq_n_s32(0);
513
514
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
515
{
516
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
517
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
518
{
519
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
520
521
// compute table
522
for (size_t y = 0; y < blockHeight; ++y)
523
{
524
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
525
526
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
527
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
528
529
size_t x = 0;
530
for ( ; x + 4 <= blockWidth; x += 4)
531
{
532
float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
533
534
int32x4_t v_src_x = vcvtq_s32_f32(v_table.val[0]);
535
int32x4_t v_src_y = vcvtq_s32_f32(v_table.val[1]);
536
537
float32x4x2_t v_coeff;
538
v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x));
539
v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y));
540
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
541
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
542
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
543
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
544
v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
545
v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
546
547
int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
548
int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
549
int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
550
int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
551
552
int32x4x4_t v_dst_index;
553
v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
554
v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
555
v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
556
v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
557
558
vst2q_f32(coeff_row + (x << 1), v_coeff);
559
vst4q_s32(map_row + (x << 2), v_dst_index);
560
}
561
562
for ( ; x < blockWidth; ++x)
563
{
564
f32 src_x_f = table_row[(x << 1) + 0];
565
f32 src_y_f = table_row[(x << 1) + 1];
566
567
s32 src0_x = (s32)floorf(src_x_f);
568
s32 src0_y = (s32)floorf(src_y_f);
569
570
coeff_row[x << 1] = src_x_f - src0_x;
571
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
572
573
s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
574
src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
575
s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
576
src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
577
578
map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
579
map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
580
map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
581
map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
582
}
583
}
584
585
remapLinearReplicate(Size2D(blockWidth, blockHeight),
586
srcBase, &map[0], &coeffs[0],
587
getRowPtr(dstBase, dstStride, i) + j, dstStride);
588
}
589
}
590
}
591
else if (borderMode == BORDER_MODE_CONSTANT)
592
{
593
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
594
int32x4_t v_m1_4 = vdupq_n_s32(-1);
595
596
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
597
{
598
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
599
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
600
{
601
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
602
603
// compute table
604
for (size_t y = 0; y < blockHeight; ++y)
605
{
606
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
607
608
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
609
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
610
611
size_t x = 0;
612
for ( ; x + 4 <= blockWidth; x += 4)
613
{
614
float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
615
616
int32x4_t v_src_x0 = vcvtq_s32_f32(v_table.val[0]);
617
int32x4_t v_src_y0 = vcvtq_s32_f32(v_table.val[1]);
618
619
float32x4x2_t v_coeff;
620
v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x0));
621
v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y0));
622
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
623
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
624
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
625
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
626
v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
627
v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
628
629
int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
630
int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
631
632
int32x4x4_t v_dst_index;
633
v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
634
v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
635
v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
636
v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
637
638
uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_table.val[0], v_zero4), vcleq_s32(v_src_x0, v_width4));
639
uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[0], v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
640
uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_table.val[1], v_zero4), vcleq_s32(v_src_y0, v_height4));
641
uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[1], v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
642
643
v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
644
v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
645
v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
646
v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
647
648
vst2q_f32(coeff_row + (x << 1), v_coeff);
649
vst4q_s32(map_row + (x << 2), v_dst_index);
650
}
651
652
for ( ; x < blockWidth; ++x)
653
{
654
f32 src_x_f = table_row[(x << 1) + 0];
655
f32 src_y_f = table_row[(x << 1) + 1];
656
657
s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
658
s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
659
660
coeff_row[(x << 1)] = src_x_f - src0_x;
661
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
662
663
map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
664
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
665
map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
666
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
667
map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
668
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
669
map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
670
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
671
}
672
}
673
674
remapLinearConst(Size2D(blockWidth, blockHeight),
675
srcBase, &map[0], &coeffs[0],
676
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
677
}
678
}
679
}
680
#else
681
(void)ssize;
682
(void)dsize;
683
(void)srcBase;
684
(void)srcStride;
685
(void)tableBase;
686
(void)tableStride;
687
(void)dstBase;
688
(void)dstStride;
689
(void)borderMode;
690
(void)borderValue;
691
#endif
692
}
693
694
} // namespace CAROTENE_NS
695
696