CoCalc -- fast.cpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/fast.cpp
¹⁶³³⁷ views
1
/*
2
 * By downloading, copying, installing or using the software you agree to this license.
3
 * If you do not agree to this license, do not download, install,
4
 * copy or use the software.
5
 *
6
 *
7
 *                           License Agreement
8
 *                For Open Source Computer Vision Library
9
 *                        (3-clause BSD License)
10
 *
11
 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
 * Third party copyrights are property of their respective owners.
13
 *
14
 * Redistribution and use in source and binary forms, with or without modification,
15
 * are permitted provided that the following conditions are met:
16
 *
17
 *   * Redistributions of source code must retain the above copyright notice,
18
 *     this list of conditions and the following disclaimer.
19
 *
20
 *   * Redistributions in binary form must reproduce the above copyright notice,
21
 *     this list of conditions and the following disclaimer in the documentation
22
 *     and/or other materials provided with the distribution.
23
 *
24
 *   * Neither the names of the copyright holders nor the names of the contributors
25
 *     may be used to endorse or promote products derived from this software
26
 *     without specific prior written permission.
27
 *
28
 * This software is provided by the copyright holders and contributors "as is" and
29
 * any express or implied warranties, including, but not limited to, the implied
30
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31
 * In no event shall copyright holders or contributors be liable for any direct,
32
 * indirect, incidental, special, exemplary, or consequential damages
33
 * (including, but not limited to, procurement of substitute goods or services;
34
 * loss of use, data, or profits; or business interruption) however caused
35
 * and on any theory of liability, whether in contract, strict liability,
36
 * or tort (including negligence or otherwise) arising in any way out of
37
 * the use of this software, even if advised of the possibility of such damage.
38
 */
39

40

41
/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
42
   Below is the original copyright and the references */
43

44
/*
45
Copyright (c) 2006, 2008 Edward Rosten
46
All rights reserved.
47

48
Redistribution and use in source and binary forms, with or without
49
modification, are permitted provided that the following conditions
50
are met:
51

52
 *Redistributions of source code must retain the above copyright
53
  notice, this list of conditions and the following disclaimer.
54

55
 *Redistributions in binary form must reproduce the above copyright
56
  notice, this list of conditions and the following disclaimer in the
57
  documentation and/or other materials provided with the distribution.
58

59
 *Neither the name of the University of Cambridge nor the names of
60
  its contributors may be used to endorse or promote products derived
61
  from this software without specific prior written permission.
62

63
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
64
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
65
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
66
A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
67
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
68
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
69
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
70
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
71
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
72
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
73
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
74
*/
75

76
/*
77
The references are:
78
 * Machine learning for high-speed corner detection,
79
   E. Rosten and T. Drummond, ECCV 2006
80
 * Faster and better: A machine learning approach to corner detection
81
   E. Rosten, R. Porter and T. Drummond, PAMI, 2009
82
*/
83

84
#include "common.hpp"
85

86
#include <vector>
87
#include <cstring>
88

89
namespace CAROTENE_NS {
90

91
#ifdef CAROTENE_NEON
92
namespace
93
{
94

95
void makeOffsets(ptrdiff_t pixel[], ptrdiff_t row_stride)
96
{
97
    pixel[0] = 0 + row_stride * 3;
98
    pixel[1] = 1 + row_stride * 3;
99
    pixel[2] = 2 + row_stride * 2;
100
    pixel[3] = 3 + row_stride * 1;
101
    pixel[4] = 3 + row_stride * 0;
102
    pixel[5] = 3 + row_stride * -1;
103
    pixel[6] = 2 + row_stride * -2;
104
    pixel[7] = 1 + row_stride * -3;
105
    pixel[8] = 0 + row_stride * -3;
106
    pixel[9] = -1 + row_stride * -3;
107
    pixel[10] = -2 + row_stride * -2;
108
    pixel[11] = -3 + row_stride * -1;
109
    pixel[12] = -3 + row_stride * 0;
110
    pixel[13] = -3 + row_stride * 1;
111
    pixel[14] = -2 + row_stride * 2;
112
    pixel[15] = -1 + row_stride * 3;
113
}
114

115
u8 cornerScore(const u8* ptr, const ptrdiff_t pixel[])
116
{
117
    const s32 K = 8, N = 16 + K + 1;
118
    s32 k, v = ptr[0];
119
    s16 d[(N + 7) & ~7];
120
    for( k = 0; k < N; k++ )
121
        d[k] = (s16)(v - ptr[pixel[k]]);
122

123
    int16x8_t q0 = vdupq_n_s16((s16)(-1000));
124
    int16x8_t q1 = vdupq_n_s16((s16)(1000));
125

126
    int16x8_t d0_7   = vld1q_s16(d +  0);
127
    int16x8_t d8_15  = vld1q_s16(d +  8);
128
    int16x8_t d16_23 = vld1q_s16(d + 16);
129
    int16x8_t d24    = vld1q_s16(d + 24);
130

131
    //k == 0
132
    int16x8_t v0k0 = vextq_s16(d0_7, d8_15, 1);
133
    int16x8_t v1k0 = vextq_s16(d0_7, d8_15, 2);
134
    int16x8_t ak0 = vminq_s16(v0k0, v1k0);
135
    int16x8_t bk0 = vmaxq_s16(v0k0, v1k0);
136

137
    v0k0 = vextq_s16(d0_7, d8_15, 3);
138
    ak0 = vminq_s16(ak0, v0k0);
139
    bk0 = vmaxq_s16(bk0, v0k0);
140

141
    v1k0 = vextq_s16(d0_7, d8_15, 4);
142
    ak0 = vminq_s16(ak0, v1k0);
143
    bk0 = vmaxq_s16(bk0, v1k0);
144

145
    v0k0 = vextq_s16(d0_7, d8_15, 5);
146
    ak0 = vminq_s16(ak0, v0k0);
147
    bk0 = vmaxq_s16(bk0, v0k0);
148

149
    v1k0 = vextq_s16(d0_7, d8_15, 6);
150
    ak0 = vminq_s16(ak0, v1k0);
151
    bk0 = vmaxq_s16(bk0, v1k0);
152

153
    v0k0 = vextq_s16(d0_7, d8_15, 7);
154
    ak0 = vminq_s16(ak0, v0k0);
155
    bk0 = vmaxq_s16(bk0, v0k0);
156

157
    ak0 = vminq_s16(ak0, d8_15);
158
    bk0 = vmaxq_s16(bk0, d8_15);
159

160
    q0 = vmaxq_s16(q0, vminq_s16(ak0, d0_7));
161
    q1 = vminq_s16(q1, vmaxq_s16(bk0, d0_7));
162

163
    v1k0 = vextq_s16(d8_15, d16_23, 1);
164
    q0 = vmaxq_s16(q0, vminq_s16(ak0, v1k0));
165
    q1 = vminq_s16(q1, vmaxq_s16(bk0, v1k0));
166

167
    //k == 8
168
    int16x8_t v0k8 = v1k0;
169
    int16x8_t v1k8 = vextq_s16(d8_15, d16_23, 2);
170
    int16x8_t ak8 = vminq_s16(v0k8, v1k8);
171
    int16x8_t bk8 = vmaxq_s16(v0k8, v1k8);
172

173
    v0k8 = vextq_s16(d8_15, d16_23, 3);
174
    ak8 = vminq_s16(ak8, v0k8);
175
    bk8 = vmaxq_s16(bk8, v0k8);
176

177
    v1k8 = vextq_s16(d8_15, d16_23, 4);
178
    ak8 = vminq_s16(ak8, v1k8);
179
    bk8 = vmaxq_s16(bk8, v1k8);
180

181
    v0k8 = vextq_s16(d8_15, d16_23, 5);
182
    ak8 = vminq_s16(ak8, v0k8);
183
    bk8 = vmaxq_s16(bk8, v0k8);
184

185
    v1k8 = vextq_s16(d8_15, d16_23, 6);
186
    ak8 = vminq_s16(ak8, v1k8);
187
    bk8 = vmaxq_s16(bk8, v1k8);
188

189
    v0k8 = vextq_s16(d8_15, d16_23, 7);
190
    ak8 = vminq_s16(ak8, v0k8);
191
    bk8 = vmaxq_s16(bk8, v0k8);
192

193
    ak8 = vminq_s16(ak8, d16_23);
194
    bk8 = vmaxq_s16(bk8, d16_23);
195

196
    q0 = vmaxq_s16(q0, vminq_s16(ak8, d8_15));
197
    q1 = vminq_s16(q1, vmaxq_s16(bk8, d8_15));
198

199
    v1k8 = vextq_s16(d16_23, d24, 1);
200
    q0 = vmaxq_s16(q0, vminq_s16(ak8, v1k8));
201
    q1 = vminq_s16(q1, vmaxq_s16(bk8, v1k8));
202

203
    //fin
204
    int16x8_t q = vmaxq_s16(q0, vsubq_s16(vmovq_n_s16(0), q1));
205
    int16x4_t q2 = vmax_s16(vget_low_s16(q), vget_high_s16(q));
206
    int32x4_t q2w = vmovl_s16(q2);
207
    int32x2_t q4 = vmax_s32(vget_low_s32(q2w), vget_high_s32(q2w));
208
    int32x2_t q8 = vmax_s32(q4, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(q4), 32)));
209

210
    return (u8)(vget_lane_s32(q8, 0) - 1);
211
}
212

213
} //namespace
214
#endif
215

216
void FAST(const Size2D &size,
217
          u8 *srcBase, ptrdiff_t srcStride,
218
          KeypointStore *keypoints,
219
          u8 threshold, bool nonmax_suppression)
220
{
221
    internal::assertSupportedConfiguration();
222
#ifdef CAROTENE_NEON
223
    //keypoints.clear();
224

225
    const s32 K = 8, N = 16 + K + 1;
226
    ptrdiff_t i, j, k, pixel[N];
227
    makeOffsets(pixel, srcStride);
228
    for(k = 16; k < N; k++)
229
        pixel[k] = pixel[k - 16];
230

231
    uint8x16_t delta = vdupq_n_u8(128);
232
    uint8x16_t t = vdupq_n_u8(threshold);
233
    uint8x16_t K16 = vdupq_n_u8((u8)K);
234

235
    u8 threshold_tab[512];
236
    for( i = -255; i <= 255; i++ )
237
        threshold_tab[i+255] = (u8)(i < -threshold ? 1 : i > threshold ? 2 : 0);
238

239
    std::vector<u8> _buf((size.width+16)*3*(sizeof(ptrdiff_t) + sizeof(u8)) + 128);
240
    u8* buf[3];
241
    buf[0] = &_buf[0]; buf[1] = buf[0] + size.width; buf[2] = buf[1] + size.width;
242
    ptrdiff_t* cpbuf[3];
243
    cpbuf[0] = (ptrdiff_t*)internal::alignPtr(buf[2] + size.width, sizeof(ptrdiff_t)) + 1;
244
    cpbuf[1] = cpbuf[0] + size.width + 1;
245
    cpbuf[2] = cpbuf[1] + size.width + 1;
246
    memset(buf[0], 0, size.width*3);
247

248
    for(i = 3; i < (ptrdiff_t)size.height-2; i++)
249
    {
250
        const u8* ptr = internal::getRowPtr(srcBase, srcStride, i) + 3;
251
        u8* curr = buf[(i - 3)%3];
252
        ptrdiff_t* cornerpos = cpbuf[(i - 3)%3];
253
        memset(curr, 0, size.width);
254
        ptrdiff_t ncorners = 0;
255

256
        if( i < (ptrdiff_t)size.height - 3 )
257
        {
258
            j = 3;
259

260
            for(; j < (ptrdiff_t)size.width - 16 - 3; j += 16, ptr += 16)
261
            {
262
                internal::prefetch(ptr);
263
                internal::prefetch(ptr + pixel[0]);
264
                internal::prefetch(ptr + pixel[2]);
265

266
                uint8x16_t v0 = vld1q_u8(ptr);
267
                int8x16_t v1 = vreinterpretq_s8_u8(veorq_u8(vqsubq_u8(v0, t), delta));
268
                int8x16_t v2 = vreinterpretq_s8_u8(veorq_u8(vqaddq_u8(v0, t), delta));
269

270
                int8x16_t x0 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[0]), delta));
271
                int8x16_t x1 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[4]), delta));
272
                int8x16_t x2 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[8]), delta));
273
                int8x16_t x3 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[12]), delta));
274

275
                uint8x16_t m0 =   vandq_u8(vcgtq_s8(x0, v2), vcgtq_s8(x1, v2));
276
                uint8x16_t m1 =   vandq_u8(vcgtq_s8(v1, x0), vcgtq_s8(v1, x1));
277
                m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x1, v2), vcgtq_s8(x2, v2)));
278
                m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x1), vcgtq_s8(v1, x2)));
279
                m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x2, v2), vcgtq_s8(x3, v2)));
280
                m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x2), vcgtq_s8(v1, x3)));
281
                m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x3, v2), vcgtq_s8(x0, v2)));
282
                m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x3), vcgtq_s8(v1, x0)));
283
                m0 = vorrq_u8(m0, m1);
284

285
                u64 mask[2];
286
                vst1q_u64(mask, vreinterpretq_u64_u8(m0));
287

288
                if( mask[0] == 0 )
289
                {
290
                    if (mask[1] != 0)
291
                    {
292
                        j -= 8;
293
                        ptr -= 8;
294
                    }
295
                    continue;
296
                }
297

298
                uint8x16_t c0 = vmovq_n_u8(0);
299
                uint8x16_t c1 = vmovq_n_u8(0);
300
                uint8x16_t max0 = vmovq_n_u8(0);
301
                uint8x16_t max1 = vmovq_n_u8(0);
302
                for( k = 0; k < N; k++ )
303
                {
304
                    int8x16_t x = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(ptr + pixel[k]), delta));
305
                    m0 = vcgtq_s8(x, v2);
306
                    m1 = vcgtq_s8(v1, x);
307

308
                    c0 = vandq_u8(vsubq_u8(c0, m0), m0);
309
                    c1 = vandq_u8(vsubq_u8(c1, m1), m1);
310

311
                    max0 = vmaxq_u8(max0, c0);
312
                    max1 = vmaxq_u8(max1, c1);
313
                }
314

315
                max0 = vmaxq_u8(max0, max1);
316
                u8 m[16];
317
                vst1q_u8(m, vcgtq_u8(max0, K16));
318

319
                for( k = 0; k < 16; ++k )
320
                    if(m[k])
321
                    {
322
                        cornerpos[ncorners++] = j+k;
323
                        if(nonmax_suppression)
324
                            curr[j+k] = cornerScore(ptr+k, pixel);
325
                    }
326
            }
327

328
            for( ; j < (s32)size.width - 3; j++, ptr++ )
329
            {
330
                s32 v = ptr[0];
331
                const u8* tab = &threshold_tab[0] - v + 255;
332
                s32 d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]];
333

334
                if( d == 0 )
335
                    continue;
336

337
                d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]];
338
                d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]];
339
                d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]];
340

341
                if( d == 0 )
342
                    continue;
343

344
                d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]];
345
                d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]];
346
                d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]];
347
                d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]];
348

349
                if( d & 1 )
350
                {
351
                    s32 vt = v - threshold, count = 0;
352

353
                    for( k = 0; k < N; k++ )
354
                    {
355
                        s32 x = ptr[pixel[k]];
356
                        if(x < vt)
357
                        {
358
                            if( ++count > K )
359
                            {
360
                                cornerpos[ncorners++] = j;
361
                                if(nonmax_suppression)
362
                                    curr[j] = cornerScore(ptr, pixel);
363
                                break;
364
                            }
365
                        }
366
                        else
367
                            count = 0;
368
                    }
369
                }
370

371
                if( d & 2 )
372
                {
373
                    s32 vt = v + threshold, count = 0;
374

375
                    for( k = 0; k < N; k++ )
376
                    {
377
                        s32 x = ptr[pixel[k]];
378
                        if(x > vt)
379
                        {
380
                            if( ++count > K )
381
                            {
382
                                cornerpos[ncorners++] = j;
383
                                if(nonmax_suppression)
384
                                    curr[j] = cornerScore(ptr, pixel);
385
                                break;
386
                            }
387
                        }
388
                        else
389
                            count = 0;
390
                    }
391
                }
392
            }
393
        }
394

395
        cornerpos[-1] = ncorners;
396

397
        if( i == 3 )
398
            continue;
399

400
        const u8* prev = buf[(i - 4 + 3)%3];
401
        const u8* pprev = buf[(i - 5 + 3)%3];
402
        cornerpos = cpbuf[(i - 4 + 3)%3];
403
        ncorners = cornerpos[-1];
404

405
        for( k = 0; k < ncorners; k++ )
406
        {
407
            j = cornerpos[k];
408
            s32 score = prev[j];
409
            if( !nonmax_suppression ||
410
                    (score > prev[j+1] && score > prev[j-1] &&
411
                     score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
412
                     score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
413
            {
414
                keypoints->push((f32)j, (f32)(i-1), 7.f, -1, (f32)score);
415
            }
416
        }
417
    }
418
#else
419
    (void)size;
420
    (void)srcBase;
421
    (void)srcStride;
422
    (void)keypoints;
423
    (void)threshold;
424
    (void)nonmax_suppression;
425
#endif
426
}
427

428
} // namespace CAROTENE_NS
429

430
Product

Resources

Company