Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/features2d/src/fast.avx2.cpp
16337 views
1
/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
2
Below is the original copyright and the references */
3
4
/*
5
Copyright (c) 2006, 2008 Edward Rosten
6
All rights reserved.
7
8
Redistribution and use in source and binary forms, with or without
9
modification, are permitted provided that the following conditions
10
are met:
11
12
*Redistributions of source code must retain the above copyright
13
notice, this list of conditions and the following disclaimer.
14
15
*Redistributions in binary form must reproduce the above copyright
16
notice, this list of conditions and the following disclaimer in the
17
documentation and/or other materials provided with the distribution.
18
19
*Neither the name of the University of Cambridge nor the names of
20
its contributors may be used to endorse or promote products derived
21
from this software without specific prior written permission.
22
23
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
27
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
*/
35
36
/*
37
The references are:
38
* Machine learning for high-speed corner detection,
39
E. Rosten and T. Drummond, ECCV 2006
40
* Faster and better: A machine learning approach to corner detection
41
E. Rosten, R. Porter and T. Drummond, PAMI, 2009
42
*/
43
44
#include "precomp.hpp"
45
#include "fast.hpp"
46
#include "opencv2/core/hal/intrin.hpp"
47
48
namespace cv
49
{
50
namespace opt_AVX2
51
{
52
53
class FAST_t_patternSize16_AVX2_Impl CV_FINAL: public FAST_t_patternSize16_AVX2
54
{
55
public:
56
FAST_t_patternSize16_AVX2_Impl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel):
57
cols(_cols), nonmax_suppression(_nonmax_suppression), pixel(_pixel)
58
{
59
//patternSize = 16
60
t256c = (char)_threshold;
61
threshold = std::min(std::max(_threshold, 0), 255);
62
}
63
64
virtual void process(int &j, const uchar* &ptr, uchar* curr, int* cornerpos, int &ncorners) CV_OVERRIDE
65
{
66
static const __m256i delta256 = _mm256_broadcastsi128_si256(_mm_set1_epi8((char)(-128))), K16_256 = _mm256_broadcastsi128_si256(_mm_set1_epi8((char)8));
67
const __m256i t256 = _mm256_broadcastsi128_si256(_mm_set1_epi8(t256c));
68
for (; j < cols - 32 - 3; j += 32, ptr += 32)
69
{
70
__m256i m0, m1;
71
__m256i v0 = _mm256_loadu_si256((const __m256i*)ptr);
72
73
__m256i v1 = _mm256_xor_si256(_mm256_subs_epu8(v0, t256), delta256);
74
v0 = _mm256_xor_si256(_mm256_adds_epu8(v0, t256), delta256);
75
76
__m256i x0 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[0])), delta256);
77
__m256i x1 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[4])), delta256);
78
__m256i x2 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[8])), delta256);
79
__m256i x3 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[12])), delta256);
80
81
m0 = _mm256_and_si256(_mm256_cmpgt_epi8(x0, v0), _mm256_cmpgt_epi8(x1, v0));
82
m1 = _mm256_and_si256(_mm256_cmpgt_epi8(v1, x0), _mm256_cmpgt_epi8(v1, x1));
83
m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x1, v0), _mm256_cmpgt_epi8(x2, v0)));
84
m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x1), _mm256_cmpgt_epi8(v1, x2)));
85
m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x2, v0), _mm256_cmpgt_epi8(x3, v0)));
86
m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x2), _mm256_cmpgt_epi8(v1, x3)));
87
m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x3, v0), _mm256_cmpgt_epi8(x0, v0)));
88
m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x3), _mm256_cmpgt_epi8(v1, x0)));
89
m0 = _mm256_or_si256(m0, m1);
90
91
unsigned int mask = _mm256_movemask_epi8(m0); //unsigned is important!
92
if (mask == 0){
93
continue;
94
}
95
if ((mask & 0xffff) == 0)
96
{
97
j -= 16;
98
ptr -= 16;
99
continue;
100
}
101
102
__m256i c0 = _mm256_setzero_si256(), c1 = c0, max0 = c0, max1 = c0;
103
for (int k = 0; k < 25; k++)
104
{
105
__m256i x = _mm256_xor_si256(_mm256_loadu_si256((const __m256i*)(ptr + pixel[k])), delta256);
106
m0 = _mm256_cmpgt_epi8(x, v0);
107
m1 = _mm256_cmpgt_epi8(v1, x);
108
109
c0 = _mm256_and_si256(_mm256_sub_epi8(c0, m0), m0);
110
c1 = _mm256_and_si256(_mm256_sub_epi8(c1, m1), m1);
111
112
max0 = _mm256_max_epu8(max0, c0);
113
max1 = _mm256_max_epu8(max1, c1);
114
}
115
116
max0 = _mm256_max_epu8(max0, max1);
117
unsigned int m = _mm256_movemask_epi8(_mm256_cmpgt_epi8(max0, K16_256));
118
119
for (int k = 0; m > 0 && k < 32; k++, m >>= 1)
120
if (m & 1)
121
{
122
cornerpos[ncorners++] = j + k;
123
if (nonmax_suppression)
124
{
125
short d[25];
126
for (int q = 0; q < 25; q++)
127
d[q] = (short)(ptr[k] - ptr[k + pixel[q]]);
128
v_int16x8 q0 = v_setall_s16(-1000), q1 = v_setall_s16(1000);
129
for (int q = 0; q < 16; q += 8)
130
{
131
v_int16x8 v0_ = v_load(d + q + 1);
132
v_int16x8 v1_ = v_load(d + q + 2);
133
v_int16x8 a = v_min(v0_, v1_);
134
v_int16x8 b = v_max(v0_, v1_);
135
v0_ = v_load(d + q + 3);
136
a = v_min(a, v0_);
137
b = v_max(b, v0_);
138
v0_ = v_load(d + q + 4);
139
a = v_min(a, v0_);
140
b = v_max(b, v0_);
141
v0_ = v_load(d + q + 5);
142
a = v_min(a, v0_);
143
b = v_max(b, v0_);
144
v0_ = v_load(d + q + 6);
145
a = v_min(a, v0_);
146
b = v_max(b, v0_);
147
v0_ = v_load(d + q + 7);
148
a = v_min(a, v0_);
149
b = v_max(b, v0_);
150
v0_ = v_load(d + q + 8);
151
a = v_min(a, v0_);
152
b = v_max(b, v0_);
153
v0_ = v_load(d + q);
154
q0 = v_max(q0, v_min(a, v0_));
155
q1 = v_min(q1, v_max(b, v0_));
156
v0_ = v_load(d + q + 9);
157
q0 = v_max(q0, v_min(a, v0_));
158
q1 = v_min(q1, v_max(b, v0_));
159
}
160
q0 = v_max(q0, v_setzero_s16() - q1);
161
curr[j + k] = (uchar)(v_reduce_max(q0) - 1);
162
}
163
}
164
}
165
_mm256_zeroupper();
166
}
167
168
virtual ~FAST_t_patternSize16_AVX2_Impl() CV_OVERRIDE {};
169
170
private:
171
int cols;
172
char t256c;
173
int threshold;
174
bool nonmax_suppression;
175
const int* pixel;
176
};
177
178
Ptr<FAST_t_patternSize16_AVX2> FAST_t_patternSize16_AVX2::getImpl(int _cols, int _threshold, bool _nonmax_suppression, const int* _pixel)
179
{
180
return Ptr<FAST_t_patternSize16_AVX2>(new FAST_t_patternSize16_AVX2_Impl(_cols, _threshold, _nonmax_suppression, _pixel));
181
}
182
183
}
184
}
185
186