Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/objdetect/src/haar.avx.cpp
16337 views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
8
//
9
//
10
// Intel License Agreement
11
// For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2000, Intel Corporation, all rights reserved.
14
// Third party copyrights are property of their respective owners.
15
//
16
// Redistribution and use in source and binary forms, with or without modification,
17
// are permitted provided that the following conditions are met:
18
//
19
// * Redistribution's of source code must retain the above copyright notice,
20
// this list of conditions and the following disclaimer.
21
//
22
// * Redistribution's in binary form must reproduce the above copyright notice,
23
// this list of conditions and the following disclaimer in the documentation
24
// and/or other materials provided with the distribution.
25
//
26
// * The name of Intel Corporation may not be used to endorse or promote products
27
// derived from this software without specific prior written permission.
28
//
29
// This software is provided by the copyright holders and contributors "as is" and
30
// any express or implied warranties, including, but not limited to, the implied
31
// warranties of merchantability and fitness for a particular purpose are disclaimed.
32
// In no event shall the Intel Corporation or contributors be liable for any direct,
33
// indirect, incidental, special, exemplary, or consequential damages
34
// (including, but not limited to, procurement of substitute goods or services;
35
// loss of use, data, or profits; or business interruption) however caused
36
// and on any theory of liability, whether in contract, strict liability,
37
// or tort (including negligence or otherwise) arising in any way out of
38
// the use of this software, even if advised of the possibility of such damage.
39
//
40
//M*/
41
42
/* Haar features calculation */
43
44
#include "precomp.hpp"
45
#include "haar.hpp"
46
47
namespace cv_haar_avx
48
{
49
50
// AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
51
#if CV_HAAR_USE_AVX
52
double icvEvalHidHaarClassifierAVX(CvHidHaarClassifier* classifier,
53
double variance_norm_factor, size_t p_offset)
54
{
55
int CV_DECL_ALIGNED(32) idxV[8] = { 0,0,0,0,0,0,0,0 };
56
uchar flags[8] = { 0,0,0,0,0,0,0,0 };
57
CvHidHaarTreeNode* nodes[8];
58
double res = 0;
59
uchar exitConditionFlag = 0;
60
for (;;)
61
{
62
float CV_DECL_ALIGNED(32) tmp[8] = { 0,0,0,0,0,0,0,0 };
63
nodes[0] = (classifier + 0)->node + idxV[0];
64
nodes[1] = (classifier + 1)->node + idxV[1];
65
nodes[2] = (classifier + 2)->node + idxV[2];
66
nodes[3] = (classifier + 3)->node + idxV[3];
67
nodes[4] = (classifier + 4)->node + idxV[4];
68
nodes[5] = (classifier + 5)->node + idxV[5];
69
nodes[6] = (classifier + 6)->node + idxV[6];
70
nodes[7] = (classifier + 7)->node + idxV[7];
71
72
__m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
73
74
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
75
nodes[6]->threshold,
76
nodes[5]->threshold,
77
nodes[4]->threshold,
78
nodes[3]->threshold,
79
nodes[2]->threshold,
80
nodes[1]->threshold,
81
nodes[0]->threshold));
82
83
__m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
84
calc_sumf(nodes[6]->feature.rect[0], p_offset),
85
calc_sumf(nodes[5]->feature.rect[0], p_offset),
86
calc_sumf(nodes[4]->feature.rect[0], p_offset),
87
calc_sumf(nodes[3]->feature.rect[0], p_offset),
88
calc_sumf(nodes[2]->feature.rect[0], p_offset),
89
calc_sumf(nodes[1]->feature.rect[0], p_offset),
90
calc_sumf(nodes[0]->feature.rect[0], p_offset));
91
92
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
93
nodes[6]->feature.rect[0].weight,
94
nodes[5]->feature.rect[0].weight,
95
nodes[4]->feature.rect[0].weight,
96
nodes[3]->feature.rect[0].weight,
97
nodes[2]->feature.rect[0].weight,
98
nodes[1]->feature.rect[0].weight,
99
nodes[0]->feature.rect[0].weight);
100
101
__m256 sum = _mm256_mul_ps(offset, weight);
102
103
offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
104
calc_sumf(nodes[6]->feature.rect[1], p_offset),
105
calc_sumf(nodes[5]->feature.rect[1], p_offset),
106
calc_sumf(nodes[4]->feature.rect[1], p_offset),
107
calc_sumf(nodes[3]->feature.rect[1], p_offset),
108
calc_sumf(nodes[2]->feature.rect[1], p_offset),
109
calc_sumf(nodes[1]->feature.rect[1], p_offset),
110
calc_sumf(nodes[0]->feature.rect[1], p_offset));
111
112
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
113
nodes[6]->feature.rect[1].weight,
114
nodes[5]->feature.rect[1].weight,
115
nodes[4]->feature.rect[1].weight,
116
nodes[3]->feature.rect[1].weight,
117
nodes[2]->feature.rect[1].weight,
118
nodes[1]->feature.rect[1].weight,
119
nodes[0]->feature.rect[1].weight);
120
121
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
122
123
if (nodes[0]->feature.rect[2].p0)
124
tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
125
if (nodes[1]->feature.rect[2].p0)
126
tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
127
if (nodes[2]->feature.rect[2].p0)
128
tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
129
if (nodes[3]->feature.rect[2].p0)
130
tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
131
if (nodes[4]->feature.rect[2].p0)
132
tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
133
if (nodes[5]->feature.rect[2].p0)
134
tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
135
if (nodes[6]->feature.rect[2].p0)
136
tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
137
if (nodes[7]->feature.rect[2].p0)
138
tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;
139
140
sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
141
142
__m256 left = _mm256_set_ps(static_cast<float>(nodes[7]->left), static_cast<float>(nodes[6]->left),
143
static_cast<float>(nodes[5]->left), static_cast<float>(nodes[4]->left),
144
static_cast<float>(nodes[3]->left), static_cast<float>(nodes[2]->left),
145
static_cast<float>(nodes[1]->left), static_cast<float>(nodes[0]->left));
146
__m256 right = _mm256_set_ps(static_cast<float>(nodes[7]->right), static_cast<float>(nodes[6]->right),
147
static_cast<float>(nodes[5]->right), static_cast<float>(nodes[4]->right),
148
static_cast<float>(nodes[3]->right), static_cast<float>(nodes[2]->right),
149
static_cast<float>(nodes[1]->right), static_cast<float>(nodes[0]->right));
150
151
_mm256_store_si256((__m256i*)idxV, _mm256_cvttps_epi32(_mm256_blendv_ps(right, left, _mm256_cmp_ps(sum, t, _CMP_LT_OQ))));
152
153
for (int i = 0; i < 8; i++)
154
{
155
if (idxV[i] <= 0)
156
{
157
if (!flags[i])
158
{
159
exitConditionFlag++;
160
flags[i] = 1;
161
res += (classifier + i)->alpha[-idxV[i]];
162
}
163
idxV[i] = 0;
164
}
165
}
166
if (exitConditionFlag == 8)
167
return res;
168
}
169
}
170
171
double icvEvalHidHaarStumpClassifierAVX(CvHidHaarClassifier* classifier,
172
double variance_norm_factor, size_t p_offset)
173
{
174
float CV_DECL_ALIGNED(32) tmp[8] = { 0,0,0,0,0,0,0,0 };
175
CvHidHaarTreeNode* nodes[8];
176
177
nodes[0] = classifier[0].node;
178
nodes[1] = classifier[1].node;
179
nodes[2] = classifier[2].node;
180
nodes[3] = classifier[3].node;
181
nodes[4] = classifier[4].node;
182
nodes[5] = classifier[5].node;
183
nodes[6] = classifier[6].node;
184
nodes[7] = classifier[7].node;
185
186
__m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
187
188
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
189
nodes[6]->threshold,
190
nodes[5]->threshold,
191
nodes[4]->threshold,
192
nodes[3]->threshold,
193
nodes[2]->threshold,
194
nodes[1]->threshold,
195
nodes[0]->threshold));
196
197
__m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
198
calc_sumf(nodes[6]->feature.rect[0], p_offset),
199
calc_sumf(nodes[5]->feature.rect[0], p_offset),
200
calc_sumf(nodes[4]->feature.rect[0], p_offset),
201
calc_sumf(nodes[3]->feature.rect[0], p_offset),
202
calc_sumf(nodes[2]->feature.rect[0], p_offset),
203
calc_sumf(nodes[1]->feature.rect[0], p_offset),
204
calc_sumf(nodes[0]->feature.rect[0], p_offset));
205
206
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
207
nodes[6]->feature.rect[0].weight,
208
nodes[5]->feature.rect[0].weight,
209
nodes[4]->feature.rect[0].weight,
210
nodes[3]->feature.rect[0].weight,
211
nodes[2]->feature.rect[0].weight,
212
nodes[1]->feature.rect[0].weight,
213
nodes[0]->feature.rect[0].weight);
214
215
__m256 sum = _mm256_mul_ps(offset, weight);
216
217
offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
218
calc_sumf(nodes[6]->feature.rect[1], p_offset),
219
calc_sumf(nodes[5]->feature.rect[1], p_offset),
220
calc_sumf(nodes[4]->feature.rect[1], p_offset),
221
calc_sumf(nodes[3]->feature.rect[1], p_offset),
222
calc_sumf(nodes[2]->feature.rect[1], p_offset),
223
calc_sumf(nodes[1]->feature.rect[1], p_offset),
224
calc_sumf(nodes[0]->feature.rect[1], p_offset));
225
226
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
227
nodes[6]->feature.rect[1].weight,
228
nodes[5]->feature.rect[1].weight,
229
nodes[4]->feature.rect[1].weight,
230
nodes[3]->feature.rect[1].weight,
231
nodes[2]->feature.rect[1].weight,
232
nodes[1]->feature.rect[1].weight,
233
nodes[0]->feature.rect[1].weight);
234
235
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
236
237
if (nodes[0]->feature.rect[2].p0)
238
tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
239
if (nodes[1]->feature.rect[2].p0)
240
tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
241
if (nodes[2]->feature.rect[2].p0)
242
tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
243
if (nodes[3]->feature.rect[2].p0)
244
tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
245
if (nodes[4]->feature.rect[2].p0)
246
tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
247
if (nodes[5]->feature.rect[2].p0)
248
tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
249
if (nodes[6]->feature.rect[2].p0)
250
tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
251
if (nodes[7]->feature.rect[2].p0)
252
tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;
253
254
sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
255
256
__m256 alpha0 = _mm256_set_ps(classifier[7].alpha[0],
257
classifier[6].alpha[0],
258
classifier[5].alpha[0],
259
classifier[4].alpha[0],
260
classifier[3].alpha[0],
261
classifier[2].alpha[0],
262
classifier[1].alpha[0],
263
classifier[0].alpha[0]);
264
__m256 alpha1 = _mm256_set_ps(classifier[7].alpha[1],
265
classifier[6].alpha[1],
266
classifier[5].alpha[1],
267
classifier[4].alpha[1],
268
classifier[3].alpha[1],
269
classifier[2].alpha[1],
270
classifier[1].alpha[1],
271
classifier[0].alpha[1]);
272
273
__m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ));
274
outBuf = _mm256_hadd_ps(outBuf, outBuf);
275
outBuf = _mm256_hadd_ps(outBuf, outBuf);
276
_mm256_store_ps(tmp, outBuf);
277
return (tmp[0] + tmp[4]);
278
}
279
280
double icvEvalHidHaarStumpClassifierTwoRectAVX(CvHidHaarClassifier* classifier,
281
double variance_norm_factor, size_t p_offset)
282
{
283
float CV_DECL_ALIGNED(32) buf[8];
284
CvHidHaarTreeNode* nodes[8];
285
nodes[0] = classifier[0].node;
286
nodes[1] = classifier[1].node;
287
nodes[2] = classifier[2].node;
288
nodes[3] = classifier[3].node;
289
nodes[4] = classifier[4].node;
290
nodes[5] = classifier[5].node;
291
nodes[6] = classifier[6].node;
292
nodes[7] = classifier[7].node;
293
294
__m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
295
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
296
nodes[6]->threshold,
297
nodes[5]->threshold,
298
nodes[4]->threshold,
299
nodes[3]->threshold,
300
nodes[2]->threshold,
301
nodes[1]->threshold,
302
nodes[0]->threshold));
303
304
__m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
305
calc_sumf(nodes[6]->feature.rect[0], p_offset),
306
calc_sumf(nodes[5]->feature.rect[0], p_offset),
307
calc_sumf(nodes[4]->feature.rect[0], p_offset),
308
calc_sumf(nodes[3]->feature.rect[0], p_offset),
309
calc_sumf(nodes[2]->feature.rect[0], p_offset),
310
calc_sumf(nodes[1]->feature.rect[0], p_offset),
311
calc_sumf(nodes[0]->feature.rect[0], p_offset));
312
313
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
314
nodes[6]->feature.rect[0].weight,
315
nodes[5]->feature.rect[0].weight,
316
nodes[4]->feature.rect[0].weight,
317
nodes[3]->feature.rect[0].weight,
318
nodes[2]->feature.rect[0].weight,
319
nodes[1]->feature.rect[0].weight,
320
nodes[0]->feature.rect[0].weight);
321
322
__m256 sum = _mm256_mul_ps(offset, weight);
323
324
offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
325
calc_sumf(nodes[6]->feature.rect[1], p_offset),
326
calc_sumf(nodes[5]->feature.rect[1], p_offset),
327
calc_sumf(nodes[4]->feature.rect[1], p_offset),
328
calc_sumf(nodes[3]->feature.rect[1], p_offset),
329
calc_sumf(nodes[2]->feature.rect[1], p_offset),
330
calc_sumf(nodes[1]->feature.rect[1], p_offset),
331
calc_sumf(nodes[0]->feature.rect[1], p_offset));
332
333
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
334
nodes[6]->feature.rect[1].weight,
335
nodes[5]->feature.rect[1].weight,
336
nodes[4]->feature.rect[1].weight,
337
nodes[3]->feature.rect[1].weight,
338
nodes[2]->feature.rect[1].weight,
339
nodes[1]->feature.rect[1].weight,
340
nodes[0]->feature.rect[1].weight);
341
342
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
343
344
__m256 alpha0 = _mm256_set_ps(classifier[7].alpha[0],
345
classifier[6].alpha[0],
346
classifier[5].alpha[0],
347
classifier[4].alpha[0],
348
classifier[3].alpha[0],
349
classifier[2].alpha[0],
350
classifier[1].alpha[0],
351
classifier[0].alpha[0]);
352
__m256 alpha1 = _mm256_set_ps(classifier[7].alpha[1],
353
classifier[6].alpha[1],
354
classifier[5].alpha[1],
355
classifier[4].alpha[1],
356
classifier[3].alpha[1],
357
classifier[2].alpha[1],
358
classifier[1].alpha[1],
359
classifier[0].alpha[1]);
360
361
_mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ)));
362
return (buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7]);
363
}
364
365
#endif //CV_HAAR_USE_AVX
366
367
}
368
369
/* End of file. */
370
371