Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/calib3d/src/stereosgbm.cpp
16339 views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
8
//
9
//
10
// License Agreement
11
// For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16
// Third party copyrights are property of their respective owners.
17
//
18
// Redistribution and use in source and binary forms, with or without modification,
19
// are permitted provided that the following conditions are met:
20
//
21
// * Redistribution's of source code must retain the above copyright notice,
22
// this list of conditions and the following disclaimer.
23
//
24
// * Redistribution's in binary form must reproduce the above copyright notice,
25
// this list of conditions and the following disclaimer in the documentation
26
// and/or other materials provided with the distribution.
27
//
28
// * The name of the copyright holders may not be used to endorse or promote products
29
// derived from this software without specific prior written permission.
30
//
31
// This software is provided by the copyright holders and contributors "as is" and
32
// any express or implied warranties, including, but not limited to, the implied
33
// warranties of merchantability and fitness for a particular purpose are disclaimed.
34
// In no event shall the Intel Corporation or contributors be liable for any direct,
35
// indirect, incidental, special, exemplary, or consequential damages
36
// (including, but not limited to, procurement of substitute goods or services;
37
// loss of use, data, or profits; or business interruption) however caused
38
// and on any theory of liability, whether in contract, strict liability,
39
// or tort (including negligence or otherwise) arising in any way out of
40
// the use of this software, even if advised of the possibility of such damage.
41
//
42
//M*/
43
44
/*
45
This is a variation of
46
"Stereo Processing by Semiglobal Matching and Mutual Information"
47
by Heiko Hirschmuller.
48
49
We match blocks rather than individual pixels, thus the algorithm is called
50
SGBM (Semi-global block matching)
51
*/
52
53
#include "precomp.hpp"
54
#include <limits.h>
55
#include "opencv2/core/hal/intrin.hpp"
56
57
namespace cv
58
{
59
60
typedef uchar PixType;
61
typedef short CostType;
62
typedef short DispType;
63
64
enum { NR = 16, NR2 = NR/2 };
65
66
67
struct StereoSGBMParams
68
{
69
StereoSGBMParams()
70
{
71
minDisparity = numDisparities = 0;
72
SADWindowSize = 0;
73
P1 = P2 = 0;
74
disp12MaxDiff = 0;
75
preFilterCap = 0;
76
uniquenessRatio = 0;
77
speckleWindowSize = 0;
78
speckleRange = 0;
79
mode = StereoSGBM::MODE_SGBM;
80
}
81
82
StereoSGBMParams( int _minDisparity, int _numDisparities, int _SADWindowSize,
83
int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
84
int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
85
int _mode )
86
{
87
minDisparity = _minDisparity;
88
numDisparities = _numDisparities;
89
SADWindowSize = _SADWindowSize;
90
P1 = _P1;
91
P2 = _P2;
92
disp12MaxDiff = _disp12MaxDiff;
93
preFilterCap = _preFilterCap;
94
uniquenessRatio = _uniquenessRatio;
95
speckleWindowSize = _speckleWindowSize;
96
speckleRange = _speckleRange;
97
mode = _mode;
98
}
99
100
int minDisparity;
101
int numDisparities;
102
int SADWindowSize;
103
int preFilterCap;
104
int uniquenessRatio;
105
int P1;
106
int P2;
107
int speckleWindowSize;
108
int speckleRange;
109
int disp12MaxDiff;
110
int mode;
111
};
112
113
static const int DEFAULT_RIGHT_BORDER = -1;
114
/*
115
For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
116
and for each disparity minD<=d<maxD the function
117
computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
118
row1[x] and row2[x-d]. The subpixel algorithm from
119
"Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
120
is used, hence the suffix BT.
121
122
the temporary buffer should contain width2*2 elements
123
*/
124
static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
125
int minD, int maxD, CostType* cost,
126
PixType* buffer, const PixType* tab,
127
int tabOfs, int , int xrange_min = 0, int xrange_max = DEFAULT_RIGHT_BORDER )
128
{
129
int x, c, width = img1.cols, cn = img1.channels();
130
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
131
int D = maxD - minD, width1 = maxX1 - minX1;
132
//This minX1 & maxX2 correction is defining which part of calculatable line must be calculated
133
//That is needs of parallel algorithm
134
xrange_min = (xrange_min < 0) ? 0: xrange_min;
135
xrange_max = (xrange_max == DEFAULT_RIGHT_BORDER) || (xrange_max > width1) ? width1 : xrange_max;
136
maxX1 = minX1 + xrange_max;
137
minX1 += xrange_min;
138
width1 = maxX1 - minX1;
139
int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
140
int width2 = maxX2 - minX2;
141
const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
142
PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
143
#if CV_SIMD128
144
bool useSIMD = hasSIMD128();
145
#endif
146
147
tab += tabOfs;
148
149
for( c = 0; c < cn*2; c++ )
150
{
151
prow1[width*c] = prow1[width*c + width-1] =
152
prow2[width*c] = prow2[width*c + width-1] = tab[0];
153
}
154
155
int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
156
int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
157
158
int minX_cmn = std::min(minX1,minX2)-1;
159
int maxX_cmn = std::max(maxX1,maxX2)+1;
160
minX_cmn = std::max(minX_cmn, 1);
161
maxX_cmn = std::min(maxX_cmn, width - 1);
162
if( cn == 1 )
163
{
164
for( x = minX_cmn; x < maxX_cmn; x++ )
165
{
166
prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
167
prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
168
169
prow1[x+width] = row1[x];
170
prow2[width-1-x+width] = row2[x];
171
}
172
}
173
else
174
{
175
for( x = minX_cmn; x < maxX_cmn; x++ )
176
{
177
prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
178
prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
179
prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
180
181
prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
182
prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
183
prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
184
185
prow1[x+width*3] = row1[x*3];
186
prow1[x+width*4] = row1[x*3+1];
187
prow1[x+width*5] = row1[x*3+2];
188
189
prow2[width-1-x+width*3] = row2[x*3];
190
prow2[width-1-x+width*4] = row2[x*3+1];
191
prow2[width-1-x+width*5] = row2[x*3+2];
192
}
193
}
194
195
memset( cost + xrange_min*D, 0, width1*D*sizeof(cost[0]) );
196
197
buffer -= width-1-maxX2;
198
cost -= (minX1-xrange_min)*D + minD; // simplify the cost indices inside the loop
199
200
for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
201
{
202
int diff_scale = c < cn ? 0 : 2;
203
204
// precompute
205
// v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
206
// v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
207
for( x = width-1-maxX2; x < width-1- minX2; x++ )
208
{
209
int v = prow2[x];
210
int vl = x > 0 ? (v + prow2[x-1])/2 : v;
211
int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
212
int v0 = std::min(vl, vr); v0 = std::min(v0, v);
213
int v1 = std::max(vl, vr); v1 = std::max(v1, v);
214
buffer[x] = (PixType)v0;
215
buffer[x + width2] = (PixType)v1;
216
}
217
218
for( x = minX1; x < maxX1; x++ )
219
{
220
int u = prow1[x];
221
int ul = x > 0 ? (u + prow1[x-1])/2 : u;
222
int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
223
int u0 = std::min(ul, ur); u0 = std::min(u0, u);
224
int u1 = std::max(ul, ur); u1 = std::max(u1, u);
225
226
#if CV_SIMD128
227
if( useSIMD )
228
{
229
v_uint8x16 _u = v_setall_u8((uchar)u), _u0 = v_setall_u8((uchar)u0);
230
v_uint8x16 _u1 = v_setall_u8((uchar)u1);
231
232
for( int d = minD; d < maxD; d += 16 )
233
{
234
v_uint8x16 _v = v_load(prow2 + width-x-1 + d);
235
v_uint8x16 _v0 = v_load(buffer + width-x-1 + d);
236
v_uint8x16 _v1 = v_load(buffer + width-x-1 + d + width2);
237
v_uint8x16 c0 = v_max(_u - _v1, _v0 - _u);
238
v_uint8x16 c1 = v_max(_v - _u1, _u0 - _v);
239
v_uint8x16 diff = v_min(c0, c1);
240
241
v_int16x8 _c0 = v_load_aligned(cost + x*D + d);
242
v_int16x8 _c1 = v_load_aligned(cost + x*D + d + 8);
243
244
v_uint16x8 diff1,diff2;
245
v_expand(diff,diff1,diff2);
246
v_store_aligned(cost + x*D + d, _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
247
v_store_aligned(cost + x*D + d + 8, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
248
}
249
}
250
else
251
#endif
252
{
253
for( int d = minD; d < maxD; d++ )
254
{
255
int v = prow2[width-x-1 + d];
256
int v0 = buffer[width-x-1 + d];
257
int v1 = buffer[width-x-1 + d + width2];
258
int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
259
int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
260
261
cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
262
}
263
}
264
}
265
}
266
}
267
268
269
/*
270
computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
271
that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
272
minD <= d < maxD.
273
disp2full is the reverse disparity map, that is:
274
disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
275
276
note that disp1buf will have the same size as the roi and
277
disp2full will have the same size as img1 (or img2).
278
On exit disp2buf is not the final disparity, it is an intermediate result that becomes
279
final after all the tiles are processed.
280
281
the disparity in disp1buf is written with sub-pixel accuracy
282
(4 fractional bits, see StereoSGBM::DISP_SCALE),
283
using quadratic interpolation, while the disparity in disp2buf
284
is written as is, without interpolation.
285
286
disp2cost also has the same size as img1 (or img2).
287
It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
288
*/
289
static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
290
Mat& disp1, const StereoSGBMParams& params,
291
Mat& buffer )
292
{
293
#if CV_SIMD128
294
// maxDisparity is supposed to multiple of 16, so we can forget doing else
295
static const uchar LSBTab[] =
296
{
297
0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
298
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
299
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
300
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
301
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
302
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
303
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
304
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
305
};
306
static const v_uint16x8 v_LSB = v_uint16x8(0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
307
308
bool useSIMD = hasSIMD128();
309
#endif
310
311
const int ALIGN = 16;
312
const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
313
const int DISP_SCALE = (1 << DISP_SHIFT);
314
const CostType MAX_COST = SHRT_MAX;
315
316
int minD = params.minDisparity, maxD = minD + params.numDisparities;
317
Size SADWindowSize;
318
SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
319
int ftzero = std::max(params.preFilterCap, 15) | 1;
320
int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
321
int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
322
int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
323
int k, width = disp1.cols, height = disp1.rows;
324
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
325
int D = maxD - minD, width1 = maxX1 - minX1;
326
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
327
int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
328
bool fullDP = params.mode == StereoSGBM::MODE_HH;
329
int npasses = fullDP ? 2 : 1;
330
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
331
PixType clipTab[TAB_SIZE];
332
333
for( k = 0; k < TAB_SIZE; k++ )
334
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
335
336
if( minX1 >= maxX1 )
337
{
338
disp1 = Scalar::all(INVALID_DISP_SCALED);
339
return;
340
}
341
342
CV_Assert( D % 16 == 0 );
343
344
// NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
345
// if you change NR, please, modify the loop as well.
346
int D2 = D+16, NRD2 = NR2*D2;
347
348
// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
349
// for 8-way dynamic programming we need the current row and
350
// the previous row, i.e. 2 rows in total
351
const int NLR = 2;
352
const int LrBorder = NLR - 1;
353
354
// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
355
// we keep pixel difference cost (C) and the summary cost over NR directions (S).
356
// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
357
size_t costBufSize = width1*D;
358
size_t CSBufSize = costBufSize*(fullDP ? height : 1);
359
size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
360
int hsumBufNRows = SH2*2 + 2;
361
size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
362
costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
363
CSBufSize*2*sizeof(CostType) + // C, S
364
width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
365
width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
366
367
if( buffer.empty() || !buffer.isContinuous() ||
368
buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
369
buffer.reserveBuffer(totalBufSize);
370
371
// summary cost over different (nDirs) directions
372
CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
373
CostType* Sbuf = Cbuf + CSBufSize;
374
CostType* hsumBuf = Sbuf + CSBufSize;
375
CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
376
377
CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
378
DispType* disp2ptr = (DispType*)(disp2cost + width);
379
PixType* tempBuf = (PixType*)(disp2ptr + width);
380
381
// add P2 to every C(x,y). it saves a few operations in the inner loops
382
for(k = 0; k < (int)CSBufSize; k++ )
383
Cbuf[k] = (CostType)P2;
384
385
for( int pass = 1; pass <= npasses; pass++ )
386
{
387
int x1, y1, x2, y2, dx, dy;
388
389
if( pass == 1 )
390
{
391
y1 = 0; y2 = height; dy = 1;
392
x1 = 0; x2 = width1; dx = 1;
393
}
394
else
395
{
396
y1 = height-1; y2 = -1; dy = -1;
397
x1 = width1-1; x2 = -1; dx = -1;
398
}
399
400
CostType *Lr[NLR]={0}, *minLr[NLR]={0};
401
402
for( k = 0; k < NLR; k++ )
403
{
404
// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
405
// and will occasionally use negative indices with the arrays
406
// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
407
// however, then the alignment will be imperfect, i.e. bad for SSE,
408
// thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
409
Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
410
memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
411
minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;
412
memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
413
}
414
415
for( int y = y1; y != y2; y += dy )
416
{
417
int x, d;
418
DispType* disp1ptr = disp1.ptr<DispType>(y);
419
CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
420
CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);
421
422
if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
423
{
424
int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
425
426
for( k = dy1; k <= dy2; k++ )
427
{
428
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
429
430
if( k < height )
431
{
432
calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
433
434
memset(hsumAdd, 0, D*sizeof(CostType));
435
for( x = 0; x <= SW2*D; x += D )
436
{
437
int scale = x == 0 ? SW2 + 1 : 1;
438
for( d = 0; d < D; d++ )
439
hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
440
}
441
442
if( y > 0 )
443
{
444
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
445
const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
446
447
for( x = D; x < width1*D; x += D )
448
{
449
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
450
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
451
452
#if CV_SIMD128
453
if( useSIMD )
454
{
455
for( d = 0; d < D; d += 8 )
456
{
457
v_int16x8 hv = v_load(hsumAdd + x - D + d);
458
v_int16x8 Cx = v_load(Cprev + x + d);
459
v_int16x8 psub = v_load(pixSub + d);
460
v_int16x8 padd = v_load(pixAdd + d);
461
hv = (hv - psub + padd);
462
psub = v_load(hsumSub + x + d);
463
Cx = Cx - psub + hv;
464
v_store(hsumAdd + x + d, hv);
465
v_store(C + x + d, Cx);
466
}
467
}
468
else
469
#endif
470
{
471
for( d = 0; d < D; d++ )
472
{
473
int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
474
C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
475
}
476
}
477
}
478
}
479
else
480
{
481
for( x = D; x < width1*D; x += D )
482
{
483
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
484
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
485
486
for( d = 0; d < D; d++ )
487
hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
488
}
489
}
490
}
491
492
if( y == 0 )
493
{
494
int scale = k == 0 ? SH2 + 1 : 1;
495
for( x = 0; x < width1*D; x++ )
496
C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
497
}
498
}
499
500
// also, clear the S buffer
501
for( k = 0; k < width1*D; k++ )
502
S[k] = 0;
503
}
504
505
// clear the left and the right borders
506
memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
507
memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
508
memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
509
memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
510
511
/*
512
[formula 13 in the paper]
513
compute L_r(p, d) = C(p, d) +
514
min(L_r(p-r, d),
515
L_r(p-r, d-1) + P1,
516
L_r(p-r, d+1) + P1,
517
min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
518
where p = (x,y), r is one of the directions.
519
we process all the directions at once:
520
0: r=(-dx, 0)
521
1: r=(-1, -dy)
522
2: r=(0, -dy)
523
3: r=(1, -dy)
524
4: r=(-2, -dy)
525
5: r=(-1, -dy*2)
526
6: r=(1, -dy*2)
527
7: r=(2, -dy)
528
*/
529
530
for( x = x1; x != x2; x += dx )
531
{
532
int xm = x*NR2, xd = xm*D2;
533
534
int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
535
int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
536
537
CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
538
CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
539
CostType* Lr_p2 = Lr[1] + xd + D2*2;
540
CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
541
542
Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
543
Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
544
545
CostType* Lr_p = Lr[0] + xd;
546
const CostType* Cp = C + x*D;
547
CostType* Sp = S + x*D;
548
549
#if CV_SIMD128
550
if( useSIMD )
551
{
552
v_int16x8 _P1 = v_setall_s16((short)P1);
553
554
v_int16x8 _delta0 = v_setall_s16((short)delta0);
555
v_int16x8 _delta1 = v_setall_s16((short)delta1);
556
v_int16x8 _delta2 = v_setall_s16((short)delta2);
557
v_int16x8 _delta3 = v_setall_s16((short)delta3);
558
v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
559
560
for( d = 0; d < D; d += 8 )
561
{
562
v_int16x8 Cpd = v_load(Cp + d);
563
v_int16x8 L0, L1, L2, L3;
564
565
L0 = v_load(Lr_p0 + d);
566
L1 = v_load(Lr_p1 + d);
567
L2 = v_load(Lr_p2 + d);
568
L3 = v_load(Lr_p3 + d);
569
570
L0 = v_min(L0, (v_load(Lr_p0 + d - 1) + _P1));
571
L0 = v_min(L0, (v_load(Lr_p0 + d + 1) + _P1));
572
573
L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
574
L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
575
576
L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1));
577
L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1));
578
579
L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
580
L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
581
582
L0 = v_min(L0, _delta0);
583
L0 = ((L0 - _delta0) + Cpd);
584
585
L1 = v_min(L1, _delta1);
586
L1 = ((L1 - _delta1) + Cpd);
587
588
L2 = v_min(L2, _delta2);
589
L2 = ((L2 - _delta2) + Cpd);
590
591
L3 = v_min(L3, _delta3);
592
L3 = ((L3 - _delta3) + Cpd);
593
594
v_store(Lr_p + d, L0);
595
v_store(Lr_p + d + D2, L1);
596
v_store(Lr_p + d + D2*2, L2);
597
v_store(Lr_p + d + D2*3, L3);
598
599
// Get minimum from in L0-L3
600
v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
601
v_zip(L0, L2, t02L, t02H); // L0[0] L2[0] L0[1] L2[1]...
602
v_zip(L1, L3, t13L, t13H); // L1[0] L3[0] L1[1] L3[1]...
603
v_int16x8 t02 = v_min(t02L, t02H); // L0[i] L2[i] L0[i] L2[i]...
604
v_int16x8 t13 = v_min(t13L, t13H); // L1[i] L3[i] L1[i] L3[i]...
605
v_zip(t02, t13, t0123L, t0123H); // L0[i] L1[i] L2[i] L3[i]...
606
v_int16x8 t0 = v_min(t0123L, t0123H);
607
_minL0 = v_min(_minL0, t0);
608
609
v_int16x8 Sval = v_load(Sp + d);
610
611
L0 = L0 + L1;
612
L2 = L2 + L3;
613
Sval = Sval + L0;
614
Sval = Sval + L2;
615
616
v_store(Sp + d, Sval);
617
}
618
619
v_int32x4 minL, minH;
620
v_expand(_minL0, minL, minH);
621
v_pack_store(&minLr[0][xm], v_min(minL, minH));
622
}
623
else
624
#endif
625
{
626
int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
627
628
for( d = 0; d < D; d++ )
629
{
630
int Cpd = Cp[d], L0, L1, L2, L3;
631
632
L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
633
L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;
634
L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d-1] + P1, std::min(Lr_p2[d+1] + P1, delta2))) - delta2;
635
L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d-1] + P1, std::min(Lr_p3[d+1] + P1, delta3))) - delta3;
636
637
Lr_p[d] = (CostType)L0;
638
minL0 = std::min(minL0, L0);
639
640
Lr_p[d + D2] = (CostType)L1;
641
minL1 = std::min(minL1, L1);
642
643
Lr_p[d + D2*2] = (CostType)L2;
644
minL2 = std::min(minL2, L2);
645
646
Lr_p[d + D2*3] = (CostType)L3;
647
minL3 = std::min(minL3, L3);
648
649
Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
650
}
651
minLr[0][xm] = (CostType)minL0;
652
minLr[0][xm+1] = (CostType)minL1;
653
minLr[0][xm+2] = (CostType)minL2;
654
minLr[0][xm+3] = (CostType)minL3;
655
}
656
}
657
658
if( pass == npasses )
659
{
660
for( x = 0; x < width; x++ )
661
{
662
disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
663
disp2cost[x] = MAX_COST;
664
}
665
666
for( x = width1 - 1; x >= 0; x-- )
667
{
668
CostType* Sp = S + x*D;
669
int minS = MAX_COST, bestDisp = -1;
670
671
if( npasses == 1 )
672
{
673
int xm = x*NR2, xd = xm*D2;
674
675
int minL0 = MAX_COST;
676
int delta0 = minLr[0][xm + NR2] + P2;
677
CostType* Lr_p0 = Lr[0] + xd + NRD2;
678
Lr_p0[-1] = Lr_p0[D] = MAX_COST;
679
CostType* Lr_p = Lr[0] + xd;
680
681
const CostType* Cp = C + x*D;
682
683
#if CV_SIMD128
684
if( useSIMD )
685
{
686
v_int16x8 _P1 = v_setall_s16((short)P1);
687
v_int16x8 _delta0 = v_setall_s16((short)delta0);
688
689
v_int16x8 _minL0 = v_setall_s16((short)minL0);
690
v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);
691
v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);
692
693
for( d = 0; d < D; d += 8 )
694
{
695
v_int16x8 Cpd = v_load(Cp + d);
696
v_int16x8 L0 = v_load(Lr_p0 + d);
697
698
L0 = v_min(L0, v_load(Lr_p0 + d - 1) + _P1);
699
L0 = v_min(L0, v_load(Lr_p0 + d + 1) + _P1);
700
L0 = v_min(L0, _delta0);
701
L0 = L0 - _delta0 + Cpd;
702
703
v_store(Lr_p + d, L0);
704
_minL0 = v_min(_minL0, L0);
705
L0 = L0 + v_load(Sp + d);
706
v_store(Sp + d, L0);
707
708
v_int16x8 mask = _minS > L0;
709
_minS = v_min(_minS, L0);
710
_bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
711
_d8 += _8;
712
}
713
short bestDispBuf[8];
714
v_store(bestDispBuf, _bestDisp);
715
716
v_int32x4 min32L, min32H;
717
v_expand(_minL0, min32L, min32H);
718
minLr[0][xm] = (CostType)std::min(v_reduce_min(min32L), v_reduce_min(min32H));
719
720
v_expand(_minS, min32L, min32H);
721
minS = std::min(v_reduce_min(min32L), v_reduce_min(min32H));
722
723
v_int16x8 ss = v_setall_s16((short)minS);
724
v_uint16x8 minMask = v_reinterpret_as_u16(ss == _minS);
725
v_uint16x8 minBit = minMask & v_LSB;
726
727
v_uint32x4 minBitL, minBitH;
728
v_expand(minBit, minBitL, minBitH);
729
730
int idx = v_reduce_sum(minBitL) + v_reduce_sum(minBitH);
731
bestDisp = bestDispBuf[LSBTab[idx]];
732
}
733
else
734
#endif
735
{
736
for( d = 0; d < D; d++ )
737
{
738
int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
739
740
Lr_p[d] = (CostType)L0;
741
minL0 = std::min(minL0, L0);
742
743
int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
744
if( Sval < minS )
745
{
746
minS = Sval;
747
bestDisp = d;
748
}
749
}
750
minLr[0][xm] = (CostType)minL0;
751
}
752
}
753
else
754
{
755
#if CV_SIMD128
756
if( useSIMD )
757
{
758
v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);
759
v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);
760
761
for( d = 0; d < D; d+= 8 )
762
{
763
v_int16x8 L0 = v_load(Sp + d);
764
v_int16x8 mask = L0 < _minS;
765
_minS = v_min( L0, _minS );
766
_bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
767
_d8 = _d8 + _8;
768
}
769
v_int32x4 _d0, _d1;
770
v_expand(_minS, _d0, _d1);
771
minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
772
v_int16x8 v_mask = v_setall_s16((short)minS) == _minS;
773
774
_bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);
775
v_expand(_bestDisp, _d0, _d1);
776
bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
777
}
778
else
779
#endif
780
{
781
for( d = 0; d < D; d++ )
782
{
783
int Sval = Sp[d];
784
if( Sval < minS )
785
{
786
minS = Sval;
787
bestDisp = d;
788
}
789
}
790
}
791
}
792
793
for( d = 0; d < D; d++ )
794
{
795
if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
796
break;
797
}
798
if( d < D )
799
continue;
800
d = bestDisp;
801
int _x2 = x + minX1 - d - minD;
802
if( disp2cost[_x2] > minS )
803
{
804
disp2cost[_x2] = (CostType)minS;
805
disp2ptr[_x2] = (DispType)(d + minD);
806
}
807
808
if( 0 < d && d < D-1 )
809
{
810
// do subpixel quadratic interpolation:
811
// fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
812
// then find minimum of the parabola.
813
int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
814
d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
815
}
816
else
817
d *= DISP_SCALE;
818
disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
819
}
820
821
for( x = minX1; x < maxX1; x++ )
822
{
823
// we round the computed disparity both towards -inf and +inf and check
824
// if either of the corresponding disparities in disp2 is consistent.
825
// This is to give the computed disparity a chance to look valid if it is.
826
int d1 = disp1ptr[x];
827
if( d1 == INVALID_DISP_SCALED )
828
continue;
829
int _d = d1 >> DISP_SHIFT;
830
int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
831
int _x = x - _d, x_ = x - d_;
832
if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
833
0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
834
disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
835
}
836
}
837
838
// now shift the cyclic buffers
839
std::swap( Lr[0], Lr[1] );
840
std::swap( minLr[0], minLr[1] );
841
}
842
}
843
}
844
845
////////////////////////////////////////////////////////////////////////////////////////////
846
struct CalcVerticalSums: public ParallelLoopBody
847
{
848
CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params,
849
CostType* alignedBuf, PixType* _clipTab): img1(_img1), img2(_img2), clipTab(_clipTab)
850
{
851
minD = params.minDisparity;
852
maxD = minD + params.numDisparities;
853
SW2 = SH2 = (params.SADWindowSize > 0 ? params.SADWindowSize : 5)/2;
854
ftzero = std::max(params.preFilterCap, 15) | 1;
855
P1 = params.P1 > 0 ? params.P1 : 2;
856
P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
857
height = img1.rows;
858
width = img1.cols;
859
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
860
D = maxD - minD;
861
width1 = maxX1 - minX1;
862
D2 = D + 16;
863
costBufSize = width1*D;
864
CSBufSize = costBufSize*height;
865
minLrSize = width1;
866
LrSize = minLrSize*D2;
867
hsumBufNRows = SH2*2 + 2;
868
Cbuf = alignedBuf;
869
Sbuf = Cbuf + CSBufSize;
870
hsumBuf = Sbuf + CSBufSize;
871
useSIMD = hasSIMD128();
872
}
873
874
void operator()(const Range& range) const CV_OVERRIDE
875
{
876
static const CostType MAX_COST = SHRT_MAX;
877
static const int ALIGN = 16;
878
static const int TAB_OFS = 256*4;
879
static const int npasses = 2;
880
int x1 = range.start, x2 = range.end, k;
881
size_t pixDiffSize = ((x2 - x1) + 2*SW2)*D;
882
size_t auxBufsSize = pixDiffSize*sizeof(CostType) + //pixdiff size
883
width*16*img1.channels()*sizeof(PixType) + 32; //tempBuf
884
Mat auxBuff;
885
auxBuff.create(1, (int)auxBufsSize, CV_8U);
886
CostType* pixDiff = (CostType*)alignPtr(auxBuff.ptr(), ALIGN);
887
PixType* tempBuf = (PixType*)(pixDiff + pixDiffSize);
888
889
// Simplification of index calculation
890
pixDiff -= (x1>SW2 ? (x1 - SW2): 0)*D;
891
892
for( int pass = 1; pass <= npasses; pass++ )
893
{
894
int y1, y2, dy;
895
896
if( pass == 1 )
897
{
898
y1 = 0; y2 = height; dy = 1;
899
}
900
else
901
{
902
y1 = height-1; y2 = -1; dy = -1;
903
}
904
905
CostType *Lr[NLR]={0}, *minLr[NLR]={0};
906
907
for( k = 0; k < NLR; k++ )
908
{
909
// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
910
// and will occasionally use negative indices with the arrays
911
// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
912
// however, then the alignment will be imperfect, i.e. bad for SSE,
913
// thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
914
Lr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*k + 8;
915
memset( Lr[k] + x1*D2 - 8, 0, (x2-x1)*D2*sizeof(CostType) );
916
minLr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*NLR + minLrSize*k;
917
memset( minLr[k] + x1, 0, (x2-x1)*sizeof(CostType) );
918
}
919
920
for( int y = y1; y != y2; y += dy )
921
{
922
int x, d;
923
CostType* C = Cbuf + y*costBufSize;
924
CostType* S = Sbuf + y*costBufSize;
925
926
if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
927
{
928
int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
929
930
for( k = dy1; k <= dy2; k++ )
931
{
932
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
933
934
if( k < height )
935
{
936
calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero, x1 - SW2, x2 + SW2);
937
938
memset(hsumAdd + x1*D, 0, D*sizeof(CostType));
939
for( x = (x1 - SW2)*D; x <= (x1 + SW2)*D; x += D )
940
{
941
int xbord = x <= 0 ? 0 : (x > (width1 - 1)*D? (width1 - 1)*D : x);
942
for( d = 0; d < D; d++ )
943
hsumAdd[x1*D + d] = (CostType)(hsumAdd[x1*D + d] + pixDiff[xbord + d]);
944
}
945
946
if( y > 0 )
947
{
948
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
949
const CostType* Cprev = C - costBufSize;
950
951
for( d = 0; d < D; d++ )
952
C[x1*D + d] = (CostType)(Cprev[x1*D + d] + hsumAdd[x1*D + d] - hsumSub[x1*D + d]);
953
954
for( x = (x1+1)*D; x < x2*D; x += D )
955
{
956
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
957
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
958
959
#if CV_SIMD128
960
if( useSIMD )
961
{
962
for( d = 0; d < D; d += 8 )
963
{
964
v_int16x8 hv = v_load(hsumAdd + x - D + d);
965
v_int16x8 Cx = v_load(Cprev + x + d);
966
v_int16x8 psub = v_load(pixSub + d);
967
v_int16x8 padd = v_load(pixAdd + d);
968
hv = (hv - psub + padd);
969
psub = v_load(hsumSub + x + d);
970
Cx = Cx - psub + hv;
971
v_store(hsumAdd + x + d, hv);
972
v_store(C + x + d, Cx);
973
}
974
}
975
else
976
#endif
977
{
978
for( d = 0; d < D; d++ )
979
{
980
int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
981
C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
982
}
983
}
984
}
985
}
986
else
987
{
988
for( x = (x1+1)*D; x < x2*D; x += D )
989
{
990
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
991
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
992
993
for( d = 0; d < D; d++ )
994
hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
995
}
996
}
997
}
998
999
if( y == 0 )
1000
{
1001
int scale = k == 0 ? SH2 + 1 : 1;
1002
for( x = x1*D; x < x2*D; x++ )
1003
C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
1004
}
1005
}
1006
1007
// also, clear the S buffer
1008
for( k = x1*D; k < x2*D; k++ )
1009
S[k] = 0;
1010
}
1011
1012
// [formula 13 in the paper]
1013
// compute L_r(p, d) = C(p, d) +
1014
// min(L_r(p-r, d),
1015
// L_r(p-r, d-1) + P1,
1016
// L_r(p-r, d+1) + P1,
1017
// min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
1018
// where p = (x,y), r is one of the directions.
1019
// we process one directions on first pass and other on second:
1020
// r=(0, dy), where dy=1 on first pass and dy=-1 on second
1021
1022
for( x = x1; x != x2; x++ )
1023
{
1024
int xd = x*D2;
1025
1026
int delta = minLr[1][x] + P2;
1027
1028
CostType* Lr_ppr = Lr[1] + xd;
1029
1030
Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
1031
1032
CostType* Lr_p = Lr[0] + xd;
1033
const CostType* Cp = C + x*D;
1034
CostType* Sp = S + x*D;
1035
1036
#if CV_SIMD128
1037
if( useSIMD )
1038
{
1039
v_int16x8 _P1 = v_setall_s16((short)P1);
1040
1041
v_int16x8 _delta = v_setall_s16((short)delta);
1042
v_int16x8 _minL = v_setall_s16((short)MAX_COST);
1043
1044
for( d = 0; d < D; d += 8 )
1045
{
1046
v_int16x8 Cpd = v_load(Cp + d);
1047
v_int16x8 L;
1048
1049
L = v_load(Lr_ppr + d);
1050
1051
L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1));
1052
L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1));
1053
1054
L = v_min(L, _delta);
1055
L = ((L - _delta) + Cpd);
1056
1057
v_store(Lr_p + d, L);
1058
1059
// Get minimum from in L-L3
1060
_minL = v_min(_minL, L);
1061
1062
v_int16x8 Sval = v_load(Sp + d);
1063
1064
Sval = Sval + L;
1065
1066
v_store(Sp + d, Sval);
1067
}
1068
1069
v_int32x4 min1, min2, min12;
1070
v_expand(_minL, min1, min2);
1071
min12 = v_min(min1,min2);
1072
minLr[0][x] = (CostType)v_reduce_min(min12);
1073
}
1074
else
1075
#endif
1076
{
1077
int minL = MAX_COST;
1078
1079
for( d = 0; d < D; d++ )
1080
{
1081
int Cpd = Cp[d], L;
1082
1083
L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
1084
1085
Lr_p[d] = (CostType)L;
1086
minL = std::min(minL, L);
1087
1088
Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1089
}
1090
minLr[0][x] = (CostType)minL;
1091
}
1092
}
1093
1094
// now shift the cyclic buffers
1095
std::swap( Lr[0], Lr[1] );
1096
std::swap( minLr[0], minLr[1] );
1097
}
1098
}
1099
}
1100
static const int NLR = 2;
1101
const Mat& img1;
1102
const Mat& img2;
1103
CostType* Cbuf;
1104
CostType* Sbuf;
1105
CostType* hsumBuf;
1106
PixType* clipTab;
1107
int minD;
1108
int maxD;
1109
int D;
1110
int D2;
1111
int SH2;
1112
int SW2;
1113
int width;
1114
int width1;
1115
int height;
1116
int P1;
1117
int P2;
1118
size_t costBufSize;
1119
size_t CSBufSize;
1120
size_t minLrSize;
1121
size_t LrSize;
1122
size_t hsumBufNRows;
1123
int ftzero;
1124
bool useSIMD;
1125
};
1126
1127
struct CalcHorizontalSums: public ParallelLoopBody
1128
{
1129
CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params,
1130
CostType* alignedBuf): img1(_img1), img2(_img2), disp1(_disp1)
1131
{
1132
minD = params.minDisparity;
1133
maxD = minD + params.numDisparities;
1134
P1 = params.P1 > 0 ? params.P1 : 2;
1135
P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
1136
uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
1137
disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
1138
height = img1.rows;
1139
width = img1.cols;
1140
minX1 = std::max(maxD, 0);
1141
maxX1 = width + std::min(minD, 0);
1142
INVALID_DISP = minD - 1;
1143
INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1144
D = maxD - minD;
1145
width1 = maxX1 - minX1;
1146
costBufSize = width1*D;
1147
CSBufSize = costBufSize*height;
1148
D2 = D + 16;
1149
LrSize = 2 * D2;
1150
Cbuf = alignedBuf;
1151
Sbuf = Cbuf + CSBufSize;
1152
useSIMD = hasSIMD128();
1153
}
1154
1155
void operator()(const Range& range) const CV_OVERRIDE
1156
{
1157
int y1 = range.start, y2 = range.end;
1158
size_t auxBufsSize = LrSize * sizeof(CostType) + width*(sizeof(CostType) + sizeof(DispType)) + 32;
1159
1160
Mat auxBuff;
1161
auxBuff.create(1, (int)auxBufsSize, CV_8U);
1162
CostType *Lr = ((CostType*)alignPtr(auxBuff.ptr(), ALIGN)) + 8;
1163
CostType* disp2cost = Lr + LrSize;
1164
DispType* disp2ptr = (DispType*)(disp2cost + width);
1165
1166
CostType minLr;
1167
1168
for( int y = y1; y != y2; y++)
1169
{
1170
int x, d;
1171
DispType* disp1ptr = disp1.ptr<DispType>(y);
1172
CostType* C = Cbuf + y*costBufSize;
1173
CostType* S = Sbuf + y*costBufSize;
1174
1175
for( x = 0; x < width; x++ )
1176
{
1177
disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
1178
disp2cost[x] = MAX_COST;
1179
}
1180
1181
// clear buffers
1182
memset( Lr - 8, 0, LrSize*sizeof(CostType) );
1183
Lr[-1] = Lr[D] = Lr[D2 - 1] = Lr[D2 + D] = MAX_COST;
1184
1185
minLr = 0;
1186
// [formula 13 in the paper]
1187
// compute L_r(p, d) = C(p, d) +
1188
// min(L_r(p-r, d),
1189
// L_r(p-r, d-1) + P1,
1190
// L_r(p-r, d+1) + P1,
1191
// min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
1192
// where p = (x,y), r is one of the directions.
1193
// we process all the directions at once:
1194
// we process one directions on first pass and other on second:
1195
// r=(dx, 0), where dx=1 on first pass and dx=-1 on second
1196
for( x = 0; x != width1; x++)
1197
{
1198
int delta = minLr + P2;
1199
1200
CostType* Lr_ppr = Lr + ((x&1)? 0 : D2);
1201
1202
CostType* Lr_p = Lr + ((x&1)? D2 :0);
1203
const CostType* Cp = C + x*D;
1204
CostType* Sp = S + x*D;
1205
1206
#if CV_SIMD128
1207
if( useSIMD )
1208
{
1209
v_int16x8 _P1 = v_setall_s16((short)P1);
1210
1211
v_int16x8 _delta = v_setall_s16((short)delta);
1212
v_int16x8 _minL = v_setall_s16((short)MAX_COST);
1213
1214
for( d = 0; d < D; d += 8 )
1215
{
1216
v_int16x8 Cpd = v_load(Cp + d);
1217
v_int16x8 L;
1218
1219
L = v_load(Lr_ppr + d);
1220
1221
L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1));
1222
L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1));
1223
1224
L = v_min(L, _delta);
1225
L = ((L - _delta) + Cpd);
1226
1227
v_store(Lr_p + d, L);
1228
1229
// Get minimum from in L-L3
1230
_minL = v_min(_minL, L);
1231
1232
v_int16x8 Sval = v_load(Sp + d);
1233
1234
Sval = Sval + L;
1235
1236
v_store(Sp + d, Sval);
1237
}
1238
1239
v_int32x4 min1, min2, min12;
1240
v_expand(_minL, min1, min2);
1241
min12 = v_min(min1,min2);
1242
minLr = (CostType)v_reduce_min(min12);
1243
}
1244
else
1245
#endif
1246
{
1247
minLr = MAX_COST;
1248
for( d = 0; d < D; d++ )
1249
{
1250
int Cpd = Cp[d], L;
1251
1252
L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
1253
1254
Lr_p[d] = (CostType)L;
1255
minLr = (CostType)std::min((int)minLr, L);
1256
1257
Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1258
}
1259
}
1260
}
1261
1262
memset( Lr - 8, 0, LrSize*sizeof(CostType) );
1263
Lr[-1] = Lr[D] = Lr[D2 - 1] = Lr[D2 + D] = MAX_COST;
1264
1265
minLr = 0;
1266
1267
for( x = width1-1; x != -1; x--)
1268
{
1269
int delta = minLr + P2;
1270
1271
CostType* Lr_ppr = Lr + ((x&1)? 0 :D2);
1272
1273
CostType* Lr_p = Lr + ((x&1)? D2 :0);
1274
const CostType* Cp = C + x*D;
1275
CostType* Sp = S + x*D;
1276
int minS = MAX_COST, bestDisp = -1;
1277
minLr = MAX_COST;
1278
1279
#if CV_SIMD128
1280
if( useSIMD )
1281
{
1282
v_int16x8 _P1 = v_setall_s16((short)P1);
1283
1284
v_int16x8 _delta = v_setall_s16((short)delta);
1285
v_int16x8 _minL = v_setall_s16((short)MAX_COST);
1286
1287
v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);
1288
v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);
1289
1290
for( d = 0; d < D; d+= 8 )
1291
{
1292
v_int16x8 Cpd = v_load(Cp + d);
1293
v_int16x8 L;
1294
1295
L = v_load(Lr_ppr + d);
1296
1297
L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1));
1298
L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1));
1299
1300
L = v_min(L, _delta);
1301
L = ((L - _delta) + Cpd);
1302
1303
v_store(Lr_p + d, L);
1304
1305
// Get minimum from in L-L3
1306
_minL = v_min(_minL, L);
1307
1308
v_int16x8 Sval = v_load(Sp + d);
1309
1310
Sval = Sval + L;
1311
1312
v_int16x8 mask = Sval < _minS;
1313
_minS = v_min( Sval, _minS );
1314
_bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
1315
_d8 = _d8 + _8;
1316
1317
v_store(Sp + d, Sval);
1318
}
1319
v_int32x4 min1, min2, min12;
1320
v_expand(_minL, min1, min2);
1321
min12 = v_min(min1,min2);
1322
minLr = (CostType)v_reduce_min(min12);
1323
1324
v_int32x4 _d0, _d1;
1325
v_expand(_minS, _d0, _d1);
1326
minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
1327
v_int16x8 v_mask = v_setall_s16((short)minS) == _minS;
1328
1329
_bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);
1330
v_expand(_bestDisp, _d0, _d1);
1331
bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
1332
}
1333
else
1334
#endif
1335
{
1336
for( d = 0; d < D; d++ )
1337
{
1338
int Cpd = Cp[d], L;
1339
1340
L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
1341
1342
Lr_p[d] = (CostType)L;
1343
minLr = (CostType)std::min((int)minLr, L);
1344
1345
Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1346
if( Sp[d] < minS )
1347
{
1348
minS = Sp[d];
1349
bestDisp = d;
1350
}
1351
}
1352
}
1353
//Some postprocessing procedures and saving
1354
for( d = 0; d < D; d++ )
1355
{
1356
if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
1357
break;
1358
}
1359
if( d < D )
1360
continue;
1361
d = bestDisp;
1362
int _x2 = x + minX1 - d - minD;
1363
if( disp2cost[_x2] > minS )
1364
{
1365
disp2cost[_x2] = (CostType)minS;
1366
disp2ptr[_x2] = (DispType)(d + minD);
1367
}
1368
1369
if( 0 < d && d < D-1 )
1370
{
1371
// do subpixel quadratic interpolation:
1372
// fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
1373
// then find minimum of the parabola.
1374
int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
1375
d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
1376
}
1377
else
1378
d *= DISP_SCALE;
1379
disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
1380
}
1381
//Left-right check sanity procedure
1382
for( x = minX1; x < maxX1; x++ )
1383
{
1384
// we round the computed disparity both towards -inf and +inf and check
1385
// if either of the corresponding disparities in disp2 is consistent.
1386
// This is to give the computed disparity a chance to look valid if it is.
1387
int d1 = disp1ptr[x];
1388
if( d1 == INVALID_DISP_SCALED )
1389
continue;
1390
int _d = d1 >> DISP_SHIFT;
1391
int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
1392
int _x = x - _d, x_ = x - d_;
1393
if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
1394
0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
1395
disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
1396
}
1397
}
1398
}
1399
1400
static const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
1401
static const int DISP_SCALE = (1 << DISP_SHIFT);
1402
static const CostType MAX_COST = SHRT_MAX;
1403
static const int ALIGN = 16;
1404
const Mat& img1;
1405
const Mat& img2;
1406
Mat& disp1;
1407
CostType* Cbuf;
1408
CostType* Sbuf;
1409
int minD;
1410
int maxD;
1411
int D;
1412
int D2;
1413
int width;
1414
int width1;
1415
int height;
1416
int P1;
1417
int P2;
1418
int minX1;
1419
int maxX1;
1420
size_t costBufSize;
1421
size_t CSBufSize;
1422
size_t LrSize;
1423
int INVALID_DISP;
1424
int INVALID_DISP_SCALED;
1425
int uniquenessRatio;
1426
int disp12MaxDiff;
1427
bool useSIMD;
1428
};
1429
/*
1430
computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
1431
that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
1432
minD <= d < maxD.
1433
1434
note that disp1buf will have the same size as the roi and
1435
On exit disp1buf is not the final disparity, it is an intermediate result that becomes
1436
final after all the tiles are processed.
1437
1438
the disparity in disp1buf is written with sub-pixel accuracy
1439
(4 fractional bits, see StereoSGBM::DISP_SCALE),
1440
using quadratic interpolation, while the disparity in disp2buf
1441
is written as is, without interpolation.
1442
*/
1443
static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
1444
Mat& disp1, const StereoSGBMParams& params,
1445
Mat& buffer )
1446
{
1447
const int ALIGN = 16;
1448
const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
1449
const int DISP_SCALE = (1 << DISP_SHIFT);
1450
int minD = params.minDisparity, maxD = minD + params.numDisparities;
1451
Size SADWindowSize;
1452
SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
1453
int ftzero = std::max(params.preFilterCap, 15) | 1;
1454
int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
1455
int k, width = disp1.cols, height = disp1.rows;
1456
int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
1457
int D = maxD - minD, width1 = maxX1 - minX1;
1458
int SH2 = SADWindowSize.height/2;
1459
int INVALID_DISP = minD - 1;
1460
int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1461
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
1462
PixType clipTab[TAB_SIZE];
1463
1464
for( k = 0; k < TAB_SIZE; k++ )
1465
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
1466
1467
if( minX1 >= maxX1 )
1468
{
1469
disp1 = Scalar::all(INVALID_DISP_SCALED);
1470
return;
1471
}
1472
1473
CV_Assert( D % 16 == 0 );
1474
1475
int D2 = D+16;
1476
1477
// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
1478
// for dynamic programming we need the current row and
1479
// the previous row, i.e. 2 rows in total
1480
const int NLR = 2;
1481
1482
// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
1483
// we keep pixel difference cost (C) and the summary cost over 4 directions (S).
1484
// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
1485
size_t costBufSize = width1*D;
1486
size_t CSBufSize = costBufSize*height;
1487
size_t minLrSize = width1 , LrSize = minLrSize*D2;
1488
int hsumBufNRows = SH2*2 + 2;
1489
size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
1490
costBufSize*hsumBufNRows*sizeof(CostType) + // hsumBuf
1491
CSBufSize*2*sizeof(CostType) + 1024; // C, S
1492
1493
if( buffer.empty() || !buffer.isContinuous() ||
1494
buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
1495
{
1496
buffer.reserveBuffer(totalBufSize);
1497
}
1498
1499
// summary cost over different (nDirs) directions
1500
CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
1501
1502
// add P2 to every C(x,y). it saves a few operations in the inner loops
1503
for(k = 0; k < (int)CSBufSize; k++ )
1504
Cbuf[k] = (CostType)P2;
1505
1506
parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, Cbuf, clipTab),8);
1507
parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, Cbuf),8);
1508
1509
}
1510
1511
//////////////////////////////////////////////////////////////////////////////////////////////////////
1512
1513
void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
1514
CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
1515
PixType*& tmpBuf, CostType*& horPassCostVolume,
1516
CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
1517
CostType*& disp2CostBuf, short*& disp2Buf);
1518
1519
struct SGBM3WayMainLoop : public ParallelLoopBody
1520
{
1521
Mat* buffers;
1522
const Mat *img1, *img2;
1523
Mat* dst_disp;
1524
1525
int nstripes, stripe_sz;
1526
int stripe_overlap;
1527
1528
int width,height;
1529
int minD, maxD, D;
1530
int minX1, maxX1, width1;
1531
1532
int SW2, SH2;
1533
int P1, P2;
1534
int uniquenessRatio, disp12MaxDiff;
1535
1536
int costBufSize, hsumBufNRows;
1537
int TAB_OFS, ftzero;
1538
1539
#if CV_SIMD128
1540
bool useSIMD;
1541
#endif
1542
1543
PixType* clipTab;
1544
1545
SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap);
1546
void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const;
1547
void operator () (const Range& range) const CV_OVERRIDE;
1548
};
1549
1550
SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap):
1551
buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_clipTab)
1552
{
1553
nstripes = _nstripes;
1554
stripe_overlap = _stripe_overlap;
1555
stripe_sz = (int)ceil(img1->rows/(double)nstripes);
1556
1557
width = img1->cols; height = img1->rows;
1558
minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
1559
minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1;
1560
CV_Assert( D % 16 == 0 );
1561
1562
SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1;
1563
1564
P1 = params.P1 > 0 ? params.P1 : 2; P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
1565
uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
1566
disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
1567
1568
costBufSize = width1*D;
1569
hsumBufNRows = SH2*2 + 2;
1570
TAB_OFS = 256*4;
1571
ftzero = std::max(params.preFilterCap, 15) | 1;
1572
1573
#if CV_SIMD128
1574
useSIMD = hasSIMD128();
1575
#endif
1576
}
1577
1578
void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
1579
CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
1580
PixType*& tmpBuf, CostType*& horPassCostVolume,
1581
CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
1582
CostType*& disp2CostBuf, short*& disp2Buf)
1583
{
1584
// allocating all the required memory:
1585
int costVolumeLineSize = width1*D;
1586
int width1_ext = width1+2;
1587
int costVolumeLineSize_ext = width1_ext*D;
1588
int hsumBufNRows = SH2*2 + 2;
1589
1590
// main buffer to store matching costs for the current line:
1591
int curCostVolumeLineSize = costVolumeLineSize*sizeof(CostType);
1592
1593
// auxiliary buffers for the raw matching cost computation:
1594
int hsumBufSize = costVolumeLineSize*hsumBufNRows*sizeof(CostType);
1595
int pixDiffSize = costVolumeLineSize*sizeof(CostType);
1596
int tmpBufSize = width*16*num_ch*sizeof(PixType);
1597
1598
// auxiliary buffers for the matching cost aggregation:
1599
int horPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation
1600
int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation
1601
int vertPassMinSize = width1_ext*sizeof(CostType); // buffer for storing minimum costs from the previous line
1602
int rightPassBufSize = D*sizeof(CostType); // additional small buffer for the right-to-left pass
1603
1604
// buffers for the pseudo-LRC check:
1605
int disp2CostBufSize = width*sizeof(CostType);
1606
int disp2BufSize = width*sizeof(short);
1607
1608
// sum up the sizes of all the buffers:
1609
size_t totalBufSize = curCostVolumeLineSize +
1610
hsumBufSize +
1611
pixDiffSize +
1612
tmpBufSize +
1613
horPassCostVolumeSize +
1614
vertPassCostVolumeSize +
1615
vertPassMinSize +
1616
rightPassBufSize +
1617
disp2CostBufSize +
1618
disp2BufSize +
1619
16; //to compensate for the alignPtr shifts
1620
1621
if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
1622
buffer.reserveBuffer(totalBufSize);
1623
1624
// set up all the pointers:
1625
curCostVolumeLine = (CostType*)alignPtr(buffer.ptr(), 16);
1626
hsumBuf = curCostVolumeLine + costVolumeLineSize;
1627
pixDiff = hsumBuf + costVolumeLineSize*hsumBufNRows;
1628
tmpBuf = (PixType*)(pixDiff + costVolumeLineSize);
1629
horPassCostVolume = (CostType*)(tmpBuf + width*16*num_ch);
1630
vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext;
1631
rightPassBuf = vertPassCostVolume + costVolumeLineSize_ext;
1632
vertPassMin = rightPassBuf + D;
1633
disp2CostBuf = vertPassMin + width1_ext;
1634
disp2Buf = disp2CostBuf + width;
1635
1636
// initialize memory:
1637
memset(buffer.ptr(),0,totalBufSize);
1638
for(int i=0;i<costVolumeLineSize;i++)
1639
curCostVolumeLine[i] = (CostType)P2; //such initialization simplifies the cost aggregation loops a bit
1640
}
1641
1642
// performing block matching and building raw cost-volume for the current row
1643
void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
1644
CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, //buffers
1645
int y, int src_start_idx) const
1646
{
1647
int x, d;
1648
int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1;
1649
1650
for(int k = dy1; k <= dy2; k++ )
1651
{
1652
CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
1653
if( k < height )
1654
{
1655
calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab, TAB_OFS, ftzero );
1656
1657
memset(hsumAdd, 0, D*sizeof(CostType));
1658
for(x = 0; x <= SW2*D; x += D )
1659
{
1660
int scale = x == 0 ? SW2 + 1 : 1;
1661
1662
for( d = 0; d < D; d++ )
1663
hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
1664
}
1665
1666
if( y > src_start_idx )
1667
{
1668
const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;
1669
1670
for( x = D; x < width1*D; x += D )
1671
{
1672
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
1673
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
1674
1675
#if CV_SIMD128
1676
if(useSIMD)
1677
{
1678
v_int16x8 hv_reg;
1679
for( d = 0; d < D; d+=8 )
1680
{
1681
hv_reg = v_load_aligned(hsumAdd+x-D+d) + (v_load_aligned(pixAdd+d) - v_load_aligned(pixSub+d));
1682
v_store_aligned(hsumAdd+x+d,hv_reg);
1683
v_store_aligned(C+x+d,v_load_aligned(C+x+d)+(hv_reg-v_load_aligned(hsumSub+x+d)));
1684
}
1685
}
1686
else
1687
#endif
1688
{
1689
for( d = 0; d < D; d++ )
1690
{
1691
int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
1692
C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]);
1693
}
1694
}
1695
}
1696
}
1697
else
1698
{
1699
for( x = D; x < width1*D; x += D )
1700
{
1701
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
1702
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
1703
1704
for( d = 0; d < D; d++ )
1705
hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
1706
}
1707
}
1708
}
1709
1710
if( y == src_start_idx )
1711
{
1712
int scale = k == src_start_idx ? SH2 + 1 : 1;
1713
for( x = 0; x < width1*D; x++ )
1714
C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
1715
}
1716
}
1717
}
1718
1719
#if CV_SIMD128
1720
// define some additional reduce operations:
1721
inline short min_pos(const v_int16x8& val, const v_int16x8& pos, const short min_val)
1722
{
1723
v_int16x8 v_min = v_setall_s16(min_val);
1724
v_int16x8 v_mask = v_min == val;
1725
v_int16x8 v_pos = (pos & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);
1726
1727
return v_reduce_min(v_pos);
1728
}
1729
#endif
1730
1731
// performing SGM cost accumulation from left to right (result is stored in leftBuf) and
1732
// in-place cost accumulation from top to bottom (result is stored in topBuf)
1733
inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs,
1734
CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2)
1735
{
1736
#if CV_SIMD128
1737
if(hasSIMD128())
1738
{
1739
v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
1740
1741
v_int16x8 leftMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
1742
v_int16x8 leftMinCost_new_reg = v_setall_s16(SHRT_MAX);
1743
v_int16x8 src0_leftBuf = v_setall_s16(SHRT_MAX);
1744
v_int16x8 src1_leftBuf = v_load_aligned(leftBuf_prev);
1745
1746
v_int16x8 topMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(topMinCost+P2));
1747
v_int16x8 topMinCost_new_reg = v_setall_s16(SHRT_MAX);
1748
v_int16x8 src0_topBuf = v_setall_s16(SHRT_MAX);
1749
v_int16x8 src1_topBuf = v_load_aligned(topBuf);
1750
1751
v_int16x8 src2;
1752
v_int16x8 src_shifted_left,src_shifted_right;
1753
v_int16x8 res;
1754
1755
for(int i=0;i<D-8;i+=8)
1756
{
1757
//process leftBuf:
1758
//lookahead load:
1759
src2 = v_load_aligned(leftBuf_prev+i+8);
1760
1761
//get shifted versions of the current block and add P1:
1762
src_shifted_left = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg;
1763
src_shifted_right = v_extract<1> (src1_leftBuf,src2 ) + P1_reg;
1764
1765
// process and save current block:
1766
res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
1767
leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);
1768
v_store_aligned(leftBuf+i, res);
1769
1770
//update src buffers:
1771
src0_leftBuf = src1_leftBuf;
1772
src1_leftBuf = src2;
1773
1774
//process topBuf:
1775
//lookahead load:
1776
src2 = v_load_aligned(topBuf+i+8);
1777
1778
//get shifted versions of the current block and add P1:
1779
src_shifted_left = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg;
1780
src_shifted_right = v_extract<1> (src1_topBuf,src2 ) + P1_reg;
1781
1782
// process and save current block:
1783
res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
1784
topMinCost_new_reg = v_min(topMinCost_new_reg,res);
1785
v_store_aligned(topBuf+i, res);
1786
1787
//update src buffers:
1788
src0_topBuf = src1_topBuf;
1789
src1_topBuf = src2;
1790
}
1791
1792
// a bit different processing for the last cycle of the loop:
1793
//process leftBuf:
1794
src2 = v_setall_s16(SHRT_MAX);
1795
src_shifted_left = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg;
1796
src_shifted_right = v_extract<1> (src1_leftBuf,src2 ) + P1_reg;
1797
1798
res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
1799
leftMinCost = v_reduce_min(v_min(leftMinCost_new_reg,res));
1800
v_store_aligned(leftBuf+D-8, res);
1801
1802
//process topBuf:
1803
src2 = v_setall_s16(SHRT_MAX);
1804
src_shifted_left = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg;
1805
src_shifted_right = v_extract<1> (src1_topBuf,src2 ) + P1_reg;
1806
1807
res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
1808
topMinCost = v_reduce_min(v_min(topMinCost_new_reg,res));
1809
v_store_aligned(topBuf+D-8, res);
1810
}
1811
else
1812
#endif
1813
{
1814
CostType leftMinCost_new = SHRT_MAX;
1815
CostType topMinCost_new = SHRT_MAX;
1816
int leftMinCost_P2 = leftMinCost + P2;
1817
int topMinCost_P2 = topMinCost + P2;
1818
CostType leftBuf_prev_i_minus_1 = SHRT_MAX;
1819
CostType topBuf_i_minus_1 = SHRT_MAX;
1820
CostType tmp;
1821
1822
for(int i=0;i<D-1;i++)
1823
{
1824
leftBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(leftBuf_prev_i_minus_1+P1,leftBuf_prev[i+1]+P1),std::min((int)leftBuf_prev[i],leftMinCost_P2))-leftMinCost_P2);
1825
leftBuf_prev_i_minus_1 = leftBuf_prev[i];
1826
leftMinCost_new = std::min(leftMinCost_new,leftBuf[i]);
1827
1828
tmp = topBuf[i];
1829
topBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(topBuf_i_minus_1+P1,topBuf[i+1]+P1),std::min((int)topBuf[i],topMinCost_P2))-topMinCost_P2);
1830
topBuf_i_minus_1 = tmp;
1831
topMinCost_new = std::min(topMinCost_new,topBuf[i]);
1832
}
1833
1834
leftBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(leftBuf_prev_i_minus_1+P1,std::min((int)leftBuf_prev[D-1],leftMinCost_P2))-leftMinCost_P2);
1835
leftMinCost = std::min(leftMinCost_new,leftBuf[D-1]);
1836
1837
topBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(topBuf_i_minus_1+P1,std::min((int)topBuf[D-1],topMinCost_P2))-topMinCost_P2);
1838
topMinCost = std::min(topMinCost_new,topBuf[D-1]);
1839
}
1840
}
1841
1842
// performing in-place SGM cost accumulation from right to left (the result is stored in rightBuf) and
1843
// summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the
1844
// optimal disparity value with minimum accumulated cost
1845
inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs,
1846
CostType& rightMinCost, int D, int P1, int P2, int& optimal_disp, CostType& min_cost)
1847
{
1848
#if CV_SIMD128
1849
if(hasSIMD128())
1850
{
1851
v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
1852
1853
v_int16x8 rightMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
1854
v_int16x8 rightMinCost_new_reg = v_setall_s16(SHRT_MAX);
1855
v_int16x8 src0_rightBuf = v_setall_s16(SHRT_MAX);
1856
v_int16x8 src1_rightBuf = v_load(rightBuf);
1857
1858
v_int16x8 src2;
1859
v_int16x8 src_shifted_left,src_shifted_right;
1860
v_int16x8 res;
1861
1862
v_int16x8 min_sum_cost_reg = v_setall_s16(SHRT_MAX);
1863
v_int16x8 min_sum_pos_reg = v_setall_s16(0);
1864
v_int16x8 loop_idx(0,1,2,3,4,5,6,7);
1865
v_int16x8 eight_reg = v_setall_s16(8);
1866
1867
for(int i=0;i<D-8;i+=8)
1868
{
1869
//lookahead load:
1870
src2 = v_load_aligned(rightBuf+i+8);
1871
1872
//get shifted versions of the current block and add P1:
1873
src_shifted_left = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg;
1874
src_shifted_right = v_extract<1> (src1_rightBuf,src2 ) + P1_reg;
1875
1876
// process and save current block:
1877
res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
1878
rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);
1879
v_store_aligned(rightBuf+i, res);
1880
1881
// compute and save total cost:
1882
res = res + v_load_aligned(leftBuf+i) + v_load_aligned(topBuf+i);
1883
v_store_aligned(leftBuf+i, res);
1884
1885
// track disparity value with the minimum cost:
1886
min_sum_cost_reg = v_min(min_sum_cost_reg,res);
1887
min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
1888
loop_idx = loop_idx+eight_reg;
1889
1890
//update src:
1891
src0_rightBuf = src1_rightBuf;
1892
src1_rightBuf = src2;
1893
}
1894
1895
// a bit different processing for the last cycle of the loop:
1896
src2 = v_setall_s16(SHRT_MAX);
1897
src_shifted_left = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg;
1898
src_shifted_right = v_extract<1> (src1_rightBuf,src2 ) + P1_reg;
1899
1900
res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
1901
rightMinCost = v_reduce_min(v_min(rightMinCost_new_reg,res));
1902
v_store_aligned(rightBuf+D-8, res);
1903
1904
res = res + v_load_aligned(leftBuf+D-8) + v_load_aligned(topBuf+D-8);
1905
v_store_aligned(leftBuf+D-8, res);
1906
1907
min_sum_cost_reg = v_min(min_sum_cost_reg,res);
1908
min_cost = v_reduce_min(min_sum_cost_reg);
1909
min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
1910
optimal_disp = min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost);
1911
}
1912
else
1913
#endif
1914
{
1915
CostType rightMinCost_new = SHRT_MAX;
1916
int rightMinCost_P2 = rightMinCost + P2;
1917
CostType rightBuf_i_minus_1 = SHRT_MAX;
1918
CostType tmp;
1919
min_cost = SHRT_MAX;
1920
1921
for(int i=0;i<D-1;i++)
1922
{
1923
tmp = rightBuf[i];
1924
rightBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(rightBuf_i_minus_1+P1,rightBuf[i+1]+P1),std::min((int)rightBuf[i],rightMinCost_P2))-rightMinCost_P2);
1925
rightBuf_i_minus_1 = tmp;
1926
rightMinCost_new = std::min(rightMinCost_new,rightBuf[i]);
1927
leftBuf[i] = cv::saturate_cast<CostType>((int)leftBuf[i]+rightBuf[i]+topBuf[i]);
1928
if(leftBuf[i]<min_cost)
1929
{
1930
optimal_disp = i;
1931
min_cost = leftBuf[i];
1932
}
1933
}
1934
1935
rightBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(rightBuf_i_minus_1+P1,std::min((int)rightBuf[D-1],rightMinCost_P2))-rightMinCost_P2);
1936
rightMinCost = std::min(rightMinCost_new,rightBuf[D-1]);
1937
leftBuf[D-1] = cv::saturate_cast<CostType>((int)leftBuf[D-1]+rightBuf[D-1]+topBuf[D-1]);
1938
if(leftBuf[D-1]<min_cost)
1939
{
1940
optimal_disp = D-1;
1941
min_cost = leftBuf[D-1];
1942
}
1943
}
1944
}
1945
1946
void SGBM3WayMainLoop::operator () (const Range& range) const
1947
{
1948
// force separate processing of stripes:
1949
if(range.end>range.start+1)
1950
{
1951
for(int n=range.start;n<range.end;n++)
1952
(*this)(Range(n,n+1));
1953
return;
1954
}
1955
1956
const int DISP_SCALE = (1 << StereoMatcher::DISP_SHIFT);
1957
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1958
1959
// setting up the ranges:
1960
int src_start_idx = std::max(std::min(range.start * stripe_sz - stripe_overlap, height),0);
1961
int src_end_idx = std::min(range.end * stripe_sz, height);
1962
1963
int dst_offset;
1964
if(range.start==0)
1965
dst_offset=stripe_overlap;
1966
else
1967
dst_offset=0;
1968
1969
Mat cur_buffer = buffers [range.start];
1970
Mat cur_disp = dst_disp[range.start];
1971
cur_disp = Scalar(INVALID_DISP_SCALED);
1972
1973
// prepare buffers:
1974
CostType *curCostVolumeLine, *hsumBuf, *pixDiff;
1975
PixType* tmpBuf;
1976
CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf;
1977
short* disp2Buf;
1978
getBufferPointers(cur_buffer,width,width1,D,img1->channels(),SH2,P2,
1979
curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume,
1980
vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf);
1981
1982
// start real processing:
1983
for(int y=src_start_idx;y<src_end_idx;y++)
1984
{
1985
getRawMatchingCost(curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,y,src_start_idx);
1986
1987
short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx));
1988
1989
// initialize the auxiliary buffers for the pseudo left-right consistency check:
1990
for(int x=0;x<width;x++)
1991
{
1992
disp2Buf[x] = (short)INVALID_DISP_SCALED;
1993
disp2CostBuf[x] = SHRT_MAX;
1994
}
1995
CostType* C = curCostVolumeLine - D;
1996
CostType prev_min, min_cost;
1997
int d, best_d;
1998
d = best_d = 0;
1999
2000
// forward pass
2001
prev_min=0;
2002
for (int x=D;x<(1+width1)*D;x+=D)
2003
accumulateCostsLeftTop(horPassCostVolume+x,horPassCostVolume+x-D,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/D],D,P1,P2);
2004
2005
//backward pass
2006
memset(rightPassBuf,0,D*sizeof(CostType));
2007
prev_min=0;
2008
for (int x=width1*D;x>=D;x-=D)
2009
{
2010
accumulateCostsRight(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost);
2011
2012
if(uniquenessRatio>0)
2013
{
2014
#if CV_SIMD128
2015
if(useSIMD)
2016
{
2017
horPassCostVolume+=x;
2018
int thresh = (100*min_cost)/(100-uniquenessRatio);
2019
v_int16x8 thresh_reg = v_setall_s16((short)(thresh+1));
2020
v_int16x8 d1 = v_setall_s16((short)(best_d-1));
2021
v_int16x8 d2 = v_setall_s16((short)(best_d+1));
2022
v_int16x8 eight_reg = v_setall_s16(8);
2023
v_int16x8 cur_d(0,1,2,3,4,5,6,7);
2024
v_int16x8 mask,cost1,cost2;
2025
2026
for( d = 0; d < D; d+=16 )
2027
{
2028
cost1 = v_load_aligned(horPassCostVolume+d);
2029
cost2 = v_load_aligned(horPassCostVolume+d+8);
2030
2031
mask = cost1 < thresh_reg;
2032
mask = mask & ( (cur_d<d1) | (cur_d>d2) );
2033
if( v_signmask(mask) )
2034
break;
2035
2036
cur_d = cur_d+eight_reg;
2037
2038
mask = cost2 < thresh_reg;
2039
mask = mask & ( (cur_d<d1) | (cur_d>d2) );
2040
if( v_signmask(mask) )
2041
break;
2042
2043
cur_d = cur_d+eight_reg;
2044
}
2045
horPassCostVolume-=x;
2046
}
2047
else
2048
#endif
2049
{
2050
for( d = 0; d < D; d++ )
2051
{
2052
if( horPassCostVolume[x+d]*(100 - uniquenessRatio) < min_cost*100 && std::abs(d - best_d) > 1 )
2053
break;
2054
}
2055
}
2056
if( d < D )
2057
continue;
2058
}
2059
d = best_d;
2060
2061
int _x2 = x/D - 1 + minX1 - d - minD;
2062
if( _x2>=0 && _x2<width && disp2CostBuf[_x2] > min_cost )
2063
{
2064
disp2CostBuf[_x2] = min_cost;
2065
disp2Buf[_x2] = (short)(d + minD);
2066
}
2067
2068
if( 0 < d && d < D-1 )
2069
{
2070
// do subpixel quadratic interpolation:
2071
// fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
2072
// then find minimum of the parabola.
2073
int denom2 = std::max(horPassCostVolume[x+d-1] + horPassCostVolume[x+d+1] - 2*horPassCostVolume[x+d], 1);
2074
d = d*DISP_SCALE + ((horPassCostVolume[x+d-1] - horPassCostVolume[x+d+1])*DISP_SCALE + denom2)/(denom2*2);
2075
}
2076
else
2077
d *= DISP_SCALE;
2078
2079
disp_row[(x/D)-1 + minX1] = (DispType)(d + minD*DISP_SCALE);
2080
}
2081
2082
for(int x = minX1; x < maxX1; x++ )
2083
{
2084
// pseudo LRC consistency check using only one disparity map;
2085
// pixels with difference more than disp12MaxDiff are invalidated
2086
int d1 = disp_row[x];
2087
if( d1 == INVALID_DISP_SCALED )
2088
continue;
2089
int _d = d1 >> StereoMatcher::DISP_SHIFT;
2090
int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT;
2091
int _x = x - _d, x_ = x - d_;
2092
if( 0 <= _x && _x < width && disp2Buf[_x] >= minD && std::abs(disp2Buf[_x] - _d) > disp12MaxDiff &&
2093
0 <= x_ && x_ < width && disp2Buf[x_] >= minD && std::abs(disp2Buf[x_] - d_) > disp12MaxDiff )
2094
disp_row[x] = (short)INVALID_DISP_SCALED;
2095
}
2096
}
2097
}
2098
2099
static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2,
2100
Mat& disp1, const StereoSGBMParams& params,
2101
Mat* buffers, int nstripes )
2102
{
2103
// precompute a lookup table for the raw matching cost computation:
2104
const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
2105
PixType* clipTab = new PixType[TAB_SIZE];
2106
int ftzero = std::max(params.preFilterCap, 15) | 1;
2107
for(int k = 0; k < TAB_SIZE; k++ )
2108
clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
2109
2110
// allocate separate dst_disp arrays to avoid conflicts due to stripe overlap:
2111
int stripe_sz = (int)ceil(img1.rows/(double)nstripes);
2112
int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz);
2113
Mat* dst_disp = new Mat[nstripes];
2114
for(int i=0;i<nstripes;i++)
2115
dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S);
2116
2117
parallel_for_(Range(0,nstripes),SGBM3WayMainLoop(buffers,img1,img2,dst_disp,params,clipTab,nstripes,stripe_overlap));
2118
2119
//assemble disp1 from dst_disp:
2120
short* dst_row;
2121
short* src_row;
2122
for(int i=0;i<disp1.rows;i++)
2123
{
2124
dst_row = (short*)disp1.ptr(i);
2125
src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz);
2126
memcpy(dst_row,src_row,disp1.cols*sizeof(short));
2127
}
2128
2129
delete[] clipTab;
2130
delete[] dst_disp;
2131
}
2132
2133
class StereoSGBMImpl CV_FINAL : public StereoSGBM
2134
{
2135
public:
2136
StereoSGBMImpl()
2137
{
2138
params = StereoSGBMParams();
2139
}
2140
2141
StereoSGBMImpl( int _minDisparity, int _numDisparities, int _SADWindowSize,
2142
int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
2143
int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
2144
int _mode )
2145
{
2146
params = StereoSGBMParams( _minDisparity, _numDisparities, _SADWindowSize,
2147
_P1, _P2, _disp12MaxDiff, _preFilterCap,
2148
_uniquenessRatio, _speckleWindowSize, _speckleRange,
2149
_mode );
2150
}
2151
2152
void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr ) CV_OVERRIDE
2153
{
2154
CV_INSTRUMENT_REGION();
2155
2156
Mat left = leftarr.getMat(), right = rightarr.getMat();
2157
CV_Assert( left.size() == right.size() && left.type() == right.type() &&
2158
left.depth() == CV_8U );
2159
2160
disparr.create( left.size(), CV_16S );
2161
Mat disp = disparr.getMat();
2162
2163
if(params.mode==MODE_SGBM_3WAY)
2164
computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );
2165
else if(params.mode==MODE_HH4)
2166
computeDisparitySGBM_HH4( left, right, disp, params, buffer );
2167
else
2168
computeDisparitySGBM( left, right, disp, params, buffer );
2169
2170
medianBlur(disp, disp, 3);
2171
2172
if( params.speckleWindowSize > 0 )
2173
filterSpeckles(disp, (params.minDisparity - 1)*StereoMatcher::DISP_SCALE, params.speckleWindowSize,
2174
StereoMatcher::DISP_SCALE*params.speckleRange, buffer);
2175
}
2176
2177
int getMinDisparity() const CV_OVERRIDE { return params.minDisparity; }
2178
void setMinDisparity(int minDisparity) CV_OVERRIDE { params.minDisparity = minDisparity; }
2179
2180
int getNumDisparities() const CV_OVERRIDE { return params.numDisparities; }
2181
void setNumDisparities(int numDisparities) CV_OVERRIDE { params.numDisparities = numDisparities; }
2182
2183
int getBlockSize() const CV_OVERRIDE { return params.SADWindowSize; }
2184
void setBlockSize(int blockSize) CV_OVERRIDE { params.SADWindowSize = blockSize; }
2185
2186
int getSpeckleWindowSize() const CV_OVERRIDE { return params.speckleWindowSize; }
2187
void setSpeckleWindowSize(int speckleWindowSize) CV_OVERRIDE { params.speckleWindowSize = speckleWindowSize; }
2188
2189
int getSpeckleRange() const CV_OVERRIDE { return params.speckleRange; }
2190
void setSpeckleRange(int speckleRange) CV_OVERRIDE { params.speckleRange = speckleRange; }
2191
2192
int getDisp12MaxDiff() const CV_OVERRIDE { return params.disp12MaxDiff; }
2193
void setDisp12MaxDiff(int disp12MaxDiff) CV_OVERRIDE { params.disp12MaxDiff = disp12MaxDiff; }
2194
2195
int getPreFilterCap() const CV_OVERRIDE { return params.preFilterCap; }
2196
void setPreFilterCap(int preFilterCap) CV_OVERRIDE { params.preFilterCap = preFilterCap; }
2197
2198
int getUniquenessRatio() const CV_OVERRIDE { return params.uniquenessRatio; }
2199
void setUniquenessRatio(int uniquenessRatio) CV_OVERRIDE { params.uniquenessRatio = uniquenessRatio; }
2200
2201
int getP1() const CV_OVERRIDE { return params.P1; }
2202
void setP1(int P1) CV_OVERRIDE { params.P1 = P1; }
2203
2204
int getP2() const CV_OVERRIDE { return params.P2; }
2205
void setP2(int P2) CV_OVERRIDE { params.P2 = P2; }
2206
2207
int getMode() const CV_OVERRIDE { return params.mode; }
2208
void setMode(int mode) CV_OVERRIDE { params.mode = mode; }
2209
2210
void write(FileStorage& fs) const CV_OVERRIDE
2211
{
2212
writeFormat(fs);
2213
fs << "name" << name_
2214
<< "minDisparity" << params.minDisparity
2215
<< "numDisparities" << params.numDisparities
2216
<< "blockSize" << params.SADWindowSize
2217
<< "speckleWindowSize" << params.speckleWindowSize
2218
<< "speckleRange" << params.speckleRange
2219
<< "disp12MaxDiff" << params.disp12MaxDiff
2220
<< "preFilterCap" << params.preFilterCap
2221
<< "uniquenessRatio" << params.uniquenessRatio
2222
<< "P1" << params.P1
2223
<< "P2" << params.P2
2224
<< "mode" << params.mode;
2225
}
2226
2227
void read(const FileNode& fn) CV_OVERRIDE
2228
{
2229
FileNode n = fn["name"];
2230
CV_Assert( n.isString() && String(n) == name_ );
2231
params.minDisparity = (int)fn["minDisparity"];
2232
params.numDisparities = (int)fn["numDisparities"];
2233
params.SADWindowSize = (int)fn["blockSize"];
2234
params.speckleWindowSize = (int)fn["speckleWindowSize"];
2235
params.speckleRange = (int)fn["speckleRange"];
2236
params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
2237
params.preFilterCap = (int)fn["preFilterCap"];
2238
params.uniquenessRatio = (int)fn["uniquenessRatio"];
2239
params.P1 = (int)fn["P1"];
2240
params.P2 = (int)fn["P2"];
2241
params.mode = (int)fn["mode"];
2242
}
2243
2244
StereoSGBMParams params;
2245
Mat buffer;
2246
2247
// the number of stripes is fixed, disregarding the number of threads/processors
2248
// to make the results fully reproducible:
2249
static const int num_stripes = 4;
2250
Mat buffers[num_stripes];
2251
2252
static const char* name_;
2253
};
2254
2255
const char* StereoSGBMImpl::name_ = "StereoMatcher.SGBM";
2256
2257
2258
Ptr<StereoSGBM> StereoSGBM::create(int minDisparity, int numDisparities, int SADWindowSize,
2259
int P1, int P2, int disp12MaxDiff,
2260
int preFilterCap, int uniquenessRatio,
2261
int speckleWindowSize, int speckleRange,
2262
int mode)
2263
{
2264
return Ptr<StereoSGBM>(
2265
new StereoSGBMImpl(minDisparity, numDisparities, SADWindowSize,
2266
P1, P2, disp12MaxDiff,
2267
preFilterCap, uniquenessRatio,
2268
speckleWindowSize, speckleRange,
2269
mode));
2270
}
2271
2272
Rect getValidDisparityROI( Rect roi1, Rect roi2,
2273
int minDisparity,
2274
int numberOfDisparities,
2275
int SADWindowSize )
2276
{
2277
int SW2 = SADWindowSize/2;
2278
int maxD = minDisparity + numberOfDisparities - 1;
2279
2280
int xmin = std::max(roi1.x, roi2.x + maxD) + SW2;
2281
int xmax = std::min(roi1.x + roi1.width, roi2.x + roi2.width) - SW2;
2282
int ymin = std::max(roi1.y, roi2.y) + SW2;
2283
int ymax = std::min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;
2284
2285
Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
2286
2287
return r.width > 0 && r.height > 0 ? r : Rect();
2288
}
2289
2290
typedef cv::Point_<short> Point2s;
2291
2292
template <typename T>
2293
void filterSpecklesImpl(cv::Mat& img, int newVal, int maxSpeckleSize, int maxDiff, cv::Mat& _buf)
2294
{
2295
using namespace cv;
2296
2297
int width = img.cols, height = img.rows, npixels = width*height;
2298
size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
2299
if( !_buf.isContinuous() || _buf.empty() || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )
2300
_buf.reserveBuffer(bufSize);
2301
2302
uchar* buf = _buf.ptr();
2303
int i, j, dstep = (int)(img.step/sizeof(T));
2304
int* labels = (int*)buf;
2305
buf += npixels*sizeof(labels[0]);
2306
Point2s* wbuf = (Point2s*)buf;
2307
buf += npixels*sizeof(wbuf[0]);
2308
uchar* rtype = (uchar*)buf;
2309
int curlabel = 0;
2310
2311
// clear out label assignments
2312
memset(labels, 0, npixels*sizeof(labels[0]));
2313
2314
for( i = 0; i < height; i++ )
2315
{
2316
T* ds = img.ptr<T>(i);
2317
int* ls = labels + width*i;
2318
2319
for( j = 0; j < width; j++ )
2320
{
2321
if( ds[j] != newVal ) // not a bad disparity
2322
{
2323
if( ls[j] ) // has a label, check for bad label
2324
{
2325
if( rtype[ls[j]] ) // small region, zero out disparity
2326
ds[j] = (T)newVal;
2327
}
2328
// no label, assign and propagate
2329
else
2330
{
2331
Point2s* ws = wbuf; // initialize wavefront
2332
Point2s p((short)j, (short)i); // current pixel
2333
curlabel++; // next label
2334
int count = 0; // current region size
2335
ls[j] = curlabel;
2336
2337
// wavefront propagation
2338
while( ws >= wbuf ) // wavefront not empty
2339
{
2340
count++;
2341
// put neighbors onto wavefront
2342
T* dpp = &img.at<T>(p.y, p.x);
2343
T dp = *dpp;
2344
int* lpp = labels + width*p.y + p.x;
2345
2346
if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )
2347
{
2348
lpp[+width] = curlabel;
2349
*ws++ = Point2s(p.x, p.y+1);
2350
}
2351
2352
if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )
2353
{
2354
lpp[-width] = curlabel;
2355
*ws++ = Point2s(p.x, p.y-1);
2356
}
2357
2358
if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )
2359
{
2360
lpp[+1] = curlabel;
2361
*ws++ = Point2s(p.x+1, p.y);
2362
}
2363
2364
if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )
2365
{
2366
lpp[-1] = curlabel;
2367
*ws++ = Point2s(p.x-1, p.y);
2368
}
2369
2370
// pop most recent and propagate
2371
// NB: could try least recent, maybe better convergence
2372
p = *--ws;
2373
}
2374
2375
// assign label type
2376
if( count <= maxSpeckleSize ) // speckle region
2377
{
2378
rtype[ls[j]] = 1; // small region label
2379
ds[j] = (T)newVal;
2380
}
2381
else
2382
rtype[ls[j]] = 0; // large region label
2383
}
2384
}
2385
}
2386
}
2387
}
2388
2389
#ifdef HAVE_IPP
2390
static bool ipp_filterSpeckles(Mat &img, int maxSpeckleSize, int newVal, int maxDiff, Mat &buffer)
2391
{
2392
#if IPP_VERSION_X100 >= 810
2393
CV_INSTRUMENT_REGION_IPP();
2394
2395
IppDataType dataType = ippiGetDataType(img.depth());
2396
IppiSize size = ippiSize(img.size());
2397
int bufferSize;
2398
2399
if(img.channels() != 1)
2400
return false;
2401
2402
if(dataType != ipp8u && dataType != ipp16s)
2403
return false;
2404
2405
if(ippiMarkSpecklesGetBufferSize(size, dataType, 1, &bufferSize) < 0)
2406
return false;
2407
2408
if(bufferSize && (buffer.empty() || (int)(buffer.step*buffer.rows) < bufferSize))
2409
buffer.create(1, (int)bufferSize, CV_8U);
2410
2411
switch(dataType)
2412
{
2413
case ipp8u: return CV_INSTRUMENT_FUN_IPP(ippiMarkSpeckles_8u_C1IR, img.ptr<Ipp8u>(), (int)img.step, size, (Ipp8u)newVal, maxSpeckleSize, (Ipp8u)maxDiff, ippiNormL1, buffer.ptr<Ipp8u>()) >= 0;
2414
case ipp16s: return CV_INSTRUMENT_FUN_IPP(ippiMarkSpeckles_16s_C1IR, img.ptr<Ipp16s>(), (int)img.step, size, (Ipp16s)newVal, maxSpeckleSize, (Ipp16s)maxDiff, ippiNormL1, buffer.ptr<Ipp8u>()) >= 0;
2415
default: return false;
2416
}
2417
#else
2418
CV_UNUSED(img); CV_UNUSED(maxSpeckleSize); CV_UNUSED(newVal); CV_UNUSED(maxDiff); CV_UNUSED(buffer);
2419
return false;
2420
#endif
2421
}
2422
#endif
2423
2424
}
2425
2426
void cv::filterSpeckles( InputOutputArray _img, double _newval, int maxSpeckleSize,
2427
double _maxDiff, InputOutputArray __buf )
2428
{
2429
CV_INSTRUMENT_REGION();
2430
2431
Mat img = _img.getMat();
2432
int type = img.type();
2433
Mat temp, &_buf = __buf.needed() ? __buf.getMatRef() : temp;
2434
CV_Assert( type == CV_8UC1 || type == CV_16SC1 );
2435
2436
int newVal = cvRound(_newval), maxDiff = cvRound(_maxDiff);
2437
2438
CV_IPP_RUN_FAST(ipp_filterSpeckles(img, maxSpeckleSize, newVal, maxDiff, _buf));
2439
2440
if (type == CV_8UC1)
2441
filterSpecklesImpl<uchar>(img, newVal, maxSpeckleSize, maxDiff, _buf);
2442
else
2443
filterSpecklesImpl<short>(img, newVal, maxSpeckleSize, maxDiff, _buf);
2444
}
2445
2446
void cv::validateDisparity( InputOutputArray _disp, InputArray _cost, int minDisparity,
2447
int numberOfDisparities, int disp12MaxDiff )
2448
{
2449
CV_INSTRUMENT_REGION();
2450
2451
Mat disp = _disp.getMat(), cost = _cost.getMat();
2452
int cols = disp.cols, rows = disp.rows;
2453
int minD = minDisparity, maxD = minDisparity + numberOfDisparities;
2454
int x, minX1 = std::max(maxD, 0), maxX1 = cols + std::min(minD, 0);
2455
AutoBuffer<int> _disp2buf(cols*2);
2456
int* disp2buf = _disp2buf.data();
2457
int* disp2cost = disp2buf + cols;
2458
const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;
2459
int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
2460
int costType = cost.type();
2461
2462
disp12MaxDiff *= DISP_SCALE;
2463
2464
CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&
2465
(costType == CV_16S || costType == CV_32S) &&
2466
disp.size() == cost.size() );
2467
2468
for( int y = 0; y < rows; y++ )
2469
{
2470
short* dptr = disp.ptr<short>(y);
2471
2472
for( x = 0; x < cols; x++ )
2473
{
2474
disp2buf[x] = INVALID_DISP_SCALED;
2475
disp2cost[x] = INT_MAX;
2476
}
2477
2478
if( costType == CV_16S )
2479
{
2480
const short* cptr = cost.ptr<short>(y);
2481
2482
for( x = minX1; x < maxX1; x++ )
2483
{
2484
int d = dptr[x], c = cptr[x];
2485
2486
if( d == INVALID_DISP_SCALED )
2487
continue;
2488
2489
int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
2490
2491
if( disp2cost[x2] > c )
2492
{
2493
disp2cost[x2] = c;
2494
disp2buf[x2] = d;
2495
}
2496
}
2497
}
2498
else
2499
{
2500
const int* cptr = cost.ptr<int>(y);
2501
2502
for( x = minX1; x < maxX1; x++ )
2503
{
2504
int d = dptr[x], c = cptr[x];
2505
2506
if( d == INVALID_DISP_SCALED )
2507
continue;
2508
2509
int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
2510
2511
if( disp2cost[x2] > c )
2512
{
2513
disp2cost[x2] = c;
2514
disp2buf[x2] = d;
2515
}
2516
}
2517
}
2518
2519
for( x = minX1; x < maxX1; x++ )
2520
{
2521
// we round the computed disparity both towards -inf and +inf and check
2522
// if either of the corresponding disparities in disp2 is consistent.
2523
// This is to give the computed disparity a chance to look valid if it is.
2524
int d = dptr[x];
2525
if( d == INVALID_DISP_SCALED )
2526
continue;
2527
int d0 = d >> DISP_SHIFT;
2528
int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;
2529
int x0 = x - d0, x1 = x - d1;
2530
if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP_SCALED && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&
2531
(0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP_SCALED && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )
2532
dptr[x] = (short)INVALID_DISP_SCALED;
2533
}
2534
}
2535
}
2536
2537