CoCalc -- stereosgbm.cpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/calib3d/src/stereosgbm.cpp
¹⁶³³⁹ views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
//  By downloading, copying, installing or using the software you agree to this license.
6
//  If you do not agree to this license, do not download, install,
7
//  copy or use the software.
8
//
9
//
10
//                           License Agreement
11
//                For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16
// Third party copyrights are property of their respective owners.
17
//
18
// Redistribution and use in source and binary forms, with or without modification,
19
// are permitted provided that the following conditions are met:
20
//
21
//   * Redistribution's of source code must retain the above copyright notice,
22
//     this list of conditions and the following disclaimer.
23
//
24
//   * Redistribution's in binary form must reproduce the above copyright notice,
25
//     this list of conditions and the following disclaimer in the documentation
26
//     and/or other materials provided with the distribution.
27
//
28
//   * The name of the copyright holders may not be used to endorse or promote products
29
//     derived from this software without specific prior written permission.
30
//
31
// This software is provided by the copyright holders and contributors "as is" and
32
// any express or implied warranties, including, but not limited to, the implied
33
// warranties of merchantability and fitness for a particular purpose are disclaimed.
34
// In no event shall the Intel Corporation or contributors be liable for any direct,
35
// indirect, incidental, special, exemplary, or consequential damages
36
// (including, but not limited to, procurement of substitute goods or services;
37
// loss of use, data, or profits; or business interruption) however caused
38
// and on any theory of liability, whether in contract, strict liability,
39
// or tort (including negligence or otherwise) arising in any way out of
40
// the use of this software, even if advised of the possibility of such damage.
41
//
42
//M*/
43

44
/*
45
 This is a variation of
46
 "Stereo Processing by Semiglobal Matching and Mutual Information"
47
 by Heiko Hirschmuller.
48

49
 We match blocks rather than individual pixels, thus the algorithm is called
50
 SGBM (Semi-global block matching)
51
 */
52

53
#include "precomp.hpp"
54
#include <limits.h>
55
#include "opencv2/core/hal/intrin.hpp"
56

57
namespace cv
58
{
59

60
typedef uchar PixType;
61
typedef short CostType;
62
typedef short DispType;
63

64
enum { NR = 16, NR2 = NR/2 };
65

66

67
struct StereoSGBMParams
68
{
69
    StereoSGBMParams()
70
    {
71
        minDisparity = numDisparities = 0;
72
        SADWindowSize = 0;
73
        P1 = P2 = 0;
74
        disp12MaxDiff = 0;
75
        preFilterCap = 0;
76
        uniquenessRatio = 0;
77
        speckleWindowSize = 0;
78
        speckleRange = 0;
79
        mode = StereoSGBM::MODE_SGBM;
80
    }
81

82
    StereoSGBMParams( int _minDisparity, int _numDisparities, int _SADWindowSize,
83
                      int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
84
                      int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
85
                      int _mode )
86
    {
87
        minDisparity = _minDisparity;
88
        numDisparities = _numDisparities;
89
        SADWindowSize = _SADWindowSize;
90
        P1 = _P1;
91
        P2 = _P2;
92
        disp12MaxDiff = _disp12MaxDiff;
93
        preFilterCap = _preFilterCap;
94
        uniquenessRatio = _uniquenessRatio;
95
        speckleWindowSize = _speckleWindowSize;
96
        speckleRange = _speckleRange;
97
        mode = _mode;
98
    }
99

100
    int minDisparity;
101
    int numDisparities;
102
    int SADWindowSize;
103
    int preFilterCap;
104
    int uniquenessRatio;
105
    int P1;
106
    int P2;
107
    int speckleWindowSize;
108
    int speckleRange;
109
    int disp12MaxDiff;
110
    int mode;
111
};
112

113
static const int DEFAULT_RIGHT_BORDER = -1;
114
/*
115
 For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
116
 and for each disparity minD<=d<maxD the function
117
 computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
118
 row1[x] and row2[x-d]. The subpixel algorithm from
119
 "Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
120
 is used, hence the suffix BT.
121

122
 the temporary buffer should contain width2*2 elements
123
 */
124
static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
125
                            int minD, int maxD, CostType* cost,
126
                            PixType* buffer, const PixType* tab,
127
                            int tabOfs, int , int xrange_min = 0, int xrange_max = DEFAULT_RIGHT_BORDER )
128
{
129
    int x, c, width = img1.cols, cn = img1.channels();
130
    int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
131
    int D = maxD - minD, width1 = maxX1 - minX1;
132
    //This minX1 & maxX2 correction is defining which part of calculatable line must be calculated
133
    //That is needs of parallel algorithm
134
    xrange_min = (xrange_min < 0) ? 0: xrange_min;
135
    xrange_max = (xrange_max == DEFAULT_RIGHT_BORDER) || (xrange_max > width1) ? width1 : xrange_max;
136
    maxX1 = minX1 + xrange_max;
137
    minX1 += xrange_min;
138
    width1 = maxX1 - minX1;
139
    int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
140
    int width2 = maxX2 - minX2;
141
    const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
142
    PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
143
#if CV_SIMD128
144
    bool useSIMD = hasSIMD128();
145
#endif
146

147
    tab += tabOfs;
148

149
    for( c = 0; c < cn*2; c++ )
150
    {
151
        prow1[width*c] = prow1[width*c + width-1] =
152
        prow2[width*c] = prow2[width*c + width-1] = tab[0];
153
    }
154

155
    int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
156
    int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
157

158
    int minX_cmn = std::min(minX1,minX2)-1;
159
    int maxX_cmn = std::max(maxX1,maxX2)+1;
160
    minX_cmn = std::max(minX_cmn, 1);
161
    maxX_cmn = std::min(maxX_cmn, width - 1);
162
    if( cn == 1 )
163
    {
164
        for( x = minX_cmn; x < maxX_cmn; x++ )
165
        {
166
            prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
167
            prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
168

169
            prow1[x+width] = row1[x];
170
            prow2[width-1-x+width] = row2[x];
171
        }
172
    }
173
    else
174
    {
175
        for( x = minX_cmn; x < maxX_cmn; x++ )
176
        {
177
            prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
178
            prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
179
            prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
180

181
            prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
182
            prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
183
            prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
184

185
            prow1[x+width*3] = row1[x*3];
186
            prow1[x+width*4] = row1[x*3+1];
187
            prow1[x+width*5] = row1[x*3+2];
188

189
            prow2[width-1-x+width*3] = row2[x*3];
190
            prow2[width-1-x+width*4] = row2[x*3+1];
191
            prow2[width-1-x+width*5] = row2[x*3+2];
192
        }
193
    }
194

195
    memset( cost + xrange_min*D, 0, width1*D*sizeof(cost[0]) );
196

197
    buffer -= width-1-maxX2;
198
    cost -= (minX1-xrange_min)*D + minD; // simplify the cost indices inside the loop
199

200
    for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
201
    {
202
        int diff_scale = c < cn ? 0 : 2;
203

204
        // precompute
205
        //   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
206
        //   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
207
        for( x = width-1-maxX2; x < width-1- minX2; x++ )
208
        {
209
            int v = prow2[x];
210
            int vl = x > 0 ? (v + prow2[x-1])/2 : v;
211
            int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
212
            int v0 = std::min(vl, vr); v0 = std::min(v0, v);
213
            int v1 = std::max(vl, vr); v1 = std::max(v1, v);
214
            buffer[x] = (PixType)v0;
215
            buffer[x + width2] = (PixType)v1;
216
        }
217

218
        for( x = minX1; x < maxX1; x++ )
219
        {
220
            int u = prow1[x];
221
            int ul = x > 0 ? (u + prow1[x-1])/2 : u;
222
            int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
223
            int u0 = std::min(ul, ur); u0 = std::min(u0, u);
224
            int u1 = std::max(ul, ur); u1 = std::max(u1, u);
225

226
        #if CV_SIMD128
227
            if( useSIMD )
228
            {
229
                v_uint8x16 _u  = v_setall_u8((uchar)u), _u0 = v_setall_u8((uchar)u0);
230
                v_uint8x16 _u1 = v_setall_u8((uchar)u1);
231

232
                for( int d = minD; d < maxD; d += 16 )
233
                {
234
                    v_uint8x16 _v  = v_load(prow2  + width-x-1 + d);
235
                    v_uint8x16 _v0 = v_load(buffer + width-x-1 + d);
236
                    v_uint8x16 _v1 = v_load(buffer + width-x-1 + d + width2);
237
                    v_uint8x16 c0 = v_max(_u - _v1, _v0 - _u);
238
                    v_uint8x16 c1 = v_max(_v - _u1, _u0 - _v);
239
                    v_uint8x16 diff = v_min(c0, c1);
240

241
                    v_int16x8 _c0 = v_load_aligned(cost + x*D + d);
242
                    v_int16x8 _c1 = v_load_aligned(cost + x*D + d + 8);
243

244
                    v_uint16x8 diff1,diff2;
245
                    v_expand(diff,diff1,diff2);
246
                    v_store_aligned(cost + x*D + d,     _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
247
                    v_store_aligned(cost + x*D + d + 8, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
248
                }
249
            }
250
            else
251
        #endif
252
            {
253
                for( int d = minD; d < maxD; d++ )
254
                {
255
                    int v = prow2[width-x-1 + d];
256
                    int v0 = buffer[width-x-1 + d];
257
                    int v1 = buffer[width-x-1 + d + width2];
258
                    int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
259
                    int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
260

261
                    cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
262
                }
263
            }
264
        }
265
    }
266
}
267

268

269
/*
270
 computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
271
 that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
272
 minD <= d < maxD.
273
 disp2full is the reverse disparity map, that is:
274
 disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
275

276
 note that disp1buf will have the same size as the roi and
277
 disp2full will have the same size as img1 (or img2).
278
 On exit disp2buf is not the final disparity, it is an intermediate result that becomes
279
 final after all the tiles are processed.
280

281
 the disparity in disp1buf is written with sub-pixel accuracy
282
 (4 fractional bits, see StereoSGBM::DISP_SCALE),
283
 using quadratic interpolation, while the disparity in disp2buf
284
 is written as is, without interpolation.
285

286
 disp2cost also has the same size as img1 (or img2).
287
 It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
288
 */
289
static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
290
                                 Mat& disp1, const StereoSGBMParams& params,
291
                                 Mat& buffer )
292
{
293
#if CV_SIMD128
294
    // maxDisparity is supposed to multiple of 16, so we can forget doing else
295
    static const uchar LSBTab[] =
296
    {
297
        0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
298
        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
299
        6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
300
        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
301
        7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
302
        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
303
        6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
304
        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
305
    };
306
    static const v_uint16x8 v_LSB = v_uint16x8(0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
307

308
    bool useSIMD = hasSIMD128();
309
#endif
310

311
    const int ALIGN = 16;
312
    const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
313
    const int DISP_SCALE = (1 << DISP_SHIFT);
314
    const CostType MAX_COST = SHRT_MAX;
315

316
    int minD = params.minDisparity, maxD = minD + params.numDisparities;
317
    Size SADWindowSize;
318
    SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
319
    int ftzero = std::max(params.preFilterCap, 15) | 1;
320
    int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
321
    int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
322
    int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
323
    int k, width = disp1.cols, height = disp1.rows;
324
    int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
325
    int D = maxD - minD, width1 = maxX1 - minX1;
326
    int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
327
    int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
328
    bool fullDP = params.mode == StereoSGBM::MODE_HH;
329
    int npasses = fullDP ? 2 : 1;
330
    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
331
    PixType clipTab[TAB_SIZE];
332

333
    for( k = 0; k < TAB_SIZE; k++ )
334
        clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
335

336
    if( minX1 >= maxX1 )
337
    {
338
        disp1 = Scalar::all(INVALID_DISP_SCALED);
339
        return;
340
    }
341

342
    CV_Assert( D % 16 == 0 );
343

344
    // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
345
    // if you change NR, please, modify the loop as well.
346
    int D2 = D+16, NRD2 = NR2*D2;
347

348
    // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
349
    // for 8-way dynamic programming we need the current row and
350
    // the previous row, i.e. 2 rows in total
351
    const int NLR = 2;
352
    const int LrBorder = NLR - 1;
353

354
    // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
355
    // we keep pixel difference cost (C) and the summary cost over NR directions (S).
356
    // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
357
    size_t costBufSize = width1*D;
358
    size_t CSBufSize = costBufSize*(fullDP ? height : 1);
359
    size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
360
    int hsumBufNRows = SH2*2 + 2;
361
    size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
362
    costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
363
    CSBufSize*2*sizeof(CostType) + // C, S
364
    width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
365
    width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
366

367
    if( buffer.empty() || !buffer.isContinuous() ||
368
        buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
369
        buffer.reserveBuffer(totalBufSize);
370

371
    // summary cost over different (nDirs) directions
372
    CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
373
    CostType* Sbuf = Cbuf + CSBufSize;
374
    CostType* hsumBuf = Sbuf + CSBufSize;
375
    CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
376

377
    CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
378
    DispType* disp2ptr = (DispType*)(disp2cost + width);
379
    PixType* tempBuf = (PixType*)(disp2ptr + width);
380

381
    // add P2 to every C(x,y). it saves a few operations in the inner loops
382
    for(k = 0; k < (int)CSBufSize; k++ )
383
        Cbuf[k] = (CostType)P2;
384

385
    for( int pass = 1; pass <= npasses; pass++ )
386
    {
387
        int x1, y1, x2, y2, dx, dy;
388

389
        if( pass == 1 )
390
        {
391
            y1 = 0; y2 = height; dy = 1;
392
            x1 = 0; x2 = width1; dx = 1;
393
        }
394
        else
395
        {
396
            y1 = height-1; y2 = -1; dy = -1;
397
            x1 = width1-1; x2 = -1; dx = -1;
398
        }
399

400
        CostType *Lr[NLR]={0}, *minLr[NLR]={0};
401

402
        for( k = 0; k < NLR; k++ )
403
        {
404
            // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
405
            // and will occasionally use negative indices with the arrays
406
            // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
407
            // however, then the alignment will be imperfect, i.e. bad for SSE,
408
            // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
409
            Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
410
            memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
411
            minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;
412
            memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
413
        }
414

415
        for( int y = y1; y != y2; y += dy )
416
        {
417
            int x, d;
418
            DispType* disp1ptr = disp1.ptr<DispType>(y);
419
            CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
420
            CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);
421

422
            if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
423
            {
424
                int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
425

426
                for( k = dy1; k <= dy2; k++ )
427
                {
428
                    CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
429

430
                    if( k < height )
431
                    {
432
                        calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
433

434
                        memset(hsumAdd, 0, D*sizeof(CostType));
435
                        for( x = 0; x <= SW2*D; x += D )
436
                        {
437
                            int scale = x == 0 ? SW2 + 1 : 1;
438
                            for( d = 0; d < D; d++ )
439
                                hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
440
                        }
441

442
                        if( y > 0 )
443
                        {
444
                            const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
445
                            const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
446

447
                            for( x = D; x < width1*D; x += D )
448
                            {
449
                                const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
450
                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
451

452
                            #if CV_SIMD128
453
                                if( useSIMD )
454
                                {
455
                                    for( d = 0; d < D; d += 8 )
456
                                    {
457
                                        v_int16x8 hv = v_load(hsumAdd + x - D + d);
458
                                        v_int16x8 Cx = v_load(Cprev + x + d);
459
                                        v_int16x8 psub = v_load(pixSub + d);
460
                                        v_int16x8 padd = v_load(pixAdd + d);
461
                                        hv = (hv - psub + padd);
462
                                        psub = v_load(hsumSub + x + d);
463
                                        Cx = Cx - psub + hv;
464
                                        v_store(hsumAdd + x + d, hv);
465
                                        v_store(C + x + d, Cx);
466
                                    }
467
                                }
468
                                else
469
                            #endif
470
                                {
471
                                    for( d = 0; d < D; d++ )
472
                                    {
473
                                        int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
474
                                        C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
475
                                    }
476
                                }
477
                            }
478
                        }
479
                        else
480
                        {
481
                            for( x = D; x < width1*D; x += D )
482
                            {
483
                                const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
484
                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
485

486
                                for( d = 0; d < D; d++ )
487
                                    hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
488
                            }
489
                        }
490
                    }
491

492
                    if( y == 0 )
493
                    {
494
                        int scale = k == 0 ? SH2 + 1 : 1;
495
                        for( x = 0; x < width1*D; x++ )
496
                            C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
497
                    }
498
                }
499

500
                // also, clear the S buffer
501
                for( k = 0; k < width1*D; k++ )
502
                    S[k] = 0;
503
            }
504

505
            // clear the left and the right borders
506
            memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
507
            memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
508
            memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
509
            memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
510

511
            /*
512
             [formula 13 in the paper]
513
             compute L_r(p, d) = C(p, d) +
514
             min(L_r(p-r, d),
515
             L_r(p-r, d-1) + P1,
516
             L_r(p-r, d+1) + P1,
517
             min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
518
             where p = (x,y), r is one of the directions.
519
             we process all the directions at once:
520
             0: r=(-dx, 0)
521
             1: r=(-1, -dy)
522
             2: r=(0, -dy)
523
             3: r=(1, -dy)
524
             4: r=(-2, -dy)
525
             5: r=(-1, -dy*2)
526
             6: r=(1, -dy*2)
527
             7: r=(2, -dy)
528
             */
529

530
            for( x = x1; x != x2; x += dx )
531
            {
532
                int xm = x*NR2, xd = xm*D2;
533

534
                int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
535
                int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
536

537
                CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
538
                CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
539
                CostType* Lr_p2 = Lr[1] + xd + D2*2;
540
                CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
541

542
                Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
543
                Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
544

545
                CostType* Lr_p = Lr[0] + xd;
546
                const CostType* Cp = C + x*D;
547
                CostType* Sp = S + x*D;
548

549
            #if CV_SIMD128
550
                if( useSIMD )
551
                {
552
                    v_int16x8 _P1 = v_setall_s16((short)P1);
553

554
                    v_int16x8 _delta0 = v_setall_s16((short)delta0);
555
                    v_int16x8 _delta1 = v_setall_s16((short)delta1);
556
                    v_int16x8 _delta2 = v_setall_s16((short)delta2);
557
                    v_int16x8 _delta3 = v_setall_s16((short)delta3);
558
                    v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
559

560
                    for( d = 0; d < D; d += 8 )
561
                    {
562
                        v_int16x8 Cpd = v_load(Cp + d);
563
                        v_int16x8 L0, L1, L2, L3;
564

565
                        L0 = v_load(Lr_p0 + d);
566
                        L1 = v_load(Lr_p1 + d);
567
                        L2 = v_load(Lr_p2 + d);
568
                        L3 = v_load(Lr_p3 + d);
569

570
                        L0 = v_min(L0, (v_load(Lr_p0 + d - 1) + _P1));
571
                        L0 = v_min(L0, (v_load(Lr_p0 + d + 1) + _P1));
572

573
                        L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
574
                        L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
575

576
                        L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1));
577
                        L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1));
578

579
                        L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
580
                        L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
581

582
                        L0 = v_min(L0, _delta0);
583
                        L0 = ((L0 - _delta0) + Cpd);
584

585
                        L1 = v_min(L1, _delta1);
586
                        L1 = ((L1 - _delta1) + Cpd);
587

588
                        L2 = v_min(L2, _delta2);
589
                        L2 = ((L2 - _delta2) + Cpd);
590

591
                        L3 = v_min(L3, _delta3);
592
                        L3 = ((L3 - _delta3) + Cpd);
593

594
                        v_store(Lr_p + d, L0);
595
                        v_store(Lr_p + d + D2, L1);
596
                        v_store(Lr_p + d + D2*2, L2);
597
                        v_store(Lr_p + d + D2*3, L3);
598

599
                        // Get minimum from in L0-L3
600
                        v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
601
                        v_zip(L0, L2, t02L, t02H);            // L0[0] L2[0] L0[1] L2[1]...
602
                        v_zip(L1, L3, t13L, t13H);            // L1[0] L3[0] L1[1] L3[1]...
603
                        v_int16x8 t02 = v_min(t02L, t02H);    // L0[i] L2[i] L0[i] L2[i]...
604
                        v_int16x8 t13 = v_min(t13L, t13H);    // L1[i] L3[i] L1[i] L3[i]...
605
                        v_zip(t02, t13, t0123L, t0123H);      // L0[i] L1[i] L2[i] L3[i]...
606
                        v_int16x8 t0 = v_min(t0123L, t0123H);
607
                        _minL0 = v_min(_minL0, t0);
608

609
                        v_int16x8 Sval = v_load(Sp + d);
610

611
                        L0 = L0 + L1;
612
                        L2 = L2 + L3;
613
                        Sval = Sval + L0;
614
                        Sval = Sval + L2;
615

616
                        v_store(Sp + d, Sval);
617
                    }
618

619
                    v_int32x4 minL, minH;
620
                    v_expand(_minL0, minL, minH);
621
                    v_pack_store(&minLr[0][xm], v_min(minL, minH));
622
                }
623
                else
624
            #endif
625
                {
626
                    int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
627

628
                    for( d = 0; d < D; d++ )
629
                    {
630
                        int Cpd = Cp[d], L0, L1, L2, L3;
631

632
                        L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
633
                        L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;
634
                        L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d-1] + P1, std::min(Lr_p2[d+1] + P1, delta2))) - delta2;
635
                        L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d-1] + P1, std::min(Lr_p3[d+1] + P1, delta3))) - delta3;
636

637
                        Lr_p[d] = (CostType)L0;
638
                        minL0 = std::min(minL0, L0);
639

640
                        Lr_p[d + D2] = (CostType)L1;
641
                        minL1 = std::min(minL1, L1);
642

643
                        Lr_p[d + D2*2] = (CostType)L2;
644
                        minL2 = std::min(minL2, L2);
645

646
                        Lr_p[d + D2*3] = (CostType)L3;
647
                        minL3 = std::min(minL3, L3);
648

649
                        Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
650
                    }
651
                    minLr[0][xm] = (CostType)minL0;
652
                    minLr[0][xm+1] = (CostType)minL1;
653
                    minLr[0][xm+2] = (CostType)minL2;
654
                    minLr[0][xm+3] = (CostType)minL3;
655
                }
656
            }
657

658
            if( pass == npasses )
659
            {
660
                for( x = 0; x < width; x++ )
661
                {
662
                    disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
663
                    disp2cost[x] = MAX_COST;
664
                }
665

666
                for( x = width1 - 1; x >= 0; x-- )
667
                {
668
                    CostType* Sp = S + x*D;
669
                    int minS = MAX_COST, bestDisp = -1;
670

671
                    if( npasses == 1 )
672
                    {
673
                        int xm = x*NR2, xd = xm*D2;
674

675
                        int minL0 = MAX_COST;
676
                        int delta0 = minLr[0][xm + NR2] + P2;
677
                        CostType* Lr_p0 = Lr[0] + xd + NRD2;
678
                        Lr_p0[-1] = Lr_p0[D] = MAX_COST;
679
                        CostType* Lr_p = Lr[0] + xd;
680

681
                        const CostType* Cp = C + x*D;
682

683
                    #if CV_SIMD128
684
                        if( useSIMD )
685
                        {
686
                            v_int16x8 _P1 = v_setall_s16((short)P1);
687
                            v_int16x8 _delta0 = v_setall_s16((short)delta0);
688

689
                            v_int16x8 _minL0 = v_setall_s16((short)minL0);
690
                            v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);
691
                            v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);
692

693
                            for( d = 0; d < D; d += 8 )
694
                            {
695
                                v_int16x8 Cpd = v_load(Cp + d);
696
                                v_int16x8 L0 = v_load(Lr_p0 + d);
697

698
                                L0 = v_min(L0, v_load(Lr_p0 + d - 1) + _P1);
699
                                L0 = v_min(L0, v_load(Lr_p0 + d + 1) + _P1);
700
                                L0 = v_min(L0, _delta0);
701
                                L0 = L0 - _delta0 + Cpd;
702

703
                                v_store(Lr_p + d, L0);
704
                                _minL0 = v_min(_minL0, L0);
705
                                L0 = L0 + v_load(Sp + d);
706
                                v_store(Sp + d, L0);
707

708
                                v_int16x8 mask = _minS > L0;
709
                                _minS = v_min(_minS, L0);
710
                                _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
711
                                _d8 += _8;
712
                            }
713
                            short bestDispBuf[8];
714
                            v_store(bestDispBuf, _bestDisp);
715

716
                            v_int32x4 min32L, min32H;
717
                            v_expand(_minL0, min32L, min32H);
718
                            minLr[0][xm] = (CostType)std::min(v_reduce_min(min32L), v_reduce_min(min32H));
719

720
                            v_expand(_minS, min32L, min32H);
721
                            minS = std::min(v_reduce_min(min32L), v_reduce_min(min32H));
722

723
                            v_int16x8 ss = v_setall_s16((short)minS);
724
                            v_uint16x8 minMask = v_reinterpret_as_u16(ss == _minS);
725
                            v_uint16x8 minBit = minMask & v_LSB;
726

727
                            v_uint32x4 minBitL, minBitH;
728
                            v_expand(minBit, minBitL, minBitH);
729

730
                            int idx = v_reduce_sum(minBitL) + v_reduce_sum(minBitH);
731
                            bestDisp = bestDispBuf[LSBTab[idx]];
732
                        }
733
                        else
734
                    #endif
735
                        {
736
                            for( d = 0; d < D; d++ )
737
                            {
738
                                int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
739

740
                                Lr_p[d] = (CostType)L0;
741
                                minL0 = std::min(minL0, L0);
742

743
                                int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
744
                                if( Sval < minS )
745
                                {
746
                                    minS = Sval;
747
                                    bestDisp = d;
748
                                }
749
                            }
750
                            minLr[0][xm] = (CostType)minL0;
751
                        }
752
                    }
753
                    else
754
                    {
755
                    #if CV_SIMD128
756
                        if( useSIMD )
757
                        {
758
                            v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);
759
                            v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);
760

761
                            for( d = 0; d < D; d+= 8 )
762
                            {
763
                                v_int16x8 L0 = v_load(Sp + d);
764
                                v_int16x8 mask = L0 < _minS;
765
                                _minS = v_min( L0, _minS );
766
                                _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
767
                                _d8 = _d8 + _8;
768
                            }
769
                            v_int32x4 _d0, _d1;
770
                            v_expand(_minS, _d0, _d1);
771
                            minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
772
                            v_int16x8 v_mask = v_setall_s16((short)minS) == _minS;
773

774
                            _bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);
775
                            v_expand(_bestDisp, _d0, _d1);
776
                            bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
777
                        }
778
                        else
779
                    #endif
780
                        {
781
                            for( d = 0; d < D; d++ )
782
                            {
783
                                int Sval = Sp[d];
784
                                if( Sval < minS )
785
                                {
786
                                    minS = Sval;
787
                                    bestDisp = d;
788
                                }
789
                            }
790
                        }
791
                    }
792

793
                    for( d = 0; d < D; d++ )
794
                    {
795
                        if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
796
                            break;
797
                    }
798
                    if( d < D )
799
                        continue;
800
                    d = bestDisp;
801
                    int _x2 = x + minX1 - d - minD;
802
                    if( disp2cost[_x2] > minS )
803
                    {
804
                        disp2cost[_x2] = (CostType)minS;
805
                        disp2ptr[_x2] = (DispType)(d + minD);
806
                    }
807

808
                    if( 0 < d && d < D-1 )
809
                    {
810
                        // do subpixel quadratic interpolation:
811
                        //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
812
                        //   then find minimum of the parabola.
813
                        int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
814
                        d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
815
                    }
816
                    else
817
                        d *= DISP_SCALE;
818
                    disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
819
                }
820

821
                for( x = minX1; x < maxX1; x++ )
822
                {
823
                    // we round the computed disparity both towards -inf and +inf and check
824
                    // if either of the corresponding disparities in disp2 is consistent.
825
                    // This is to give the computed disparity a chance to look valid if it is.
826
                    int d1 = disp1ptr[x];
827
                    if( d1 == INVALID_DISP_SCALED )
828
                        continue;
829
                    int _d = d1 >> DISP_SHIFT;
830
                    int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
831
                    int _x = x - _d, x_ = x - d_;
832
                    if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
833
                       0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
834
                        disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
835
                }
836
            }
837

838
            // now shift the cyclic buffers
839
            std::swap( Lr[0], Lr[1] );
840
            std::swap( minLr[0], minLr[1] );
841
        }
842
    }
843
}
844

845
////////////////////////////////////////////////////////////////////////////////////////////
846
struct CalcVerticalSums: public ParallelLoopBody
847
{
848
    CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params,
849
                     CostType* alignedBuf, PixType* _clipTab): img1(_img1), img2(_img2), clipTab(_clipTab)
850
    {
851
        minD = params.minDisparity;
852
        maxD = minD + params.numDisparities;
853
        SW2 = SH2 = (params.SADWindowSize > 0 ? params.SADWindowSize : 5)/2;
854
        ftzero = std::max(params.preFilterCap, 15) | 1;
855
        P1 = params.P1 > 0 ? params.P1 : 2;
856
        P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
857
        height = img1.rows;
858
        width = img1.cols;
859
        int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
860
        D = maxD - minD;
861
        width1 = maxX1 - minX1;
862
        D2 = D + 16;
863
        costBufSize = width1*D;
864
        CSBufSize = costBufSize*height;
865
        minLrSize = width1;
866
        LrSize = minLrSize*D2;
867
        hsumBufNRows = SH2*2 + 2;
868
        Cbuf = alignedBuf;
869
        Sbuf = Cbuf + CSBufSize;
870
        hsumBuf = Sbuf + CSBufSize;
871
        useSIMD = hasSIMD128();
872
    }
873

874
    void operator()(const Range& range) const CV_OVERRIDE
875
    {
876
        static const CostType MAX_COST = SHRT_MAX;
877
        static const int ALIGN = 16;
878
        static const int TAB_OFS = 256*4;
879
        static const int npasses = 2;
880
        int x1 = range.start, x2 = range.end, k;
881
        size_t pixDiffSize = ((x2 - x1) + 2*SW2)*D;
882
        size_t auxBufsSize = pixDiffSize*sizeof(CostType) +                 //pixdiff size
883
                             width*16*img1.channels()*sizeof(PixType) + 32; //tempBuf
884
        Mat auxBuff;
885
        auxBuff.create(1, (int)auxBufsSize, CV_8U);
886
        CostType* pixDiff = (CostType*)alignPtr(auxBuff.ptr(), ALIGN);
887
        PixType* tempBuf = (PixType*)(pixDiff + pixDiffSize);
888

889
        // Simplification of index calculation
890
        pixDiff -= (x1>SW2 ? (x1 - SW2): 0)*D;
891

892
        for( int pass = 1; pass <= npasses; pass++ )
893
        {
894
            int y1, y2, dy;
895

896
            if( pass == 1 )
897
            {
898
                y1 = 0; y2 = height; dy = 1;
899
            }
900
            else
901
            {
902
                y1 = height-1; y2 = -1; dy = -1;
903
            }
904

905
            CostType *Lr[NLR]={0}, *minLr[NLR]={0};
906

907
            for( k = 0; k < NLR; k++ )
908
            {
909
                // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
910
                // and will occasionally use negative indices with the arrays
911
                // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
912
                // however, then the alignment will be imperfect, i.e. bad for SSE,
913
                // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
914
                Lr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*k + 8;
915
                memset( Lr[k] + x1*D2 - 8, 0, (x2-x1)*D2*sizeof(CostType) );
916
                minLr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*NLR + minLrSize*k;
917
                memset( minLr[k] + x1, 0, (x2-x1)*sizeof(CostType) );
918
            }
919

920
            for( int y = y1; y != y2; y += dy )
921
            {
922
                int x, d;
923
                CostType* C = Cbuf + y*costBufSize;
924
                CostType* S = Sbuf + y*costBufSize;
925

926
                if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
927
                {
928
                    int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
929

930
                    for( k = dy1; k <= dy2; k++ )
931
                    {
932
                        CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
933

934
                        if( k < height )
935
                        {
936
                            calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero, x1 - SW2, x2 + SW2);
937

938
                            memset(hsumAdd + x1*D, 0, D*sizeof(CostType));
939
                            for( x = (x1 - SW2)*D; x <= (x1 + SW2)*D; x += D )
940
                            {
941
                                int xbord = x <= 0 ? 0 : (x > (width1 - 1)*D? (width1 - 1)*D : x);
942
                                for( d = 0; d < D; d++ )
943
                                    hsumAdd[x1*D + d] = (CostType)(hsumAdd[x1*D + d] + pixDiff[xbord + d]);
944
                            }
945

946
                            if( y > 0 )
947
                            {
948
                                const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
949
                                const CostType* Cprev = C - costBufSize;
950

951
                                for( d = 0; d < D; d++ )
952
                                    C[x1*D + d] = (CostType)(Cprev[x1*D + d] + hsumAdd[x1*D + d] - hsumSub[x1*D + d]);
953

954
                                for( x = (x1+1)*D; x < x2*D; x += D )
955
                                {
956
                                    const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
957
                                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
958

959
                                #if CV_SIMD128
960
                                    if( useSIMD )
961
                                    {
962
                                        for( d = 0; d < D; d += 8 )
963
                                        {
964
                                            v_int16x8 hv = v_load(hsumAdd + x - D + d);
965
                                            v_int16x8 Cx = v_load(Cprev + x + d);
966
                                            v_int16x8 psub = v_load(pixSub + d);
967
                                            v_int16x8 padd = v_load(pixAdd + d);
968
                                            hv = (hv - psub + padd);
969
                                            psub = v_load(hsumSub + x + d);
970
                                            Cx = Cx - psub + hv;
971
                                            v_store(hsumAdd + x + d, hv);
972
                                            v_store(C + x + d, Cx);
973
                                        }
974
                                    }
975
                                    else
976
                                #endif
977
                                    {
978
                                        for( d = 0; d < D; d++ )
979
                                        {
980
                                            int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
981
                                            C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
982
                                        }
983
                                    }
984
                                }
985
                            }
986
                            else
987
                            {
988
                                for( x = (x1+1)*D; x < x2*D; x += D )
989
                                {
990
                                    const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
991
                                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
992

993
                                    for( d = 0; d < D; d++ )
994
                                        hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
995
                                }
996
                            }
997
                        }
998

999
                        if( y == 0 )
1000
                        {
1001
                            int scale = k == 0 ? SH2 + 1 : 1;
1002
                            for( x = x1*D; x < x2*D; x++ )
1003
                                C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
1004
                        }
1005
                    }
1006

1007
                    // also, clear the S buffer
1008
                    for( k = x1*D; k < x2*D; k++ )
1009
                        S[k] = 0;
1010
                }
1011

1012
//              [formula 13 in the paper]
1013
//              compute L_r(p, d) = C(p, d) +
1014
//              min(L_r(p-r, d),
1015
//              L_r(p-r, d-1) + P1,
1016
//              L_r(p-r, d+1) + P1,
1017
//              min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
1018
//              where p = (x,y), r is one of the directions.
1019
//              we process one directions on first pass and other on second:
1020
//              r=(0, dy), where dy=1 on first pass and dy=-1 on second
1021

1022
                for( x = x1; x != x2; x++ )
1023
                {
1024
                    int xd = x*D2;
1025

1026
                    int delta = minLr[1][x] + P2;
1027

1028
                    CostType* Lr_ppr = Lr[1] + xd;
1029

1030
                    Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
1031

1032
                    CostType* Lr_p = Lr[0] + xd;
1033
                    const CostType* Cp = C + x*D;
1034
                    CostType* Sp = S + x*D;
1035

1036
                #if CV_SIMD128
1037
                    if( useSIMD )
1038
                    {
1039
                        v_int16x8 _P1 = v_setall_s16((short)P1);
1040

1041
                        v_int16x8 _delta = v_setall_s16((short)delta);
1042
                        v_int16x8 _minL = v_setall_s16((short)MAX_COST);
1043

1044
                        for( d = 0; d < D; d += 8 )
1045
                        {
1046
                            v_int16x8 Cpd = v_load(Cp + d);
1047
                            v_int16x8 L;
1048

1049
                            L = v_load(Lr_ppr + d);
1050

1051
                            L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1));
1052
                            L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1));
1053

1054
                            L = v_min(L, _delta);
1055
                            L = ((L - _delta) + Cpd);
1056

1057
                            v_store(Lr_p + d, L);
1058

1059
                            // Get minimum from in L-L3
1060
                            _minL = v_min(_minL, L);
1061

1062
                            v_int16x8 Sval = v_load(Sp + d);
1063

1064
                            Sval = Sval + L;
1065

1066
                            v_store(Sp + d, Sval);
1067
                        }
1068

1069
                        v_int32x4 min1, min2, min12;
1070
                        v_expand(_minL, min1, min2);
1071
                        min12 = v_min(min1,min2);
1072
                        minLr[0][x] = (CostType)v_reduce_min(min12);
1073
                    }
1074
                    else
1075
                #endif
1076
                    {
1077
                        int minL = MAX_COST;
1078

1079
                        for( d = 0; d < D; d++ )
1080
                        {
1081
                            int Cpd = Cp[d], L;
1082

1083
                            L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
1084

1085
                            Lr_p[d] = (CostType)L;
1086
                            minL = std::min(minL, L);
1087

1088
                            Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1089
                        }
1090
                        minLr[0][x] = (CostType)minL;
1091
                    }
1092
                }
1093

1094
                // now shift the cyclic buffers
1095
                std::swap( Lr[0], Lr[1] );
1096
                std::swap( minLr[0], minLr[1] );
1097
            }
1098
        }
1099
    }
1100
    static const int NLR = 2;
1101
    const Mat& img1;
1102
    const Mat& img2;
1103
    CostType* Cbuf;
1104
    CostType* Sbuf;
1105
    CostType* hsumBuf;
1106
    PixType* clipTab;
1107
    int minD;
1108
    int maxD;
1109
    int D;
1110
    int D2;
1111
    int SH2;
1112
    int SW2;
1113
    int width;
1114
    int width1;
1115
    int height;
1116
    int P1;
1117
    int P2;
1118
    size_t costBufSize;
1119
    size_t CSBufSize;
1120
    size_t minLrSize;
1121
    size_t LrSize;
1122
    size_t hsumBufNRows;
1123
    int ftzero;
1124
    bool useSIMD;
1125
};
1126

1127
struct CalcHorizontalSums: public ParallelLoopBody
1128
{
1129
    CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params,
1130
                     CostType* alignedBuf): img1(_img1), img2(_img2), disp1(_disp1)
1131
    {
1132
        minD = params.minDisparity;
1133
        maxD = minD + params.numDisparities;
1134
        P1 = params.P1 > 0 ? params.P1 : 2;
1135
        P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
1136
        uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
1137
        disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
1138
        height = img1.rows;
1139
        width = img1.cols;
1140
        minX1 = std::max(maxD, 0);
1141
        maxX1 = width + std::min(minD, 0);
1142
        INVALID_DISP = minD - 1;
1143
        INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1144
        D = maxD - minD;
1145
        width1 = maxX1 - minX1;
1146
        costBufSize = width1*D;
1147
        CSBufSize = costBufSize*height;
1148
        D2 = D + 16;
1149
        LrSize = 2 * D2;
1150
        Cbuf = alignedBuf;
1151
        Sbuf = Cbuf + CSBufSize;
1152
        useSIMD = hasSIMD128();
1153
    }
1154

1155
    void operator()(const Range& range) const CV_OVERRIDE
1156
    {
1157
        int y1 = range.start, y2 = range.end;
1158
        size_t auxBufsSize = LrSize * sizeof(CostType) + width*(sizeof(CostType) + sizeof(DispType)) + 32;
1159

1160
        Mat auxBuff;
1161
        auxBuff.create(1, (int)auxBufsSize, CV_8U);
1162
        CostType *Lr = ((CostType*)alignPtr(auxBuff.ptr(), ALIGN)) + 8;
1163
        CostType* disp2cost = Lr + LrSize;
1164
        DispType* disp2ptr = (DispType*)(disp2cost + width);
1165

1166
        CostType minLr;
1167

1168
        for( int y = y1; y != y2; y++)
1169
        {
1170
            int x, d;
1171
            DispType* disp1ptr = disp1.ptr<DispType>(y);
1172
            CostType* C = Cbuf + y*costBufSize;
1173
            CostType* S = Sbuf + y*costBufSize;
1174

1175
            for( x = 0; x < width; x++ )
1176
            {
1177
                disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
1178
                disp2cost[x] = MAX_COST;
1179
            }
1180

1181
            // clear buffers
1182
            memset( Lr - 8, 0, LrSize*sizeof(CostType) );
1183
            Lr[-1] = Lr[D] = Lr[D2 - 1] = Lr[D2 + D] = MAX_COST;
1184

1185
            minLr = 0;
1186
//          [formula 13 in the paper]
1187
//          compute L_r(p, d) = C(p, d) +
1188
//          min(L_r(p-r, d),
1189
//          L_r(p-r, d-1) + P1,
1190
//          L_r(p-r, d+1) + P1,
1191
//          min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
1192
//          where p = (x,y), r is one of the directions.
1193
//          we process all the directions at once:
1194
//          we process one directions on first pass and other on second:
1195
//          r=(dx, 0), where dx=1 on first pass and dx=-1 on second
1196
            for( x = 0; x != width1; x++)
1197
            {
1198
                int delta = minLr + P2;
1199

1200
                CostType* Lr_ppr = Lr + ((x&1)? 0 : D2);
1201

1202
                CostType* Lr_p = Lr + ((x&1)? D2 :0);
1203
                const CostType* Cp = C + x*D;
1204
                CostType* Sp = S + x*D;
1205

1206
            #if CV_SIMD128
1207
                if( useSIMD )
1208
                {
1209
                    v_int16x8 _P1 = v_setall_s16((short)P1);
1210

1211
                    v_int16x8 _delta = v_setall_s16((short)delta);
1212
                    v_int16x8 _minL = v_setall_s16((short)MAX_COST);
1213

1214
                    for( d = 0; d < D; d += 8 )
1215
                    {
1216
                        v_int16x8 Cpd = v_load(Cp + d);
1217
                        v_int16x8 L;
1218

1219
                        L = v_load(Lr_ppr + d);
1220

1221
                        L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1));
1222
                        L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1));
1223

1224
                        L = v_min(L, _delta);
1225
                        L = ((L - _delta) + Cpd);
1226

1227
                        v_store(Lr_p + d, L);
1228

1229
                        // Get minimum from in L-L3
1230
                        _minL = v_min(_minL, L);
1231

1232
                        v_int16x8 Sval = v_load(Sp + d);
1233

1234
                        Sval = Sval + L;
1235

1236
                        v_store(Sp + d, Sval);
1237
                    }
1238

1239
                    v_int32x4 min1, min2, min12;
1240
                    v_expand(_minL, min1, min2);
1241
                    min12 = v_min(min1,min2);
1242
                    minLr = (CostType)v_reduce_min(min12);
1243
                }
1244
                else
1245
            #endif
1246
                {
1247
                    minLr = MAX_COST;
1248
                    for( d = 0; d < D; d++ )
1249
                    {
1250
                        int Cpd = Cp[d], L;
1251

1252
                        L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
1253

1254
                        Lr_p[d] = (CostType)L;
1255
                        minLr = (CostType)std::min((int)minLr, L);
1256

1257
                        Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1258
                    }
1259
                }
1260
            }
1261

1262
            memset( Lr - 8, 0, LrSize*sizeof(CostType) );
1263
            Lr[-1] = Lr[D] = Lr[D2 - 1] = Lr[D2 + D] = MAX_COST;
1264

1265
            minLr = 0;
1266

1267
            for( x = width1-1; x != -1; x--)
1268
            {
1269
                int delta = minLr + P2;
1270

1271
                CostType* Lr_ppr = Lr + ((x&1)? 0 :D2);
1272

1273
                CostType* Lr_p = Lr + ((x&1)? D2 :0);
1274
                const CostType* Cp = C + x*D;
1275
                CostType* Sp = S + x*D;
1276
                int minS = MAX_COST, bestDisp = -1;
1277
                minLr = MAX_COST;
1278

1279
            #if CV_SIMD128
1280
                if( useSIMD )
1281
                {
1282
                    v_int16x8 _P1 = v_setall_s16((short)P1);
1283

1284
                    v_int16x8 _delta = v_setall_s16((short)delta);
1285
                    v_int16x8 _minL = v_setall_s16((short)MAX_COST);
1286

1287
                    v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);
1288
                    v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);
1289

1290
                    for( d = 0; d < D; d+= 8 )
1291
                    {
1292
                        v_int16x8 Cpd = v_load(Cp + d);
1293
                        v_int16x8 L;
1294

1295
                        L = v_load(Lr_ppr + d);
1296

1297
                        L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1));
1298
                        L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1));
1299

1300
                        L = v_min(L, _delta);
1301
                        L = ((L - _delta) + Cpd);
1302

1303
                        v_store(Lr_p + d, L);
1304

1305
                        // Get minimum from in L-L3
1306
                        _minL = v_min(_minL, L);
1307

1308
                        v_int16x8 Sval = v_load(Sp + d);
1309

1310
                        Sval = Sval + L;
1311

1312
                        v_int16x8 mask = Sval < _minS;
1313
                        _minS = v_min( Sval, _minS );
1314
                        _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
1315
                        _d8 = _d8 + _8;
1316

1317
                        v_store(Sp + d, Sval);
1318
                    }
1319
                    v_int32x4 min1, min2, min12;
1320
                    v_expand(_minL, min1, min2);
1321
                    min12 = v_min(min1,min2);
1322
                    minLr = (CostType)v_reduce_min(min12);
1323

1324
                    v_int32x4 _d0, _d1;
1325
                    v_expand(_minS, _d0, _d1);
1326
                    minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
1327
                    v_int16x8 v_mask = v_setall_s16((short)minS) == _minS;
1328

1329
                    _bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);
1330
                    v_expand(_bestDisp, _d0, _d1);
1331
                    bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
1332
                }
1333
                else
1334
            #endif
1335
                {
1336
                    for( d = 0; d < D; d++ )
1337
                    {
1338
                        int Cpd = Cp[d], L;
1339

1340
                        L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
1341

1342
                        Lr_p[d] = (CostType)L;
1343
                        minLr = (CostType)std::min((int)minLr, L);
1344

1345
                        Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1346
                        if( Sp[d] < minS )
1347
                        {
1348
                            minS = Sp[d];
1349
                            bestDisp = d;
1350
                        }
1351
                    }
1352
                }
1353
                //Some postprocessing procedures and saving
1354
                for( d = 0; d < D; d++ )
1355
                {
1356
                    if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
1357
                        break;
1358
                }
1359
                if( d < D )
1360
                    continue;
1361
                d = bestDisp;
1362
                int _x2 = x + minX1 - d - minD;
1363
                if( disp2cost[_x2] > minS )
1364
                {
1365
                    disp2cost[_x2] = (CostType)minS;
1366
                    disp2ptr[_x2] = (DispType)(d + minD);
1367
                }
1368

1369
                if( 0 < d && d < D-1 )
1370
                {
1371
                    // do subpixel quadratic interpolation:
1372
                    //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
1373
                    //   then find minimum of the parabola.
1374
                    int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
1375
                    d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
1376
                }
1377
                else
1378
                    d *= DISP_SCALE;
1379
                disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
1380
            }
1381
            //Left-right check sanity procedure
1382
            for( x = minX1; x < maxX1; x++ )
1383
            {
1384
                // we round the computed disparity both towards -inf and +inf and check
1385
                // if either of the corresponding disparities in disp2 is consistent.
1386
                // This is to give the computed disparity a chance to look valid if it is.
1387
                int d1 = disp1ptr[x];
1388
                if( d1 == INVALID_DISP_SCALED )
1389
                    continue;
1390
                int _d = d1 >> DISP_SHIFT;
1391
                int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
1392
                int _x = x - _d, x_ = x - d_;
1393
                if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
1394
                   0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
1395
                    disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
1396
            }
1397
        }
1398
    }
1399

1400
    static const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
1401
    static const int DISP_SCALE = (1 << DISP_SHIFT);
1402
    static const CostType MAX_COST = SHRT_MAX;
1403
    static const int ALIGN = 16;
1404
    const Mat& img1;
1405
    const Mat& img2;
1406
    Mat& disp1;
1407
    CostType* Cbuf;
1408
    CostType* Sbuf;
1409
    int minD;
1410
    int maxD;
1411
    int D;
1412
    int D2;
1413
    int width;
1414
    int width1;
1415
    int height;
1416
    int P1;
1417
    int P2;
1418
    int minX1;
1419
    int maxX1;
1420
    size_t costBufSize;
1421
    size_t CSBufSize;
1422
    size_t LrSize;
1423
    int INVALID_DISP;
1424
    int INVALID_DISP_SCALED;
1425
    int uniquenessRatio;
1426
    int disp12MaxDiff;
1427
    bool useSIMD;
1428
};
1429
/*
1430
 computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
1431
 that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
1432
 minD <= d < maxD.
1433

1434
 note that disp1buf will have the same size as the roi and
1435
 On exit disp1buf is not the final disparity, it is an intermediate result that becomes
1436
 final after all the tiles are processed.
1437

1438
 the disparity in disp1buf is written with sub-pixel accuracy
1439
 (4 fractional bits, see StereoSGBM::DISP_SCALE),
1440
 using quadratic interpolation, while the disparity in disp2buf
1441
 is written as is, without interpolation.
1442
 */
1443
static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
1444
                                 Mat& disp1, const StereoSGBMParams& params,
1445
                                 Mat& buffer )
1446
{
1447
    const int ALIGN = 16;
1448
    const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
1449
    const int DISP_SCALE = (1 << DISP_SHIFT);
1450
    int minD = params.minDisparity, maxD = minD + params.numDisparities;
1451
    Size SADWindowSize;
1452
    SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
1453
    int ftzero = std::max(params.preFilterCap, 15) | 1;
1454
    int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
1455
    int k, width = disp1.cols, height = disp1.rows;
1456
    int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
1457
    int D = maxD - minD, width1 = maxX1 - minX1;
1458
    int SH2 = SADWindowSize.height/2;
1459
    int INVALID_DISP = minD - 1;
1460
    int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1461
    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
1462
    PixType clipTab[TAB_SIZE];
1463

1464
    for( k = 0; k < TAB_SIZE; k++ )
1465
        clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
1466

1467
    if( minX1 >= maxX1 )
1468
    {
1469
        disp1 = Scalar::all(INVALID_DISP_SCALED);
1470
        return;
1471
    }
1472

1473
    CV_Assert( D % 16 == 0 );
1474

1475
    int D2 = D+16;
1476

1477
    // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
1478
    // for dynamic programming we need the current row and
1479
    // the previous row, i.e. 2 rows in total
1480
    const int NLR = 2;
1481

1482
    // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
1483
    // we keep pixel difference cost (C) and the summary cost over 4 directions (S).
1484
    // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
1485
    size_t costBufSize = width1*D;
1486
    size_t CSBufSize = costBufSize*height;
1487
    size_t minLrSize = width1 , LrSize = minLrSize*D2;
1488
    int hsumBufNRows = SH2*2 + 2;
1489
    size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
1490
                          costBufSize*hsumBufNRows*sizeof(CostType) + // hsumBuf
1491
                          CSBufSize*2*sizeof(CostType) + 1024;        // C, S
1492

1493
    if( buffer.empty() || !buffer.isContinuous() ||
1494
        buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
1495
    {
1496
        buffer.reserveBuffer(totalBufSize);
1497
    }
1498

1499
    // summary cost over different (nDirs) directions
1500
    CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
1501

1502
    // add P2 to every C(x,y). it saves a few operations in the inner loops
1503
    for(k = 0; k < (int)CSBufSize; k++ )
1504
        Cbuf[k] = (CostType)P2;
1505

1506
    parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, Cbuf, clipTab),8);
1507
    parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, Cbuf),8);
1508

1509
}
1510

1511
//////////////////////////////////////////////////////////////////////////////////////////////////////
1512

1513
void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
1514
                       CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
1515
                       PixType*& tmpBuf, CostType*& horPassCostVolume,
1516
                       CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
1517
                       CostType*& disp2CostBuf, short*& disp2Buf);
1518

1519
struct SGBM3WayMainLoop : public ParallelLoopBody
1520
{
1521
    Mat* buffers;
1522
    const Mat *img1, *img2;
1523
    Mat* dst_disp;
1524

1525
    int nstripes, stripe_sz;
1526
    int stripe_overlap;
1527

1528
    int width,height;
1529
    int minD, maxD, D;
1530
    int minX1, maxX1, width1;
1531

1532
    int SW2, SH2;
1533
    int P1, P2;
1534
    int uniquenessRatio, disp12MaxDiff;
1535

1536
    int costBufSize, hsumBufNRows;
1537
    int TAB_OFS, ftzero;
1538

1539
#if CV_SIMD128
1540
    bool useSIMD;
1541
#endif
1542

1543
    PixType* clipTab;
1544

1545
    SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap);
1546
    void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const;
1547
    void operator () (const Range& range) const CV_OVERRIDE;
1548
};
1549

1550
SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap):
1551
buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_clipTab)
1552
{
1553
    nstripes = _nstripes;
1554
    stripe_overlap = _stripe_overlap;
1555
    stripe_sz = (int)ceil(img1->rows/(double)nstripes);
1556

1557
    width = img1->cols; height = img1->rows;
1558
    minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
1559
    minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1;
1560
    CV_Assert( D % 16 == 0 );
1561

1562
    SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1;
1563

1564
    P1 = params.P1 > 0 ? params.P1 : 2; P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
1565
    uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
1566
    disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
1567

1568
    costBufSize = width1*D;
1569
    hsumBufNRows = SH2*2 + 2;
1570
    TAB_OFS = 256*4;
1571
    ftzero = std::max(params.preFilterCap, 15) | 1;
1572

1573
#if CV_SIMD128
1574
    useSIMD = hasSIMD128();
1575
#endif
1576
}
1577

1578
void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
1579
                       CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
1580
                       PixType*& tmpBuf, CostType*& horPassCostVolume,
1581
                       CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
1582
                       CostType*& disp2CostBuf, short*& disp2Buf)
1583
{
1584
    // allocating all the required memory:
1585
    int costVolumeLineSize = width1*D;
1586
    int width1_ext = width1+2;
1587
    int costVolumeLineSize_ext = width1_ext*D;
1588
    int hsumBufNRows = SH2*2 + 2;
1589

1590
    // main buffer to store matching costs for the current line:
1591
    int curCostVolumeLineSize = costVolumeLineSize*sizeof(CostType);
1592

1593
    // auxiliary buffers for the raw matching cost computation:
1594
    int hsumBufSize  = costVolumeLineSize*hsumBufNRows*sizeof(CostType);
1595
    int pixDiffSize  = costVolumeLineSize*sizeof(CostType);
1596
    int tmpBufSize   = width*16*num_ch*sizeof(PixType);
1597

1598
    // auxiliary buffers for the matching cost aggregation:
1599
    int horPassCostVolumeSize  = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation
1600
    int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation
1601
    int vertPassMinSize        = width1_ext*sizeof(CostType);             // buffer for storing minimum costs from the previous line
1602
    int rightPassBufSize       = D*sizeof(CostType);                      // additional small buffer for the right-to-left pass
1603

1604
    // buffers for the pseudo-LRC check:
1605
    int disp2CostBufSize = width*sizeof(CostType);
1606
    int disp2BufSize     = width*sizeof(short);
1607

1608
    // sum up the sizes of all the buffers:
1609
    size_t totalBufSize = curCostVolumeLineSize +
1610
                          hsumBufSize +
1611
                          pixDiffSize +
1612
                          tmpBufSize  +
1613
                          horPassCostVolumeSize +
1614
                          vertPassCostVolumeSize +
1615
                          vertPassMinSize +
1616
                          rightPassBufSize +
1617
                          disp2CostBufSize +
1618
                          disp2BufSize +
1619
                          16;  //to compensate for the alignPtr shifts
1620

1621
    if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
1622
        buffer.reserveBuffer(totalBufSize);
1623

1624
    // set up all the pointers:
1625
    curCostVolumeLine  = (CostType*)alignPtr(buffer.ptr(), 16);
1626
    hsumBuf            = curCostVolumeLine + costVolumeLineSize;
1627
    pixDiff            = hsumBuf + costVolumeLineSize*hsumBufNRows;
1628
    tmpBuf             = (PixType*)(pixDiff + costVolumeLineSize);
1629
    horPassCostVolume  = (CostType*)(tmpBuf + width*16*num_ch);
1630
    vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext;
1631
    rightPassBuf       = vertPassCostVolume + costVolumeLineSize_ext;
1632
    vertPassMin        = rightPassBuf + D;
1633
    disp2CostBuf       = vertPassMin + width1_ext;
1634
    disp2Buf           = disp2CostBuf + width;
1635

1636
    // initialize memory:
1637
    memset(buffer.ptr(),0,totalBufSize);
1638
    for(int i=0;i<costVolumeLineSize;i++)
1639
        curCostVolumeLine[i] = (CostType)P2; //such initialization simplifies the cost aggregation loops a bit
1640
}
1641

1642
// performing block matching and building raw cost-volume for the current row
1643
void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
1644
                                          CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, //buffers
1645
                                          int y, int src_start_idx) const
1646
{
1647
    int x, d;
1648
    int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1;
1649

1650
    for(int k = dy1; k <= dy2; k++ )
1651
    {
1652
        CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;
1653
        if( k < height )
1654
        {
1655
            calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab, TAB_OFS, ftzero );
1656

1657
            memset(hsumAdd, 0, D*sizeof(CostType));
1658
            for(x = 0; x <= SW2*D; x += D )
1659
            {
1660
                int scale = x == 0 ? SW2 + 1 : 1;
1661

1662
                for( d = 0; d < D; d++ )
1663
                    hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
1664
            }
1665

1666
            if( y > src_start_idx )
1667
            {
1668
                const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;
1669

1670
                for( x = D; x < width1*D; x += D )
1671
                {
1672
                    const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
1673
                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
1674

1675
#if CV_SIMD128
1676
                    if(useSIMD)
1677
                    {
1678
                        v_int16x8 hv_reg;
1679
                        for( d = 0; d < D; d+=8 )
1680
                        {
1681
                            hv_reg = v_load_aligned(hsumAdd+x-D+d) + (v_load_aligned(pixAdd+d) - v_load_aligned(pixSub+d));
1682
                            v_store_aligned(hsumAdd+x+d,hv_reg);
1683
                            v_store_aligned(C+x+d,v_load_aligned(C+x+d)+(hv_reg-v_load_aligned(hsumSub+x+d)));
1684
                        }
1685
                    }
1686
                    else
1687
#endif
1688
                    {
1689
                        for( d = 0; d < D; d++ )
1690
                        {
1691
                            int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
1692
                            C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]);
1693
                        }
1694
                    }
1695
                }
1696
            }
1697
            else
1698
            {
1699
                for( x = D; x < width1*D; x += D )
1700
                {
1701
                    const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
1702
                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
1703

1704
                    for( d = 0; d < D; d++ )
1705
                        hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
1706
                }
1707
            }
1708
        }
1709

1710
        if( y == src_start_idx )
1711
        {
1712
            int scale = k == src_start_idx ? SH2 + 1 : 1;
1713
            for( x = 0; x < width1*D; x++ )
1714
                C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
1715
        }
1716
    }
1717
}
1718

1719
#if CV_SIMD128
1720
// define some additional reduce operations:
1721
inline short min_pos(const v_int16x8& val, const v_int16x8& pos, const short min_val)
1722
{
1723
    v_int16x8 v_min = v_setall_s16(min_val);
1724
    v_int16x8 v_mask = v_min == val;
1725
    v_int16x8 v_pos = (pos & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);
1726

1727
    return v_reduce_min(v_pos);
1728
}
1729
#endif
1730

1731
// performing SGM cost accumulation from left to right (result is stored in leftBuf) and
1732
// in-place cost accumulation from top to bottom (result is stored in topBuf)
1733
inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs,
1734
                                   CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2)
1735
{
1736
#if CV_SIMD128
1737
    if(hasSIMD128())
1738
    {
1739
        v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
1740

1741
        v_int16x8 leftMinCostP2_reg   = v_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
1742
        v_int16x8 leftMinCost_new_reg = v_setall_s16(SHRT_MAX);
1743
        v_int16x8 src0_leftBuf        = v_setall_s16(SHRT_MAX);
1744
        v_int16x8 src1_leftBuf        = v_load_aligned(leftBuf_prev);
1745

1746
        v_int16x8 topMinCostP2_reg   = v_setall_s16(cv::saturate_cast<CostType>(topMinCost+P2));
1747
        v_int16x8 topMinCost_new_reg = v_setall_s16(SHRT_MAX);
1748
        v_int16x8 src0_topBuf        = v_setall_s16(SHRT_MAX);
1749
        v_int16x8 src1_topBuf        = v_load_aligned(topBuf);
1750

1751
        v_int16x8 src2;
1752
        v_int16x8 src_shifted_left,src_shifted_right;
1753
        v_int16x8 res;
1754

1755
        for(int i=0;i<D-8;i+=8)
1756
        {
1757
            //process leftBuf:
1758
            //lookahead load:
1759
            src2 = v_load_aligned(leftBuf_prev+i+8);
1760

1761
            //get shifted versions of the current block and add P1:
1762
            src_shifted_left  = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg;
1763
            src_shifted_right = v_extract<1> (src1_leftBuf,src2        ) + P1_reg;
1764

1765
            // process and save current block:
1766
            res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
1767
            leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);
1768
            v_store_aligned(leftBuf+i, res);
1769

1770
            //update src buffers:
1771
            src0_leftBuf = src1_leftBuf;
1772
            src1_leftBuf = src2;
1773

1774
            //process topBuf:
1775
            //lookahead load:
1776
            src2 = v_load_aligned(topBuf+i+8);
1777

1778
            //get shifted versions of the current block and add P1:
1779
            src_shifted_left  = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg;
1780
            src_shifted_right = v_extract<1> (src1_topBuf,src2       ) + P1_reg;
1781

1782
            // process and save current block:
1783
            res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
1784
            topMinCost_new_reg = v_min(topMinCost_new_reg,res);
1785
            v_store_aligned(topBuf+i, res);
1786

1787
            //update src buffers:
1788
            src0_topBuf = src1_topBuf;
1789
            src1_topBuf = src2;
1790
        }
1791

1792
        // a bit different processing for the last cycle of the loop:
1793
        //process leftBuf:
1794
        src2 = v_setall_s16(SHRT_MAX);
1795
        src_shifted_left  = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg;
1796
        src_shifted_right = v_extract<1> (src1_leftBuf,src2        ) + P1_reg;
1797

1798
        res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
1799
        leftMinCost = v_reduce_min(v_min(leftMinCost_new_reg,res));
1800
        v_store_aligned(leftBuf+D-8, res);
1801

1802
        //process topBuf:
1803
        src2 = v_setall_s16(SHRT_MAX);
1804
        src_shifted_left  = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg;
1805
        src_shifted_right = v_extract<1> (src1_topBuf,src2       ) + P1_reg;
1806

1807
        res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
1808
        topMinCost = v_reduce_min(v_min(topMinCost_new_reg,res));
1809
        v_store_aligned(topBuf+D-8, res);
1810
    }
1811
    else
1812
#endif
1813
    {
1814
        CostType leftMinCost_new = SHRT_MAX;
1815
        CostType topMinCost_new  = SHRT_MAX;
1816
        int leftMinCost_P2  = leftMinCost + P2;
1817
        int topMinCost_P2   = topMinCost  + P2;
1818
        CostType leftBuf_prev_i_minus_1 = SHRT_MAX;
1819
        CostType topBuf_i_minus_1       = SHRT_MAX;
1820
        CostType tmp;
1821

1822
        for(int i=0;i<D-1;i++)
1823
        {
1824
            leftBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(leftBuf_prev_i_minus_1+P1,leftBuf_prev[i+1]+P1),std::min((int)leftBuf_prev[i],leftMinCost_P2))-leftMinCost_P2);
1825
            leftBuf_prev_i_minus_1 = leftBuf_prev[i];
1826
            leftMinCost_new = std::min(leftMinCost_new,leftBuf[i]);
1827

1828
            tmp = topBuf[i];
1829
            topBuf[i]  = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(topBuf_i_minus_1+P1,topBuf[i+1]+P1),std::min((int)topBuf[i],topMinCost_P2))-topMinCost_P2);
1830
            topBuf_i_minus_1 = tmp;
1831
            topMinCost_new  = std::min(topMinCost_new,topBuf[i]);
1832
        }
1833

1834
        leftBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(leftBuf_prev_i_minus_1+P1,std::min((int)leftBuf_prev[D-1],leftMinCost_P2))-leftMinCost_P2);
1835
        leftMinCost = std::min(leftMinCost_new,leftBuf[D-1]);
1836

1837
        topBuf[D-1]  = cv::saturate_cast<CostType>(costs[D-1] + std::min(topBuf_i_minus_1+P1,std::min((int)topBuf[D-1],topMinCost_P2))-topMinCost_P2);
1838
        topMinCost  = std::min(topMinCost_new,topBuf[D-1]);
1839
    }
1840
}
1841

1842
// performing in-place SGM cost accumulation from right to left (the result is stored in rightBuf) and
1843
// summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the
1844
// optimal disparity value with minimum accumulated cost
1845
inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs,
1846
                                 CostType& rightMinCost, int D, int P1, int P2, int& optimal_disp, CostType& min_cost)
1847
{
1848
#if CV_SIMD128
1849
    if(hasSIMD128())
1850
    {
1851
        v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
1852

1853
        v_int16x8 rightMinCostP2_reg   = v_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
1854
        v_int16x8 rightMinCost_new_reg = v_setall_s16(SHRT_MAX);
1855
        v_int16x8 src0_rightBuf        = v_setall_s16(SHRT_MAX);
1856
        v_int16x8 src1_rightBuf        = v_load(rightBuf);
1857

1858
        v_int16x8 src2;
1859
        v_int16x8 src_shifted_left,src_shifted_right;
1860
        v_int16x8 res;
1861

1862
        v_int16x8 min_sum_cost_reg = v_setall_s16(SHRT_MAX);
1863
        v_int16x8 min_sum_pos_reg  = v_setall_s16(0);
1864
        v_int16x8 loop_idx(0,1,2,3,4,5,6,7);
1865
        v_int16x8 eight_reg = v_setall_s16(8);
1866

1867
        for(int i=0;i<D-8;i+=8)
1868
        {
1869
            //lookahead load:
1870
            src2 = v_load_aligned(rightBuf+i+8);
1871

1872
            //get shifted versions of the current block and add P1:
1873
            src_shifted_left  = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg;
1874
            src_shifted_right = v_extract<1> (src1_rightBuf,src2         ) + P1_reg;
1875

1876
            // process and save current block:
1877
            res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
1878
            rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);
1879
            v_store_aligned(rightBuf+i, res);
1880

1881
            // compute and save total cost:
1882
            res = res + v_load_aligned(leftBuf+i) + v_load_aligned(topBuf+i);
1883
            v_store_aligned(leftBuf+i, res);
1884

1885
            // track disparity value with the minimum cost:
1886
            min_sum_cost_reg = v_min(min_sum_cost_reg,res);
1887
            min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
1888
            loop_idx = loop_idx+eight_reg;
1889

1890
            //update src:
1891
            src0_rightBuf    = src1_rightBuf;
1892
            src1_rightBuf    = src2;
1893
        }
1894

1895
        // a bit different processing for the last cycle of the loop:
1896
        src2 = v_setall_s16(SHRT_MAX);
1897
        src_shifted_left  = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg;
1898
        src_shifted_right = v_extract<1> (src1_rightBuf,src2         ) + P1_reg;
1899

1900
        res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
1901
        rightMinCost = v_reduce_min(v_min(rightMinCost_new_reg,res));
1902
        v_store_aligned(rightBuf+D-8, res);
1903

1904
        res = res + v_load_aligned(leftBuf+D-8) + v_load_aligned(topBuf+D-8);
1905
        v_store_aligned(leftBuf+D-8, res);
1906

1907
        min_sum_cost_reg = v_min(min_sum_cost_reg,res);
1908
        min_cost = v_reduce_min(min_sum_cost_reg);
1909
        min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
1910
        optimal_disp = min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost);
1911
    }
1912
    else
1913
#endif
1914
    {
1915
        CostType rightMinCost_new = SHRT_MAX;
1916
        int rightMinCost_P2  = rightMinCost + P2;
1917
        CostType rightBuf_i_minus_1 = SHRT_MAX;
1918
        CostType tmp;
1919
        min_cost = SHRT_MAX;
1920

1921
        for(int i=0;i<D-1;i++)
1922
        {
1923
            tmp = rightBuf[i];
1924
            rightBuf[i]  = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(rightBuf_i_minus_1+P1,rightBuf[i+1]+P1),std::min((int)rightBuf[i],rightMinCost_P2))-rightMinCost_P2);
1925
            rightBuf_i_minus_1 = tmp;
1926
            rightMinCost_new  = std::min(rightMinCost_new,rightBuf[i]);
1927
            leftBuf[i] = cv::saturate_cast<CostType>((int)leftBuf[i]+rightBuf[i]+topBuf[i]);
1928
            if(leftBuf[i]<min_cost)
1929
            {
1930
                optimal_disp = i;
1931
                min_cost = leftBuf[i];
1932
            }
1933
        }
1934

1935
        rightBuf[D-1]  = cv::saturate_cast<CostType>(costs[D-1] + std::min(rightBuf_i_minus_1+P1,std::min((int)rightBuf[D-1],rightMinCost_P2))-rightMinCost_P2);
1936
        rightMinCost  = std::min(rightMinCost_new,rightBuf[D-1]);
1937
        leftBuf[D-1] = cv::saturate_cast<CostType>((int)leftBuf[D-1]+rightBuf[D-1]+topBuf[D-1]);
1938
        if(leftBuf[D-1]<min_cost)
1939
        {
1940
            optimal_disp = D-1;
1941
            min_cost = leftBuf[D-1];
1942
        }
1943
    }
1944
}
1945

1946
void SGBM3WayMainLoop::operator () (const Range& range) const
1947
{
1948
    // force separate processing of stripes:
1949
    if(range.end>range.start+1)
1950
    {
1951
        for(int n=range.start;n<range.end;n++)
1952
            (*this)(Range(n,n+1));
1953
        return;
1954
    }
1955

1956
    const int DISP_SCALE = (1 << StereoMatcher::DISP_SHIFT);
1957
    int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1958

1959
    // setting up the ranges:
1960
    int src_start_idx = std::max(std::min(range.start * stripe_sz - stripe_overlap, height),0);
1961
    int src_end_idx   = std::min(range.end   * stripe_sz, height);
1962

1963
    int dst_offset;
1964
    if(range.start==0)
1965
        dst_offset=stripe_overlap;
1966
    else
1967
        dst_offset=0;
1968

1969
    Mat cur_buffer = buffers [range.start];
1970
    Mat cur_disp   = dst_disp[range.start];
1971
    cur_disp = Scalar(INVALID_DISP_SCALED);
1972

1973
    // prepare buffers:
1974
    CostType *curCostVolumeLine, *hsumBuf, *pixDiff;
1975
    PixType* tmpBuf;
1976
    CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf;
1977
    short* disp2Buf;
1978
    getBufferPointers(cur_buffer,width,width1,D,img1->channels(),SH2,P2,
1979
                      curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume,
1980
                      vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf);
1981

1982
    // start real processing:
1983
    for(int y=src_start_idx;y<src_end_idx;y++)
1984
    {
1985
        getRawMatchingCost(curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,y,src_start_idx);
1986

1987
        short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx));
1988

1989
        // initialize the auxiliary buffers for the pseudo left-right consistency check:
1990
        for(int x=0;x<width;x++)
1991
        {
1992
            disp2Buf[x] = (short)INVALID_DISP_SCALED;
1993
            disp2CostBuf[x] = SHRT_MAX;
1994
        }
1995
        CostType* C = curCostVolumeLine - D;
1996
        CostType prev_min, min_cost;
1997
        int d, best_d;
1998
        d = best_d = 0;
1999

2000
        // forward pass
2001
        prev_min=0;
2002
        for (int x=D;x<(1+width1)*D;x+=D)
2003
            accumulateCostsLeftTop(horPassCostVolume+x,horPassCostVolume+x-D,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/D],D,P1,P2);
2004

2005
        //backward pass
2006
        memset(rightPassBuf,0,D*sizeof(CostType));
2007
        prev_min=0;
2008
        for (int x=width1*D;x>=D;x-=D)
2009
        {
2010
            accumulateCostsRight(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost);
2011

2012
            if(uniquenessRatio>0)
2013
            {
2014
#if CV_SIMD128
2015
                if(useSIMD)
2016
                {
2017
                    horPassCostVolume+=x;
2018
                    int thresh = (100*min_cost)/(100-uniquenessRatio);
2019
                    v_int16x8 thresh_reg = v_setall_s16((short)(thresh+1));
2020
                    v_int16x8 d1 = v_setall_s16((short)(best_d-1));
2021
                    v_int16x8 d2 = v_setall_s16((short)(best_d+1));
2022
                    v_int16x8 eight_reg = v_setall_s16(8);
2023
                    v_int16x8 cur_d(0,1,2,3,4,5,6,7);
2024
                    v_int16x8 mask,cost1,cost2;
2025

2026
                    for( d = 0; d < D; d+=16 )
2027
                    {
2028
                        cost1 = v_load_aligned(horPassCostVolume+d);
2029
                        cost2 = v_load_aligned(horPassCostVolume+d+8);
2030

2031
                        mask = cost1 < thresh_reg;
2032
                        mask = mask & ( (cur_d<d1) | (cur_d>d2) );
2033
                        if( v_signmask(mask) )
2034
                            break;
2035

2036
                        cur_d = cur_d+eight_reg;
2037

2038
                        mask = cost2 < thresh_reg;
2039
                        mask = mask & ( (cur_d<d1) | (cur_d>d2) );
2040
                        if( v_signmask(mask) )
2041
                            break;
2042

2043
                        cur_d = cur_d+eight_reg;
2044
                    }
2045
                    horPassCostVolume-=x;
2046
                }
2047
                else
2048
#endif
2049
                {
2050
                    for( d = 0; d < D; d++ )
2051
                    {
2052
                        if( horPassCostVolume[x+d]*(100 - uniquenessRatio) < min_cost*100 && std::abs(d - best_d) > 1 )
2053
                            break;
2054
                    }
2055
                }
2056
                if( d < D )
2057
                    continue;
2058
            }
2059
            d = best_d;
2060

2061
            int _x2 = x/D - 1 + minX1 - d - minD;
2062
            if( _x2>=0 && _x2<width && disp2CostBuf[_x2] > min_cost )
2063
            {
2064
                disp2CostBuf[_x2] = min_cost;
2065
                disp2Buf[_x2] = (short)(d + minD);
2066
            }
2067

2068
            if( 0 < d && d < D-1 )
2069
            {
2070
                // do subpixel quadratic interpolation:
2071
                //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
2072
                //   then find minimum of the parabola.
2073
                int denom2 = std::max(horPassCostVolume[x+d-1] + horPassCostVolume[x+d+1] - 2*horPassCostVolume[x+d], 1);
2074
                d = d*DISP_SCALE + ((horPassCostVolume[x+d-1] - horPassCostVolume[x+d+1])*DISP_SCALE + denom2)/(denom2*2);
2075
            }
2076
            else
2077
                d *= DISP_SCALE;
2078

2079
            disp_row[(x/D)-1 + minX1] = (DispType)(d + minD*DISP_SCALE);
2080
        }
2081

2082
        for(int x = minX1; x < maxX1; x++ )
2083
        {
2084
            // pseudo LRC consistency check using only one disparity map;
2085
            // pixels with difference more than disp12MaxDiff are invalidated
2086
            int d1 = disp_row[x];
2087
            if( d1 == INVALID_DISP_SCALED )
2088
                continue;
2089
            int _d = d1 >> StereoMatcher::DISP_SHIFT;
2090
            int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT;
2091
            int _x = x - _d, x_ = x - d_;
2092
            if( 0 <= _x && _x < width && disp2Buf[_x] >= minD && std::abs(disp2Buf[_x] - _d) > disp12MaxDiff &&
2093
                0 <= x_ && x_ < width && disp2Buf[x_] >= minD && std::abs(disp2Buf[x_] - d_) > disp12MaxDiff )
2094
                disp_row[x] = (short)INVALID_DISP_SCALED;
2095
        }
2096
    }
2097
}
2098

2099
static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2,
2100
                                      Mat& disp1, const StereoSGBMParams& params,
2101
                                      Mat* buffers, int nstripes )
2102
{
2103
    // precompute a lookup table for the raw matching cost computation:
2104
    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
2105
    PixType* clipTab = new PixType[TAB_SIZE];
2106
    int ftzero = std::max(params.preFilterCap, 15) | 1;
2107
    for(int k = 0; k < TAB_SIZE; k++ )
2108
        clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
2109

2110
    // allocate separate dst_disp arrays to avoid conflicts due to stripe overlap:
2111
    int stripe_sz = (int)ceil(img1.rows/(double)nstripes);
2112
    int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz);
2113
    Mat* dst_disp = new Mat[nstripes];
2114
    for(int i=0;i<nstripes;i++)
2115
        dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S);
2116

2117
    parallel_for_(Range(0,nstripes),SGBM3WayMainLoop(buffers,img1,img2,dst_disp,params,clipTab,nstripes,stripe_overlap));
2118

2119
    //assemble disp1 from dst_disp:
2120
    short* dst_row;
2121
    short* src_row;
2122
    for(int i=0;i<disp1.rows;i++)
2123
    {
2124
        dst_row = (short*)disp1.ptr(i);
2125
        src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz);
2126
        memcpy(dst_row,src_row,disp1.cols*sizeof(short));
2127
    }
2128

2129
    delete[] clipTab;
2130
    delete[] dst_disp;
2131
}
2132

2133
class StereoSGBMImpl CV_FINAL : public StereoSGBM
2134
{
2135
public:
2136
    StereoSGBMImpl()
2137
    {
2138
        params = StereoSGBMParams();
2139
    }
2140

2141
    StereoSGBMImpl( int _minDisparity, int _numDisparities, int _SADWindowSize,
2142
                    int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
2143
                    int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
2144
                    int _mode )
2145
    {
2146
        params = StereoSGBMParams( _minDisparity, _numDisparities, _SADWindowSize,
2147
                                   _P1, _P2, _disp12MaxDiff, _preFilterCap,
2148
                                   _uniquenessRatio, _speckleWindowSize, _speckleRange,
2149
                                   _mode );
2150
    }
2151

2152
    void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr ) CV_OVERRIDE
2153
    {
2154
        CV_INSTRUMENT_REGION();
2155

2156
        Mat left = leftarr.getMat(), right = rightarr.getMat();
2157
        CV_Assert( left.size() == right.size() && left.type() == right.type() &&
2158
                   left.depth() == CV_8U );
2159

2160
        disparr.create( left.size(), CV_16S );
2161
        Mat disp = disparr.getMat();
2162

2163
        if(params.mode==MODE_SGBM_3WAY)
2164
            computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );
2165
        else if(params.mode==MODE_HH4)
2166
            computeDisparitySGBM_HH4( left, right, disp, params, buffer );
2167
        else
2168
            computeDisparitySGBM( left, right, disp, params, buffer );
2169

2170
        medianBlur(disp, disp, 3);
2171

2172
        if( params.speckleWindowSize > 0 )
2173
            filterSpeckles(disp, (params.minDisparity - 1)*StereoMatcher::DISP_SCALE, params.speckleWindowSize,
2174
                           StereoMatcher::DISP_SCALE*params.speckleRange, buffer);
2175
    }
2176

2177
    int getMinDisparity() const CV_OVERRIDE { return params.minDisparity; }
2178
    void setMinDisparity(int minDisparity) CV_OVERRIDE { params.minDisparity = minDisparity; }
2179

2180
    int getNumDisparities() const CV_OVERRIDE { return params.numDisparities; }
2181
    void setNumDisparities(int numDisparities) CV_OVERRIDE { params.numDisparities = numDisparities; }
2182

2183
    int getBlockSize() const CV_OVERRIDE { return params.SADWindowSize; }
2184
    void setBlockSize(int blockSize) CV_OVERRIDE { params.SADWindowSize = blockSize; }
2185

2186
    int getSpeckleWindowSize() const CV_OVERRIDE { return params.speckleWindowSize; }
2187
    void setSpeckleWindowSize(int speckleWindowSize) CV_OVERRIDE { params.speckleWindowSize = speckleWindowSize; }
2188

2189
    int getSpeckleRange() const CV_OVERRIDE { return params.speckleRange; }
2190
    void setSpeckleRange(int speckleRange) CV_OVERRIDE { params.speckleRange = speckleRange; }
2191

2192
    int getDisp12MaxDiff() const CV_OVERRIDE { return params.disp12MaxDiff; }
2193
    void setDisp12MaxDiff(int disp12MaxDiff) CV_OVERRIDE { params.disp12MaxDiff = disp12MaxDiff; }
2194

2195
    int getPreFilterCap() const CV_OVERRIDE { return params.preFilterCap; }
2196
    void setPreFilterCap(int preFilterCap) CV_OVERRIDE { params.preFilterCap = preFilterCap; }
2197

2198
    int getUniquenessRatio() const CV_OVERRIDE { return params.uniquenessRatio; }
2199
    void setUniquenessRatio(int uniquenessRatio) CV_OVERRIDE { params.uniquenessRatio = uniquenessRatio; }
2200

2201
    int getP1() const CV_OVERRIDE { return params.P1; }
2202
    void setP1(int P1) CV_OVERRIDE { params.P1 = P1; }
2203

2204
    int getP2() const CV_OVERRIDE { return params.P2; }
2205
    void setP2(int P2) CV_OVERRIDE { params.P2 = P2; }
2206

2207
    int getMode() const CV_OVERRIDE { return params.mode; }
2208
    void setMode(int mode) CV_OVERRIDE { params.mode = mode; }
2209

2210
    void write(FileStorage& fs) const CV_OVERRIDE
2211
    {
2212
        writeFormat(fs);
2213
        fs << "name" << name_
2214
        << "minDisparity" << params.minDisparity
2215
        << "numDisparities" << params.numDisparities
2216
        << "blockSize" << params.SADWindowSize
2217
        << "speckleWindowSize" << params.speckleWindowSize
2218
        << "speckleRange" << params.speckleRange
2219
        << "disp12MaxDiff" << params.disp12MaxDiff
2220
        << "preFilterCap" << params.preFilterCap
2221
        << "uniquenessRatio" << params.uniquenessRatio
2222
        << "P1" << params.P1
2223
        << "P2" << params.P2
2224
        << "mode" << params.mode;
2225
    }
2226

2227
    void read(const FileNode& fn) CV_OVERRIDE
2228
    {
2229
        FileNode n = fn["name"];
2230
        CV_Assert( n.isString() && String(n) == name_ );
2231
        params.minDisparity = (int)fn["minDisparity"];
2232
        params.numDisparities = (int)fn["numDisparities"];
2233
        params.SADWindowSize = (int)fn["blockSize"];
2234
        params.speckleWindowSize = (int)fn["speckleWindowSize"];
2235
        params.speckleRange = (int)fn["speckleRange"];
2236
        params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
2237
        params.preFilterCap = (int)fn["preFilterCap"];
2238
        params.uniquenessRatio = (int)fn["uniquenessRatio"];
2239
        params.P1 = (int)fn["P1"];
2240
        params.P2 = (int)fn["P2"];
2241
        params.mode = (int)fn["mode"];
2242
    }
2243

2244
    StereoSGBMParams params;
2245
    Mat buffer;
2246

2247
    // the number of stripes is fixed, disregarding the number of threads/processors
2248
    // to make the results fully reproducible:
2249
    static const int num_stripes = 4;
2250
    Mat buffers[num_stripes];
2251

2252
    static const char* name_;
2253
};
2254

2255
const char* StereoSGBMImpl::name_ = "StereoMatcher.SGBM";
2256

2257

2258
Ptr<StereoSGBM> StereoSGBM::create(int minDisparity, int numDisparities, int SADWindowSize,
2259
                                 int P1, int P2, int disp12MaxDiff,
2260
                                 int preFilterCap, int uniquenessRatio,
2261
                                 int speckleWindowSize, int speckleRange,
2262
                                 int mode)
2263
{
2264
    return Ptr<StereoSGBM>(
2265
        new StereoSGBMImpl(minDisparity, numDisparities, SADWindowSize,
2266
                           P1, P2, disp12MaxDiff,
2267
                           preFilterCap, uniquenessRatio,
2268
                           speckleWindowSize, speckleRange,
2269
                           mode));
2270
}
2271

2272
Rect getValidDisparityROI( Rect roi1, Rect roi2,
2273
                          int minDisparity,
2274
                          int numberOfDisparities,
2275
                          int SADWindowSize )
2276
{
2277
    int SW2 = SADWindowSize/2;
2278
    int maxD = minDisparity + numberOfDisparities - 1;
2279

2280
    int xmin = std::max(roi1.x, roi2.x + maxD) + SW2;
2281
    int xmax = std::min(roi1.x + roi1.width, roi2.x + roi2.width) - SW2;
2282
    int ymin = std::max(roi1.y, roi2.y) + SW2;
2283
    int ymax = std::min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;
2284

2285
    Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
2286

2287
    return r.width > 0 && r.height > 0 ? r : Rect();
2288
}
2289

2290
typedef cv::Point_<short> Point2s;
2291

2292
template <typename T>
2293
void filterSpecklesImpl(cv::Mat& img, int newVal, int maxSpeckleSize, int maxDiff, cv::Mat& _buf)
2294
{
2295
    using namespace cv;
2296

2297
    int width = img.cols, height = img.rows, npixels = width*height;
2298
    size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
2299
    if( !_buf.isContinuous() || _buf.empty() || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )
2300
        _buf.reserveBuffer(bufSize);
2301

2302
    uchar* buf = _buf.ptr();
2303
    int i, j, dstep = (int)(img.step/sizeof(T));
2304
    int* labels = (int*)buf;
2305
    buf += npixels*sizeof(labels[0]);
2306
    Point2s* wbuf = (Point2s*)buf;
2307
    buf += npixels*sizeof(wbuf[0]);
2308
    uchar* rtype = (uchar*)buf;
2309
    int curlabel = 0;
2310

2311
    // clear out label assignments
2312
    memset(labels, 0, npixels*sizeof(labels[0]));
2313

2314
    for( i = 0; i < height; i++ )
2315
    {
2316
        T* ds = img.ptr<T>(i);
2317
        int* ls = labels + width*i;
2318

2319
        for( j = 0; j < width; j++ )
2320
        {
2321
            if( ds[j] != newVal )   // not a bad disparity
2322
            {
2323
                if( ls[j] )     // has a label, check for bad label
2324
                {
2325
                    if( rtype[ls[j]] ) // small region, zero out disparity
2326
                        ds[j] = (T)newVal;
2327
                }
2328
                // no label, assign and propagate
2329
                else
2330
                {
2331
                    Point2s* ws = wbuf; // initialize wavefront
2332
                    Point2s p((short)j, (short)i);  // current pixel
2333
                    curlabel++; // next label
2334
                    int count = 0;  // current region size
2335
                    ls[j] = curlabel;
2336

2337
                    // wavefront propagation
2338
                    while( ws >= wbuf ) // wavefront not empty
2339
                    {
2340
                        count++;
2341
                        // put neighbors onto wavefront
2342
                        T* dpp = &img.at<T>(p.y, p.x);
2343
                        T dp = *dpp;
2344
                        int* lpp = labels + width*p.y + p.x;
2345

2346
                        if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )
2347
                        {
2348
                            lpp[+width] = curlabel;
2349
                            *ws++ = Point2s(p.x, p.y+1);
2350
                        }
2351

2352
                        if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )
2353
                        {
2354
                            lpp[-width] = curlabel;
2355
                            *ws++ = Point2s(p.x, p.y-1);
2356
                        }
2357

2358
                        if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )
2359
                        {
2360
                            lpp[+1] = curlabel;
2361
                            *ws++ = Point2s(p.x+1, p.y);
2362
                        }
2363

2364
                        if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )
2365
                        {
2366
                            lpp[-1] = curlabel;
2367
                            *ws++ = Point2s(p.x-1, p.y);
2368
                        }
2369

2370
                        // pop most recent and propagate
2371
                        // NB: could try least recent, maybe better convergence
2372
                        p = *--ws;
2373
                    }
2374

2375
                    // assign label type
2376
                    if( count <= maxSpeckleSize )   // speckle region
2377
                    {
2378
                        rtype[ls[j]] = 1;   // small region label
2379
                        ds[j] = (T)newVal;
2380
                    }
2381
                    else
2382
                        rtype[ls[j]] = 0;   // large region label
2383
                }
2384
            }
2385
        }
2386
    }
2387
}
2388

2389
#ifdef HAVE_IPP
2390
static bool ipp_filterSpeckles(Mat &img, int maxSpeckleSize, int newVal, int maxDiff, Mat &buffer)
2391
{
2392
#if IPP_VERSION_X100 >= 810
2393
    CV_INSTRUMENT_REGION_IPP();
2394

2395
    IppDataType dataType = ippiGetDataType(img.depth());
2396
    IppiSize    size     = ippiSize(img.size());
2397
    int         bufferSize;
2398

2399
    if(img.channels() != 1)
2400
        return false;
2401

2402
    if(dataType != ipp8u && dataType != ipp16s)
2403
        return false;
2404

2405
    if(ippiMarkSpecklesGetBufferSize(size, dataType, 1, &bufferSize) < 0)
2406
        return false;
2407

2408
    if(bufferSize && (buffer.empty() || (int)(buffer.step*buffer.rows) < bufferSize))
2409
        buffer.create(1, (int)bufferSize, CV_8U);
2410

2411
    switch(dataType)
2412
    {
2413
    case ipp8u:  return CV_INSTRUMENT_FUN_IPP(ippiMarkSpeckles_8u_C1IR, img.ptr<Ipp8u>(), (int)img.step, size, (Ipp8u)newVal, maxSpeckleSize, (Ipp8u)maxDiff, ippiNormL1, buffer.ptr<Ipp8u>()) >= 0;
2414
    case ipp16s: return CV_INSTRUMENT_FUN_IPP(ippiMarkSpeckles_16s_C1IR, img.ptr<Ipp16s>(), (int)img.step, size, (Ipp16s)newVal, maxSpeckleSize, (Ipp16s)maxDiff, ippiNormL1, buffer.ptr<Ipp8u>()) >= 0;
2415
    default:     return false;
2416
    }
2417
#else
2418
    CV_UNUSED(img); CV_UNUSED(maxSpeckleSize); CV_UNUSED(newVal); CV_UNUSED(maxDiff); CV_UNUSED(buffer);
2419
    return false;
2420
#endif
2421
}
2422
#endif
2423

2424
}
2425

2426
void cv::filterSpeckles( InputOutputArray _img, double _newval, int maxSpeckleSize,
2427
                         double _maxDiff, InputOutputArray __buf )
2428
{
2429
    CV_INSTRUMENT_REGION();
2430

2431
    Mat img = _img.getMat();
2432
    int type = img.type();
2433
    Mat temp, &_buf = __buf.needed() ? __buf.getMatRef() : temp;
2434
    CV_Assert( type == CV_8UC1 || type == CV_16SC1 );
2435

2436
    int newVal = cvRound(_newval), maxDiff = cvRound(_maxDiff);
2437

2438
    CV_IPP_RUN_FAST(ipp_filterSpeckles(img, maxSpeckleSize, newVal, maxDiff, _buf));
2439

2440
    if (type == CV_8UC1)
2441
        filterSpecklesImpl<uchar>(img, newVal, maxSpeckleSize, maxDiff, _buf);
2442
    else
2443
        filterSpecklesImpl<short>(img, newVal, maxSpeckleSize, maxDiff, _buf);
2444
}
2445

2446
void cv::validateDisparity( InputOutputArray _disp, InputArray _cost, int minDisparity,
2447
                            int numberOfDisparities, int disp12MaxDiff )
2448
{
2449
    CV_INSTRUMENT_REGION();
2450

2451
    Mat disp = _disp.getMat(), cost = _cost.getMat();
2452
    int cols = disp.cols, rows = disp.rows;
2453
    int minD = minDisparity, maxD = minDisparity + numberOfDisparities;
2454
    int x, minX1 = std::max(maxD, 0), maxX1 = cols + std::min(minD, 0);
2455
    AutoBuffer<int> _disp2buf(cols*2);
2456
    int* disp2buf = _disp2buf.data();
2457
    int* disp2cost = disp2buf + cols;
2458
    const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;
2459
    int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
2460
    int costType = cost.type();
2461

2462
    disp12MaxDiff *= DISP_SCALE;
2463

2464
    CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&
2465
              (costType == CV_16S || costType == CV_32S) &&
2466
              disp.size() == cost.size() );
2467

2468
    for( int y = 0; y < rows; y++ )
2469
    {
2470
        short* dptr = disp.ptr<short>(y);
2471

2472
        for( x = 0; x < cols; x++ )
2473
        {
2474
            disp2buf[x] = INVALID_DISP_SCALED;
2475
            disp2cost[x] = INT_MAX;
2476
        }
2477

2478
        if( costType == CV_16S )
2479
        {
2480
            const short* cptr = cost.ptr<short>(y);
2481

2482
            for( x = minX1; x < maxX1; x++ )
2483
            {
2484
                int d = dptr[x], c = cptr[x];
2485

2486
                if( d == INVALID_DISP_SCALED )
2487
                    continue;
2488

2489
                int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
2490

2491
                if( disp2cost[x2] > c )
2492
                {
2493
                    disp2cost[x2] = c;
2494
                    disp2buf[x2] = d;
2495
                }
2496
            }
2497
        }
2498
        else
2499
        {
2500
            const int* cptr = cost.ptr<int>(y);
2501

2502
            for( x = minX1; x < maxX1; x++ )
2503
            {
2504
                int d = dptr[x], c = cptr[x];
2505

2506
                if( d == INVALID_DISP_SCALED )
2507
                    continue;
2508

2509
                int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
2510

2511
                if( disp2cost[x2] > c )
2512
                {
2513
                    disp2cost[x2] = c;
2514
                    disp2buf[x2] = d;
2515
                }
2516
            }
2517
        }
2518

2519
        for( x = minX1; x < maxX1; x++ )
2520
        {
2521
            // we round the computed disparity both towards -inf and +inf and check
2522
            // if either of the corresponding disparities in disp2 is consistent.
2523
            // This is to give the computed disparity a chance to look valid if it is.
2524
            int d = dptr[x];
2525
            if( d == INVALID_DISP_SCALED )
2526
                continue;
2527
            int d0 = d >> DISP_SHIFT;
2528
            int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;
2529
            int x0 = x - d0, x1 = x - d1;
2530
            if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP_SCALED && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&
2531
                (0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP_SCALED && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )
2532
                dptr[x] = (short)INVALID_DISP_SCALED;
2533
        }
2534
    }
2535
}
2536

2537
Product

Resources

Company