Path: blob/master/modules/calib3d/src/stereosgbm.cpp
16339 views
/*M///////////////////////////////////////////////////////////////////////////////////////1//2// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.3//4// By downloading, copying, installing or using the software you agree to this license.5// If you do not agree to this license, do not download, install,6// copy or use the software.7//8//9// License Agreement10// For Open Source Computer Vision Library11//12// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.13// Copyright (C) 2009, Willow Garage Inc., all rights reserved.14// Copyright (C) 2013, OpenCV Foundation, all rights reserved.15// Third party copyrights are property of their respective owners.16//17// Redistribution and use in source and binary forms, with or without modification,18// are permitted provided that the following conditions are met:19//20// * Redistribution's of source code must retain the above copyright notice,21// this list of conditions and the following disclaimer.22//23// * Redistribution's in binary form must reproduce the above copyright notice,24// this list of conditions and the following disclaimer in the documentation25// and/or other materials provided with the distribution.26//27// * The name of the copyright holders may not be used to endorse or promote products28// derived from this software without specific prior written permission.29//30// This software is provided by the copyright holders and contributors "as is" and31// any express or implied warranties, including, but not limited to, the implied32// warranties of merchantability and fitness for a particular purpose are disclaimed.33// In no event shall the Intel Corporation or contributors be liable for any direct,34// indirect, incidental, special, exemplary, or consequential damages35// (including, but not limited to, procurement of substitute goods or services;36// loss of use, data, or profits; or business interruption) however caused37// and on any theory of liability, whether in contract, strict liability,38// or tort (including negligence or otherwise) arising in any way out of39// the use of this software, even if advised of the possibility of such damage.40//41//M*/4243/*44This is a variation of45"Stereo Processing by Semiglobal Matching and Mutual Information"46by Heiko Hirschmuller.4748We match blocks rather than individual pixels, thus the algorithm is called49SGBM (Semi-global block matching)50*/5152#include "precomp.hpp"53#include <limits.h>54#include "opencv2/core/hal/intrin.hpp"5556namespace cv57{5859typedef uchar PixType;60typedef short CostType;61typedef short DispType;6263enum { NR = 16, NR2 = NR/2 };646566struct StereoSGBMParams67{68StereoSGBMParams()69{70minDisparity = numDisparities = 0;71SADWindowSize = 0;72P1 = P2 = 0;73disp12MaxDiff = 0;74preFilterCap = 0;75uniquenessRatio = 0;76speckleWindowSize = 0;77speckleRange = 0;78mode = StereoSGBM::MODE_SGBM;79}8081StereoSGBMParams( int _minDisparity, int _numDisparities, int _SADWindowSize,82int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,83int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,84int _mode )85{86minDisparity = _minDisparity;87numDisparities = _numDisparities;88SADWindowSize = _SADWindowSize;89P1 = _P1;90P2 = _P2;91disp12MaxDiff = _disp12MaxDiff;92preFilterCap = _preFilterCap;93uniquenessRatio = _uniquenessRatio;94speckleWindowSize = _speckleWindowSize;95speckleRange = _speckleRange;96mode = _mode;97}9899int minDisparity;100int numDisparities;101int SADWindowSize;102int preFilterCap;103int uniquenessRatio;104int P1;105int P2;106int speckleWindowSize;107int speckleRange;108int disp12MaxDiff;109int mode;110};111112static const int DEFAULT_RIGHT_BORDER = -1;113/*114For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),115and for each disparity minD<=d<maxD the function116computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between117row1[x] and row2[x-d]. The subpixel algorithm from118"Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi119is used, hence the suffix BT.120121the temporary buffer should contain width2*2 elements122*/123static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,124int minD, int maxD, CostType* cost,125PixType* buffer, const PixType* tab,126int tabOfs, int , int xrange_min = 0, int xrange_max = DEFAULT_RIGHT_BORDER )127{128int x, c, width = img1.cols, cn = img1.channels();129int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);130int D = maxD - minD, width1 = maxX1 - minX1;131//This minX1 & maxX2 correction is defining which part of calculatable line must be calculated132//That is needs of parallel algorithm133xrange_min = (xrange_min < 0) ? 0: xrange_min;134xrange_max = (xrange_max == DEFAULT_RIGHT_BORDER) || (xrange_max > width1) ? width1 : xrange_max;135maxX1 = minX1 + xrange_max;136minX1 += xrange_min;137width1 = maxX1 - minX1;138int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);139int width2 = maxX2 - minX2;140const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);141PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;142#if CV_SIMD128143bool useSIMD = hasSIMD128();144#endif145146tab += tabOfs;147148for( c = 0; c < cn*2; c++ )149{150prow1[width*c] = prow1[width*c + width-1] =151prow2[width*c] = prow2[width*c + width-1] = tab[0];152}153154int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;155int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;156157int minX_cmn = std::min(minX1,minX2)-1;158int maxX_cmn = std::max(maxX1,maxX2)+1;159minX_cmn = std::max(minX_cmn, 1);160maxX_cmn = std::min(maxX_cmn, width - 1);161if( cn == 1 )162{163for( x = minX_cmn; x < maxX_cmn; x++ )164{165prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];166prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];167168prow1[x+width] = row1[x];169prow2[width-1-x+width] = row2[x];170}171}172else173{174for( x = minX_cmn; x < maxX_cmn; x++ )175{176prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];177prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];178prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];179180prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];181prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];182prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];183184prow1[x+width*3] = row1[x*3];185prow1[x+width*4] = row1[x*3+1];186prow1[x+width*5] = row1[x*3+2];187188prow2[width-1-x+width*3] = row2[x*3];189prow2[width-1-x+width*4] = row2[x*3+1];190prow2[width-1-x+width*5] = row2[x*3+2];191}192}193194memset( cost + xrange_min*D, 0, width1*D*sizeof(cost[0]) );195196buffer -= width-1-maxX2;197cost -= (minX1-xrange_min)*D + minD; // simplify the cost indices inside the loop198199for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )200{201int diff_scale = c < cn ? 0 : 2;202203// precompute204// v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and205// v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and206for( x = width-1-maxX2; x < width-1- minX2; x++ )207{208int v = prow2[x];209int vl = x > 0 ? (v + prow2[x-1])/2 : v;210int vr = x < width-1 ? (v + prow2[x+1])/2 : v;211int v0 = std::min(vl, vr); v0 = std::min(v0, v);212int v1 = std::max(vl, vr); v1 = std::max(v1, v);213buffer[x] = (PixType)v0;214buffer[x + width2] = (PixType)v1;215}216217for( x = minX1; x < maxX1; x++ )218{219int u = prow1[x];220int ul = x > 0 ? (u + prow1[x-1])/2 : u;221int ur = x < width-1 ? (u + prow1[x+1])/2 : u;222int u0 = std::min(ul, ur); u0 = std::min(u0, u);223int u1 = std::max(ul, ur); u1 = std::max(u1, u);224225#if CV_SIMD128226if( useSIMD )227{228v_uint8x16 _u = v_setall_u8((uchar)u), _u0 = v_setall_u8((uchar)u0);229v_uint8x16 _u1 = v_setall_u8((uchar)u1);230231for( int d = minD; d < maxD; d += 16 )232{233v_uint8x16 _v = v_load(prow2 + width-x-1 + d);234v_uint8x16 _v0 = v_load(buffer + width-x-1 + d);235v_uint8x16 _v1 = v_load(buffer + width-x-1 + d + width2);236v_uint8x16 c0 = v_max(_u - _v1, _v0 - _u);237v_uint8x16 c1 = v_max(_v - _u1, _u0 - _v);238v_uint8x16 diff = v_min(c0, c1);239240v_int16x8 _c0 = v_load_aligned(cost + x*D + d);241v_int16x8 _c1 = v_load_aligned(cost + x*D + d + 8);242243v_uint16x8 diff1,diff2;244v_expand(diff,diff1,diff2);245v_store_aligned(cost + x*D + d, _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));246v_store_aligned(cost + x*D + d + 8, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));247}248}249else250#endif251{252for( int d = minD; d < maxD; d++ )253{254int v = prow2[width-x-1 + d];255int v0 = buffer[width-x-1 + d];256int v1 = buffer[width-x-1 + d + width2];257int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);258int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);259260cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));261}262}263}264}265}266267268/*269computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.270that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).271minD <= d < maxD.272disp2full is the reverse disparity map, that is:273disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)274275note that disp1buf will have the same size as the roi and276disp2full will have the same size as img1 (or img2).277On exit disp2buf is not the final disparity, it is an intermediate result that becomes278final after all the tiles are processed.279280the disparity in disp1buf is written with sub-pixel accuracy281(4 fractional bits, see StereoSGBM::DISP_SCALE),282using quadratic interpolation, while the disparity in disp2buf283is written as is, without interpolation.284285disp2cost also has the same size as img1 (or img2).286It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.287*/288static void computeDisparitySGBM( const Mat& img1, const Mat& img2,289Mat& disp1, const StereoSGBMParams& params,290Mat& buffer )291{292#if CV_SIMD128293// maxDisparity is supposed to multiple of 16, so we can forget doing else294static const uchar LSBTab[] =295{2960, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,2975, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,2986, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,2995, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,3007, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,3015, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,3026, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,3035, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0304};305static const v_uint16x8 v_LSB = v_uint16x8(0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);306307bool useSIMD = hasSIMD128();308#endif309310const int ALIGN = 16;311const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;312const int DISP_SCALE = (1 << DISP_SHIFT);313const CostType MAX_COST = SHRT_MAX;314315int minD = params.minDisparity, maxD = minD + params.numDisparities;316Size SADWindowSize;317SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;318int ftzero = std::max(params.preFilterCap, 15) | 1;319int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;320int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;321int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);322int k, width = disp1.cols, height = disp1.rows;323int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);324int D = maxD - minD, width1 = maxX1 - minX1;325int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;326int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;327bool fullDP = params.mode == StereoSGBM::MODE_HH;328int npasses = fullDP ? 2 : 1;329const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;330PixType clipTab[TAB_SIZE];331332for( k = 0; k < TAB_SIZE; k++ )333clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);334335if( minX1 >= maxX1 )336{337disp1 = Scalar::all(INVALID_DISP_SCALED);338return;339}340341CV_Assert( D % 16 == 0 );342343// NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.344// if you change NR, please, modify the loop as well.345int D2 = D+16, NRD2 = NR2*D2;346347// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:348// for 8-way dynamic programming we need the current row and349// the previous row, i.e. 2 rows in total350const int NLR = 2;351const int LrBorder = NLR - 1;352353// for each possible stereo match (img1(x,y) <=> img2(x-d,y))354// we keep pixel difference cost (C) and the summary cost over NR directions (S).355// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)356size_t costBufSize = width1*D;357size_t CSBufSize = costBufSize*(fullDP ? height : 1);358size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;359int hsumBufNRows = SH2*2 + 2;360size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]361costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff362CSBufSize*2*sizeof(CostType) + // C, S363width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost364width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2365366if( buffer.empty() || !buffer.isContinuous() ||367buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )368buffer.reserveBuffer(totalBufSize);369370// summary cost over different (nDirs) directions371CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);372CostType* Sbuf = Cbuf + CSBufSize;373CostType* hsumBuf = Sbuf + CSBufSize;374CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;375376CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;377DispType* disp2ptr = (DispType*)(disp2cost + width);378PixType* tempBuf = (PixType*)(disp2ptr + width);379380// add P2 to every C(x,y). it saves a few operations in the inner loops381for(k = 0; k < (int)CSBufSize; k++ )382Cbuf[k] = (CostType)P2;383384for( int pass = 1; pass <= npasses; pass++ )385{386int x1, y1, x2, y2, dx, dy;387388if( pass == 1 )389{390y1 = 0; y2 = height; dy = 1;391x1 = 0; x2 = width1; dx = 1;392}393else394{395y1 = height-1; y2 = -1; dy = -1;396x1 = width1-1; x2 = -1; dx = -1;397}398399CostType *Lr[NLR]={0}, *minLr[NLR]={0};400401for( k = 0; k < NLR; k++ )402{403// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,404// and will occasionally use negative indices with the arrays405// we need to shift Lr[k] pointers by 1, to give the space for d=-1.406// however, then the alignment will be imperfect, i.e. bad for SSE,407// thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)408Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;409memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );410minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;411memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );412}413414for( int y = y1; y != y2; y += dy )415{416int x, d;417DispType* disp1ptr = disp1.ptr<DispType>(y);418CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);419CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);420421if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.422{423int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;424425for( k = dy1; k <= dy2; k++ )426{427CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;428429if( k < height )430{431calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );432433memset(hsumAdd, 0, D*sizeof(CostType));434for( x = 0; x <= SW2*D; x += D )435{436int scale = x == 0 ? SW2 + 1 : 1;437for( d = 0; d < D; d++ )438hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);439}440441if( y > 0 )442{443const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;444const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;445446for( x = D; x < width1*D; x += D )447{448const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);449const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);450451#if CV_SIMD128452if( useSIMD )453{454for( d = 0; d < D; d += 8 )455{456v_int16x8 hv = v_load(hsumAdd + x - D + d);457v_int16x8 Cx = v_load(Cprev + x + d);458v_int16x8 psub = v_load(pixSub + d);459v_int16x8 padd = v_load(pixAdd + d);460hv = (hv - psub + padd);461psub = v_load(hsumSub + x + d);462Cx = Cx - psub + hv;463v_store(hsumAdd + x + d, hv);464v_store(C + x + d, Cx);465}466}467else468#endif469{470for( d = 0; d < D; d++ )471{472int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);473C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);474}475}476}477}478else479{480for( x = D; x < width1*D; x += D )481{482const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);483const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);484485for( d = 0; d < D; d++ )486hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);487}488}489}490491if( y == 0 )492{493int scale = k == 0 ? SH2 + 1 : 1;494for( x = 0; x < width1*D; x++ )495C[x] = (CostType)(C[x] + hsumAdd[x]*scale);496}497}498499// also, clear the S buffer500for( k = 0; k < width1*D; k++ )501S[k] = 0;502}503504// clear the left and the right borders505memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );506memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );507memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );508memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );509510/*511[formula 13 in the paper]512compute L_r(p, d) = C(p, d) +513min(L_r(p-r, d),514L_r(p-r, d-1) + P1,515L_r(p-r, d+1) + P1,516min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)517where p = (x,y), r is one of the directions.518we process all the directions at once:5190: r=(-dx, 0)5201: r=(-1, -dy)5212: r=(0, -dy)5223: r=(1, -dy)5234: r=(-2, -dy)5245: r=(-1, -dy*2)5256: r=(1, -dy*2)5267: r=(2, -dy)527*/528529for( x = x1; x != x2; x += dx )530{531int xm = x*NR2, xd = xm*D2;532533int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;534int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;535536CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;537CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;538CostType* Lr_p2 = Lr[1] + xd + D2*2;539CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;540541Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =542Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;543544CostType* Lr_p = Lr[0] + xd;545const CostType* Cp = C + x*D;546CostType* Sp = S + x*D;547548#if CV_SIMD128549if( useSIMD )550{551v_int16x8 _P1 = v_setall_s16((short)P1);552553v_int16x8 _delta0 = v_setall_s16((short)delta0);554v_int16x8 _delta1 = v_setall_s16((short)delta1);555v_int16x8 _delta2 = v_setall_s16((short)delta2);556v_int16x8 _delta3 = v_setall_s16((short)delta3);557v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);558559for( d = 0; d < D; d += 8 )560{561v_int16x8 Cpd = v_load(Cp + d);562v_int16x8 L0, L1, L2, L3;563564L0 = v_load(Lr_p0 + d);565L1 = v_load(Lr_p1 + d);566L2 = v_load(Lr_p2 + d);567L3 = v_load(Lr_p3 + d);568569L0 = v_min(L0, (v_load(Lr_p0 + d - 1) + _P1));570L0 = v_min(L0, (v_load(Lr_p0 + d + 1) + _P1));571572L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));573L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));574575L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1));576L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1));577578L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));579L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));580581L0 = v_min(L0, _delta0);582L0 = ((L0 - _delta0) + Cpd);583584L1 = v_min(L1, _delta1);585L1 = ((L1 - _delta1) + Cpd);586587L2 = v_min(L2, _delta2);588L2 = ((L2 - _delta2) + Cpd);589590L3 = v_min(L3, _delta3);591L3 = ((L3 - _delta3) + Cpd);592593v_store(Lr_p + d, L0);594v_store(Lr_p + d + D2, L1);595v_store(Lr_p + d + D2*2, L2);596v_store(Lr_p + d + D2*3, L3);597598// Get minimum from in L0-L3599v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;600v_zip(L0, L2, t02L, t02H); // L0[0] L2[0] L0[1] L2[1]...601v_zip(L1, L3, t13L, t13H); // L1[0] L3[0] L1[1] L3[1]...602v_int16x8 t02 = v_min(t02L, t02H); // L0[i] L2[i] L0[i] L2[i]...603v_int16x8 t13 = v_min(t13L, t13H); // L1[i] L3[i] L1[i] L3[i]...604v_zip(t02, t13, t0123L, t0123H); // L0[i] L1[i] L2[i] L3[i]...605v_int16x8 t0 = v_min(t0123L, t0123H);606_minL0 = v_min(_minL0, t0);607608v_int16x8 Sval = v_load(Sp + d);609610L0 = L0 + L1;611L2 = L2 + L3;612Sval = Sval + L0;613Sval = Sval + L2;614615v_store(Sp + d, Sval);616}617618v_int32x4 minL, minH;619v_expand(_minL0, minL, minH);620v_pack_store(&minLr[0][xm], v_min(minL, minH));621}622else623#endif624{625int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;626627for( d = 0; d < D; d++ )628{629int Cpd = Cp[d], L0, L1, L2, L3;630631L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;632L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;633L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d-1] + P1, std::min(Lr_p2[d+1] + P1, delta2))) - delta2;634L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d-1] + P1, std::min(Lr_p3[d+1] + P1, delta3))) - delta3;635636Lr_p[d] = (CostType)L0;637minL0 = std::min(minL0, L0);638639Lr_p[d + D2] = (CostType)L1;640minL1 = std::min(minL1, L1);641642Lr_p[d + D2*2] = (CostType)L2;643minL2 = std::min(minL2, L2);644645Lr_p[d + D2*3] = (CostType)L3;646minL3 = std::min(minL3, L3);647648Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);649}650minLr[0][xm] = (CostType)minL0;651minLr[0][xm+1] = (CostType)minL1;652minLr[0][xm+2] = (CostType)minL2;653minLr[0][xm+3] = (CostType)minL3;654}655}656657if( pass == npasses )658{659for( x = 0; x < width; x++ )660{661disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;662disp2cost[x] = MAX_COST;663}664665for( x = width1 - 1; x >= 0; x-- )666{667CostType* Sp = S + x*D;668int minS = MAX_COST, bestDisp = -1;669670if( npasses == 1 )671{672int xm = x*NR2, xd = xm*D2;673674int minL0 = MAX_COST;675int delta0 = minLr[0][xm + NR2] + P2;676CostType* Lr_p0 = Lr[0] + xd + NRD2;677Lr_p0[-1] = Lr_p0[D] = MAX_COST;678CostType* Lr_p = Lr[0] + xd;679680const CostType* Cp = C + x*D;681682#if CV_SIMD128683if( useSIMD )684{685v_int16x8 _P1 = v_setall_s16((short)P1);686v_int16x8 _delta0 = v_setall_s16((short)delta0);687688v_int16x8 _minL0 = v_setall_s16((short)minL0);689v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);690v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);691692for( d = 0; d < D; d += 8 )693{694v_int16x8 Cpd = v_load(Cp + d);695v_int16x8 L0 = v_load(Lr_p0 + d);696697L0 = v_min(L0, v_load(Lr_p0 + d - 1) + _P1);698L0 = v_min(L0, v_load(Lr_p0 + d + 1) + _P1);699L0 = v_min(L0, _delta0);700L0 = L0 - _delta0 + Cpd;701702v_store(Lr_p + d, L0);703_minL0 = v_min(_minL0, L0);704L0 = L0 + v_load(Sp + d);705v_store(Sp + d, L0);706707v_int16x8 mask = _minS > L0;708_minS = v_min(_minS, L0);709_bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);710_d8 += _8;711}712short bestDispBuf[8];713v_store(bestDispBuf, _bestDisp);714715v_int32x4 min32L, min32H;716v_expand(_minL0, min32L, min32H);717minLr[0][xm] = (CostType)std::min(v_reduce_min(min32L), v_reduce_min(min32H));718719v_expand(_minS, min32L, min32H);720minS = std::min(v_reduce_min(min32L), v_reduce_min(min32H));721722v_int16x8 ss = v_setall_s16((short)minS);723v_uint16x8 minMask = v_reinterpret_as_u16(ss == _minS);724v_uint16x8 minBit = minMask & v_LSB;725726v_uint32x4 minBitL, minBitH;727v_expand(minBit, minBitL, minBitH);728729int idx = v_reduce_sum(minBitL) + v_reduce_sum(minBitH);730bestDisp = bestDispBuf[LSBTab[idx]];731}732else733#endif734{735for( d = 0; d < D; d++ )736{737int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;738739Lr_p[d] = (CostType)L0;740minL0 = std::min(minL0, L0);741742int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);743if( Sval < minS )744{745minS = Sval;746bestDisp = d;747}748}749minLr[0][xm] = (CostType)minL0;750}751}752else753{754#if CV_SIMD128755if( useSIMD )756{757v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);758v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);759760for( d = 0; d < D; d+= 8 )761{762v_int16x8 L0 = v_load(Sp + d);763v_int16x8 mask = L0 < _minS;764_minS = v_min( L0, _minS );765_bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);766_d8 = _d8 + _8;767}768v_int32x4 _d0, _d1;769v_expand(_minS, _d0, _d1);770minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));771v_int16x8 v_mask = v_setall_s16((short)minS) == _minS;772773_bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);774v_expand(_bestDisp, _d0, _d1);775bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));776}777else778#endif779{780for( d = 0; d < D; d++ )781{782int Sval = Sp[d];783if( Sval < minS )784{785minS = Sval;786bestDisp = d;787}788}789}790}791792for( d = 0; d < D; d++ )793{794if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )795break;796}797if( d < D )798continue;799d = bestDisp;800int _x2 = x + minX1 - d - minD;801if( disp2cost[_x2] > minS )802{803disp2cost[_x2] = (CostType)minS;804disp2ptr[_x2] = (DispType)(d + minD);805}806807if( 0 < d && d < D-1 )808{809// do subpixel quadratic interpolation:810// fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])811// then find minimum of the parabola.812int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);813d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);814}815else816d *= DISP_SCALE;817disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);818}819820for( x = minX1; x < maxX1; x++ )821{822// we round the computed disparity both towards -inf and +inf and check823// if either of the corresponding disparities in disp2 is consistent.824// This is to give the computed disparity a chance to look valid if it is.825int d1 = disp1ptr[x];826if( d1 == INVALID_DISP_SCALED )827continue;828int _d = d1 >> DISP_SHIFT;829int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;830int _x = x - _d, x_ = x - d_;831if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&8320 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )833disp1ptr[x] = (DispType)INVALID_DISP_SCALED;834}835}836837// now shift the cyclic buffers838std::swap( Lr[0], Lr[1] );839std::swap( minLr[0], minLr[1] );840}841}842}843844////////////////////////////////////////////////////////////////////////////////////////////845struct CalcVerticalSums: public ParallelLoopBody846{847CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params,848CostType* alignedBuf, PixType* _clipTab): img1(_img1), img2(_img2), clipTab(_clipTab)849{850minD = params.minDisparity;851maxD = minD + params.numDisparities;852SW2 = SH2 = (params.SADWindowSize > 0 ? params.SADWindowSize : 5)/2;853ftzero = std::max(params.preFilterCap, 15) | 1;854P1 = params.P1 > 0 ? params.P1 : 2;855P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);856height = img1.rows;857width = img1.cols;858int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);859D = maxD - minD;860width1 = maxX1 - minX1;861D2 = D + 16;862costBufSize = width1*D;863CSBufSize = costBufSize*height;864minLrSize = width1;865LrSize = minLrSize*D2;866hsumBufNRows = SH2*2 + 2;867Cbuf = alignedBuf;868Sbuf = Cbuf + CSBufSize;869hsumBuf = Sbuf + CSBufSize;870useSIMD = hasSIMD128();871}872873void operator()(const Range& range) const CV_OVERRIDE874{875static const CostType MAX_COST = SHRT_MAX;876static const int ALIGN = 16;877static const int TAB_OFS = 256*4;878static const int npasses = 2;879int x1 = range.start, x2 = range.end, k;880size_t pixDiffSize = ((x2 - x1) + 2*SW2)*D;881size_t auxBufsSize = pixDiffSize*sizeof(CostType) + //pixdiff size882width*16*img1.channels()*sizeof(PixType) + 32; //tempBuf883Mat auxBuff;884auxBuff.create(1, (int)auxBufsSize, CV_8U);885CostType* pixDiff = (CostType*)alignPtr(auxBuff.ptr(), ALIGN);886PixType* tempBuf = (PixType*)(pixDiff + pixDiffSize);887888// Simplification of index calculation889pixDiff -= (x1>SW2 ? (x1 - SW2): 0)*D;890891for( int pass = 1; pass <= npasses; pass++ )892{893int y1, y2, dy;894895if( pass == 1 )896{897y1 = 0; y2 = height; dy = 1;898}899else900{901y1 = height-1; y2 = -1; dy = -1;902}903904CostType *Lr[NLR]={0}, *minLr[NLR]={0};905906for( k = 0; k < NLR; k++ )907{908// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,909// and will occasionally use negative indices with the arrays910// we need to shift Lr[k] pointers by 1, to give the space for d=-1.911// however, then the alignment will be imperfect, i.e. bad for SSE,912// thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)913Lr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*k + 8;914memset( Lr[k] + x1*D2 - 8, 0, (x2-x1)*D2*sizeof(CostType) );915minLr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*NLR + minLrSize*k;916memset( minLr[k] + x1, 0, (x2-x1)*sizeof(CostType) );917}918919for( int y = y1; y != y2; y += dy )920{921int x, d;922CostType* C = Cbuf + y*costBufSize;923CostType* S = Sbuf + y*costBufSize;924925if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.926{927int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;928929for( k = dy1; k <= dy2; k++ )930{931CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;932933if( k < height )934{935calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero, x1 - SW2, x2 + SW2);936937memset(hsumAdd + x1*D, 0, D*sizeof(CostType));938for( x = (x1 - SW2)*D; x <= (x1 + SW2)*D; x += D )939{940int xbord = x <= 0 ? 0 : (x > (width1 - 1)*D? (width1 - 1)*D : x);941for( d = 0; d < D; d++ )942hsumAdd[x1*D + d] = (CostType)(hsumAdd[x1*D + d] + pixDiff[xbord + d]);943}944945if( y > 0 )946{947const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;948const CostType* Cprev = C - costBufSize;949950for( d = 0; d < D; d++ )951C[x1*D + d] = (CostType)(Cprev[x1*D + d] + hsumAdd[x1*D + d] - hsumSub[x1*D + d]);952953for( x = (x1+1)*D; x < x2*D; x += D )954{955const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);956const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);957958#if CV_SIMD128959if( useSIMD )960{961for( d = 0; d < D; d += 8 )962{963v_int16x8 hv = v_load(hsumAdd + x - D + d);964v_int16x8 Cx = v_load(Cprev + x + d);965v_int16x8 psub = v_load(pixSub + d);966v_int16x8 padd = v_load(pixAdd + d);967hv = (hv - psub + padd);968psub = v_load(hsumSub + x + d);969Cx = Cx - psub + hv;970v_store(hsumAdd + x + d, hv);971v_store(C + x + d, Cx);972}973}974else975#endif976{977for( d = 0; d < D; d++ )978{979int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);980C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);981}982}983}984}985else986{987for( x = (x1+1)*D; x < x2*D; x += D )988{989const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);990const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);991992for( d = 0; d < D; d++ )993hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);994}995}996}997998if( y == 0 )999{1000int scale = k == 0 ? SH2 + 1 : 1;1001for( x = x1*D; x < x2*D; x++ )1002C[x] = (CostType)(C[x] + hsumAdd[x]*scale);1003}1004}10051006// also, clear the S buffer1007for( k = x1*D; k < x2*D; k++ )1008S[k] = 0;1009}10101011// [formula 13 in the paper]1012// compute L_r(p, d) = C(p, d) +1013// min(L_r(p-r, d),1014// L_r(p-r, d-1) + P1,1015// L_r(p-r, d+1) + P1,1016// min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)1017// where p = (x,y), r is one of the directions.1018// we process one directions on first pass and other on second:1019// r=(0, dy), where dy=1 on first pass and dy=-1 on second10201021for( x = x1; x != x2; x++ )1022{1023int xd = x*D2;10241025int delta = minLr[1][x] + P2;10261027CostType* Lr_ppr = Lr[1] + xd;10281029Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;10301031CostType* Lr_p = Lr[0] + xd;1032const CostType* Cp = C + x*D;1033CostType* Sp = S + x*D;10341035#if CV_SIMD1281036if( useSIMD )1037{1038v_int16x8 _P1 = v_setall_s16((short)P1);10391040v_int16x8 _delta = v_setall_s16((short)delta);1041v_int16x8 _minL = v_setall_s16((short)MAX_COST);10421043for( d = 0; d < D; d += 8 )1044{1045v_int16x8 Cpd = v_load(Cp + d);1046v_int16x8 L;10471048L = v_load(Lr_ppr + d);10491050L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1));1051L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1));10521053L = v_min(L, _delta);1054L = ((L - _delta) + Cpd);10551056v_store(Lr_p + d, L);10571058// Get minimum from in L-L31059_minL = v_min(_minL, L);10601061v_int16x8 Sval = v_load(Sp + d);10621063Sval = Sval + L;10641065v_store(Sp + d, Sval);1066}10671068v_int32x4 min1, min2, min12;1069v_expand(_minL, min1, min2);1070min12 = v_min(min1,min2);1071minLr[0][x] = (CostType)v_reduce_min(min12);1072}1073else1074#endif1075{1076int minL = MAX_COST;10771078for( d = 0; d < D; d++ )1079{1080int Cpd = Cp[d], L;10811082L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;10831084Lr_p[d] = (CostType)L;1085minL = std::min(minL, L);10861087Sp[d] = saturate_cast<CostType>(Sp[d] + L);1088}1089minLr[0][x] = (CostType)minL;1090}1091}10921093// now shift the cyclic buffers1094std::swap( Lr[0], Lr[1] );1095std::swap( minLr[0], minLr[1] );1096}1097}1098}1099static const int NLR = 2;1100const Mat& img1;1101const Mat& img2;1102CostType* Cbuf;1103CostType* Sbuf;1104CostType* hsumBuf;1105PixType* clipTab;1106int minD;1107int maxD;1108int D;1109int D2;1110int SH2;1111int SW2;1112int width;1113int width1;1114int height;1115int P1;1116int P2;1117size_t costBufSize;1118size_t CSBufSize;1119size_t minLrSize;1120size_t LrSize;1121size_t hsumBufNRows;1122int ftzero;1123bool useSIMD;1124};11251126struct CalcHorizontalSums: public ParallelLoopBody1127{1128CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params,1129CostType* alignedBuf): img1(_img1), img2(_img2), disp1(_disp1)1130{1131minD = params.minDisparity;1132maxD = minD + params.numDisparities;1133P1 = params.P1 > 0 ? params.P1 : 2;1134P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);1135uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;1136disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;1137height = img1.rows;1138width = img1.cols;1139minX1 = std::max(maxD, 0);1140maxX1 = width + std::min(minD, 0);1141INVALID_DISP = minD - 1;1142INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;1143D = maxD - minD;1144width1 = maxX1 - minX1;1145costBufSize = width1*D;1146CSBufSize = costBufSize*height;1147D2 = D + 16;1148LrSize = 2 * D2;1149Cbuf = alignedBuf;1150Sbuf = Cbuf + CSBufSize;1151useSIMD = hasSIMD128();1152}11531154void operator()(const Range& range) const CV_OVERRIDE1155{1156int y1 = range.start, y2 = range.end;1157size_t auxBufsSize = LrSize * sizeof(CostType) + width*(sizeof(CostType) + sizeof(DispType)) + 32;11581159Mat auxBuff;1160auxBuff.create(1, (int)auxBufsSize, CV_8U);1161CostType *Lr = ((CostType*)alignPtr(auxBuff.ptr(), ALIGN)) + 8;1162CostType* disp2cost = Lr + LrSize;1163DispType* disp2ptr = (DispType*)(disp2cost + width);11641165CostType minLr;11661167for( int y = y1; y != y2; y++)1168{1169int x, d;1170DispType* disp1ptr = disp1.ptr<DispType>(y);1171CostType* C = Cbuf + y*costBufSize;1172CostType* S = Sbuf + y*costBufSize;11731174for( x = 0; x < width; x++ )1175{1176disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;1177disp2cost[x] = MAX_COST;1178}11791180// clear buffers1181memset( Lr - 8, 0, LrSize*sizeof(CostType) );1182Lr[-1] = Lr[D] = Lr[D2 - 1] = Lr[D2 + D] = MAX_COST;11831184minLr = 0;1185// [formula 13 in the paper]1186// compute L_r(p, d) = C(p, d) +1187// min(L_r(p-r, d),1188// L_r(p-r, d-1) + P1,1189// L_r(p-r, d+1) + P1,1190// min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)1191// where p = (x,y), r is one of the directions.1192// we process all the directions at once:1193// we process one directions on first pass and other on second:1194// r=(dx, 0), where dx=1 on first pass and dx=-1 on second1195for( x = 0; x != width1; x++)1196{1197int delta = minLr + P2;11981199CostType* Lr_ppr = Lr + ((x&1)? 0 : D2);12001201CostType* Lr_p = Lr + ((x&1)? D2 :0);1202const CostType* Cp = C + x*D;1203CostType* Sp = S + x*D;12041205#if CV_SIMD1281206if( useSIMD )1207{1208v_int16x8 _P1 = v_setall_s16((short)P1);12091210v_int16x8 _delta = v_setall_s16((short)delta);1211v_int16x8 _minL = v_setall_s16((short)MAX_COST);12121213for( d = 0; d < D; d += 8 )1214{1215v_int16x8 Cpd = v_load(Cp + d);1216v_int16x8 L;12171218L = v_load(Lr_ppr + d);12191220L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1));1221L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1));12221223L = v_min(L, _delta);1224L = ((L - _delta) + Cpd);12251226v_store(Lr_p + d, L);12271228// Get minimum from in L-L31229_minL = v_min(_minL, L);12301231v_int16x8 Sval = v_load(Sp + d);12321233Sval = Sval + L;12341235v_store(Sp + d, Sval);1236}12371238v_int32x4 min1, min2, min12;1239v_expand(_minL, min1, min2);1240min12 = v_min(min1,min2);1241minLr = (CostType)v_reduce_min(min12);1242}1243else1244#endif1245{1246minLr = MAX_COST;1247for( d = 0; d < D; d++ )1248{1249int Cpd = Cp[d], L;12501251L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;12521253Lr_p[d] = (CostType)L;1254minLr = (CostType)std::min((int)minLr, L);12551256Sp[d] = saturate_cast<CostType>(Sp[d] + L);1257}1258}1259}12601261memset( Lr - 8, 0, LrSize*sizeof(CostType) );1262Lr[-1] = Lr[D] = Lr[D2 - 1] = Lr[D2 + D] = MAX_COST;12631264minLr = 0;12651266for( x = width1-1; x != -1; x--)1267{1268int delta = minLr + P2;12691270CostType* Lr_ppr = Lr + ((x&1)? 0 :D2);12711272CostType* Lr_p = Lr + ((x&1)? D2 :0);1273const CostType* Cp = C + x*D;1274CostType* Sp = S + x*D;1275int minS = MAX_COST, bestDisp = -1;1276minLr = MAX_COST;12771278#if CV_SIMD1281279if( useSIMD )1280{1281v_int16x8 _P1 = v_setall_s16((short)P1);12821283v_int16x8 _delta = v_setall_s16((short)delta);1284v_int16x8 _minL = v_setall_s16((short)MAX_COST);12851286v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);1287v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);12881289for( d = 0; d < D; d+= 8 )1290{1291v_int16x8 Cpd = v_load(Cp + d);1292v_int16x8 L;12931294L = v_load(Lr_ppr + d);12951296L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1));1297L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1));12981299L = v_min(L, _delta);1300L = ((L - _delta) + Cpd);13011302v_store(Lr_p + d, L);13031304// Get minimum from in L-L31305_minL = v_min(_minL, L);13061307v_int16x8 Sval = v_load(Sp + d);13081309Sval = Sval + L;13101311v_int16x8 mask = Sval < _minS;1312_minS = v_min( Sval, _minS );1313_bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);1314_d8 = _d8 + _8;13151316v_store(Sp + d, Sval);1317}1318v_int32x4 min1, min2, min12;1319v_expand(_minL, min1, min2);1320min12 = v_min(min1,min2);1321minLr = (CostType)v_reduce_min(min12);13221323v_int32x4 _d0, _d1;1324v_expand(_minS, _d0, _d1);1325minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));1326v_int16x8 v_mask = v_setall_s16((short)minS) == _minS;13271328_bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);1329v_expand(_bestDisp, _d0, _d1);1330bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));1331}1332else1333#endif1334{1335for( d = 0; d < D; d++ )1336{1337int Cpd = Cp[d], L;13381339L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;13401341Lr_p[d] = (CostType)L;1342minLr = (CostType)std::min((int)minLr, L);13431344Sp[d] = saturate_cast<CostType>(Sp[d] + L);1345if( Sp[d] < minS )1346{1347minS = Sp[d];1348bestDisp = d;1349}1350}1351}1352//Some postprocessing procedures and saving1353for( d = 0; d < D; d++ )1354{1355if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )1356break;1357}1358if( d < D )1359continue;1360d = bestDisp;1361int _x2 = x + minX1 - d - minD;1362if( disp2cost[_x2] > minS )1363{1364disp2cost[_x2] = (CostType)minS;1365disp2ptr[_x2] = (DispType)(d + minD);1366}13671368if( 0 < d && d < D-1 )1369{1370// do subpixel quadratic interpolation:1371// fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])1372// then find minimum of the parabola.1373int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);1374d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);1375}1376else1377d *= DISP_SCALE;1378disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);1379}1380//Left-right check sanity procedure1381for( x = minX1; x < maxX1; x++ )1382{1383// we round the computed disparity both towards -inf and +inf and check1384// if either of the corresponding disparities in disp2 is consistent.1385// This is to give the computed disparity a chance to look valid if it is.1386int d1 = disp1ptr[x];1387if( d1 == INVALID_DISP_SCALED )1388continue;1389int _d = d1 >> DISP_SHIFT;1390int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;1391int _x = x - _d, x_ = x - d_;1392if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&13930 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )1394disp1ptr[x] = (DispType)INVALID_DISP_SCALED;1395}1396}1397}13981399static const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;1400static const int DISP_SCALE = (1 << DISP_SHIFT);1401static const CostType MAX_COST = SHRT_MAX;1402static const int ALIGN = 16;1403const Mat& img1;1404const Mat& img2;1405Mat& disp1;1406CostType* Cbuf;1407CostType* Sbuf;1408int minD;1409int maxD;1410int D;1411int D2;1412int width;1413int width1;1414int height;1415int P1;1416int P2;1417int minX1;1418int maxX1;1419size_t costBufSize;1420size_t CSBufSize;1421size_t LrSize;1422int INVALID_DISP;1423int INVALID_DISP_SCALED;1424int uniquenessRatio;1425int disp12MaxDiff;1426bool useSIMD;1427};1428/*1429computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.1430that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).1431minD <= d < maxD.14321433note that disp1buf will have the same size as the roi and1434On exit disp1buf is not the final disparity, it is an intermediate result that becomes1435final after all the tiles are processed.14361437the disparity in disp1buf is written with sub-pixel accuracy1438(4 fractional bits, see StereoSGBM::DISP_SCALE),1439using quadratic interpolation, while the disparity in disp2buf1440is written as is, without interpolation.1441*/1442static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,1443Mat& disp1, const StereoSGBMParams& params,1444Mat& buffer )1445{1446const int ALIGN = 16;1447const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;1448const int DISP_SCALE = (1 << DISP_SHIFT);1449int minD = params.minDisparity, maxD = minD + params.numDisparities;1450Size SADWindowSize;1451SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;1452int ftzero = std::max(params.preFilterCap, 15) | 1;1453int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);1454int k, width = disp1.cols, height = disp1.rows;1455int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);1456int D = maxD - minD, width1 = maxX1 - minX1;1457int SH2 = SADWindowSize.height/2;1458int INVALID_DISP = minD - 1;1459int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;1460const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;1461PixType clipTab[TAB_SIZE];14621463for( k = 0; k < TAB_SIZE; k++ )1464clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);14651466if( minX1 >= maxX1 )1467{1468disp1 = Scalar::all(INVALID_DISP_SCALED);1469return;1470}14711472CV_Assert( D % 16 == 0 );14731474int D2 = D+16;14751476// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:1477// for dynamic programming we need the current row and1478// the previous row, i.e. 2 rows in total1479const int NLR = 2;14801481// for each possible stereo match (img1(x,y) <=> img2(x-d,y))1482// we keep pixel difference cost (C) and the summary cost over 4 directions (S).1483// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)1484size_t costBufSize = width1*D;1485size_t CSBufSize = costBufSize*height;1486size_t minLrSize = width1 , LrSize = minLrSize*D2;1487int hsumBufNRows = SH2*2 + 2;1488size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]1489costBufSize*hsumBufNRows*sizeof(CostType) + // hsumBuf1490CSBufSize*2*sizeof(CostType) + 1024; // C, S14911492if( buffer.empty() || !buffer.isContinuous() ||1493buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )1494{1495buffer.reserveBuffer(totalBufSize);1496}14971498// summary cost over different (nDirs) directions1499CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);15001501// add P2 to every C(x,y). it saves a few operations in the inner loops1502for(k = 0; k < (int)CSBufSize; k++ )1503Cbuf[k] = (CostType)P2;15041505parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, Cbuf, clipTab),8);1506parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, Cbuf),8);15071508}15091510//////////////////////////////////////////////////////////////////////////////////////////////////////15111512void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,1513CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,1514PixType*& tmpBuf, CostType*& horPassCostVolume,1515CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,1516CostType*& disp2CostBuf, short*& disp2Buf);15171518struct SGBM3WayMainLoop : public ParallelLoopBody1519{1520Mat* buffers;1521const Mat *img1, *img2;1522Mat* dst_disp;15231524int nstripes, stripe_sz;1525int stripe_overlap;15261527int width,height;1528int minD, maxD, D;1529int minX1, maxX1, width1;15301531int SW2, SH2;1532int P1, P2;1533int uniquenessRatio, disp12MaxDiff;15341535int costBufSize, hsumBufNRows;1536int TAB_OFS, ftzero;15371538#if CV_SIMD1281539bool useSIMD;1540#endif15411542PixType* clipTab;15431544SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap);1545void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const;1546void operator () (const Range& range) const CV_OVERRIDE;1547};15481549SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap):1550buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_clipTab)1551{1552nstripes = _nstripes;1553stripe_overlap = _stripe_overlap;1554stripe_sz = (int)ceil(img1->rows/(double)nstripes);15551556width = img1->cols; height = img1->rows;1557minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;1558minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1;1559CV_Assert( D % 16 == 0 );15601561SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1;15621563P1 = params.P1 > 0 ? params.P1 : 2; P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);1564uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;1565disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;15661567costBufSize = width1*D;1568hsumBufNRows = SH2*2 + 2;1569TAB_OFS = 256*4;1570ftzero = std::max(params.preFilterCap, 15) | 1;15711572#if CV_SIMD1281573useSIMD = hasSIMD128();1574#endif1575}15761577void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,1578CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,1579PixType*& tmpBuf, CostType*& horPassCostVolume,1580CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,1581CostType*& disp2CostBuf, short*& disp2Buf)1582{1583// allocating all the required memory:1584int costVolumeLineSize = width1*D;1585int width1_ext = width1+2;1586int costVolumeLineSize_ext = width1_ext*D;1587int hsumBufNRows = SH2*2 + 2;15881589// main buffer to store matching costs for the current line:1590int curCostVolumeLineSize = costVolumeLineSize*sizeof(CostType);15911592// auxiliary buffers for the raw matching cost computation:1593int hsumBufSize = costVolumeLineSize*hsumBufNRows*sizeof(CostType);1594int pixDiffSize = costVolumeLineSize*sizeof(CostType);1595int tmpBufSize = width*16*num_ch*sizeof(PixType);15961597// auxiliary buffers for the matching cost aggregation:1598int horPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation1599int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation1600int vertPassMinSize = width1_ext*sizeof(CostType); // buffer for storing minimum costs from the previous line1601int rightPassBufSize = D*sizeof(CostType); // additional small buffer for the right-to-left pass16021603// buffers for the pseudo-LRC check:1604int disp2CostBufSize = width*sizeof(CostType);1605int disp2BufSize = width*sizeof(short);16061607// sum up the sizes of all the buffers:1608size_t totalBufSize = curCostVolumeLineSize +1609hsumBufSize +1610pixDiffSize +1611tmpBufSize +1612horPassCostVolumeSize +1613vertPassCostVolumeSize +1614vertPassMinSize +1615rightPassBufSize +1616disp2CostBufSize +1617disp2BufSize +161816; //to compensate for the alignPtr shifts16191620if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )1621buffer.reserveBuffer(totalBufSize);16221623// set up all the pointers:1624curCostVolumeLine = (CostType*)alignPtr(buffer.ptr(), 16);1625hsumBuf = curCostVolumeLine + costVolumeLineSize;1626pixDiff = hsumBuf + costVolumeLineSize*hsumBufNRows;1627tmpBuf = (PixType*)(pixDiff + costVolumeLineSize);1628horPassCostVolume = (CostType*)(tmpBuf + width*16*num_ch);1629vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext;1630rightPassBuf = vertPassCostVolume + costVolumeLineSize_ext;1631vertPassMin = rightPassBuf + D;1632disp2CostBuf = vertPassMin + width1_ext;1633disp2Buf = disp2CostBuf + width;16341635// initialize memory:1636memset(buffer.ptr(),0,totalBufSize);1637for(int i=0;i<costVolumeLineSize;i++)1638curCostVolumeLine[i] = (CostType)P2; //such initialization simplifies the cost aggregation loops a bit1639}16401641// performing block matching and building raw cost-volume for the current row1642void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row1643CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, //buffers1644int y, int src_start_idx) const1645{1646int x, d;1647int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1;16481649for(int k = dy1; k <= dy2; k++ )1650{1651CostType* hsumAdd = hsumBuf + (std::min(k, height-1) % hsumBufNRows)*costBufSize;1652if( k < height )1653{1654calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab, TAB_OFS, ftzero );16551656memset(hsumAdd, 0, D*sizeof(CostType));1657for(x = 0; x <= SW2*D; x += D )1658{1659int scale = x == 0 ? SW2 + 1 : 1;16601661for( d = 0; d < D; d++ )1662hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);1663}16641665if( y > src_start_idx )1666{1667const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;16681669for( x = D; x < width1*D; x += D )1670{1671const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);1672const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);16731674#if CV_SIMD1281675if(useSIMD)1676{1677v_int16x8 hv_reg;1678for( d = 0; d < D; d+=8 )1679{1680hv_reg = v_load_aligned(hsumAdd+x-D+d) + (v_load_aligned(pixAdd+d) - v_load_aligned(pixSub+d));1681v_store_aligned(hsumAdd+x+d,hv_reg);1682v_store_aligned(C+x+d,v_load_aligned(C+x+d)+(hv_reg-v_load_aligned(hsumSub+x+d)));1683}1684}1685else1686#endif1687{1688for( d = 0; d < D; d++ )1689{1690int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);1691C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]);1692}1693}1694}1695}1696else1697{1698for( x = D; x < width1*D; x += D )1699{1700const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);1701const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);17021703for( d = 0; d < D; d++ )1704hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);1705}1706}1707}17081709if( y == src_start_idx )1710{1711int scale = k == src_start_idx ? SH2 + 1 : 1;1712for( x = 0; x < width1*D; x++ )1713C[x] = (CostType)(C[x] + hsumAdd[x]*scale);1714}1715}1716}17171718#if CV_SIMD1281719// define some additional reduce operations:1720inline short min_pos(const v_int16x8& val, const v_int16x8& pos, const short min_val)1721{1722v_int16x8 v_min = v_setall_s16(min_val);1723v_int16x8 v_mask = v_min == val;1724v_int16x8 v_pos = (pos & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);17251726return v_reduce_min(v_pos);1727}1728#endif17291730// performing SGM cost accumulation from left to right (result is stored in leftBuf) and1731// in-place cost accumulation from top to bottom (result is stored in topBuf)1732inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs,1733CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2)1734{1735#if CV_SIMD1281736if(hasSIMD128())1737{1738v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));17391740v_int16x8 leftMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));1741v_int16x8 leftMinCost_new_reg = v_setall_s16(SHRT_MAX);1742v_int16x8 src0_leftBuf = v_setall_s16(SHRT_MAX);1743v_int16x8 src1_leftBuf = v_load_aligned(leftBuf_prev);17441745v_int16x8 topMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(topMinCost+P2));1746v_int16x8 topMinCost_new_reg = v_setall_s16(SHRT_MAX);1747v_int16x8 src0_topBuf = v_setall_s16(SHRT_MAX);1748v_int16x8 src1_topBuf = v_load_aligned(topBuf);17491750v_int16x8 src2;1751v_int16x8 src_shifted_left,src_shifted_right;1752v_int16x8 res;17531754for(int i=0;i<D-8;i+=8)1755{1756//process leftBuf:1757//lookahead load:1758src2 = v_load_aligned(leftBuf_prev+i+8);17591760//get shifted versions of the current block and add P1:1761src_shifted_left = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg;1762src_shifted_right = v_extract<1> (src1_leftBuf,src2 ) + P1_reg;17631764// process and save current block:1765res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);1766leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);1767v_store_aligned(leftBuf+i, res);17681769//update src buffers:1770src0_leftBuf = src1_leftBuf;1771src1_leftBuf = src2;17721773//process topBuf:1774//lookahead load:1775src2 = v_load_aligned(topBuf+i+8);17761777//get shifted versions of the current block and add P1:1778src_shifted_left = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg;1779src_shifted_right = v_extract<1> (src1_topBuf,src2 ) + P1_reg;17801781// process and save current block:1782res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);1783topMinCost_new_reg = v_min(topMinCost_new_reg,res);1784v_store_aligned(topBuf+i, res);17851786//update src buffers:1787src0_topBuf = src1_topBuf;1788src1_topBuf = src2;1789}17901791// a bit different processing for the last cycle of the loop:1792//process leftBuf:1793src2 = v_setall_s16(SHRT_MAX);1794src_shifted_left = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg;1795src_shifted_right = v_extract<1> (src1_leftBuf,src2 ) + P1_reg;17961797res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);1798leftMinCost = v_reduce_min(v_min(leftMinCost_new_reg,res));1799v_store_aligned(leftBuf+D-8, res);18001801//process topBuf:1802src2 = v_setall_s16(SHRT_MAX);1803src_shifted_left = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg;1804src_shifted_right = v_extract<1> (src1_topBuf,src2 ) + P1_reg;18051806res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);1807topMinCost = v_reduce_min(v_min(topMinCost_new_reg,res));1808v_store_aligned(topBuf+D-8, res);1809}1810else1811#endif1812{1813CostType leftMinCost_new = SHRT_MAX;1814CostType topMinCost_new = SHRT_MAX;1815int leftMinCost_P2 = leftMinCost + P2;1816int topMinCost_P2 = topMinCost + P2;1817CostType leftBuf_prev_i_minus_1 = SHRT_MAX;1818CostType topBuf_i_minus_1 = SHRT_MAX;1819CostType tmp;18201821for(int i=0;i<D-1;i++)1822{1823leftBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(leftBuf_prev_i_minus_1+P1,leftBuf_prev[i+1]+P1),std::min((int)leftBuf_prev[i],leftMinCost_P2))-leftMinCost_P2);1824leftBuf_prev_i_minus_1 = leftBuf_prev[i];1825leftMinCost_new = std::min(leftMinCost_new,leftBuf[i]);18261827tmp = topBuf[i];1828topBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(topBuf_i_minus_1+P1,topBuf[i+1]+P1),std::min((int)topBuf[i],topMinCost_P2))-topMinCost_P2);1829topBuf_i_minus_1 = tmp;1830topMinCost_new = std::min(topMinCost_new,topBuf[i]);1831}18321833leftBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(leftBuf_prev_i_minus_1+P1,std::min((int)leftBuf_prev[D-1],leftMinCost_P2))-leftMinCost_P2);1834leftMinCost = std::min(leftMinCost_new,leftBuf[D-1]);18351836topBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(topBuf_i_minus_1+P1,std::min((int)topBuf[D-1],topMinCost_P2))-topMinCost_P2);1837topMinCost = std::min(topMinCost_new,topBuf[D-1]);1838}1839}18401841// performing in-place SGM cost accumulation from right to left (the result is stored in rightBuf) and1842// summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the1843// optimal disparity value with minimum accumulated cost1844inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs,1845CostType& rightMinCost, int D, int P1, int P2, int& optimal_disp, CostType& min_cost)1846{1847#if CV_SIMD1281848if(hasSIMD128())1849{1850v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));18511852v_int16x8 rightMinCostP2_reg = v_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));1853v_int16x8 rightMinCost_new_reg = v_setall_s16(SHRT_MAX);1854v_int16x8 src0_rightBuf = v_setall_s16(SHRT_MAX);1855v_int16x8 src1_rightBuf = v_load(rightBuf);18561857v_int16x8 src2;1858v_int16x8 src_shifted_left,src_shifted_right;1859v_int16x8 res;18601861v_int16x8 min_sum_cost_reg = v_setall_s16(SHRT_MAX);1862v_int16x8 min_sum_pos_reg = v_setall_s16(0);1863v_int16x8 loop_idx(0,1,2,3,4,5,6,7);1864v_int16x8 eight_reg = v_setall_s16(8);18651866for(int i=0;i<D-8;i+=8)1867{1868//lookahead load:1869src2 = v_load_aligned(rightBuf+i+8);18701871//get shifted versions of the current block and add P1:1872src_shifted_left = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg;1873src_shifted_right = v_extract<1> (src1_rightBuf,src2 ) + P1_reg;18741875// process and save current block:1876res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);1877rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);1878v_store_aligned(rightBuf+i, res);18791880// compute and save total cost:1881res = res + v_load_aligned(leftBuf+i) + v_load_aligned(topBuf+i);1882v_store_aligned(leftBuf+i, res);18831884// track disparity value with the minimum cost:1885min_sum_cost_reg = v_min(min_sum_cost_reg,res);1886min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));1887loop_idx = loop_idx+eight_reg;18881889//update src:1890src0_rightBuf = src1_rightBuf;1891src1_rightBuf = src2;1892}18931894// a bit different processing for the last cycle of the loop:1895src2 = v_setall_s16(SHRT_MAX);1896src_shifted_left = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg;1897src_shifted_right = v_extract<1> (src1_rightBuf,src2 ) + P1_reg;18981899res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);1900rightMinCost = v_reduce_min(v_min(rightMinCost_new_reg,res));1901v_store_aligned(rightBuf+D-8, res);19021903res = res + v_load_aligned(leftBuf+D-8) + v_load_aligned(topBuf+D-8);1904v_store_aligned(leftBuf+D-8, res);19051906min_sum_cost_reg = v_min(min_sum_cost_reg,res);1907min_cost = v_reduce_min(min_sum_cost_reg);1908min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));1909optimal_disp = min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost);1910}1911else1912#endif1913{1914CostType rightMinCost_new = SHRT_MAX;1915int rightMinCost_P2 = rightMinCost + P2;1916CostType rightBuf_i_minus_1 = SHRT_MAX;1917CostType tmp;1918min_cost = SHRT_MAX;19191920for(int i=0;i<D-1;i++)1921{1922tmp = rightBuf[i];1923rightBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(rightBuf_i_minus_1+P1,rightBuf[i+1]+P1),std::min((int)rightBuf[i],rightMinCost_P2))-rightMinCost_P2);1924rightBuf_i_minus_1 = tmp;1925rightMinCost_new = std::min(rightMinCost_new,rightBuf[i]);1926leftBuf[i] = cv::saturate_cast<CostType>((int)leftBuf[i]+rightBuf[i]+topBuf[i]);1927if(leftBuf[i]<min_cost)1928{1929optimal_disp = i;1930min_cost = leftBuf[i];1931}1932}19331934rightBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(rightBuf_i_minus_1+P1,std::min((int)rightBuf[D-1],rightMinCost_P2))-rightMinCost_P2);1935rightMinCost = std::min(rightMinCost_new,rightBuf[D-1]);1936leftBuf[D-1] = cv::saturate_cast<CostType>((int)leftBuf[D-1]+rightBuf[D-1]+topBuf[D-1]);1937if(leftBuf[D-1]<min_cost)1938{1939optimal_disp = D-1;1940min_cost = leftBuf[D-1];1941}1942}1943}19441945void SGBM3WayMainLoop::operator () (const Range& range) const1946{1947// force separate processing of stripes:1948if(range.end>range.start+1)1949{1950for(int n=range.start;n<range.end;n++)1951(*this)(Range(n,n+1));1952return;1953}19541955const int DISP_SCALE = (1 << StereoMatcher::DISP_SHIFT);1956int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;19571958// setting up the ranges:1959int src_start_idx = std::max(std::min(range.start * stripe_sz - stripe_overlap, height),0);1960int src_end_idx = std::min(range.end * stripe_sz, height);19611962int dst_offset;1963if(range.start==0)1964dst_offset=stripe_overlap;1965else1966dst_offset=0;19671968Mat cur_buffer = buffers [range.start];1969Mat cur_disp = dst_disp[range.start];1970cur_disp = Scalar(INVALID_DISP_SCALED);19711972// prepare buffers:1973CostType *curCostVolumeLine, *hsumBuf, *pixDiff;1974PixType* tmpBuf;1975CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf;1976short* disp2Buf;1977getBufferPointers(cur_buffer,width,width1,D,img1->channels(),SH2,P2,1978curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume,1979vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf);19801981// start real processing:1982for(int y=src_start_idx;y<src_end_idx;y++)1983{1984getRawMatchingCost(curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,y,src_start_idx);19851986short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx));19871988// initialize the auxiliary buffers for the pseudo left-right consistency check:1989for(int x=0;x<width;x++)1990{1991disp2Buf[x] = (short)INVALID_DISP_SCALED;1992disp2CostBuf[x] = SHRT_MAX;1993}1994CostType* C = curCostVolumeLine - D;1995CostType prev_min, min_cost;1996int d, best_d;1997d = best_d = 0;19981999// forward pass2000prev_min=0;2001for (int x=D;x<(1+width1)*D;x+=D)2002accumulateCostsLeftTop(horPassCostVolume+x,horPassCostVolume+x-D,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/D],D,P1,P2);20032004//backward pass2005memset(rightPassBuf,0,D*sizeof(CostType));2006prev_min=0;2007for (int x=width1*D;x>=D;x-=D)2008{2009accumulateCostsRight(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost);20102011if(uniquenessRatio>0)2012{2013#if CV_SIMD1282014if(useSIMD)2015{2016horPassCostVolume+=x;2017int thresh = (100*min_cost)/(100-uniquenessRatio);2018v_int16x8 thresh_reg = v_setall_s16((short)(thresh+1));2019v_int16x8 d1 = v_setall_s16((short)(best_d-1));2020v_int16x8 d2 = v_setall_s16((short)(best_d+1));2021v_int16x8 eight_reg = v_setall_s16(8);2022v_int16x8 cur_d(0,1,2,3,4,5,6,7);2023v_int16x8 mask,cost1,cost2;20242025for( d = 0; d < D; d+=16 )2026{2027cost1 = v_load_aligned(horPassCostVolume+d);2028cost2 = v_load_aligned(horPassCostVolume+d+8);20292030mask = cost1 < thresh_reg;2031mask = mask & ( (cur_d<d1) | (cur_d>d2) );2032if( v_signmask(mask) )2033break;20342035cur_d = cur_d+eight_reg;20362037mask = cost2 < thresh_reg;2038mask = mask & ( (cur_d<d1) | (cur_d>d2) );2039if( v_signmask(mask) )2040break;20412042cur_d = cur_d+eight_reg;2043}2044horPassCostVolume-=x;2045}2046else2047#endif2048{2049for( d = 0; d < D; d++ )2050{2051if( horPassCostVolume[x+d]*(100 - uniquenessRatio) < min_cost*100 && std::abs(d - best_d) > 1 )2052break;2053}2054}2055if( d < D )2056continue;2057}2058d = best_d;20592060int _x2 = x/D - 1 + minX1 - d - minD;2061if( _x2>=0 && _x2<width && disp2CostBuf[_x2] > min_cost )2062{2063disp2CostBuf[_x2] = min_cost;2064disp2Buf[_x2] = (short)(d + minD);2065}20662067if( 0 < d && d < D-1 )2068{2069// do subpixel quadratic interpolation:2070// fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])2071// then find minimum of the parabola.2072int denom2 = std::max(horPassCostVolume[x+d-1] + horPassCostVolume[x+d+1] - 2*horPassCostVolume[x+d], 1);2073d = d*DISP_SCALE + ((horPassCostVolume[x+d-1] - horPassCostVolume[x+d+1])*DISP_SCALE + denom2)/(denom2*2);2074}2075else2076d *= DISP_SCALE;20772078disp_row[(x/D)-1 + minX1] = (DispType)(d + minD*DISP_SCALE);2079}20802081for(int x = minX1; x < maxX1; x++ )2082{2083// pseudo LRC consistency check using only one disparity map;2084// pixels with difference more than disp12MaxDiff are invalidated2085int d1 = disp_row[x];2086if( d1 == INVALID_DISP_SCALED )2087continue;2088int _d = d1 >> StereoMatcher::DISP_SHIFT;2089int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT;2090int _x = x - _d, x_ = x - d_;2091if( 0 <= _x && _x < width && disp2Buf[_x] >= minD && std::abs(disp2Buf[_x] - _d) > disp12MaxDiff &&20920 <= x_ && x_ < width && disp2Buf[x_] >= minD && std::abs(disp2Buf[x_] - d_) > disp12MaxDiff )2093disp_row[x] = (short)INVALID_DISP_SCALED;2094}2095}2096}20972098static void computeDisparity3WaySGBM( const Mat& img1, const Mat& img2,2099Mat& disp1, const StereoSGBMParams& params,2100Mat* buffers, int nstripes )2101{2102// precompute a lookup table for the raw matching cost computation:2103const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;2104PixType* clipTab = new PixType[TAB_SIZE];2105int ftzero = std::max(params.preFilterCap, 15) | 1;2106for(int k = 0; k < TAB_SIZE; k++ )2107clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);21082109// allocate separate dst_disp arrays to avoid conflicts due to stripe overlap:2110int stripe_sz = (int)ceil(img1.rows/(double)nstripes);2111int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz);2112Mat* dst_disp = new Mat[nstripes];2113for(int i=0;i<nstripes;i++)2114dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S);21152116parallel_for_(Range(0,nstripes),SGBM3WayMainLoop(buffers,img1,img2,dst_disp,params,clipTab,nstripes,stripe_overlap));21172118//assemble disp1 from dst_disp:2119short* dst_row;2120short* src_row;2121for(int i=0;i<disp1.rows;i++)2122{2123dst_row = (short*)disp1.ptr(i);2124src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz);2125memcpy(dst_row,src_row,disp1.cols*sizeof(short));2126}21272128delete[] clipTab;2129delete[] dst_disp;2130}21312132class StereoSGBMImpl CV_FINAL : public StereoSGBM2133{2134public:2135StereoSGBMImpl()2136{2137params = StereoSGBMParams();2138}21392140StereoSGBMImpl( int _minDisparity, int _numDisparities, int _SADWindowSize,2141int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,2142int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,2143int _mode )2144{2145params = StereoSGBMParams( _minDisparity, _numDisparities, _SADWindowSize,2146_P1, _P2, _disp12MaxDiff, _preFilterCap,2147_uniquenessRatio, _speckleWindowSize, _speckleRange,2148_mode );2149}21502151void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr ) CV_OVERRIDE2152{2153CV_INSTRUMENT_REGION();21542155Mat left = leftarr.getMat(), right = rightarr.getMat();2156CV_Assert( left.size() == right.size() && left.type() == right.type() &&2157left.depth() == CV_8U );21582159disparr.create( left.size(), CV_16S );2160Mat disp = disparr.getMat();21612162if(params.mode==MODE_SGBM_3WAY)2163computeDisparity3WaySGBM( left, right, disp, params, buffers, num_stripes );2164else if(params.mode==MODE_HH4)2165computeDisparitySGBM_HH4( left, right, disp, params, buffer );2166else2167computeDisparitySGBM( left, right, disp, params, buffer );21682169medianBlur(disp, disp, 3);21702171if( params.speckleWindowSize > 0 )2172filterSpeckles(disp, (params.minDisparity - 1)*StereoMatcher::DISP_SCALE, params.speckleWindowSize,2173StereoMatcher::DISP_SCALE*params.speckleRange, buffer);2174}21752176int getMinDisparity() const CV_OVERRIDE { return params.minDisparity; }2177void setMinDisparity(int minDisparity) CV_OVERRIDE { params.minDisparity = minDisparity; }21782179int getNumDisparities() const CV_OVERRIDE { return params.numDisparities; }2180void setNumDisparities(int numDisparities) CV_OVERRIDE { params.numDisparities = numDisparities; }21812182int getBlockSize() const CV_OVERRIDE { return params.SADWindowSize; }2183void setBlockSize(int blockSize) CV_OVERRIDE { params.SADWindowSize = blockSize; }21842185int getSpeckleWindowSize() const CV_OVERRIDE { return params.speckleWindowSize; }2186void setSpeckleWindowSize(int speckleWindowSize) CV_OVERRIDE { params.speckleWindowSize = speckleWindowSize; }21872188int getSpeckleRange() const CV_OVERRIDE { return params.speckleRange; }2189void setSpeckleRange(int speckleRange) CV_OVERRIDE { params.speckleRange = speckleRange; }21902191int getDisp12MaxDiff() const CV_OVERRIDE { return params.disp12MaxDiff; }2192void setDisp12MaxDiff(int disp12MaxDiff) CV_OVERRIDE { params.disp12MaxDiff = disp12MaxDiff; }21932194int getPreFilterCap() const CV_OVERRIDE { return params.preFilterCap; }2195void setPreFilterCap(int preFilterCap) CV_OVERRIDE { params.preFilterCap = preFilterCap; }21962197int getUniquenessRatio() const CV_OVERRIDE { return params.uniquenessRatio; }2198void setUniquenessRatio(int uniquenessRatio) CV_OVERRIDE { params.uniquenessRatio = uniquenessRatio; }21992200int getP1() const CV_OVERRIDE { return params.P1; }2201void setP1(int P1) CV_OVERRIDE { params.P1 = P1; }22022203int getP2() const CV_OVERRIDE { return params.P2; }2204void setP2(int P2) CV_OVERRIDE { params.P2 = P2; }22052206int getMode() const CV_OVERRIDE { return params.mode; }2207void setMode(int mode) CV_OVERRIDE { params.mode = mode; }22082209void write(FileStorage& fs) const CV_OVERRIDE2210{2211writeFormat(fs);2212fs << "name" << name_2213<< "minDisparity" << params.minDisparity2214<< "numDisparities" << params.numDisparities2215<< "blockSize" << params.SADWindowSize2216<< "speckleWindowSize" << params.speckleWindowSize2217<< "speckleRange" << params.speckleRange2218<< "disp12MaxDiff" << params.disp12MaxDiff2219<< "preFilterCap" << params.preFilterCap2220<< "uniquenessRatio" << params.uniquenessRatio2221<< "P1" << params.P12222<< "P2" << params.P22223<< "mode" << params.mode;2224}22252226void read(const FileNode& fn) CV_OVERRIDE2227{2228FileNode n = fn["name"];2229CV_Assert( n.isString() && String(n) == name_ );2230params.minDisparity = (int)fn["minDisparity"];2231params.numDisparities = (int)fn["numDisparities"];2232params.SADWindowSize = (int)fn["blockSize"];2233params.speckleWindowSize = (int)fn["speckleWindowSize"];2234params.speckleRange = (int)fn["speckleRange"];2235params.disp12MaxDiff = (int)fn["disp12MaxDiff"];2236params.preFilterCap = (int)fn["preFilterCap"];2237params.uniquenessRatio = (int)fn["uniquenessRatio"];2238params.P1 = (int)fn["P1"];2239params.P2 = (int)fn["P2"];2240params.mode = (int)fn["mode"];2241}22422243StereoSGBMParams params;2244Mat buffer;22452246// the number of stripes is fixed, disregarding the number of threads/processors2247// to make the results fully reproducible:2248static const int num_stripes = 4;2249Mat buffers[num_stripes];22502251static const char* name_;2252};22532254const char* StereoSGBMImpl::name_ = "StereoMatcher.SGBM";225522562257Ptr<StereoSGBM> StereoSGBM::create(int minDisparity, int numDisparities, int SADWindowSize,2258int P1, int P2, int disp12MaxDiff,2259int preFilterCap, int uniquenessRatio,2260int speckleWindowSize, int speckleRange,2261int mode)2262{2263return Ptr<StereoSGBM>(2264new StereoSGBMImpl(minDisparity, numDisparities, SADWindowSize,2265P1, P2, disp12MaxDiff,2266preFilterCap, uniquenessRatio,2267speckleWindowSize, speckleRange,2268mode));2269}22702271Rect getValidDisparityROI( Rect roi1, Rect roi2,2272int minDisparity,2273int numberOfDisparities,2274int SADWindowSize )2275{2276int SW2 = SADWindowSize/2;2277int maxD = minDisparity + numberOfDisparities - 1;22782279int xmin = std::max(roi1.x, roi2.x + maxD) + SW2;2280int xmax = std::min(roi1.x + roi1.width, roi2.x + roi2.width) - SW2;2281int ymin = std::max(roi1.y, roi2.y) + SW2;2282int ymax = std::min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;22832284Rect r(xmin, ymin, xmax - xmin, ymax - ymin);22852286return r.width > 0 && r.height > 0 ? r : Rect();2287}22882289typedef cv::Point_<short> Point2s;22902291template <typename T>2292void filterSpecklesImpl(cv::Mat& img, int newVal, int maxSpeckleSize, int maxDiff, cv::Mat& _buf)2293{2294using namespace cv;22952296int width = img.cols, height = img.rows, npixels = width*height;2297size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));2298if( !_buf.isContinuous() || _buf.empty() || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )2299_buf.reserveBuffer(bufSize);23002301uchar* buf = _buf.ptr();2302int i, j, dstep = (int)(img.step/sizeof(T));2303int* labels = (int*)buf;2304buf += npixels*sizeof(labels[0]);2305Point2s* wbuf = (Point2s*)buf;2306buf += npixels*sizeof(wbuf[0]);2307uchar* rtype = (uchar*)buf;2308int curlabel = 0;23092310// clear out label assignments2311memset(labels, 0, npixels*sizeof(labels[0]));23122313for( i = 0; i < height; i++ )2314{2315T* ds = img.ptr<T>(i);2316int* ls = labels + width*i;23172318for( j = 0; j < width; j++ )2319{2320if( ds[j] != newVal ) // not a bad disparity2321{2322if( ls[j] ) // has a label, check for bad label2323{2324if( rtype[ls[j]] ) // small region, zero out disparity2325ds[j] = (T)newVal;2326}2327// no label, assign and propagate2328else2329{2330Point2s* ws = wbuf; // initialize wavefront2331Point2s p((short)j, (short)i); // current pixel2332curlabel++; // next label2333int count = 0; // current region size2334ls[j] = curlabel;23352336// wavefront propagation2337while( ws >= wbuf ) // wavefront not empty2338{2339count++;2340// put neighbors onto wavefront2341T* dpp = &img.at<T>(p.y, p.x);2342T dp = *dpp;2343int* lpp = labels + width*p.y + p.x;23442345if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )2346{2347lpp[+width] = curlabel;2348*ws++ = Point2s(p.x, p.y+1);2349}23502351if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )2352{2353lpp[-width] = curlabel;2354*ws++ = Point2s(p.x, p.y-1);2355}23562357if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )2358{2359lpp[+1] = curlabel;2360*ws++ = Point2s(p.x+1, p.y);2361}23622363if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )2364{2365lpp[-1] = curlabel;2366*ws++ = Point2s(p.x-1, p.y);2367}23682369// pop most recent and propagate2370// NB: could try least recent, maybe better convergence2371p = *--ws;2372}23732374// assign label type2375if( count <= maxSpeckleSize ) // speckle region2376{2377rtype[ls[j]] = 1; // small region label2378ds[j] = (T)newVal;2379}2380else2381rtype[ls[j]] = 0; // large region label2382}2383}2384}2385}2386}23872388#ifdef HAVE_IPP2389static bool ipp_filterSpeckles(Mat &img, int maxSpeckleSize, int newVal, int maxDiff, Mat &buffer)2390{2391#if IPP_VERSION_X100 >= 8102392CV_INSTRUMENT_REGION_IPP();23932394IppDataType dataType = ippiGetDataType(img.depth());2395IppiSize size = ippiSize(img.size());2396int bufferSize;23972398if(img.channels() != 1)2399return false;24002401if(dataType != ipp8u && dataType != ipp16s)2402return false;24032404if(ippiMarkSpecklesGetBufferSize(size, dataType, 1, &bufferSize) < 0)2405return false;24062407if(bufferSize && (buffer.empty() || (int)(buffer.step*buffer.rows) < bufferSize))2408buffer.create(1, (int)bufferSize, CV_8U);24092410switch(dataType)2411{2412case ipp8u: return CV_INSTRUMENT_FUN_IPP(ippiMarkSpeckles_8u_C1IR, img.ptr<Ipp8u>(), (int)img.step, size, (Ipp8u)newVal, maxSpeckleSize, (Ipp8u)maxDiff, ippiNormL1, buffer.ptr<Ipp8u>()) >= 0;2413case ipp16s: return CV_INSTRUMENT_FUN_IPP(ippiMarkSpeckles_16s_C1IR, img.ptr<Ipp16s>(), (int)img.step, size, (Ipp16s)newVal, maxSpeckleSize, (Ipp16s)maxDiff, ippiNormL1, buffer.ptr<Ipp8u>()) >= 0;2414default: return false;2415}2416#else2417CV_UNUSED(img); CV_UNUSED(maxSpeckleSize); CV_UNUSED(newVal); CV_UNUSED(maxDiff); CV_UNUSED(buffer);2418return false;2419#endif2420}2421#endif24222423}24242425void cv::filterSpeckles( InputOutputArray _img, double _newval, int maxSpeckleSize,2426double _maxDiff, InputOutputArray __buf )2427{2428CV_INSTRUMENT_REGION();24292430Mat img = _img.getMat();2431int type = img.type();2432Mat temp, &_buf = __buf.needed() ? __buf.getMatRef() : temp;2433CV_Assert( type == CV_8UC1 || type == CV_16SC1 );24342435int newVal = cvRound(_newval), maxDiff = cvRound(_maxDiff);24362437CV_IPP_RUN_FAST(ipp_filterSpeckles(img, maxSpeckleSize, newVal, maxDiff, _buf));24382439if (type == CV_8UC1)2440filterSpecklesImpl<uchar>(img, newVal, maxSpeckleSize, maxDiff, _buf);2441else2442filterSpecklesImpl<short>(img, newVal, maxSpeckleSize, maxDiff, _buf);2443}24442445void cv::validateDisparity( InputOutputArray _disp, InputArray _cost, int minDisparity,2446int numberOfDisparities, int disp12MaxDiff )2447{2448CV_INSTRUMENT_REGION();24492450Mat disp = _disp.getMat(), cost = _cost.getMat();2451int cols = disp.cols, rows = disp.rows;2452int minD = minDisparity, maxD = minDisparity + numberOfDisparities;2453int x, minX1 = std::max(maxD, 0), maxX1 = cols + std::min(minD, 0);2454AutoBuffer<int> _disp2buf(cols*2);2455int* disp2buf = _disp2buf.data();2456int* disp2cost = disp2buf + cols;2457const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;2458int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;2459int costType = cost.type();24602461disp12MaxDiff *= DISP_SCALE;24622463CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&2464(costType == CV_16S || costType == CV_32S) &&2465disp.size() == cost.size() );24662467for( int y = 0; y < rows; y++ )2468{2469short* dptr = disp.ptr<short>(y);24702471for( x = 0; x < cols; x++ )2472{2473disp2buf[x] = INVALID_DISP_SCALED;2474disp2cost[x] = INT_MAX;2475}24762477if( costType == CV_16S )2478{2479const short* cptr = cost.ptr<short>(y);24802481for( x = minX1; x < maxX1; x++ )2482{2483int d = dptr[x], c = cptr[x];24842485if( d == INVALID_DISP_SCALED )2486continue;24872488int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);24892490if( disp2cost[x2] > c )2491{2492disp2cost[x2] = c;2493disp2buf[x2] = d;2494}2495}2496}2497else2498{2499const int* cptr = cost.ptr<int>(y);25002501for( x = minX1; x < maxX1; x++ )2502{2503int d = dptr[x], c = cptr[x];25042505if( d == INVALID_DISP_SCALED )2506continue;25072508int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);25092510if( disp2cost[x2] > c )2511{2512disp2cost[x2] = c;2513disp2buf[x2] = d;2514}2515}2516}25172518for( x = minX1; x < maxX1; x++ )2519{2520// we round the computed disparity both towards -inf and +inf and check2521// if either of the corresponding disparities in disp2 is consistent.2522// This is to give the computed disparity a chance to look valid if it is.2523int d = dptr[x];2524if( d == INVALID_DISP_SCALED )2525continue;2526int d0 = d >> DISP_SHIFT;2527int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;2528int x0 = x - d0, x1 = x - d1;2529if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP_SCALED && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&2530(0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP_SCALED && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )2531dptr[x] = (short)INVALID_DISP_SCALED;2532}2533}2534}253525362537