Path: blob/master/3rdparty/carotene/src/opticalflow.cpp
16337 views
/*1* By downloading, copying, installing or using the software you agree to this license.2* If you do not agree to this license, do not download, install,3* copy or use the software.4*5*6* License Agreement7* For Open Source Computer Vision Library8* (3-clause BSD License)9*10* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.11* Third party copyrights are property of their respective owners.12*13* Redistribution and use in source and binary forms, with or without modification,14* are permitted provided that the following conditions are met:15*16* * Redistributions of source code must retain the above copyright notice,17* this list of conditions and the following disclaimer.18*19* * Redistributions in binary form must reproduce the above copyright notice,20* this list of conditions and the following disclaimer in the documentation21* and/or other materials provided with the distribution.22*23* * Neither the names of the copyright holders nor the names of the contributors24* may be used to endorse or promote products derived from this software25* without specific prior written permission.26*27* This software is provided by the copyright holders and contributors "as is" and28* any express or implied warranties, including, but not limited to, the implied29* warranties of merchantability and fitness for a particular purpose are disclaimed.30* In no event shall copyright holders or contributors be liable for any direct,31* indirect, incidental, special, exemplary, or consequential damages32* (including, but not limited to, procurement of substitute goods or services;33* loss of use, data, or profits; or business interruption) however caused34* and on any theory of liability, whether in contract, strict liability,35* or tort (including negligence or otherwise) arising in any way out of36* the use of this software, even if advised of the possibility of such damage.37*/3839#include "common.hpp"40#include "saturate_cast.hpp"41#include <vector>42#include <float.h> // For FLT_EPSILON4344namespace CAROTENE_NS {4546#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))4748/*49* Pyramidal Lucas-Kanade Optical Flow level processing50*/51void pyrLKOptFlowLevel(const Size2D &size, s32 cn,52const u8 *prevData, ptrdiff_t prevStride,53const s16 *prevDerivData, ptrdiff_t prevDerivStride,54const u8 *nextData, ptrdiff_t nextStride,55u32 ptCount,56const f32 *prevPts, f32 *nextPts,57u8 *status, f32 *err,58const Size2D &winSize,59u32 terminationCount, f64 terminationEpsilon,60u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,61f32 minEigThreshold)62{63internal::assertSupportedConfiguration();64#ifdef CAROTENE_NEON65f32 halfWinX = (winSize.width-1)*0.5f, halfWinY = (winSize.height-1)*0.5f;66s32 cn2 = cn*2;6768std::vector<s16> _buf(winSize.total()*(cn + cn2));69s16* IWinBuf = &_buf[0];70s32 IWinBufStride = winSize.width*cn;71s16* derivIWinBuf = &_buf[winSize.total()*cn];72s32 derivIWinBufStride = winSize.width*cn2;7374for( u32 ptidx = 0; ptidx < ptCount; ptidx++ )75{76f32 levscale = (1./(1 << level));77u32 ptref = ptidx << 1;78f32 prevPtX = prevPts[ptref+0]*levscale;79f32 prevPtY = prevPts[ptref+1]*levscale;80f32 nextPtX;81f32 nextPtY;82if( level == maxLevel )83{84if( useInitialFlow )85{86nextPtX = nextPts[ptref+0]*levscale;87nextPtY = nextPts[ptref+1]*levscale;88}89else90{91nextPtX = prevPtX;92nextPtY = prevPtY;93}94}95else96{97nextPtX = nextPts[ptref+0]*2.f;98nextPtY = nextPts[ptref+1]*2.f;99}100nextPts[ptref+0] = nextPtX;101nextPts[ptref+1] = nextPtY;102103s32 iprevPtX, iprevPtY;104s32 inextPtX, inextPtY;105prevPtX -= halfWinX;106prevPtY -= halfWinY;107iprevPtX = floor(prevPtX);108iprevPtY = floor(prevPtY);109110if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width ||111iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height )112{113if( level == 0 )114{115if( status )116status[ptidx] = false;117if( err )118err[ptidx] = 0;119}120continue;121}122123f32 a = prevPtX - iprevPtX;124f32 b = prevPtY - iprevPtY;125const s32 W_BITS = 14, W_BITS1 = 14;126const f32 FLT_SCALE = 1.f/(1 << 20);127s32 iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));128s32 iw01 = round(a*(1.f - b)*(1 << W_BITS));129s32 iw10 = round((1.f - a)*b*(1 << W_BITS));130s32 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;131132s32 dstep = prevDerivStride/sizeof(s16);133f32 A11 = 0, A12 = 0, A22 = 0;134135int16x4_t viw00 = vmov_n_s16((s16)iw00);136int16x4_t viw01 = vmov_n_s16((s16)iw01);137int16x4_t viw10 = vmov_n_s16((s16)iw10);138int16x4_t viw11 = vmov_n_s16((s16)iw11);139140float32x4_t vA11 = vmovq_n_f32(0);141float32x4_t vA12 = vmovq_n_f32(0);142float32x4_t vA22 = vmovq_n_f32(0);143144s32 wwcn = winSize.width*cn;145146// extract the patch from the first image, compute covariation matrix of derivatives147s32 x = 0;148for(s32 y = 0; y < (s32)winSize.height; y++ )149{150const u8* src = prevData + prevStride*(y + iprevPtY) + iprevPtX*cn;151const s16* dsrc = prevDerivData + dstep*(y + iprevPtY) + iprevPtX*cn2;152153s16* Iptr = IWinBuf + y*IWinBufStride;154s16* dIptr = derivIWinBuf + y*derivIWinBufStride;155156internal::prefetch(src + x + prevStride * 2, 0);157for(x = 0; x <= wwcn - 8; x += 8)158{159uint8x8_t vsrc00 = vld1_u8(src + x);160uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);161uint8x8_t vsrc01 = vld1_u8(src + x + cn);162uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);163164int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vsrc00));165int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vsrc10));166int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vsrc01));167int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vsrc11));168169int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);170int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);171172vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);173vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);174175vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);176vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);177178vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);179vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);180181int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);182int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);183184vst1q_s16(Iptr + x, vcombine_s16(vsumnl, vsumnh));185}186for(; x <= wwcn - 4; x += 4)187{188uint8x8_t vsrc00 = vld1_u8(src + x);189uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);190uint8x8_t vsrc01 = vld1_u8(src + x + cn);191uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);192193int16x4_t vs00 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc00)));194int16x4_t vs10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc10)));195int16x4_t vs01 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc01)));196int16x4_t vs11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc11)));197198int32x4_t vsuml1 = vmull_s16(vs00, viw00);199int32x4_t vsuml2 = vmull_s16(vs01, viw01);200vsuml1 = vmlal_s16(vsuml1, vs10, viw10);201vsuml2 = vmlal_s16(vsuml2, vs11, viw11);202int32x4_t vsuml = vaddq_s32(vsuml1, vsuml2);203204int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);205206vst1_s16(Iptr + x, vsumnl);207}208209internal::prefetch(dsrc + dstep * 2, 0);210for(x = 0; x <= wwcn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )211{212#if 0213__asm__ (214"vld2.16 {d0-d1}, [%[dsrc00]] \n\t"215"vld2.16 {d2-d3}, [%[dsrc10]] \n\t"216"vld2.16 {d4-d5}, [%[dsrc01]] \n\t"217"vld2.16 {d6-d7}, [%[dsrc11]] \n\t"218"vmull.s16 q4, d3, %P[viw10] \n\t"219"vmull.s16 q5, d0, %P[viw00] \n\t"220"vmlal.s16 q4, d7, %P[viw11] \n\t"221"vmlal.s16 q5, d4, %P[viw01] \n\t"222"vmlal.s16 q4, d1, %P[viw00] \n\t"223"vmlal.s16 q5, d2, %P[viw10] \n\t"224"vmlal.s16 q4, d5, %P[viw01] \n\t"225"vmlal.s16 q5, d6, %P[viw11] \n\t"226"vrshrn.s32 d13, q4, %[W_BITS1] \n\t"227"vrshrn.s32 d12, q5, %[W_BITS1] \n\t"228"vmull.s16 q3, d13, d13 \n\t"229"vmull.s16 q4, d12, d12 \n\t"230"vmull.s16 q5, d13, d12 \n\t"231"vcvt.f32.s32 q3, q3 \n\t"232"vcvt.f32.s32 q4, q4 \n\t"233"vcvt.f32.s32 q5, q5 \n\t"234"vadd.f32 %q[vA22], q3 \n\t"235"vadd.f32 %q[vA11], q4 \n\t"236"vadd.f32 %q[vA12], q5 \n\t"237"vst2.16 {d12-d13}, [%[out]] \n\t"238: [vA22] "=w" (vA22),239[vA11] "=w" (vA11),240[vA12] "=w" (vA12)241: "0" (vA22),242"1" (vA11),243"2" (vA12),244[out] "r" (dIptr),245[dsrc00] "r" (dsrc),246[dsrc10] "r" (dsrc + dstep),247[dsrc01] "r" (dsrc + cn2),248[dsrc11] "r" (dsrc + dstep + cn2),249[viw00] "w" (viw00),250[viw10] "w" (viw10),251[viw01] "w" (viw01),252[viw11] "w" (viw11),253[W_BITS1] "I" (W_BITS1)254: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"255);256#else257int16x4x2_t vdsrc00 = vld2_s16(dsrc);258int16x4x2_t vdsrc10 = vld2_s16(dsrc + dstep);259int16x4x2_t vdsrc01 = vld2_s16(dsrc + cn2);260int16x4x2_t vdsrc11 = vld2_s16(dsrc + dstep + cn2);261262int32x4_t vsumy = vmull_s16(vdsrc10.val[1], viw10);263int32x4_t vsumx = vmull_s16(vdsrc00.val[0], viw00);264265vsumy = vmlal_s16(vsumy, vdsrc11.val[1], viw11);266vsumx = vmlal_s16(vsumx, vdsrc01.val[0], viw01);267268vsumy = vmlal_s16(vsumy, vdsrc00.val[1], viw00);269vsumx = vmlal_s16(vsumx, vdsrc10.val[0], viw10);270271vsumy = vmlal_s16(vsumy, vdsrc01.val[1], viw01);272vsumx = vmlal_s16(vsumx, vdsrc11.val[0], viw11);273274int16x4_t vsumny = vrshrn_n_s32(vsumy, W_BITS1);275int16x4_t vsumnx = vrshrn_n_s32(vsumx, W_BITS1);276277int32x4_t va22i = vmull_s16(vsumny, vsumny);278int32x4_t va11i = vmull_s16(vsumnx, vsumnx);279int32x4_t va12i = vmull_s16(vsumnx, vsumny);280281float32x4_t va22f = vcvtq_f32_s32(va22i);282float32x4_t va11f = vcvtq_f32_s32(va11i);283float32x4_t va12f = vcvtq_f32_s32(va12i);284285vA22 = vaddq_f32(vA22, va22f);286vA11 = vaddq_f32(vA11, va11f);287vA12 = vaddq_f32(vA12, va12f);288289int16x4x2_t vsum;290vsum.val[0] = vsumnx;291vsum.val[1] = vsumny;292vst2_s16(dIptr, vsum);293#endif294}295296for( ; x < wwcn; x++, dsrc += 2, dIptr += 2 )297{298s32 ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +299src[x+prevStride]*iw10 + src[x+prevStride+cn]*iw11, W_BITS1-5);300s32 ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +301dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);302s32 iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +303dsrc[dstep+cn2+1]*iw11, W_BITS1);304Iptr[x] = (s16)ival;305dIptr[0] = (s16)ixval;306dIptr[1] = (s16)iyval;307308A11 += (f32)(ixval*ixval);309A12 += (f32)(ixval*iyval);310A22 += (f32)(iyval*iyval);311}312}313314f32 A11buf[2], A12buf[2], A22buf[2];315vst1_f32(A11buf, vadd_f32(vget_low_f32(vA11), vget_high_f32(vA11)));316vst1_f32(A12buf, vadd_f32(vget_low_f32(vA12), vget_high_f32(vA12)));317vst1_f32(A22buf, vadd_f32(vget_low_f32(vA22), vget_high_f32(vA22)));318A11 += A11buf[0] + A11buf[1];319A12 += A12buf[0] + A12buf[1];320A22 += A22buf[0] + A22buf[1];321322A11 *= FLT_SCALE;323A12 *= FLT_SCALE;324A22 *= FLT_SCALE;325326f32 D = A11*A22 - A12*A12;327f32 minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +3284.f*A12*A12))/(2*winSize.width*winSize.height);329330if( err && getMinEigenVals )331err[ptidx] = (f32)minEig;332333if( minEig < minEigThreshold || D < FLT_EPSILON )334{335if( level == 0 && status )336status[ptidx] = false;337continue;338}339340D = 1.f/D;341342nextPtX -= halfWinX;343nextPtY -= halfWinY;344f32 prevDeltaX = 0;345f32 prevDeltaY = 0;346347for(u32 j = 0; j < terminationCount; j++ )348{349inextPtX = floor(nextPtX);350inextPtY = floor(nextPtY);351352if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width ||353inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height )354{355if( level == 0 && status )356status[ptidx] = false;357break;358}359360a = nextPtX - inextPtX;361b = nextPtY - inextPtY;362iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));363iw01 = round(a*(1.f - b)*(1 << W_BITS));364iw10 = round((1.f - a)*b*(1 << W_BITS));365iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;366f32 b1 = 0, b2 = 0;367368viw00 = vmov_n_s16((s16)iw00);369viw01 = vmov_n_s16((s16)iw01);370viw10 = vmov_n_s16((s16)iw10);371viw11 = vmov_n_s16((s16)iw11);372373float32x4_t vb1 = vmovq_n_f32(0);374float32x4_t vb2 = vmovq_n_f32(0);375376for(s32 y = 0; y < (s32)winSize.height; y++ )377{378const u8* Jptr = nextData + nextStride*(y + inextPtY) + inextPtX*cn;379const s16* Iptr = IWinBuf + y*IWinBufStride;380const s16* dIptr = derivIWinBuf + y*derivIWinBufStride;381382x = 0;383384internal::prefetch(Jptr, nextStride * 2);385internal::prefetch(Iptr, IWinBufStride/2);386internal::prefetch(dIptr, derivIWinBufStride/2);387388for( ; x <= wwcn - 8; x += 8, dIptr += 8*2 )389{390uint8x8_t vj00 = vld1_u8(Jptr + x);391uint8x8_t vj10 = vld1_u8(Jptr + x + nextStride);392uint8x8_t vj01 = vld1_u8(Jptr + x + cn);393uint8x8_t vj11 = vld1_u8(Jptr + x + nextStride + cn);394int16x8_t vI = vld1q_s16(Iptr + x);395int16x8x2_t vDerivI = vld2q_s16(dIptr);396397int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vj00));398int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vj10));399int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vj01));400int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vj11));401402int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);403int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);404405vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);406vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);407408vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);409vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);410411vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);412vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);413414int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);415int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);416417int16x8_t diff = vqsubq_s16(vcombine_s16(vsumnl, vsumnh), vI);418419int32x4_t vb1l = vmull_s16(vget_low_s16(diff), vget_low_s16(vDerivI.val[0]));420int32x4_t vb2h = vmull_s16(vget_high_s16(diff), vget_high_s16(vDerivI.val[1]));421int32x4_t vb1i = vmlal_s16(vb1l, vget_high_s16(diff), vget_high_s16(vDerivI.val[0]));422int32x4_t vb2i = vmlal_s16(vb2h, vget_low_s16(diff), vget_low_s16(vDerivI.val[1]));423424float32x4_t vb1f = vcvtq_f32_s32(vb1i);425float32x4_t vb2f = vcvtq_f32_s32(vb2i);426427vb1 = vaddq_f32(vb1, vb1f);428vb2 = vaddq_f32(vb2, vb2f);429}430431for( ; x < wwcn; x++, dIptr += 2 )432{433s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +434Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,435W_BITS1-5) - Iptr[x];436b1 += (f32)(diff*dIptr[0]);437b2 += (f32)(diff*dIptr[1]);438}439}440441f32 bbuf[2];442float32x2_t vb = vpadd_f32(vadd_f32(vget_low_f32(vb1), vget_high_f32(vb1)), vadd_f32(vget_low_f32(vb2), vget_high_f32(vb2)));443vst1_f32(bbuf, vb);444b1 += bbuf[0];445b2 += bbuf[1];446447b1 *= FLT_SCALE;448b2 *= FLT_SCALE;449450f32 deltaX = (f32)((A12*b2 - A22*b1) * D);451f32 deltaY = (f32)((A12*b1 - A11*b2) * D);452453nextPtX += deltaX;454nextPtY += deltaY;455nextPts[ptref+0] = nextPtX + halfWinX;456nextPts[ptref+1] = nextPtY + halfWinY;457458if( ((double)deltaX*deltaX + (double)deltaY*deltaY) <= terminationEpsilon )459break;460461if( j > 0 && std::abs(deltaX + prevDeltaX) < 0.01 &&462std::abs(deltaY + prevDeltaY) < 0.01 )463{464nextPts[ptref+0] -= deltaX*0.5f;465nextPts[ptref+1] -= deltaY*0.5f;466break;467}468prevDeltaX = deltaX;469prevDeltaY = deltaY;470}471472if( status && status[ptidx] && err && level == 0 && !getMinEigenVals )473{474f32 nextPointX = nextPts[ptref+0] - halfWinX;475f32 nextPointY = nextPts[ptref+1] - halfWinY;476477s32 inextPointX = floor(nextPointX);478s32 inextPointY = floor(nextPointY);479480if( inextPointX < -(s32)winSize.width || inextPointX >= (s32)size.width ||481inextPointY < -(s32)winSize.height || inextPointY >= (s32)size.height )482{483if( status )484status[ptidx] = false;485continue;486}487488f32 aa = nextPointX - inextPointX;489f32 bb = nextPointY - inextPointY;490iw00 = round((1.f - aa)*(1.f - bb)*(1 << W_BITS));491iw01 = round(aa*(1.f - bb)*(1 << W_BITS));492iw10 = round((1.f - aa)*bb*(1 << W_BITS));493iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;494f32 errval = 0.f;495496for(s32 y = 0; y < (s32)winSize.height; y++ )497{498const u8* Jptr = nextData + nextStride*(y + inextPointY) + inextPointX*cn;499const s16* Iptr = IWinBuf + y*IWinBufStride;500501for( x = 0; x < wwcn; x++ )502{503s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +504Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,505W_BITS1-5) - Iptr[x];506errval += std::abs((f32)diff);507}508}509err[ptidx] = errval / (32*wwcn*winSize.height);510}511}512#else513(void)size;514(void)cn;515(void)prevData;516(void)prevStride;517(void)prevDerivData;518(void)prevDerivStride;519(void)nextData;520(void)nextStride;521(void)prevPts;522(void)nextPts;523(void)status;524(void)err;525(void)winSize;526(void)terminationCount;527(void)terminationEpsilon;528(void)level;529(void)maxLevel;530(void)useInitialFlow;531(void)getMinEigenVals;532(void)minEigThreshold;533(void)ptCount;534#endif535}536537}//CAROTENE_NS538539540541