Path: blob/master/modules/calib3d/src/undistort.avx2.cpp
16354 views
/*M///////////////////////////////////////////////////////////////////////////////////////1//2// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.3//4// By downloading, copying, installing or using the software you agree to this license.5// If you do not agree to this license, do not download, install,6// copy or use the software.7//8//9// License Agreement10// For Open Source Computer Vision Library11//12// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.13// Copyright (C) 2009, Willow Garage Inc., all rights reserved.14// Third party copyrights are property of their respective owners.15//16// Redistribution and use in source and binary forms, with or without modification,17// are permitted provided that the following conditions are met:18//19// * Redistribution's of source code must retain the above copyright notice,20// this list of conditions and the following disclaimer.21//22// * Redistribution's in binary form must reproduce the above copyright notice,23// this list of conditions and the following disclaimer in the documentation24// and/or other materials provided with the distribution.25//26// * The name of the copyright holders may not be used to endorse or promote products27// derived from this software without specific prior written permission.28//29// This software is provided by the copyright holders and contributors "as is" and30// any express or implied warranties, including, but not limited to, the implied31// warranties of merchantability and fitness for a particular purpose are disclaimed.32// In no event shall the Intel Corporation or contributors be liable for any direct,33// indirect, incidental, special, exemplary, or consequential damages34// (including, but not limited to, procurement of substitute goods or services;35// loss of use, data, or profits; or business interruption) however caused36// and on any theory of liability, whether in contract, strict liability,37// or tort (including negligence or otherwise) arising in any way out of38// the use of this software, even if advised of the possibility of such damage.39//40//M*/4142#include "precomp.hpp"43#include "undistort.hpp"4445namespace cv46{4748int initUndistortRectifyMapLine_AVX(float* m1f, float* m2f, short* m1, ushort* m2, double* matTilt, const double* ir,49double& _x, double& _y, double& _w, int width, int m1type,50double k1, double k2, double k3, double k4, double k5, double k6,51double p1, double p2, double s1, double s2, double s3, double s4,52double u0, double v0, double fx, double fy)53{54int j = 0;5556static const __m256d __one = _mm256_set1_pd(1.0);57static const __m256d __two = _mm256_set1_pd(2.0);5859const __m256d __matTilt_00 = _mm256_set1_pd(matTilt[0]);60const __m256d __matTilt_10 = _mm256_set1_pd(matTilt[3]);61const __m256d __matTilt_20 = _mm256_set1_pd(matTilt[6]);6263const __m256d __matTilt_01 = _mm256_set1_pd(matTilt[1]);64const __m256d __matTilt_11 = _mm256_set1_pd(matTilt[4]);65const __m256d __matTilt_21 = _mm256_set1_pd(matTilt[7]);6667const __m256d __matTilt_02 = _mm256_set1_pd(matTilt[2]);68const __m256d __matTilt_12 = _mm256_set1_pd(matTilt[5]);69const __m256d __matTilt_22 = _mm256_set1_pd(matTilt[8]);7071for (; j <= width - 4; j += 4, _x += 4 * ir[0], _y += 4 * ir[3], _w += 4 * ir[6])72{73// Question: Should we load the constants first?74__m256d __w = _mm256_div_pd(__one, _mm256_set_pd(_w + 3 * ir[6], _w + 2 * ir[6], _w + ir[6], _w));75__m256d __x = _mm256_mul_pd(_mm256_set_pd(_x + 3 * ir[0], _x + 2 * ir[0], _x + ir[0], _x), __w);76__m256d __y = _mm256_mul_pd(_mm256_set_pd(_y + 3 * ir[3], _y + 2 * ir[3], _y + ir[3], _y), __w);77__m256d __x2 = _mm256_mul_pd(__x, __x);78__m256d __y2 = _mm256_mul_pd(__y, __y);79__m256d __r2 = _mm256_add_pd(__x2, __y2);80__m256d __2xy = _mm256_mul_pd(__two, _mm256_mul_pd(__x, __y));81__m256d __kr = _mm256_div_pd(82#if CV_FMA383_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_set1_pd(k3), __r2, _mm256_set1_pd(k2)), __r2, _mm256_set1_pd(k1)), __r2, __one),84_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_fmadd_pd(_mm256_set1_pd(k6), __r2, _mm256_set1_pd(k5)), __r2, _mm256_set1_pd(k4)), __r2, __one)85#else86_mm256_add_pd(__one, _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(k3), __r2), _mm256_set1_pd(k2)), __r2), _mm256_set1_pd(k1)), __r2)),87_mm256_add_pd(__one, _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(k6), __r2), _mm256_set1_pd(k5)), __r2), _mm256_set1_pd(k4)), __r2))88#endif89);90__m256d __r22 = _mm256_mul_pd(__r2, __r2);91#if CV_FMA392__m256d __xd = _mm256_fmadd_pd(__x, __kr,93_mm256_add_pd(94_mm256_fmadd_pd(_mm256_set1_pd(p1), __2xy, _mm256_mul_pd(_mm256_set1_pd(p2), _mm256_fmadd_pd(__two, __x2, __r2))),95_mm256_fmadd_pd(_mm256_set1_pd(s1), __r2, _mm256_mul_pd(_mm256_set1_pd(s2), __r22))));96__m256d __yd = _mm256_fmadd_pd(__y, __kr,97_mm256_add_pd(98_mm256_fmadd_pd(_mm256_set1_pd(p1), _mm256_fmadd_pd(__two, __y2, __r2), _mm256_mul_pd(_mm256_set1_pd(p2), __2xy)),99_mm256_fmadd_pd(_mm256_set1_pd(s3), __r2, _mm256_mul_pd(_mm256_set1_pd(s4), __r22))));100101__m256d __vecTilt2 = _mm256_fmadd_pd(__matTilt_20, __xd, _mm256_fmadd_pd(__matTilt_21, __yd, __matTilt_22));102#else103__m256d __xd = _mm256_add_pd(104_mm256_mul_pd(__x, __kr),105_mm256_add_pd(106_mm256_add_pd(107_mm256_mul_pd(_mm256_set1_pd(p1), __2xy),108_mm256_mul_pd(_mm256_set1_pd(p2), _mm256_add_pd(__r2, _mm256_mul_pd(__two, __x2)))),109_mm256_add_pd(110_mm256_mul_pd(_mm256_set1_pd(s1), __r2),111_mm256_mul_pd(_mm256_set1_pd(s2), __r22))));112__m256d __yd = _mm256_add_pd(113_mm256_mul_pd(__y, __kr),114_mm256_add_pd(115_mm256_add_pd(116_mm256_mul_pd(_mm256_set1_pd(p1), _mm256_add_pd(__r2, _mm256_mul_pd(__two, __y2))),117_mm256_mul_pd(_mm256_set1_pd(p2), __2xy)),118_mm256_add_pd(119_mm256_mul_pd(_mm256_set1_pd(s3), __r2),120_mm256_mul_pd(_mm256_set1_pd(s4), __r22))));121122__m256d __vecTilt2 = _mm256_add_pd(_mm256_add_pd(123_mm256_mul_pd(__matTilt_20, __xd), _mm256_mul_pd(__matTilt_21, __yd)), __matTilt_22);124#endif125__m256d __invProj = _mm256_blendv_pd(126__one, _mm256_div_pd(__one, __vecTilt2),127_mm256_cmp_pd(__vecTilt2, _mm256_setzero_pd(), _CMP_EQ_OQ));128129#if CV_FMA3130__m256d __u = _mm256_fmadd_pd(__matTilt_00, __xd, _mm256_fmadd_pd(__matTilt_01, __yd, __matTilt_02));131__u = _mm256_fmadd_pd(_mm256_mul_pd(_mm256_set1_pd(fx), __invProj), __u, _mm256_set1_pd(u0));132133__m256d __v = _mm256_fmadd_pd(__matTilt_10, __xd, _mm256_fmadd_pd(__matTilt_11, __yd, __matTilt_12));134__v = _mm256_fmadd_pd(_mm256_mul_pd(_mm256_set1_pd(fy), __invProj), __v, _mm256_set1_pd(v0));135#else136__m256d __u = _mm256_add_pd(_mm256_add_pd(137_mm256_mul_pd(__matTilt_00, __xd), _mm256_mul_pd(__matTilt_01, __yd)), __matTilt_02);138__u = _mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(_mm256_set1_pd(fx), __invProj), __u), _mm256_set1_pd(u0));139140__m256d __v = _mm256_add_pd(_mm256_add_pd(141_mm256_mul_pd(__matTilt_10, __xd), _mm256_mul_pd(__matTilt_11, __yd)), __matTilt_12);142__v = _mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(_mm256_set1_pd(fy), __invProj), __v), _mm256_set1_pd(v0));143#endif144145if (m1type == CV_32FC1)146{147_mm_storeu_ps(&m1f[j], _mm256_cvtpd_ps(__u));148_mm_storeu_ps(&m2f[j], _mm256_cvtpd_ps(__v));149}150else if (m1type == CV_32FC2)151{152__m128 __u_float = _mm256_cvtpd_ps(__u);153__m128 __v_float = _mm256_cvtpd_ps(__v);154155_mm_storeu_ps(&m1f[j * 2], _mm_unpacklo_ps(__u_float, __v_float));156_mm_storeu_ps(&m1f[j * 2 + 4], _mm_unpackhi_ps(__u_float, __v_float));157}158else // m1type == CV_16SC2159{160__u = _mm256_mul_pd(__u, _mm256_set1_pd(INTER_TAB_SIZE));161__v = _mm256_mul_pd(__v, _mm256_set1_pd(INTER_TAB_SIZE));162163__m128i __iu = _mm256_cvtpd_epi32(__u);164__m128i __iv = _mm256_cvtpd_epi32(__v);165166static const __m128i __INTER_TAB_SIZE_m1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);167__m128i __m2 = _mm_add_epi32(168_mm_mullo_epi32(_mm_and_si128(__iv, __INTER_TAB_SIZE_m1), _mm_set1_epi32(INTER_TAB_SIZE)),169_mm_and_si128(__iu, __INTER_TAB_SIZE_m1));170__m2 = _mm_packus_epi32(__m2, __m2);171_mm_maskstore_epi64((long long int*) &m2[j], _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF), __m2);172173// gcc4.9 does not support _mm256_set_m128174// __m256i __m1 = _mm256_set_m128i(__iv, __iu);175__m256i __m1 = _mm256_setzero_si256();176__m1 = _mm256_inserti128_si256(__m1, __iu, 0);177__m1 = _mm256_inserti128_si256(__m1, __iv, 1);178__m1 = _mm256_srai_epi32(__m1, INTER_BITS); // v3 v2 v1 v0 u3 u2 u1 u0 (int32_t)179static const __m256i __permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);180__m1 = _mm256_permutevar8x32_epi32(__m1, __permute_mask); // v3 u3 v2 u2 v1 u1 v0 u0 (int32_t)181__m1 = _mm256_packs_epi32(__m1, __m1); // x x x x v3 u3 v2 u2 x x x x v1 u1 v0 u0 (int16_t)182_mm_storeu_si128((__m128i*) &m1[j * 2], _mm256_extracti128_si256(_mm256_permute4x64_epi64(__m1, (2 << 2) + 0), 0));183}184}185186_mm256_zeroupper();187188return j;189}190191}192193/* End of file */194195196