Path: blob/master/modules/imgproc/src/imgwarp.sse4_1.cpp
16354 views
/*M///////////////////////////////////////////////////////////////////////////////////////1//2// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.3//4// By downloading, copying, installing or using the software you agree to this license.5// If you do not agree to this license, do not download, install,6// copy or use the software.7//8//9// License Agreement10// For Open Source Computer Vision Library11//12// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.13// Copyright (C) 2009, Willow Garage Inc., all rights reserved.14// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.15// Third party copyrights are property of their respective owners.16//17// Redistribution and use in source and binary forms, with or without modification,18// are permitted provided that the following conditions are met:19//20// * Redistribution's of source code must retain the above copyright notice,21// this list of conditions and the following disclaimer.22//23// * Redistribution's in binary form must reproduce the above copyright notice,24// this list of conditions and the following disclaimer in the documentation25// and/or other materials provided with the distribution.26//27// * The name of the copyright holders may not be used to endorse or promote products28// derived from this software without specific prior written permission.29//30// This software is provided by the copyright holders and contributors "as is" and31// any express or implied warranties, including, but not limited to, the implied32// warranties of merchantability and fitness for a particular purpose are disclaimed.33// In no event shall the Intel Corporation or contributors be liable for any direct,34// indirect, incidental, special, exemplary, or consequential damages35// (including, but not limited to, procurement of substitute goods or services;36// loss of use, data, or profits; or business interruption) however caused37// and on any theory of liability, whether in contract, strict liability,38// or tort (including negligence or otherwise) arising in any way out of39// the use of this software, even if advised of the possibility of such damage.40//41//M*/4243/* ////////////////////////////////////////////////////////////////////44//45// Geometrical transforms on images and matrices: rotation, zoom etc.46//47// */4849#include "precomp.hpp"50#include "imgwarp.hpp"5152namespace cv53{54namespace opt_SSE4_155{5657void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width)58{59int x = 0;60for (; x <= width - 16; x += 16)61{62__m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),63_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)));64__m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)),65_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12)));6667__m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)),68_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4)));69__m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)),70_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12)));7172_mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3);7374_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0);75_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1);76_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);77_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);78}7980for (; x < width; x++)81{82dst1[x * 2] = saturate_cast<short>(src1f[x]);83dst1[x * 2 + 1] = saturate_cast<short>(src2f[x]);84}85}8687void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width)88{89int x = 0;90__m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);91__m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);9293for (; x <= width - 16; x += 16)94{95__m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its));96__m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its));97__m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its));98__m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its));99100__m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),101_mm_srai_epi32(v_ix1, INTER_BITS));102__m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),103_mm_srai_epi32(v_iy1, INTER_BITS));104__m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),105_mm_and_si128(v_ix0, v_its1));106__m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),107_mm_and_si128(v_ix1, v_its1));108_mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21));109110v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its));111v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its));112v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its));113v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its));114115__m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),116_mm_srai_epi32(v_ix1, INTER_BITS));117__m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),118_mm_srai_epi32(v_iy1, INTER_BITS));119v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),120_mm_and_si128(v_ix0, v_its1));121v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),122_mm_and_si128(v_ix1, v_its1));123_mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21));124125_mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13);126127_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10);128_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11);129_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);130_mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);131}132for (; x < width; x++)133{134int ix = saturate_cast<int>(src1f[x] * INTER_TAB_SIZE);135int iy = saturate_cast<int>(src2f[x] * INTER_TAB_SIZE);136dst1[x * 2] = saturate_cast<short>(ix >> INTER_BITS);137dst1[x * 2 + 1] = saturate_cast<short>(iy >> INTER_BITS);138dst2[x] = (ushort)((iy & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE - 1)));139}140}141142void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width)143{144int x = 0;145__m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);146__m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);147__m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE - 1) << 16);148149for (; x <= width - 4; x += 4)150{151__m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its));152__m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its));153154__m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS),155_mm_srai_epi32(v_src1, INTER_BITS));156_mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1);157158// x0 y0 x1 y1 . . .159v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1),160_mm_and_si128(v_src1, v_its1));161__m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . .162_mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .163_mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));164}165for (; x < width; x++)166{167int ix = saturate_cast<int>(src1f[x * 2] * INTER_TAB_SIZE);168int iy = saturate_cast<int>(src1f[x * 2 + 1] * INTER_TAB_SIZE);169dst1[x * 2] = saturate_cast<short>(ix >> INTER_BITS);170dst1[x * 2 + 1] = saturate_cast<short>(iy >> INTER_BITS);171dst2[x] = (ushort)((iy & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE - 1)));172}173}174175void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)176{177const int AB_BITS = MAX(10, (int)INTER_BITS);178int x1 = 0;179180__m128i v_X0 = _mm_set1_epi32(X0);181__m128i v_Y0 = _mm_set1_epi32(Y0);182for (; x1 <= bw - 16; x1 += 16)183{184__m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1))), AB_BITS),185_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 4))), AB_BITS));186__m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 8))), AB_BITS),187_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 12))), AB_BITS));188189__m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1))), AB_BITS),190_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 4))), AB_BITS));191__m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 8))), AB_BITS),192_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 12))), AB_BITS));193194_mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);195196_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);197_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);198_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);199_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);200}201for (; x1 < bw; x1++)202{203int X = (X0 + adelta[x1]) >> AB_BITS;204int Y = (Y0 + bdelta[x1]) >> AB_BITS;205xy[x1 * 2] = saturate_cast<short>(X);206xy[x1 * 2 + 1] = saturate_cast<short>(Y);207}208}209210211class WarpPerspectiveLine_SSE4_Impl CV_FINAL : public WarpPerspectiveLine_SSE4212{213public:214WarpPerspectiveLine_SSE4_Impl(const double *M)215{216CV_UNUSED(M);217}218virtual void processNN(const double *M, short* xy, double X0, double Y0, double W0, int bw) CV_OVERRIDE219{220const __m128d v_M0 = _mm_set1_pd(M[0]);221const __m128d v_M3 = _mm_set1_pd(M[3]);222const __m128d v_M6 = _mm_set1_pd(M[6]);223const __m128d v_intmax = _mm_set1_pd((double)INT_MAX);224const __m128d v_intmin = _mm_set1_pd((double)INT_MIN);225const __m128d v_2 = _mm_set1_pd(2);226const __m128d v_zero = _mm_setzero_pd();227const __m128d v_1 = _mm_set1_pd(1);228229int x1 = 0;230__m128d v_X0d = _mm_set1_pd(X0);231__m128d v_Y0d = _mm_set1_pd(Y0);232__m128d v_W0 = _mm_set1_pd(W0);233__m128d v_x1 = _mm_set_pd(1, 0);234235for (; x1 <= bw - 16; x1 += 16)236{237// 0-3238__m128i v_X0, v_Y0;239{240__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);241v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));242__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));243__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));244v_x1 = _mm_add_pd(v_x1, v_2);245246v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);247v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));248__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));249__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));250v_x1 = _mm_add_pd(v_x1, v_2);251252v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),253_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));254v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),255_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));256}257258// 4-8259__m128i v_X1, v_Y1;260{261__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);262v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));263__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));264__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));265v_x1 = _mm_add_pd(v_x1, v_2);266267v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);268v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));269__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));270__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));271v_x1 = _mm_add_pd(v_x1, v_2);272273v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),274_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));275v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),276_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));277}278279// 8-11280__m128i v_X2, v_Y2;281{282__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);283v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));284__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));285__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));286v_x1 = _mm_add_pd(v_x1, v_2);287288v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);289v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));290__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));291__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));292v_x1 = _mm_add_pd(v_x1, v_2);293294v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),295_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));296v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),297_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));298}299300// 12-15301__m128i v_X3, v_Y3;302{303__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);304v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));305__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));306__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));307v_x1 = _mm_add_pd(v_x1, v_2);308309v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);310v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));311__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));312__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));313v_x1 = _mm_add_pd(v_x1, v_2);314315v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),316_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));317v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),318_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));319}320321// convert to 16s322v_X0 = _mm_packs_epi32(v_X0, v_X1);323v_X1 = _mm_packs_epi32(v_X2, v_X3);324v_Y0 = _mm_packs_epi32(v_Y0, v_Y1);325v_Y1 = _mm_packs_epi32(v_Y2, v_Y3);326327_mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);328329_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);330_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);331_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);332_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);333}334335for (; x1 < bw; x1++)336{337double W = W0 + M[6] * x1;338W = W ? 1. / W : 0;339double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1)*W));340double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1)*W));341int X = saturate_cast<int>(fX);342int Y = saturate_cast<int>(fY);343344xy[x1 * 2] = saturate_cast<short>(X);345xy[x1 * 2 + 1] = saturate_cast<short>(Y);346}347}348virtual void process(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw) CV_OVERRIDE349{350const __m128d v_M0 = _mm_set1_pd(M[0]);351const __m128d v_M3 = _mm_set1_pd(M[3]);352const __m128d v_M6 = _mm_set1_pd(M[6]);353const __m128d v_intmax = _mm_set1_pd((double)INT_MAX);354const __m128d v_intmin = _mm_set1_pd((double)INT_MIN);355const __m128d v_2 = _mm_set1_pd(2);356const __m128d v_zero = _mm_setzero_pd();357const __m128d v_its = _mm_set1_pd(INTER_TAB_SIZE);358const __m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);359360int x1 = 0;361362__m128d v_X0d = _mm_set1_pd(X0);363__m128d v_Y0d = _mm_set1_pd(Y0);364__m128d v_W0 = _mm_set1_pd(W0);365__m128d v_x1 = _mm_set_pd(1, 0);366367for (; x1 <= bw - 16; x1 += 16)368{369// 0-3370__m128i v_X0, v_Y0;371{372__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);373v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));374__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));375__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));376v_x1 = _mm_add_pd(v_x1, v_2);377378v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);379v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));380__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));381__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));382v_x1 = _mm_add_pd(v_x1, v_2);383384v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),385_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));386v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),387_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));388}389390// 4-8391__m128i v_X1, v_Y1;392{393__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);394v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));395__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));396__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));397v_x1 = _mm_add_pd(v_x1, v_2);398399v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);400v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));401__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));402__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));403v_x1 = _mm_add_pd(v_x1, v_2);404405v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),406_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));407v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),408_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));409}410411// 8-11412__m128i v_X2, v_Y2;413{414__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);415v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));416__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));417__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));418v_x1 = _mm_add_pd(v_x1, v_2);419420v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);421v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));422__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));423__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));424v_x1 = _mm_add_pd(v_x1, v_2);425426v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),427_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));428v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),429_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));430}431432// 12-15433__m128i v_X3, v_Y3;434{435__m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);436v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));437__m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));438__m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));439v_x1 = _mm_add_pd(v_x1, v_2);440441v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);442v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));443__m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));444__m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));445v_x1 = _mm_add_pd(v_x1, v_2);446447v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),448_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));449v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),450_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));451}452453// store alpha454__m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS),455_mm_and_si128(v_X0, v_itsi1));456__m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS),457_mm_and_si128(v_X1, v_itsi1));458_mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1));459460v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS),461_mm_and_si128(v_X2, v_itsi1));462v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS),463_mm_and_si128(v_X3, v_itsi1));464_mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1));465466// convert to 16s467v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS));468v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS));469v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS));470v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS));471472_mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);473474_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);475_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);476_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);477_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);478}479for (; x1 < bw; x1++)480{481double W = W0 + M[6] * x1;482W = W ? INTER_TAB_SIZE / W : 0;483double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1)*W));484double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1)*W));485int X = saturate_cast<int>(fX);486int Y = saturate_cast<int>(fY);487488xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);489xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);490alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1))*INTER_TAB_SIZE +491(X & (INTER_TAB_SIZE - 1)));492}493}494virtual ~WarpPerspectiveLine_SSE4_Impl() CV_OVERRIDE {};495};496497Ptr<WarpPerspectiveLine_SSE4> WarpPerspectiveLine_SSE4::getImpl(const double *M)498{499return Ptr<WarpPerspectiveLine_SSE4>(new WarpPerspectiveLine_SSE4_Impl(M));500}501502}503}504/* End of file. */505506507