Path: blob/master/3rdparty/carotene/src/channels_combine.cpp
16337 views
/*1* By downloading, copying, installing or using the software you agree to this license.2* If you do not agree to this license, do not download, install,3* copy or use the software.4*5*6* License Agreement7* For Open Source Computer Vision Library8* (3-clause BSD License)9*10* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.11* Third party copyrights are property of their respective owners.12*13* Redistribution and use in source and binary forms, with or without modification,14* are permitted provided that the following conditions are met:15*16* * Redistributions of source code must retain the above copyright notice,17* this list of conditions and the following disclaimer.18*19* * Redistributions in binary form must reproduce the above copyright notice,20* this list of conditions and the following disclaimer in the documentation21* and/or other materials provided with the distribution.22*23* * Neither the names of the copyright holders nor the names of the contributors24* may be used to endorse or promote products derived from this software25* without specific prior written permission.26*27* This software is provided by the copyright holders and contributors "as is" and28* any express or implied warranties, including, but not limited to, the implied29* warranties of merchantability and fitness for a particular purpose are disclaimed.30* In no event shall copyright holders or contributors be liable for any direct,31* indirect, incidental, special, exemplary, or consequential damages32* (including, but not limited to, procurement of substitute goods or services;33* loss of use, data, or profits; or business interruption) however caused34* and on any theory of liability, whether in contract, strict liability,35* or tort (including negligence or otherwise) arising in any way out of36* the use of this software, even if advised of the possibility of such damage.37*/3839#include "common.hpp"40#include "vtransform.hpp"4142namespace CAROTENE_NS {4344#define FILL_LINES2(macro,type) \45macro##_LINE(type,0) \46macro##_LINE(type,1)47#define FILL_LINES3(macro,type) \48FILL_LINES2(macro,type) \49macro##_LINE(type,2)50#define FILL_LINES4(macro,type) \51FILL_LINES3(macro,type) \52macro##_LINE(type,3)5354#define FARG_LINE(type, n) , const type * src##n##Base, ptrdiff_t src##n##Stride5556#ifdef CAROTENE_NEON5758#define VROW_LINE(type, n) const type * src##n = internal::getRowPtr(src##n##Base, src##n##Stride, i);59#define PREF_LINE(type, n) internal::prefetch(src##n + sj);60#define VLD1Q_LINE(type, n) v_dst.val[n] = vld1q_##type(src##n + sj);61#define PRLD_LINE(type, n) internal::prefetch(src##n + sj); v_dst.val[n] = vld1q_##type(src##n + sj);62#define VLD1_LINE(type, n) v_dst.val[n] = vld1_##type(src##n + sj);63#define SLD_LINE(type, n) dst[dj + n] = src##n[sj];6465#define MUL2(val) (val << 1)66#define MUL3(val) (MUL2(val) + val)67#define MUL4(val) (val << 2)6869#define CONTSRC2 dstStride == src0Stride && \70dstStride == src1Stride &&71#define CONTSRC3 dstStride == src0Stride && \72dstStride == src1Stride && \73dstStride == src2Stride &&74#define CONTSRC4 dstStride == src0Stride && \75dstStride == src1Stride && \76dstStride == src2Stride && \77dstStride == src3Stride &&7879#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)8081#define MERGE_ASM2(sgn, bits) __asm__ ( \82"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \83"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \84"vst2." #bits " {d0, d2}, [%[out0]] \n\t" \85"vst2." #bits " {d1, d3}, [%[out1]] \n\t" \86: \87: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), \88[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL2(8)/sizeof(sgn##bits)) \89: "d0","d1","d2","d3" \90);91#define MERGE_ASM3(sgn, bits) __asm__ ( \92"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \93"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \94"vld1." #bits " {d4-d5}, [%[in2]] \n\t" \95"vst3." #bits " {d0, d2, d4}, [%[out0]] \n\t" \96"vst3." #bits " {d1, d3, d5}, [%[out1]] \n\t" \97: \98: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), \99[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL3(8)/sizeof(sgn##bits)) \100: "d0","d1","d2","d3","d4","d5" \101);102#define MERGE_ASM4(sgn, bits) __asm__ ( \103"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \104"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \105"vld1." #bits " {d4-d5}, [%[in2]] \n\t" \106"vld1." #bits " {d6-d7}, [%[in3]] \n\t" \107"vst4." #bits " {d0, d2, d4, d6}, [%[out0]] \n\t" \108"vst4." #bits " {d1, d3, d5, d7}, [%[out1]] \n\t" \109: \110: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), [in3] "r" (src3 + sj), \111[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL4(8)/sizeof(sgn##bits)) \112: "d0","d1","d2","d3","d4","d5","d6","d7" \113);114115#define MERGE_QUAD(sgn, bits, n) { \116FILL_LINES##n(PREF, sgn##bits) \117MERGE_ASM##n(sgn, bits) \118}119120#else121122#define MERGE_QUAD(sgn, bits, n) { \123vec128 v_dst; \124/*FILL_LINES##n(PREF, sgn##bits) \125FILL_LINES##n(VLD1Q, sgn##bits)*/ \126FILL_LINES##n(PRLD, sgn##bits) \127vst##n##q_##sgn##bits(dst + dj, v_dst); \128}129130#endif131132#define COMBINE(sgn,bits,n) void combine##n(const Size2D &_size \133FILL_LINES##n(FARG, sgn##bits), \134sgn##bits * dstBase, ptrdiff_t dstStride) \135{ \136internal::assertSupportedConfiguration(); \137Size2D size(_size); \138if (CONTSRC##n \139dstStride == (ptrdiff_t)(size.width)) \140{ \141size.width *= size.height; \142size.height = 1; \143} \144typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \145size_t roiw16 = size.width >= (16/sizeof(sgn##bits) - 1) ? size.width - (16/sizeof(sgn##bits) - 1) : 0; \146typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \147size_t roiw8 = size.width >= (8/sizeof(sgn##bits) - 1) ? size.width - (8/sizeof(sgn##bits) - 1) : 0; \148\149for (size_t i = 0u; i < size.height; ++i) \150{ \151FILL_LINES##n(VROW, sgn##bits) \152sgn##bits * dst = internal::getRowPtr(dstBase, dstStride, i); \153size_t sj = 0u, dj = 0u; \154\155for (; sj < roiw16; sj += 16/sizeof(sgn##bits), dj += MUL##n(16)/sizeof(sgn##bits)) \156MERGE_QUAD(sgn, bits, n) \157\158if ( sj < roiw8 ) \159{ \160vec64 v_dst; \161FILL_LINES##n(VLD1, sgn##bits) \162vst##n##_##sgn##bits(dst + dj, v_dst); \163sj += 8/sizeof(sgn##bits); dj += MUL##n(8)/sizeof(sgn##bits); \164} \165\166for (; sj < size.width; ++sj, dj += n) \167{ \168FILL_LINES##n(SLD, sgn##bits) \169} \170} \171}172173#define COMBINE64(sgn,n) void combine##n(const Size2D &_size \174FILL_LINES##n(FARG, sgn##64), \175sgn##64 * dstBase, ptrdiff_t dstStride) \176{ \177internal::assertSupportedConfiguration(); \178Size2D size(_size); \179if (CONTSRC##n \180dstStride == (ptrdiff_t)(size.width)) \181{ \182size.width *= size.height; \183size.height = 1; \184} \185typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \186\187for (size_t i = 0u; i < size.height; ++i) \188{ \189FILL_LINES##n(VROW, sgn##64) \190sgn##64 * dst = internal::getRowPtr(dstBase, dstStride, i); \191size_t sj = 0u, dj = 0u; \192\193for (; sj < size.width; ++sj, dj += n) \194{ \195vec64 v_dst; \196FILL_LINES##n(VLD1, sgn##64) \197vst##n##_##sgn##64(dst + dj, v_dst); \198/*FILL_LINES##n(SLD, sgn##64)*/ \199} \200} \201}202203#else204205#define VOID_LINE(type, n) (void)src##n##Base; (void)src##n##Stride;206207#define COMBINE(sgn,bits,n) void combine##n(const Size2D &size \208FILL_LINES##n(FARG, sgn##bits), \209sgn##bits * dstBase, ptrdiff_t dstStride) \210{ \211internal::assertSupportedConfiguration(); \212(void)size; \213FILL_LINES##n(VOID, sgn##bits) \214(void)dstBase; \215(void)dstStride; \216}217#define COMBINE64(sgn,n) COMBINE(sgn,64,n)218219#endif //CAROTENE_NEON220221COMBINE(u, 8,2)222COMBINE(u, 8,3)223COMBINE(u, 8,4)224COMBINE(u,16,2)225COMBINE(u,16,3)226COMBINE(u,16,4)227COMBINE(s,32,2)228COMBINE(s,32,3)229COMBINE(s,32,4)230COMBINE64(s, 2)231COMBINE64(s, 3)232COMBINE64(s, 4)233234void combineYUYV(const Size2D &size,235const u8 * srcyBase, ptrdiff_t srcyStride,236const u8 * srcuBase, ptrdiff_t srcuStride,237const u8 * srcvBase, ptrdiff_t srcvStride,238u8 * dstBase, ptrdiff_t dstStride)239{240internal::assertSupportedConfiguration();241#ifdef CAROTENE_NEON242#ifndef __ANDROID__243size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;244#endif245size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;246247for (size_t i = 0u; i < size.height; i += 1)248{249const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);250const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);251const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);252u8 * dst = internal::getRowPtr(dstBase, dstStride, i);253size_t syj = 0u, sj = 0u, dj = 0u;254255#ifndef __ANDROID__256for (; sj < roiw32; sj += 32, syj += 64, dj += 128)257{258internal::prefetch(srcy + syj);259internal::prefetch(srcu + sj);260internal::prefetch(srcv + sj);261262uint8x16x2_t v_y = vld2q_u8(srcy + syj);263uint8x16x4_t v_dst;264v_dst.val[0] = v_y.val[0];265v_dst.val[1] = vld1q_u8(srcu + sj);266v_dst.val[2] = v_y.val[1];267v_dst.val[3] = vld1q_u8(srcv + sj);268vst4q_u8(dst + dj, v_dst);269270v_y = vld2q_u8(srcy + syj + 32);271v_dst.val[0] = v_y.val[0];272v_dst.val[1] = vld1q_u8(srcu + sj + 16);273v_dst.val[2] = v_y.val[1];274v_dst.val[3] = vld1q_u8(srcv + sj + 16);275vst4q_u8(dst + dj + 64, v_dst);276}277#endif278279for (; sj < roiw8; sj += 8, syj += 16, dj += 32)280{281uint8x8x2_t v_y = vld2_u8(srcy + syj);282uint8x8x4_t v_dst;283v_dst.val[0] = v_y.val[0];284v_dst.val[1] = vld1_u8(srcu + sj);285v_dst.val[2] = v_y.val[1];286v_dst.val[3] = vld1_u8(srcv + sj);287vst4_u8(dst + dj, v_dst);288}289290for (; sj < size.width; ++sj, syj += 2, dj += 4)291{292dst[dj] = srcy[syj];293dst[dj + 1] = srcu[sj];294dst[dj + 2] = srcy[syj + 1];295dst[dj + 3] = srcv[sj];296}297}298#else299(void)size;300(void)srcyBase;301(void)srcyStride;302(void)srcuBase;303(void)srcuStride;304(void)srcvBase;305(void)srcvStride;306(void)dstBase;307(void)dstStride;308#endif309}310311void combineUYVY(const Size2D &size,312const u8 * srcyBase, ptrdiff_t srcyStride,313const u8 * srcuBase, ptrdiff_t srcuStride,314const u8 * srcvBase, ptrdiff_t srcvStride,315u8 * dstBase, ptrdiff_t dstStride)316{317internal::assertSupportedConfiguration();318#ifdef CAROTENE_NEON319#ifndef __ANDROID__320size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;321#endif322size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;323324for (size_t i = 0u; i < size.height; ++i)325{326const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);327const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);328const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);329u8 * dst = internal::getRowPtr(dstBase, dstStride, i);330size_t syj = 0u, sj = 0u, dj = 0u;331332#ifndef __ANDROID__333for (; sj < roiw32; sj += 32, syj += 64, dj += 128)334{335internal::prefetch(srcy + syj);336internal::prefetch(srcu + sj);337internal::prefetch(srcv + sj);338339uint8x16x2_t v_y = vld2q_u8(srcy + syj);340uint8x16x4_t v_dst;341v_dst.val[0] = vld1q_u8(srcu + sj);342v_dst.val[1] = v_y.val[0];343v_dst.val[2] = vld1q_u8(srcv + sj);344v_dst.val[3] = v_y.val[1];345vst4q_u8(dst + dj, v_dst);346347v_y = vld2q_u8(srcy + syj + 32);348v_dst.val[0] = vld1q_u8(srcu + sj + 16);349v_dst.val[1] = v_y.val[0];350v_dst.val[2] = vld1q_u8(srcv + sj + 16);351v_dst.val[3] = v_y.val[1];352vst4q_u8(dst + dj + 64, v_dst);353}354#endif355356for (; sj < roiw8; sj += 8, syj += 16, dj += 32)357{358uint8x8x2_t v_y = vld2_u8(srcy + syj);359uint8x8x4_t v_dst;360v_dst.val[0] = vld1_u8(srcu + sj);361v_dst.val[1] = v_y.val[0];362v_dst.val[2] = vld1_u8(srcv + sj);363v_dst.val[3] = v_y.val[1];364vst4_u8(dst + dj, v_dst);365}366367for (; sj < size.width; ++sj, syj += 2, dj += 4)368{369dst[dj] = srcu[sj];370dst[dj + 1] = srcy[syj];371dst[dj + 2] = srcv[sj];372dst[dj + 3] = srcy[syj + 1];373}374}375#else376(void)size;377(void)srcyBase;378(void)srcyStride;379(void)srcuBase;380(void)srcuStride;381(void)srcvBase;382(void)srcvStride;383(void)dstBase;384(void)dstStride;385#endif386}387388} // namespace CAROTENE_NS389390391