Path: blob/master/3rdparty/carotene/src/channel_extract.cpp
16337 views
/*1* By downloading, copying, installing or using the software you agree to this license.2* If you do not agree to this license, do not download, install,3* copy or use the software.4*5*6* License Agreement7* For Open Source Computer Vision Library8* (3-clause BSD License)9*10* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.11* Third party copyrights are property of their respective owners.12*13* Redistribution and use in source and binary forms, with or without modification,14* are permitted provided that the following conditions are met:15*16* * Redistributions of source code must retain the above copyright notice,17* this list of conditions and the following disclaimer.18*19* * Redistributions in binary form must reproduce the above copyright notice,20* this list of conditions and the following disclaimer in the documentation21* and/or other materials provided with the distribution.22*23* * Neither the names of the copyright holders nor the names of the contributors24* may be used to endorse or promote products derived from this software25* without specific prior written permission.26*27* This software is provided by the copyright holders and contributors "as is" and28* any express or implied warranties, including, but not limited to, the implied29* warranties of merchantability and fitness for a particular purpose are disclaimed.30* In no event shall copyright holders or contributors be liable for any direct,31* indirect, incidental, special, exemplary, or consequential damages32* (including, but not limited to, procurement of substitute goods or services;33* loss of use, data, or profits; or business interruption) however caused34* and on any theory of liability, whether in contract, strict liability,35* or tort (including negligence or otherwise) arising in any way out of36* the use of this software, even if advised of the possibility of such damage.37*/3839#include "common.hpp"40#include "vtransform.hpp"4142namespace CAROTENE_NS {4344void extract2(const Size2D &size,45const u8 * srcBase, ptrdiff_t srcStride,46u8 * dstBase, ptrdiff_t dstStride,47u32 coi)48{49internal::assertSupportedConfiguration();50#ifdef CAROTENE_NEON51#ifndef __ANDROID__52size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;53#endif54size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;5556for (size_t i = 0u; i < size.height; ++i)57{58const u8 * src = internal::getRowPtr(srcBase, srcStride, i);59u8 * dst = internal::getRowPtr(dstBase, dstStride, i);60size_t sj = 0u, dj = 0u;6162#ifndef __ANDROID__63for (; dj < roiw32; sj += 64, dj += 32)64{65internal::prefetch(src + sj);6667uint8x16x2_t v_src = vld2q_u8(src + sj);68vst1q_u8(dst + dj, v_src.val[coi]);6970v_src = vld2q_u8(src + sj + 32);71vst1q_u8(dst + dj + 16, v_src.val[coi]);72}73#endif7475for (; dj < roiw8; sj += 16, dj += 8)76{77uint8x8x2_t v_src = vld2_u8(src + sj);78vst1_u8(dst + dj, v_src.val[coi]);79}8081for (; dj < size.width; sj += 2, ++dj)82{83dst[dj] = src[sj + coi];84}85}86#else87(void)size;88(void)srcBase;89(void)srcStride;90(void)dstBase;91(void)dstStride;92(void)coi;93#endif94}9596void extract3(const Size2D &size,97const u8 * srcBase, ptrdiff_t srcStride,98u8 * dstBase, ptrdiff_t dstStride,99u32 coi)100{101internal::assertSupportedConfiguration();102#ifdef CAROTENE_NEON103#ifndef __ANDROID__104size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;105#endif106size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;107108for (size_t i = 0u; i < size.height; ++i)109{110const u8 * src = internal::getRowPtr(srcBase, srcStride, i);111u8 * dst = internal::getRowPtr(dstBase, dstStride, i);112size_t sj = 0u, dj = 0u;113114#ifndef __ANDROID__115for (; dj < roiw32; sj += 96, dj += 32)116{117internal::prefetch(src + sj);118119uint8x16x3_t v_src = vld3q_u8(src + sj);120vst1q_u8(dst + dj, v_src.val[coi]);121122v_src = vld3q_u8(src + sj + 48);123vst1q_u8(dst + dj + 16, v_src.val[coi]);124}125#endif126127for (; dj < roiw8; sj += 24, dj += 8)128{129uint8x8x3_t v_src = vld3_u8(src + sj);130vst1_u8(dst + dj, v_src.val[coi]);131}132133for (; dj < size.width; sj += 3, ++dj)134{135dst[dj] = src[sj + coi];136}137}138#else139(void)size;140(void)srcBase;141(void)srcStride;142(void)dstBase;143(void)dstStride;144(void)coi;145#endif146}147148void extract4(const Size2D &size,149const u8 * srcBase, ptrdiff_t srcStride,150u8 * dstBase, ptrdiff_t dstStride,151u32 coi)152{153internal::assertSupportedConfiguration();154#ifdef CAROTENE_NEON155#ifndef __ANDROID__156size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;157#endif158size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;159160for (size_t i = 0u; i < size.height; ++i)161{162const u8 * src = internal::getRowPtr(srcBase, srcStride, i);163u8 * dst = internal::getRowPtr(dstBase, dstStride, i);164size_t sj = 0u, dj = 0u;165166#ifndef __ANDROID__167for (; dj < roiw32; sj += 128, dj += 32)168{169internal::prefetch(src + sj);170171uint8x16x4_t v_src = vld4q_u8(src + sj);172vst1q_u8(dst + dj, v_src.val[coi]);173174v_src = vld4q_u8(src + sj + 64);175vst1q_u8(dst + dj + 16, v_src.val[coi]);176}177#endif178179for (; dj < roiw8; sj += 32, dj += 8)180{181uint8x8x4_t v_src = vld4_u8(src + sj);182vst1_u8(dst + dj, v_src.val[coi]);183}184185for (; dj < size.width; sj += 4, ++dj)186{187dst[dj] = src[sj + coi];188}189}190#else191(void)size;192(void)srcBase;193(void)srcStride;194(void)dstBase;195(void)dstStride;196(void)coi;197#endif198}199200#define FILL_LINES2(macro,type) \201macro##_LINE(type,0) \202macro##_LINE(type,1)203#define FILL_LINES3(macro,type) \204FILL_LINES2(macro,type) \205macro##_LINE(type,2)206#define FILL_LINES4(macro,type) \207FILL_LINES3(macro,type) \208macro##_LINE(type,3)209210#define FARG_LINE(type, n) , type * dst##n##Base, ptrdiff_t dst##n##Stride211212#ifdef CAROTENE_NEON213214#define VROW_LINE(type, n) type * dst##n = internal::getRowPtr(dst##n##Base, dst##n##Stride, i);215#define VST1Q_LINE(type, n) vst1q_##type(dst##n + dj, v_src.val[n]);216#define VST1_LINE(type, n) vst1_##type(dst##n + dj, v_src.val[n]);217#define SST_LINE(type, n) dst##n[dj] = src[sj + n];218219#define MUL2(val) (val << 1)220#define MUL3(val) (MUL2(val) + val)221#define MUL4(val) (val << 2)222223#define CONTDST2 srcStride == dst0Stride && \224srcStride == dst1Stride &&225#define CONTDST3 srcStride == dst0Stride && \226srcStride == dst1Stride && \227srcStride == dst2Stride &&228#define CONTDST4 srcStride == dst0Stride && \229srcStride == dst1Stride && \230srcStride == dst2Stride && \231srcStride == dst3Stride &&232233#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)234235#define SPLIT_ASM2(sgn, bits) __asm__ ( \236"vld2." #bits " {d0, d2}, [%[in0]] \n\t" \237"vld2." #bits " {d1, d3}, [%[in1]] \n\t" \238"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \239"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \240: \241: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), \242[in0] "r" (src + sj), [in1] "r" (src + sj + MUL2(8)/sizeof(sgn##bits)) \243: "d0","d1","d2","d3" \244);245#define SPLIT_ASM3(sgn, bits) __asm__ ( \246"vld3." #bits " {d0, d2, d4}, [%[in0]] \n\t" \247"vld3." #bits " {d1, d3, d5}, [%[in1]] \n\t" \248"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \249"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \250"vst1." #bits " {d4-d5}, [%[out2]] \n\t" \251: \252: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \253[in0] "r" (src + sj), [in1] "r" (src + sj + MUL3(8)/sizeof(sgn##bits)) \254: "d0","d1","d2","d3","d4","d5" \255);256#define SPLIT_ASM4(sgn, bits) __asm__ ( \257"vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \258"vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \259"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \260"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \261"vst1." #bits " {d4-d5}, [%[out2]] \n\t" \262"vst1." #bits " {d6-d7}, [%[out3]] \n\t" \263: \264: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), [out3] "r" (dst3 + dj), \265[in0] "r" (src + sj), [in1] "r" (src + sj + MUL4(8)/sizeof(sgn##bits)) \266: "d0","d1","d2","d3","d4","d5","d6","d7" \267);268269#define SPLIT_QUAD(sgn, bits, n) { \270internal::prefetch(src + sj); \271SPLIT_ASM##n(sgn, bits) \272}273274#else275276#define SPLIT_QUAD(sgn, bits, n) { \277internal::prefetch(src + sj); \278vec128 v_src = vld##n##q_##sgn##bits(src + sj); \279FILL_LINES##n(VST1Q, sgn##bits) \280}281282#endif283284#define SPLIT(sgn,bits,n) void split##n(const Size2D &_size, \285const sgn##bits * srcBase, ptrdiff_t srcStride \286FILL_LINES##n(FARG, sgn##bits) ) \287{ \288internal::assertSupportedConfiguration(); \289Size2D size(_size); \290if (CONTDST##n \291dst0Stride == (ptrdiff_t)(size.width)) \292{ \293size.width *= size.height; \294size.height = 1; \295} \296typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \297size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \298typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \299size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \300\301for (size_t i = 0u; i < size.height; ++i) \302{ \303const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \304FILL_LINES##n(VROW, sgn##bits) \305size_t sj = 0u, dj = 0u; \306\307for (; dj < roiw16; sj += MUL##n(16)/sizeof(sgn##bits), dj += 16/sizeof(sgn##bits)) \308SPLIT_QUAD(sgn, bits, n) \309\310if (dj < roiw8) \311{ \312vec64 v_src = vld##n##_##sgn##bits(src + sj); \313FILL_LINES##n(VST1, sgn##bits) \314sj += MUL##n(8)/sizeof(sgn##bits); \315dj += 8/sizeof(sgn##bits); \316} \317\318for (; dj < size.width; sj += n, ++dj) \319{ \320FILL_LINES##n(SST, sgn##bits) \321} \322} \323}324325#define SPLIT64(sgn,n) void split##n(const Size2D &_size, \326const sgn##64 * srcBase, ptrdiff_t srcStride \327FILL_LINES##n(FARG, sgn##64) ) \328{ \329internal::assertSupportedConfiguration(); \330Size2D size(_size); \331if (CONTDST##n \332dst0Stride == (ptrdiff_t)(size.width)) \333{ \334size.width *= size.height; \335size.height = 1; \336} \337typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \338\339for (size_t i = 0u; i < size.height; ++i) \340{ \341const sgn##64 * src = internal::getRowPtr(srcBase, srcStride, i); \342FILL_LINES##n(VROW, sgn##64) \343size_t sj = 0u, dj = 0u; \344\345for (; dj < size.width; sj += n, ++dj) \346{ \347vec64 v_src = vld##n##_##sgn##64(src + sj); \348FILL_LINES##n(VST1, sgn##64) \349} \350} \351}352353#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)354355#define ALPHA_QUAD(sgn, bits) { \356internal::prefetch(src + sj); \357__asm__ ( \358"vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \359"vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \360"vst3." #bits " {d0, d2, d4}, [%[out3_1]] \n\t" \361"vst3." #bits " {d1, d3, d5}, [%[out3_2]] \n\t" \362"vst1." #bits " {d6-d7}, [%[out1]] \n\t" \363: \364: [out3_1] "r" (dst3 + d3j), [out3_2] "r" (dst3 + d3j + 24/sizeof(sgn##bits)), [out1] "r" (dst1 + d1j), \365[in0] "r" (src + sj), [in1] "r" (src + sj + 32/sizeof(sgn##bits)) \366: "d0","d1","d2","d3","d4","d5","d6","d7" \367); \368}369370#else371372#define ALPHA_QUAD(sgn, bits) { \373internal::prefetch(src + sj); \374union { vec128_4 v4; vec128_3 v3; } vals; \375vals.v4 = vld4q_##sgn##bits(src + sj); \376vst3q_##sgn##bits(dst3 + d3j, vals.v3); \377vst1q_##sgn##bits(dst1 + d1j, vals.v4.val[3]); \378}379380#endif381382#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &_size, \383const sgn##bits * srcBase, ptrdiff_t srcStride, \384sgn##bits * dst3Base, ptrdiff_t dst3Stride, \385sgn##bits * dst1Base, ptrdiff_t dst1Stride) \386{ \387internal::assertSupportedConfiguration(); \388Size2D size(_size); \389if (srcStride == dst3Stride && \390srcStride == dst1Stride && \391srcStride == (ptrdiff_t)(size.width)) \392{ \393size.width *= size.height; \394size.height = 1; \395} \396typedef internal::VecTraits<sgn##bits, 4>::vec128 vec128_4; \397typedef internal::VecTraits<sgn##bits, 3>::vec128 vec128_3; \398size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \399typedef internal::VecTraits<sgn##bits, 4>::vec64 vec64_4; \400typedef internal::VecTraits<sgn##bits, 3>::vec64 vec64_3; \401size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \402\403for (size_t i = 0u; i < size.height; ++i) \404{ \405const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \406sgn##bits * dst3 = internal::getRowPtr(dst3Base, dst3Stride, i); \407sgn##bits * dst1 = internal::getRowPtr(dst1Base, dst1Stride, i); \408size_t sj = 0u, d3j = 0u, d1j = 0u; \409\410for (; d1j < roiw16; sj += MUL4(16)/sizeof(sgn##bits), d3j += MUL3(16)/sizeof(sgn##bits), \411d1j += 16/sizeof(sgn##bits)) \412ALPHA_QUAD(sgn, bits) \413\414if (d1j < roiw8) \415{ \416union { vec64_4 v4; vec64_3 v3; } vals; \417vals.v4 = vld4_##sgn##bits(src + sj); \418vst3_u8(dst3 + d3j, vals.v3); \419vst1_u8(dst1 + d1j, vals.v4.val[3]); \420sj += MUL4(8)/sizeof(sgn##bits); \421d3j += MUL3(8)/sizeof(sgn##bits); \422d1j += 8/sizeof(sgn##bits); \423} \424\425for (; d1j < size.width; sj += 4, d3j += 3, ++d1j) \426{ \427dst3[d3j+0] = src[sj + 0]; \428dst3[d3j+1] = src[sj + 1]; \429dst3[d3j+2] = src[sj + 2]; \430dst1[d1j] = src[sj + 3]; \431} \432} \433}434435#else436437#define VOID_LINE(type, n) (void)dst##n##Base; (void)dst##n##Stride;438439#define SPLIT(sgn,bits,n) void split##n(const Size2D &size, \440const sgn##bits * srcBase, ptrdiff_t srcStride \441FILL_LINES##n(FARG, sgn##bits) ) \442{ \443internal::assertSupportedConfiguration(); \444(void)size; \445(void)srcBase; \446(void)srcStride; \447FILL_LINES##n(VOID, sgn##bits) \448}449450#define SPLIT64(sgn,n) SPLIT(sgn,64,n)451452#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &size, \453const sgn##bits * srcBase, ptrdiff_t srcStride, \454sgn##bits * dst3Base, ptrdiff_t dst3Stride, \455sgn##bits * dst1Base, ptrdiff_t dst1Stride) \456{ \457internal::assertSupportedConfiguration(); \458(void)size; \459(void)srcBase; \460(void)srcStride; \461(void)dst3Base; \462(void)dst3Stride; \463(void)dst1Base; \464(void)dst1Stride; \465}466467#endif //CAROTENE_NEON468469SPLIT(u, 8,2)470SPLIT(u, 8,3)471SPLIT(u, 8,4)472SPLIT(u,16,2)473SPLIT(u,16,3)474SPLIT(u,16,4)475SPLIT(s,32,2)476SPLIT(s,32,3)477SPLIT(s,32,4)478479SPLIT64(s, 2)480SPLIT64(s, 3)481SPLIT64(s, 4)482483SPLIT4ALPHA(u,8)484485} // namespace CAROTENE_NS486487488