Path: blob/master/3rdparty/carotene/src/colorconvert.cpp
16337 views
/*1* By downloading, copying, installing or using the software you agree to this license.2* If you do not agree to this license, do not download, install,3* copy or use the software.4*5*6* License Agreement7* For Open Source Computer Vision Library8* (3-clause BSD License)9*10* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.11* Third party copyrights are property of their respective owners.12*13* Redistribution and use in source and binary forms, with or without modification,14* are permitted provided that the following conditions are met:15*16* * Redistributions of source code must retain the above copyright notice,17* this list of conditions and the following disclaimer.18*19* * Redistributions in binary form must reproduce the above copyright notice,20* this list of conditions and the following disclaimer in the documentation21* and/or other materials provided with the distribution.22*23* * Neither the names of the copyright holders nor the names of the contributors24* may be used to endorse or promote products derived from this software25* without specific prior written permission.26*27* This software is provided by the copyright holders and contributors "as is" and28* any express or implied warranties, including, but not limited to, the implied29* warranties of merchantability and fitness for a particular purpose are disclaimed.30* In no event shall copyright holders or contributors be liable for any direct,31* indirect, incidental, special, exemplary, or consequential damages32* (including, but not limited to, procurement of substitute goods or services;33* loss of use, data, or profits; or business interruption) however caused34* and on any theory of liability, whether in contract, strict liability,35* or tort (including negligence or otherwise) arising in any way out of36* the use of this software, even if advised of the possibility of such damage.37*/3839#include "common.hpp"4041#include "saturate_cast.hpp"4243namespace CAROTENE_NS {4445#ifdef CAROTENE_NEON4647namespace {4849enum50{51SHIFT = 14,52SHIFT_DELTA = 1 << (SHIFT - 1),5354R2Y_BT601 = 4899,55G2Y_BT601 = 9617,56B2Y_BT601 = 1868,5758R2Y_BT709 = 3483,59G2Y_BT709 = 11718,60B2Y_BT709 = 1183,61};6263inline uint8x8_t convertToGray(const uint16x8_t & v_r,64const uint16x8_t & v_g,65const uint16x8_t & v_b,66const uint16x4_t & v_r2y,67const uint16x4_t & v_g2y,68const uint16x4_t & v_b2y)69{70uint32x4_t v_dst0 = vmull_u16(vget_low_u16(v_g), v_g2y);71uint32x4_t v_dst1 = vmull_u16(vget_high_u16(v_g), v_g2y);7273v_dst0 = vmlal_u16(v_dst0, vget_low_u16(v_r), v_r2y);74v_dst1 = vmlal_u16(v_dst1, vget_high_u16(v_r), v_r2y);7576v_dst0 = vmlal_u16(v_dst0, vget_low_u16(v_b), v_b2y);77v_dst1 = vmlal_u16(v_dst1, vget_high_u16(v_b), v_b2y);7879uint8x8_t v_gray = vqmovn_u16(vcombine_u16(vrshrn_n_u32(v_dst0, SHIFT),80vrshrn_n_u32(v_dst1, SHIFT)));8182return v_gray;83}8485} // namespace8687#endif8889void rgb2gray(const Size2D &size, COLOR_SPACE color_space,90const u8 * srcBase, ptrdiff_t srcStride,91u8 * dstBase, ptrdiff_t dstStride)92{93internal::assertSupportedConfiguration();94#ifdef CAROTENE_NEON95const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;96const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;97const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;9899#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)100register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);101register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);102register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);103#else104uint16x4_t v_r2y = vdup_n_u16(R2Y),105v_g2y = vdup_n_u16(G2Y),106v_b2y = vdup_n_u16(B2Y);107108size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;109#endif110size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;111112for (size_t i = 0u; i < size.height; ++i)113{114const u8 * src = internal::getRowPtr(srcBase, srcStride, i);115u8 * dst = internal::getRowPtr(dstBase, dstStride, i);116size_t sj = 0u, dj = 0u;117118#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)119for (; dj < roiw8; sj += 24, dj += 8)120{121internal::prefetch(src + sj);122__asm__ (123"vld3.8 {d0-d2}, [%[in]] @RGB \n\t"124"vmovl.u8 q2, d0 @R (d4,d5) \n\t"125"vmovl.u8 q3, d1 @G (d6,d7) \n\t"126"vmovl.u8 q4, d2 @B (d8,d9) \n\t"127"vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t"128"vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t"129"vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t"130"vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t"131"vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t"132"vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t"133"vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t"134"vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t"135"vqmovn.u16 d4, q4 \n\t"136"vst1.8 {d4}, [%[out]] \n\t"137: /*no output*/138: [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)139: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"140);141}142#else143for (; dj < roiw16; sj += 48, dj += 16)144{145internal::prefetch(src + sj);146uint8x16x3_t v_src0 = vld3q_u8(src + sj);147// 0148uint16x8_t v_r = vmovl_u8(vget_low_u8(v_src0.val[0])),149v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),150v_b = vmovl_u8(vget_low_u8(v_src0.val[2]));151uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);152153v_r = vmovl_u8(vget_high_u8(v_src0.val[0])),154v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),155v_b = vmovl_u8(vget_high_u8(v_src0.val[2]));156uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);157158vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));159}160161if (dj < roiw8)162{163uint8x8x3_t v_src = vld3_u8(src + sj);164uint16x8_t v_r = vmovl_u8(v_src.val[0]),165v_g = vmovl_u8(v_src.val[1]),166v_b = vmovl_u8(v_src.val[2]);167uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);168169vst1_u8(dst + dj, v_gray);170sj += 24; dj += 8;171}172#endif173174for (; dj < size.width; sj += 3, dj++)175{176u32 val = src[sj] * R2Y + src[sj + 1] * G2Y + src[sj + 2] * B2Y;177dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);178}179}180#else181(void)size;182(void)color_space;183(void)srcBase;184(void)srcStride;185(void)dstBase;186(void)dstStride;187#endif188}189190void rgbx2gray(const Size2D &size, COLOR_SPACE color_space,191const u8 * srcBase, ptrdiff_t srcStride,192u8 * dstBase, ptrdiff_t dstStride)193{194internal::assertSupportedConfiguration();195#ifdef CAROTENE_NEON196const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;197const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;198const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;199200#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)201register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);202register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);203register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);204#else205uint16x4_t v_r2y = vdup_n_u16(R2Y),206v_g2y = vdup_n_u16(G2Y),207v_b2y = vdup_n_u16(B2Y);208209size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;210#endif211size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;212213for (size_t i = 0u; i < size.height; ++i)214{215const u8 * src = internal::getRowPtr(srcBase, srcStride, i);216u8 * dst = internal::getRowPtr(dstBase, dstStride, i);217size_t sj = 0u, dj = 0u;218219#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)220for (; dj < roiw8; sj += 32, dj += 8)221{222internal::prefetch(src + sj);223__asm__ (224"vld4.8 {d0-d3}, [%[in]] @RGBA \n\t"225"vmovl.u8 q2, d0 @R (d4,d5) \n\t"226"vmovl.u8 q3, d1 @G (d6,d7) \n\t"227"vmovl.u8 q4, d2 @B (d8,d9) \n\t"228"vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t"229"vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t"230"vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t"231"vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t"232"vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t"233"vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t"234"vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t"235"vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t"236"vqmovn.u16 d4, q4 \n\t"237"vst1.8 {d4}, [%[out]] \n\t"238: /*no output*/239: [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)240: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"241);242}243#else244for (; dj < roiw16; sj += 64, dj += 16)245{246internal::prefetch(src + sj);247uint8x16x4_t v_src0 = vld4q_u8(src + sj);248249// 0250uint16x8_t v_r = vmovl_u8(vget_low_u8(v_src0.val[0])),251v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),252v_b = vmovl_u8(vget_low_u8(v_src0.val[2]));253uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);254255v_r = vmovl_u8(vget_high_u8(v_src0.val[0])),256v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),257v_b = vmovl_u8(vget_high_u8(v_src0.val[2]));258uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);259260vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));261}262263if (dj < roiw8)264{265uint8x8x4_t v_src = vld4_u8(src + sj);266uint16x8_t v_r = vmovl_u8(v_src.val[0]),267v_g = vmovl_u8(v_src.val[1]),268v_b = vmovl_u8(v_src.val[2]);269uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);270271vst1_u8(dst + dj, v_gray);272sj += 32; dj += 8;273}274#endif275276for (; dj < size.width; sj += 4, dj++)277{278u32 val = src[sj] * R2Y + src[sj + 1] * G2Y + src[sj + 2] * B2Y;279dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);280}281}282#else283(void)size;284(void)color_space;285(void)srcBase;286(void)srcStride;287(void)dstBase;288(void)dstStride;289#endif290}291292void bgr2gray(const Size2D &size, COLOR_SPACE color_space,293const u8 * srcBase, ptrdiff_t srcStride,294u8 * dstBase, ptrdiff_t dstStride)295{296internal::assertSupportedConfiguration();297#ifdef CAROTENE_NEON298const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;299const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;300const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;301302#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)303register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);304register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);305register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);306#else307uint16x4_t v_r2y = vdup_n_u16(R2Y),308v_g2y = vdup_n_u16(G2Y),309v_b2y = vdup_n_u16(B2Y);310311size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;312#endif313size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;314315for (size_t i = 0u; i < size.height; ++i)316{317const u8 * src = internal::getRowPtr(srcBase, srcStride, i);318u8 * dst = internal::getRowPtr(dstBase, dstStride, i);319size_t sj = 0u, dj = 0u;320321#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)322for (; dj < roiw8; sj += 24, dj += 8)323{324internal::prefetch(src + sj);325__asm__ (326"vld3.8 {d0-d2}, [%[in]] @BGR \n\t"327"vmovl.u8 q2, d2 @R (d4,d5) \n\t"328"vmovl.u8 q3, d1 @G (d6,d7) \n\t"329"vmovl.u8 q4, d0 @B (d8,d9) \n\t"330"vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t"331"vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t"332"vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t"333"vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t"334"vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t"335"vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t"336"vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t"337"vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t"338"vqmovn.u16 d4, q4 \n\t"339"vst1.8 {d4}, [%[out]] \n\t"340: /*no output*/341: [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)342: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"343);344}345#else346for (; dj < roiw16; sj += 48, dj += 16)347{348internal::prefetch(src + sj);349uint8x16x3_t v_src0 = vld3q_u8(src + sj);350351// 0352uint16x8_t v_b = vmovl_u8(vget_low_u8(v_src0.val[0])),353v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),354v_r = vmovl_u8(vget_low_u8(v_src0.val[2]));355uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);356357v_b = vmovl_u8(vget_high_u8(v_src0.val[0])),358v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),359v_r = vmovl_u8(vget_high_u8(v_src0.val[2]));360uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);361362vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));363}364365if (dj < roiw8)366{367uint8x8x3_t v_src = vld3_u8(src + sj);368uint16x8_t v_b = vmovl_u8(v_src.val[0]),369v_g = vmovl_u8(v_src.val[1]),370v_r = vmovl_u8(v_src.val[2]);371uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);372373vst1_u8(dst + dj, v_gray);374sj += 24; dj += 8;375}376#endif377378for (; dj < size.width; sj += 3, dj++)379{380u32 val = src[sj] * B2Y + src[sj + 1] * G2Y + src[sj + 2] * R2Y;381dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);382}383}384#else385(void)size;386(void)color_space;387(void)srcBase;388(void)srcStride;389(void)dstBase;390(void)dstStride;391#endif392}393394void bgrx2gray(const Size2D &size, COLOR_SPACE color_space,395const u8 * srcBase, ptrdiff_t srcStride,396u8 * dstBase, ptrdiff_t dstStride)397{398internal::assertSupportedConfiguration();399#ifdef CAROTENE_NEON400const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;401const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;402const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;403404#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)405register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);406register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);407register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);408#else409uint16x4_t v_r2y = vdup_n_u16(R2Y),410v_g2y = vdup_n_u16(G2Y),411v_b2y = vdup_n_u16(B2Y);412413size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;414#endif415size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;416417for (size_t i = 0u; i < size.height; ++i)418{419const u8 * src = internal::getRowPtr(srcBase, srcStride, i);420u8 * dst = internal::getRowPtr(dstBase, dstStride, i);421size_t sj = 0u, dj = 0u;422423#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)424for (; dj < roiw8; sj += 32, dj += 8)425{426internal::prefetch(src + sj);427__asm__ (428"vld4.8 {d0-d3}, [%[in]] @BGRA \n\t"429"vmovl.u8 q2, d2 @R (d4,d5) \n\t"430"vmovl.u8 q3, d1 @G (d6,d7) \n\t"431"vmovl.u8 q4, d0 @B (d8,d9) \n\t"432"vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t"433"vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t"434"vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t"435"vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t"436"vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t"437"vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t"438"vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t"439"vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t"440"vqmovn.u16 d4, q4 \n\t"441"vst1.8 {d4}, [%[out]] \n\t"442: /*no output*/443: [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)444: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"445);446}447#else448for (; dj < roiw16; sj += 64, dj += 16)449{450internal::prefetch(src + sj);451uint8x16x4_t v_src0 = vld4q_u8(src + sj);452453// 0454uint16x8_t v_b = vmovl_u8(vget_low_u8(v_src0.val[0])),455v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),456v_r = vmovl_u8(vget_low_u8(v_src0.val[2]));457uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);458459v_b = vmovl_u8(vget_high_u8(v_src0.val[0])),460v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),461v_r = vmovl_u8(vget_high_u8(v_src0.val[2]));462uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);463464vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));465}466467if (dj < roiw8)468{469uint8x8x4_t v_src = vld4_u8(src + sj);470uint16x8_t v_b = vmovl_u8(v_src.val[0]),471v_g = vmovl_u8(v_src.val[1]),472v_r = vmovl_u8(v_src.val[2]);473uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);474475vst1_u8(dst + dj, v_gray);476sj += 32; dj += 8;477}478#endif479480for (; dj < size.width; sj += 4, dj++)481{482u32 val = src[sj] * B2Y + src[sj + 1] * G2Y + src[sj + 2] * R2Y;483dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);484}485}486#else487(void)size;488(void)color_space;489(void)srcBase;490(void)srcStride;491(void)dstBase;492(void)dstStride;493#endif494}495496void gray2rgb(const Size2D &size,497const u8 * srcBase, ptrdiff_t srcStride,498u8 * dstBase, ptrdiff_t dstStride)499{500internal::assertSupportedConfiguration();501#ifdef CAROTENE_NEON502size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;503size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;504505for (size_t i = 0u; i < size.height; ++i)506{507const u8 * src = internal::getRowPtr(srcBase, srcStride, i);508u8 * dst = internal::getRowPtr(dstBase, dstStride, i);509size_t sj = 0u, dj = 0u;510511for (; sj < roiw16; sj += 16, dj += 48)512{513internal::prefetch(src + sj);514#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)515__asm__ (516"vld1.8 {d0-d1}, [%[in0]] \n\t"517"vmov.8 q1, q0 \n\t"518"vmov.8 q2, q0 \n\t"519"vmov.8 q3, q1 \n\t"520"vst3.8 {d2, d4, d6}, [%[out0]] \n\t"521"vst3.8 {d3, d5, d7}, [%[out1]] \n\t"522: /*no output*/523: [out0] "r" (dst + dj), [out1] "r" (dst + dj + 24),524[in0] "r" (src + sj)525: "d0","d1","d2","d3","d4","d5","d6","d7"526);527#else528uint8x16x3_t vRgb1;529vRgb1.val[0] = vld1q_u8(src + sj);530531vRgb1.val[1] = vRgb1.val[0];532vRgb1.val[2] = vRgb1.val[0];533534vst3q_u8(dst + dj, vRgb1);535#endif536}537538if (sj < roiw8)539{540#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)541__asm__ (542"vld1.8 {d0}, [%[in]] \n\t"543"vmov.8 d1, d0 \n\t"544"vmov.8 d2, d0 \n\t"545"vst3.8 {d0-d2}, [%[out]] \n\t"546: /*no output*/547: [out] "r" (dst + dj), [in] "r" (src + sj)548: "d0","d1","d2"549);550#else551uint8x8x3_t vRgb2;552vRgb2.val[0] = vld1_u8(src + sj);553vRgb2.val[1] = vRgb2.val[0];554vRgb2.val[2] = vRgb2.val[0];555556vst3_u8(dst + dj, vRgb2);557#endif558sj += 8; dj += 24;559}560561for (; sj < size.width; sj++, dj += 3)562{563dst[dj+0] = src[sj];564dst[dj+1] = src[sj];565dst[dj+2] = src[sj];566}567}568#else569(void)size;570(void)srcBase;571(void)srcStride;572(void)dstBase;573(void)dstStride;574#endif575}576577void gray2rgbx(const Size2D &size,578const u8 * srcBase, ptrdiff_t srcStride,579u8 * dstBase, ptrdiff_t dstStride)580{581internal::assertSupportedConfiguration();582#ifdef CAROTENE_NEON583size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;584size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;585586#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)587register uint8x16_t vc255 asm ("q4") = vmovq_n_u8(255);588#else589uint8x16x4_t vRgba;590uint8x8x4_t vRgba2;591vRgba.val[3] = vmovq_n_u8(255);592vRgba2.val[3] = vget_low_u8(vRgba.val[3]);593#endif594595for (size_t i = 0u; i < size.height; ++i)596{597const u8 * src = internal::getRowPtr(srcBase, srcStride, i);598u8 * dst = internal::getRowPtr(dstBase, dstStride, i);599size_t sj = 0u, dj = 0u;600601for (; sj < roiw16; sj += 16, dj += 64)602{603internal::prefetch(src + sj);604#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)605__asm__ (606"vld1.8 {d0-d1}, [%[in0]] \n\t"607"vmov.8 q1, q0 \n\t"608"vmov.8 q2, q0 \n\t"609"vmov.8 q3, q1 \n\t"610"vst4.8 {d2, d4, d6, d8}, [%[out0]] \n\t"611"vst4.8 {d3, d5, d7, d9}, [%[out1]] \n\t"612: /*no output*/613: [out0] "r" (dst + dj), [out1] "r" (dst + dj + 32),614[in0] "r" (src + sj),615"w" (vc255)616: "d0","d1","d2","d3","d4","d5","d6","d7"617);618#else619vRgba.val[0] = vld1q_u8(src + sj);620621vRgba.val[1] = vRgba.val[0];622vRgba.val[2] = vRgba.val[0];623624vst4q_u8(dst + dj, vRgba);625#endif626}627628if (sj < roiw8)629{630#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)631__asm__ (632"vld1.8 {d5}, [%[in]] \n\t"633"vmov.8 d6, d5 \n\t"634"vmov.8 d7, d5 \n\t"635"vst4.8 {d5-d8}, [%[out]] \n\t"636: /*no output*/637: [out] "r" (dst + dj), [in] "r" (src + sj), "w" (vc255)638: "d5","d6","d7"639);640#else641vRgba2.val[0] = vld1_u8(src + sj);642vRgba2.val[1] = vRgba2.val[0];643vRgba2.val[2] = vRgba2.val[0];644645vst4_u8(dst + dj, vRgba2);646#endif647sj += 8; dj += 32;648}649650for (; sj < size.width; sj++, dj += 4)651{652dst[dj+0] = src[sj];653dst[dj+1] = src[sj];654dst[dj+2] = src[sj];655dst[dj+3] = 255;656}657}658#else659(void)size;660(void)srcBase;661(void)srcStride;662(void)dstBase;663(void)dstStride;664#endif665}666667void rgb2rgbx(const Size2D &size,668const u8 * srcBase, ptrdiff_t srcStride,669u8 * dstBase, ptrdiff_t dstStride)670{671internal::assertSupportedConfiguration();672#ifdef CAROTENE_NEON673size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;674#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)675register uint8x8_t vc255_0 asm ("d3") = vmov_n_u8(255);676#else677size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;678union { uint8x16x4_t v4; uint8x16x3_t v3; } v_dst0;679v_dst0.v4.val[3] = vdupq_n_u8(255);680union { uint8x8x4_t v4; uint8x8x3_t v3; } v_dst;681v_dst.v4.val[3] = vdup_n_u8(255);682#endif683684for (size_t i = 0u; i < size.height; ++i)685{686const u8 * src = internal::getRowPtr(srcBase, srcStride, i);687u8 * dst = internal::getRowPtr(dstBase, dstStride, i);688size_t sj = 0u, dj = 0u, j = 0u;689690#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)691for (; j < roiw8; sj += 24, dj += 32, j += 8)692{693internal::prefetch(src + sj);694__asm__ (695"vld3.8 {d0, d1, d2}, [%[in0]] \n\t"696"vst4.8 {d0, d1, d2, d3}, [%[out0]] \n\t"697: /*no output*/698: [out0] "r" (dst + dj),699[in0] "r" (src + sj),700"w" (vc255_0)701: "d0","d1","d2"702);703}704#else705for (; j < roiw16; sj += 48, dj += 64, j += 16)706{707internal::prefetch(src + sj);708v_dst0.v3 = vld3q_u8(src + sj);709vst4q_u8(dst + dj, v_dst0.v4);710}711712if (j < roiw8)713{714v_dst.v3 = vld3_u8(src + sj);715vst4_u8(dst + dj, v_dst.v4);716sj += 24; dj += 32; j += 8;717}718#endif719720for (; j < size.width; ++j, sj += 3, dj += 4)721{722dst[dj] = src[sj];723dst[dj + 1] = src[sj + 1];724dst[dj + 2] = src[sj + 2];725dst[dj + 3] = 255;726}727}728#else729(void)size;730(void)srcBase;731(void)srcStride;732(void)dstBase;733(void)dstStride;734#endif735}736737void rgbx2rgb(const Size2D &size,738const u8 * srcBase, ptrdiff_t srcStride,739u8 * dstBase, ptrdiff_t dstStride)740{741internal::assertSupportedConfiguration();742#ifdef CAROTENE_NEON743size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;744#if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))745size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;746union { uint8x16x4_t v4; uint8x16x3_t v3; } v_dst0;747union { uint8x8x4_t v4; uint8x8x3_t v3; } v_dst;748#endif749750for (size_t i = 0u; i < size.height; ++i)751{752const u8 * src = internal::getRowPtr(srcBase, srcStride, i);753u8 * dst = internal::getRowPtr(dstBase, dstStride, i);754size_t sj = 0u, dj = 0u, j = 0u;755756#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)757for (; j < roiw8; sj += 32, dj += 24, j += 8)758{759internal::prefetch(src + sj);760__asm__ (761"vld4.8 {d0, d1, d2, d3}, [%[in0]] \n\t"762"vst3.8 {d0, d1, d2}, [%[out0]] \n\t"763: /*no output*/764: [out0] "r" (dst + dj),765[in0] "r" (src + sj)766: "d0","d1","d2","d3"767);768}769#else770for (; j < roiw16; sj += 64, dj += 48, j += 16)771{772internal::prefetch(src + sj);773v_dst0.v4 = vld4q_u8(src + sj);774vst3q_u8(dst + dj, v_dst0.v3);775}776777if (j < roiw8)778{779v_dst.v4 = vld4_u8(src + sj);780vst3_u8(dst + dj, v_dst.v3);781sj += 32; dj += 24; j += 8;782}783#endif784785for (; j < size.width; ++j, sj += 4, dj += 3)786{787dst[dj] = src[sj];788dst[dj + 1] = src[sj + 1];789dst[dj + 2] = src[sj + 2];790}791}792#else793(void)size;794(void)srcBase;795(void)srcStride;796(void)dstBase;797(void)dstStride;798#endif799}800801void rgb2bgr(const Size2D &size,802const u8 * srcBase, ptrdiff_t srcStride,803u8 * dstBase, ptrdiff_t dstStride)804{805internal::assertSupportedConfiguration();806#ifdef CAROTENE_NEON807#if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))808size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;809#endif810size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;811812for (size_t i = 0u; i < size.height; ++i)813{814const u8 * src = internal::getRowPtr(srcBase, srcStride, i);815u8 * dst = internal::getRowPtr(dstBase, dstStride, i);816size_t sj = 0u, dj = 0u, j = 0u;817818819#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)820for (; j < roiw8; sj += 24, dj += 24, j += 8)821{822internal::prefetch(src + sj);823__asm__ (824"vld3.8 {d0, d1, d2}, [%[in0]] \n\t"825"vswp d0, d2 \n\t"826"vst3.8 {d0, d1, d2}, [%[out0]] \n\t"827: /*no output*/828: [out0] "r" (dst + dj),829[in0] "r" (src + sj)830: "d0","d1","d2"831);832}833#else834for (; j < roiw16; sj += 48, dj += 48, j += 16)835{836internal::prefetch(src + sj);837uint8x16x3_t vals0 = vld3q_u8(src + sj);838839std::swap(vals0.val[0], vals0.val[2]);840841vst3q_u8(dst + dj, vals0);842}843844if (j < roiw8)845{846uint8x8x3_t vals = vld3_u8(src + sj);847std::swap(vals.val[0], vals.val[2]);848vst3_u8(dst + dj, vals);849sj += 24; dj += 24; j += 8;850}851#endif852853for (; j < size.width; ++j, sj += 3, dj += 3)854{855u8 b = src[sj + 2];//Handle src == dst case856dst[dj + 2] = src[sj ];857dst[dj + 1] = src[sj + 1];858dst[dj ] = b;859}860}861#else862(void)size;863(void)srcBase;864(void)srcStride;865(void)dstBase;866(void)dstStride;867#endif868}869870void rgbx2bgrx(const Size2D &size,871const u8 * srcBase, ptrdiff_t srcStride,872u8 * dstBase, ptrdiff_t dstStride)873{874internal::assertSupportedConfiguration();875#ifdef CAROTENE_NEON876#if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))877size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;878#endif879size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;880881for (size_t i = 0u; i < size.height; ++i)882{883const u8 * src = internal::getRowPtr(srcBase, srcStride, i);884u8 * dst = internal::getRowPtr(dstBase, dstStride, i);885size_t sj = 0u, dj = 0u, j = 0u;886887#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)888for (; j < roiw8; sj += 32, dj += 32, j += 8)889{890internal::prefetch(src + sj);891__asm__ (892"vld4.8 {d0, d1, d2, d3}, [%[in0]] \n\t"893"vswp d0, d2 \n\t"894"vst4.8 {d0, d1, d2, d3}, [%[out0]] \n\t"895: /*no output*/896: [out0] "r" (dst + dj),897[in0] "r" (src + sj)898: "d0","d1","d2","d3"899);900}901#else902for (; j < roiw16; sj += 64, dj += 64, j += 16)903{904internal::prefetch(src + sj);905uint8x16x4_t vals0 = vld4q_u8(src + sj);906907std::swap(vals0.val[0], vals0.val[2]);908909vst4q_u8(dst + dj, vals0);910}911912if (j < roiw8)913{914uint8x8x4_t vals = vld4_u8(src + sj);915std::swap(vals.val[0], vals.val[2]);916vst4_u8(dst + dj, vals);917sj += 32; dj += 32; j += 8;918}919#endif920921for (; j < size.width; ++j, sj += 4, dj += 4)922{923u8 b = src[sj + 2];//Handle src == dst case924dst[dj + 2] = src[sj ];925dst[dj + 1] = src[sj + 1];926dst[dj ] = b;927dst[dj + 3] = src[sj + 3];928}929}930#else931(void)size;932(void)srcBase;933(void)srcStride;934(void)dstBase;935(void)dstStride;936#endif937}938939void rgbx2bgr(const Size2D &size,940const u8 * srcBase, ptrdiff_t srcStride,941u8 * dstBase, ptrdiff_t dstStride)942{943internal::assertSupportedConfiguration();944#ifdef CAROTENE_NEON945#if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))946size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;947#endif948size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;949950for (size_t i = 0u; i < size.height; ++i)951{952const u8 * src = internal::getRowPtr(srcBase, srcStride, i);953u8 * dst = internal::getRowPtr(dstBase, dstStride, i);954size_t sj = 0u, dj = 0u, j = 0u;955956#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)957for (; j < roiw8; sj += 32, dj += 24, j += 8)958{959internal::prefetch(src + sj);960__asm__ (961"vld4.8 {d0, d1, d2, d3}, [%[in0]] \n\t"962"vswp d0, d2 \n\t"963"vst3.8 {d0, d1, d2}, [%[out0]] \n\t"964: /*no output*/965: [out0] "r" (dst + dj),966[in0] "r" (src + sj)967: "d0","d1","d2","d3"968);969}970#else971for (; j < roiw16; sj += 64, dj += 48, j += 16)972{973internal::prefetch(src + sj);974union { uint8x16x4_t v4; uint8x16x3_t v3; } vals0;975vals0.v4 = vld4q_u8(src + sj);976std::swap(vals0.v3.val[0], vals0.v3.val[2]);977vst3q_u8(dst + dj, vals0.v3);978}979980if (j < roiw8)981{982union { uint8x8x4_t v4; uint8x8x3_t v3; } vals;983vals.v4 = vld4_u8(src + sj);984std::swap(vals.v3.val[0], vals.v3.val[2]);985vst3_u8(dst + dj, vals.v3);986sj += 32; dj += 24; j += 8;987}988#endif989990for (; j < size.width; ++j, sj += 4, dj += 3)991{992dst[dj + 2] = src[sj ];993dst[dj + 1] = src[sj + 1];994dst[dj ] = src[sj + 2];995}996}997#else998(void)size;999(void)srcBase;1000(void)srcStride;1001(void)dstBase;1002(void)dstStride;1003#endif1004}10051006void rgb2bgrx(const Size2D &size,1007const u8 * srcBase, ptrdiff_t srcStride,1008u8 * dstBase, ptrdiff_t dstStride)1009{1010internal::assertSupportedConfiguration();1011#ifdef CAROTENE_NEON1012#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)1013register uint8x8_t vc255 asm ("d3") = vmov_n_u8(255);1014#else1015union { uint8x16x4_t v4; uint8x16x3_t v3; } vals0;1016vals0.v4.val[3] = vmovq_n_u8(255);1017union { uint8x8x4_t v4; uint8x8x3_t v3; } vals8;1018vals8.v4.val[3] = vmov_n_u8(255);1019#endif10201021#if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))1022size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;1023#endif1024size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;10251026for (size_t i = 0u; i < size.height; ++i)1027{1028const u8 * src = internal::getRowPtr(srcBase, srcStride, i);1029u8 * dst = internal::getRowPtr(dstBase, dstStride, i);1030size_t sj = 0u, dj = 0u, j = 0u;10311032#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)1033for (; j < roiw8; sj += 24, dj += 32, j += 8)1034{1035internal::prefetch(src + sj);1036__asm__ (1037"vld3.8 {d0, d1, d2}, [%[in0]] \n\t"1038"vswp d0, d2 \n\t"1039"vst4.8 {d0, d1, d2, d3}, [%[out0]] \n\t"1040: /*no output*/1041: [out0] "r" (dst + dj),1042[in0] "r" (src + sj),1043"w" (vc255)1044: "d0","d1","d2"1045);1046}1047#else1048for (; j < roiw16; sj += 48, dj += 64, j += 16)1049{1050internal::prefetch(src + sj);1051vals0.v3 = vld3q_u8(src + sj);1052std::swap(vals0.v4.val[0], vals0.v4.val[2]);1053vst4q_u8(dst + dj, vals0.v4);1054}10551056if (j < roiw8)1057{1058vals8.v3 = vld3_u8(src + sj);1059std::swap(vals8.v4.val[0], vals8.v4.val[2]);1060vst4_u8(dst + dj, vals8.v4);1061sj += 24; dj += 32; j += 8;1062}1063#endif10641065for (; j < size.width; ++j, sj += 3, dj += 4)1066{1067dst[dj + 3] = 255;1068dst[dj + 2] = src[sj ];1069dst[dj + 1] = src[sj + 1];1070dst[dj ] = src[sj + 2];1071}1072}1073#else1074(void)size;1075(void)srcBase;1076(void)srcStride;1077(void)dstBase;1078(void)dstStride;1079#endif1080}10811082namespace {10831084#ifdef CAROTENE_NEON1085inline uint8x8x3_t convertToHSV(const uint8x8_t vR, const uint8x8_t vG, const uint8x8_t vB,1086const s32 hrange )1087{1088const s32 hsv_shift = 12;1089const f32 vsdiv_table = f32(255 << hsv_shift);1090f32 vhdiv_table = f32(hrange << hsv_shift);1091const s32 vhrange = hrange;1092const s32 v0 = s32(0);1093const s32 vshift = s32(1 << (hsv_shift-1));1094const s32 v6 = s32(6);10951096uint8x8_t vMin = vmin_u8(vR, vG);1097uint8x8_t vMax = vmax_u8(vR, vG);10981099uint16x8_t vR_u16 = vmovl_u8(vR);1100uint16x8_t vG_u16 = vmovl_u8(vG);11011102vMax = vmax_u8(vMax, vB);1103vMin = vmin_u8(vMin, vB);1104uint16x8_t vB_u16 = vmovl_u8(vB);11051106uint16x8_t vDiff = vsubl_u8(vMax, vMin);11071108uint16x8_t vV = vmovl_u8(vMax);1109uint16x8_t vDiffx2 = vaddq_u16(vDiff, vDiff);1110uint32x4_t vDiffL = vmovl_u16(vget_low_u16(vDiff));1111uint32x4_t vDiffH = vmovl_u16(vget_high_u16(vDiff));11121113uint16x8_t vVEqR = vceqq_u16(vR_u16, vV);1114uint16x8_t vVEqG = vceqq_u16(vG_u16, vV);11151116int16x8_t vG_B = vsubq_s16(vreinterpretq_s16_u16(vG_u16), vreinterpretq_s16_u16(vB_u16));1117uint16x8_t vInvR = vmvnq_u16(vVEqR);1118int16x8_t vB_R = vsubq_s16(vreinterpretq_s16_u16(vB_u16), vreinterpretq_s16_u16(vR_u16));1119int16x8_t vR_G = vsubq_s16(vreinterpretq_s16_u16(vR_u16), vreinterpretq_s16_u16(vG_u16));11201121uint16x8_t vMask2 = vandq_u16(vVEqG, vInvR);1122vR_u16 = vandq_u16(vreinterpretq_u16_s16(vG_B), vVEqR);1123int16x8_t vH2 = vaddq_s16(vB_R, vreinterpretq_s16_u16(vDiffx2));11241125vVEqR = vmvnq_u16(vVEqG);1126vB_R = vaddq_s16(vreinterpretq_s16_u16(vDiffx2), vreinterpretq_s16_u16(vDiffx2));1127vG_B = vandq_s16(vreinterpretq_s16_u16(vInvR), vreinterpretq_s16_u16(vVEqR));1128vInvR = vandq_u16(vreinterpretq_u16_s16(vH2), vMask2);1129vR_G = vaddq_s16(vR_G, vB_R);1130int16x8_t vH = vaddq_s16(vreinterpretq_s16_u16(vR_u16), vreinterpretq_s16_u16(vInvR));11311132uint32x4_t vV_L = vmovl_u16(vget_low_u16(vV));1133vR_G = vandq_s16(vR_G, vG_B);1134uint32x4_t vV_H = vmovl_u16(vget_high_u16(vV));1135int16x8_t vDiff4 = vaddq_s16(vH, vR_G);11361137int32x4_t vc6 = vdupq_n_s32(v6);1138uint32x4_t vLine1 = vmulq_u32(vDiffL, vreinterpretq_u32_s32(vc6));1139uint32x4_t vLine2 = vmulq_u32(vDiffH, vreinterpretq_u32_s32(vc6));11401141float32x4_t vF1 = vcvtq_f32_u32(vV_L);1142float32x4_t vF2 = vcvtq_f32_u32(vV_H);1143float32x4_t vHF1 = vcvtq_f32_u32(vLine1);1144float32x4_t vHF2 = vcvtq_f32_u32(vLine2);11451146float32x4_t vXInv1 = vrecpeq_f32(vF1);1147float32x4_t vXInv2 = vrecpeq_f32(vF2);1148float32x4_t vXInv3 = vrecpeq_f32(vHF1);1149float32x4_t vXInv4 = vrecpeq_f32(vHF2);11501151float32x4_t vSt1 = vrecpsq_f32(vXInv1, vF1);1152float32x4_t vSt2 = vrecpsq_f32(vXInv2, vF2);1153float32x4_t vSt3 = vrecpsq_f32(vXInv3, vHF1);1154float32x4_t vSt4 = vrecpsq_f32(vXInv4, vHF2);11551156vF1 = vmulq_f32(vXInv1, vSt1);1157vF2 = vmulq_f32(vXInv2, vSt2);1158vHF1 = vmulq_f32(vXInv3, vSt3);1159vHF2 = vmulq_f32(vXInv4, vSt4);11601161float32x4_t vDivTab = vdupq_n_f32(vsdiv_table);1162vSt1 = vmulq_f32(vF1, vDivTab);1163vSt2 = vmulq_f32(vF2, vDivTab);1164vDivTab = vdupq_n_f32(vhdiv_table);1165vSt3 = vmulq_f32(vHF1, vDivTab);1166vSt4 = vmulq_f32(vHF2, vDivTab);11671168float32x4_t bias = vdupq_n_f32(0.5f);11691170vSt1 = vaddq_f32(vSt1, bias);1171vSt2 = vaddq_f32(vSt2, bias);1172vSt3 = vaddq_f32(vSt3, bias);1173vSt4 = vaddq_f32(vSt4, bias);11741175uint32x4_t vRes1 = vcvtq_u32_f32(vSt1);1176uint32x4_t vRes2 = vcvtq_u32_f32(vSt2);1177uint32x4_t vRes3 = vcvtq_u32_f32(vSt3);1178uint32x4_t vRes4 = vcvtq_u32_f32(vSt4);11791180int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4));1181int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4));11821183uint32x4_t vDiff_Res1 = vmulq_u32(vDiffL, vRes1);1184uint32x4_t vDiff_Res2 = vmulq_u32(vDiffH, vRes2);1185uint32x4_t vDiff_Res3 = vmulq_u32(vreinterpretq_u32_s32(vH_L), vRes3);1186uint32x4_t vDiff_Res4 = vmulq_u32(vreinterpretq_u32_s32(vH_H), vRes4);11871188int32x4_t vShift = vdupq_n_s32(vshift);1189uint32x4_t vAddRes1 = vaddq_u32(vDiff_Res1, vreinterpretq_u32_s32(vShift));1190uint32x4_t vAddRes2 = vaddq_u32(vDiff_Res2, vreinterpretq_u32_s32(vShift));1191uint32x4_t vAddRes3 = vaddq_u32(vDiff_Res3, vreinterpretq_u32_s32(vShift));1192uint32x4_t vAddRes4 = vaddq_u32(vDiff_Res4, vreinterpretq_u32_s32(vShift));1193int16x4_t vShrRes1 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes1), 8);1194int16x4_t vShrRes2 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes2), 8);1195int16x4_t vShrRes3 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes3), 8);1196int16x4_t vShrRes4 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes4), 8);11971198int16x8_t vc0 = vdupq_n_s16((s16)v0);1199int8x8_t vShrRes1_s8 = vshrn_n_s16(vcombine_s16(vShrRes1, vShrRes2), 4);1200uint16x8_t vCltRes_u16 = vcltq_s16(vcombine_s16(vShrRes3, vShrRes4), vc0);1201int8x8_t vShrRes2_s8 = vshrn_n_s16(vcombine_s16(vShrRes3, vShrRes4), 4);12021203int8x8_t vCltRes_s8 = vmovn_s16(vreinterpretq_s16_u16(vCltRes_u16));1204int8x8_t vcHRange = vdup_n_s8((s8)vhrange);1205uint8x8_t vHResAdd = vand_u8(vreinterpret_u8_s8(vCltRes_s8), vreinterpret_u8_s8(vcHRange));1206int8x8_t vHRes = vadd_s8(vShrRes2_s8, vreinterpret_s8_u8(vHResAdd));12071208uint8x8x3_t vHsv;1209vHsv.val[0] = vreinterpret_u8_s8(vHRes);1210vHsv.val[1] = vreinterpret_u8_s8(vShrRes1_s8);1211vHsv.val[2] = vMax;12121213return vHsv;1214}12151216const u8 fastSaturate8u[] =1217{12180, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,12190, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,12200, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,12210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,12220, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,12230, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,12240, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,12250, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,12260, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,12270, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,12280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,12290, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,12300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,12310, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,12320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,12330, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,12340, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,123516, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,123632, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,123748, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,123864, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,123980, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,124096, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,1241112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,1242128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,1243144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,1244160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,1245176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,1246192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,1247208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,1248224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,1249240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,1250255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,1251255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,1252255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,1253255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,1254255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,1255255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,1256255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,1257255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,1258255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,1259255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,1260255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,1261255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,1262255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,1263255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,1264255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,1265255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,12662551267};12681269inline void convertToHSV(const s32 r, const s32 g, const s32 b,1270const s32 &hrange, const s32 &hsv_shift,1271u8* dst)1272{1273s32 h, s, v = b;1274s32 vmin = b, diff;1275s32 vr, vg;12761277v += fastSaturate8u[g-v+256];1278v += fastSaturate8u[r-v+256];1279vmin -= fastSaturate8u[vmin-g+256];1280vmin -= fastSaturate8u[vmin-r+256];12811282diff = v - vmin;1283vr = v == r ? -1 : 0;1284vg = v == g ? -1 : 0;12851286s = (s32(diff * (255 << hsv_shift) * (1.0f/(f32)v)) + (1 << (hsv_shift-1))) >> hsv_shift;1287h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));1288h = ((h * s32((hrange << hsv_shift)/(6.f*diff) + 0.5)) + (1 << (hsv_shift-1))) >> hsv_shift;1289h += h < 0 ? hrange : 0;12901291dst[0] = internal::saturate_cast<u8>(h);1292dst[1] = (u8)s;1293dst[2] = (u8)v;1294}12951296#define CONVERT_TO_HSV_ASM(loadop, rreg, breg) \1297__asm__ ( \1298#loadop ", [%[in]] @RGB \n\t" \1299"vmin.u8 d3, d0, d1 @VMin (d3) \n\t" \1300"vmax.u8 d6, d0, d1 @V (d6) \n\t" \1301"vmovl.u8 q2, " #rreg " @V16_R (d4,d5) \n\t" \1302"vmovl.u8 q4, d1 @V16_G (d8,d9) \n\t" \1303"vmax.u8 d6, d6, d2 \n\t" \1304"vmin.u8 d3, d3, d2 \n\t" \1305"vmovl.u8 q0, " #breg " @V16_B (d0,d1) \n\t" \1306"vsubl.u8 q8, d6, d3 @V16_Diff (d16,d17) \n\t" \1307\1308"vmovl.u8 q5, d6 @V16_V (d10,d11) \n\t" \1309"vadd.s16 q10, q8, q8 @V16_Diff_2 (d20,d21) \n\t" \1310"vmovl.u16 q9, d16 @V32_Diff_L (d18,d19) \n\t" \1311"vmovl.u16 q11, d17 @V32_Diff_H (d22,d23) \n\t" \1312"vceq.u16 q12, q2, q5 @V==R(d24,d25) \n\t" \1313"vceq.u16 q13, q4, q5 @V==G(d26,d27) \n\t" \1314\1315"vsub.s16 q8, q4, q0 @V16_G-B (d16,d17) \n\t" \1316"vmvn.u16 q15, q12 @V16~R \n\t" \1317"vsub.s16 q6, q0, q2 @V16_B-R (d12,d13) \n\t" \1318"vsub.s16 q7, q2, q4 @V16_R-G (d14,d15) \n\t" \1319"vand.u16 q1, q13, q15 @VMask2 \n\t" \1320"vand.u16 q2, q8, q12 @V16_H(d4,d5) \n\t" \1321"vadd.s16 q4, q6, q10 @V16_H2 \n\t" \1322"vmvn.u16 q12, q13 @V16~G \n\t" \1323"vadd.s16 q6, q10, q10 @VDiff16_4 (d12,d13) \n\t" \1324"vand.u16 q8, q15, q12 @VMask3 \n\t" \1325"vand.u16 q15, q4, q1 @vH2(d30,d31) \n\t" \1326"vadd.s16 q7, q7, q6 @V16_H3 (d14,d15) \n\t" \1327"vadd.s16 q14, q2, q15 @vH16 \n\t" \1328"vmovl.u16 q12, d10 @V32_V_L \n\t" \1329"vand.s16 q7, q7, q8 @vH16 \n\t" \1330"vmovl.u16 q13, d11 @V32_V_H \n\t" \1331"vadd.s16 q2, q14, q7 @V16_Diff_4 \n\t" \1332\1333"vdup.32 q4, %[v6] \n\t" \1334"vmul.u32 q14, q9, q4 \n\t" \1335"vmul.u32 q15, q11, q4 \n\t" \1336"vcvt.f32.u32 q4, q12 @VF1 (d8,d9) \n\t" \1337"vcvt.f32.u32 q8, q13 @VF2 \n\t" \1338"vcvt.f32.u32 q0, q14 @HF1 \n\t" \1339"vcvt.f32.u32 q1, q15 @HF2 \n\t" \1340"vrecpe.f32 q12, q4 @Vxinv \n\t" \1341"vrecpe.f32 q13, q8 @Vxinv \n\t" \1342"vrecpe.f32 q5, q0 @Vxinv \n\t" \1343"vrecpe.f32 q7, q1 @Vxinv \n\t" \1344"vrecps.f32 q14, q12, q4 @Vst1 \n\t" \1345"vrecps.f32 q15, q13, q8 @Vst1 \n\t" \1346"vrecps.f32 q10, q5, q0 @Vst1 \n\t" \1347"vrecps.f32 q6, q7, q1 @Vst1 \n\t" \1348"vmul.f32 q4, q12, q14 \n\t" \1349"vmul.f32 q8, q13, q15 \n\t" \1350"vmul.f32 q0, q5, q10 \n\t" \1351"vmul.f32 q1, q7, q6 \n\t" \1352"vdup.32 q12, %[vsdiv_table] \n\t" \1353"vmul.f32 q14, q4, q12 \n\t" \1354"vmul.f32 q15, q8, q12 \n\t" \1355"vdup.32 q12, %[vhdiv_table] \n\t" \1356"vmul.f32 q10, q0, q12 \n\t" \1357"vmul.f32 q6, q1, q12 \n\t" \1358\1359"vdup.32 q12, %[bias] \n\t" \1360\1361"vadd.f32 q7, q14, q12 \n\t" \1362"vadd.f32 q13, q15, q12 \n\t" \1363"vcvt.u32.f32 q4, q7 \n\t" \1364"vcvt.u32.f32 q8, q13 \n\t" \1365\1366"vadd.f32 q14, q10, q12 \n\t" \1367"vadd.f32 q7, q6, q12 \n\t" \1368"vcvt.u32.f32 q0, q14 \n\t" \1369"vcvt.u32.f32 q1, q7 @Vres \n\t" \1370\1371"vmovl.s16 q7, d4 @V32_H_L (d14,d15) \n\t" \1372"vmovl.s16 q5, d5 @V32_H_H (d10,d11) \n\t" \1373"vmul.u32 q14, q9, q4 \n\t" \1374"vmul.u32 q15, q11, q8 \n\t" \1375"vmul.u32 q10, q7, q0 \n\t" \1376"vmul.u32 q6, q5, q1 \n\t" \1377\1378"vdup.32 q12, %[vshift] \n\t" \1379"vadd.u32 q13, q14, q12 \n\t" \1380"vadd.u32 q8, q15, q12 \n\t" \1381"vadd.u32 q0, q10, q12 \n\t" \1382"vadd.u32 q1, q6, q12 \n\t" \1383"vshrn.s32 d8, q13, #8 \n\t" \1384"vshrn.s32 d9, q8, #8 \n\t" \1385"vshrn.s32 d10, q0, #8 \n\t" \1386"vshrn.s32 d11, q1, #8 \n\t" \1387\1388"vdup.16 q8, %[v0] \n\t" \1389"vshrn.s16 d5, q4, #4 \n\t" \1390"vclt.s16 q9, q5, q8 \n\t" \1391"vshrn.s16 d4, q5, #4 \n\t" \1392\1393"vmovn.s16 d9, q9 \n\t" \1394"vdup.8 d7, %[vhrange] \n\t" \1395"vand.u8 d10, d9, d7 \n\t" \1396"vadd.s8 d4, d4, d10 \n\t" \1397"vst3.8 {d4-d6}, [%[out]] @HSV \n\t" \1398: /*no output*/ \1399: [out] "r" (dst + dj), [in] "r" (src + sj), \1400[vsdiv_table] "r" (vsdiv_table), \1401[vshift] "r" (vshift), \1402[vhdiv_table] "r" (vhdiv_table), \1403[v6] "r" (v6), [vhrange] "r" (vhrange), \1404[v0] "r" (v0), [bias] "r" (bias) \1405: "d0","d1","d2","d3","d4","d5","d6","d7", \1406"d8","d9","d10","d11","d12","d13","d14","d15", \1407"d16","d17","d18","d19","d20","d21","d22","d23", \1408"d24","d25","d26","d27","d28","d29","d30","d31" \1409);14101411#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)14121413#define YCRCB_CONSTS \1414register int16x4_t vcYR asm ("d31") = vmov_n_s16(4899); \1415register int16x4_t vcYG asm ("d30") = vmov_n_s16(9617); \1416register int16x4_t vcYB asm ("d29") = vmov_n_s16(1868); \1417register int16x4_t vcCrG asm ("d28") = vmov_n_s16(6860); \1418register int16x4_t vcCrB asm ("d27") = vmov_n_s16(1332); \1419register int16x4_t vcCbR asm ("d26") = vmov_n_s16(2765); \1420register int16x4_t vcCbG asm ("d25") = vmov_n_s16(5427);14211422#else14231424#define YCRCB_CONSTS \1425const s16 convertCoeffs[] = { 4899, 4899, 4899, 4899, \14269617, 9617, 9617, 9617, \14271868, 1868, 1868, 1868, \14286860, 6860, 6860, 6860, \14291332, 1332, 1332, 1332, \14302765, 2765, 2765, 2765, \14315427, 5427, 5427, 5427 }; \1432const int16x8_t vcYRG = vld1q_s16(convertCoeffs); /*YR and YG*/ \1433const int16x4_t vcYB = vld1_s16(convertCoeffs + 8); /*YB*/ \1434const int16x8_t vcCrGB = vld1q_s16(convertCoeffs + 12); /*CrG and CrB*/ \1435const int16x8_t vcCbRG = vld1q_s16(convertCoeffs + 20); /*CbR and CbG*/14361437#endif14381439#define CONVERTTOYCRCB(loadcmd, rreg, greg, breg) \1440__asm__ ( \1441#loadcmd ", [%[in]] @RGB \n\t" \1442"vmovl.u8 q2, " #rreg " @R (d4,d5) \n\t" \1443"vmovl.u8 q3, " #greg " @G (d6,d7) \n\t" \1444"vmovl.u8 q4, " #breg " @B (d8,d9) \n\t" \1445\1446"vshll.u16 q7, d4, #13 @Cr(q7,q8): R \n\t" \1447"vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t" \1448"vshll.u16 q9, d8, #13 @Cb(q9,q10): B \n\t" \1449"vshll.u16 q8, d5, #13 @Cr(q7,q8): R \n\t" \1450"vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t" \1451"vshll.u16 q10, d9, #13 @Cb(q9,q10): B \n\t" \1452\1453"vmlsl.s16 q7, d6, d28 @Cr(q7,q8): RG \n\t" \1454"vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t" \1455"vmlsl.s16 q9, d4, d26 @Cb(q9,q10): BR \n\t" \1456"vmlsl.s16 q8, d7, d28 @Cr(q7,q8): RG \n\t" \1457"vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t" \1458"vmlsl.s16 q10, d5, d26 @Cb(q9,q10): BR \n\t" \1459\1460"vmlsl.s16 q7, d8, d27 @Cr(q7,q8): RGB \n\t" \1461"vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t" \1462"vmlsl.s16 q9, d6, d25 @Cb(q9,q10): BRG \n\t" \1463"vmlsl.s16 q8, d9, d27 @Cr(q7,q8): RGB \n\t" \1464"vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t" \1465"vmlsl.s16 q10, d7, d25 @Cb(q9,q10): BRG \n\t" \1466\1467"vrshrn.s32 d4, q7, #14 @Cr -> q2 \n\t" \1468"vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t" \1469"vrshrn.s32 d6, q9, #14 @Cb -> q3 \n\t" \1470"vrshrn.s32 d5, q8, #14 @Cr -> q2 \n\t" \1471"vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t" \1472"vrshrn.s32 d7, q10, #14 @Cb -> q3 \n\t" \1473\1474"vmov.s16 q5, #128 \n\t" \1475"vmov.s16 q6, #128 \n\t" \1476"vadd.i16 q5, q2 @Cr -> q5 \n\t" \1477"vadd.i16 q6, q3 @Cb -> q6 \n\t" \1478\1479"vqmovn.u16 d4, q4 \n\t" \1480"vqmovun.s16 d5, q5 \n\t" \1481"vqmovun.s16 d6, q6 \n\t" \1482\1483"vst3.8 {d4-d6}, [%[out]] \n\t" \1484: /*no output*/ \1485: [out] "r" (dst + dj), [in] "r" (src + sj), \1486"w" (vcYR), "w" (vcYG), "w" (vcYB), \1487"w" (vcCrB), "w" (vcCrG), "w" (vcCbG), "w" (vcCbR) \1488: "d0","d1","d2","d3","d4","d5","d6","d7", \1489"d8","d9","d10","d11","d12","d13","d14","d15", \1490"d16","d17","d18","d19","d20","d21" \1491);149214931494inline uint8x8x3_t convertToYCrCb( const int16x8_t& vR, const int16x8_t& vG, const int16x8_t& vB,1495const int16x8_t& vcYRG, const int16x4_t& vcYB,1496const int16x8_t& vcCrGB, const int16x8_t& vcCbRG )1497{1498int32x4_t vCrL = vshll_n_s16(vget_low_s16(vR), 13); // R1499int32x4_t vCrH = vshll_n_s16(vget_high_s16(vR), 13); // R1500int32x4_t vYL = vmull_s16(vget_low_s16(vG), vget_high_s16(vcYRG)); // G1501int32x4_t vYH = vmull_s16(vget_high_s16(vG), vget_high_s16(vcYRG)); // G1502int32x4_t vCbL = vshll_n_s16(vget_low_s16(vB), 13); // B1503int32x4_t vCbH = vshll_n_s16(vget_high_s16(vB), 13); // B15041505vCrL = vmlsl_s16(vCrL, vget_low_s16(vG), vget_low_s16(vcCrGB)); // RG1506vCrH = vmlsl_s16(vCrH, vget_high_s16(vG), vget_low_s16(vcCrGB)); // RG1507vYL = vmlal_s16(vYL, vget_low_s16(vB), vcYB); // GB1508vYH = vmlal_s16(vYH, vget_high_s16(vB), vcYB); // GB1509vCbL = vmlsl_s16(vCbL, vget_low_s16(vR), vget_low_s16(vcCbRG)); // BR1510vCbH = vmlsl_s16(vCbH, vget_high_s16(vR), vget_low_s16(vcCbRG)); // BR15111512vCrL = vmlsl_s16(vCrL, vget_low_s16(vB), vget_high_s16(vcCrGB)); // RGB1513vCrH = vmlsl_s16(vCrH, vget_high_s16(vB), vget_high_s16(vcCrGB)); // RGB1514vYL = vmlal_s16(vYL, vget_low_s16(vR), vget_low_s16(vcYRG)); // GBR1515vYH = vmlal_s16(vYH, vget_high_s16(vR), vget_low_s16(vcYRG)); // GBR1516vCbL = vmlsl_s16(vCbL, vget_low_s16(vG), vget_high_s16(vcCbRG)); // BRG1517vCbH = vmlsl_s16(vCbH, vget_high_s16(vG), vget_high_s16(vcCbRG)); // BRG15181519int16x4_t vCrL_ = vrshrn_n_s32(vCrL, 14);1520int16x4_t vCrH_ = vrshrn_n_s32(vCrH, 14);1521int16x4_t vYL_ = vrshrn_n_s32(vYL, 14);1522int16x4_t vYH_ = vrshrn_n_s32(vYH, 14);1523int16x4_t vCbL_ = vrshrn_n_s32(vCbL, 14);1524int16x4_t vCbH_ = vrshrn_n_s32(vCbH, 14);15251526int16x8_t vCr = vmovq_n_s16(128);1527int16x8_t vCb = vmovq_n_s16(128);15281529vCr = vaddq_s16(vCr, vcombine_s16(vCrL_, vCrH_));1530vCb = vaddq_s16(vCb, vcombine_s16(vCbL_, vCbH_));15311532uint8x8x3_t vYCrCb;1533vYCrCb.val[0] = vqmovn_u16(vreinterpretq_u16_s16(vcombine_s16(vYL_, vYH_)));1534vYCrCb.val[1] = vqmovun_s16(vCr);1535vYCrCb.val[2] = vqmovun_s16(vCb);15361537return vYCrCb;1538}15391540#define S_CONVERTTOYCRCB(R, G, B) \1541s32 Y = (R * 4899 + G * 9617 + B * 1868 + (1 << 13)) >> 14; \1542s32 Cr = 128 + ((R * 8192 - G * 6860 - B * 1332 + (1 << 13)) >> 14); \1543s32 Cb = 128 + ((R * (-2765) - G * 5427 + B * 8192 + (1 << 13)) >> 14); \1544dst[dj + 0] = internal::saturate_cast<u8>(Y); \1545dst[dj + 1] = internal::saturate_cast<u8>(Cr); \1546dst[dj + 2] = internal::saturate_cast<u8>(Cb);15471548#define COEFF_Y ( 149)1549#define COEFF_BU ( 129)1550#define COEFF_RV ( 102)1551#define COEFF_GU ( 25)1552#define COEFF_GV ( 52)1553#define COEFF_R (-14248)1554#define COEFF_G ( 8663)1555#define COEFF_B (-17705)15561557#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1558#define YUV420ALPHA3_CONST1559#define YUV420ALPHA4_CONST register uint8x16_t c255 asm ("q13") = vmovq_n_u8(255);1560#define YUV420ALPHA3_CONVERT1561#define YUV420ALPHA4_CONVERT , "w" (c255)1562#define YUV420STORE1CMD3 "vst3.8 {d20, d22, d24}"1563#define YUV420STORE2CMD3 "vst3.8 {d21, d23, d25}"1564#define YUV420STORE1CMD4 "vst4.8 {d20, d22, d24, d26}"1565#define YUV420STORE2CMD4 "vst4.8 {d21, d23, d25, d27}"15661567#define YUV420_CONSTS(cn, bIdx, vIdx) \1568register const s32 cR = s16(COEFF_R); \1569register const s32 cG = s16(COEFF_G); \1570register const s32 cB = s16(COEFF_B); \1571\1572register uint8x16_t vc16 asm ("q15") = vmovq_n_u8(16); \1573register uint8x8_t cGU asm ("d14") = vmov_n_u8(COEFF_GU); \1574register uint8x8_t cGV asm ("d15") = vmov_n_u8(COEFF_GV); \1575register uint8x8_t cRV asm ("d16") = vmov_n_u8(COEFF_RV); \1576register uint8x8_t cBU asm ("d17") = vmov_n_u8(COEFF_BU); \1577register uint8x16_t cRGBY asm ("q3") = vmovq_n_u8(COEFF_Y); \1578YUV420ALPHA##cn##_CONST15791580#define CONVERTYUV420TORGB(cn, ureg, vreg, rreg, breg) \1581__asm__ ( \1582"vld2.8 {d0-d1}, [%[inUV]] @UV \n\t" \1583"vdup.16 q4, %[cG] @cG \n\t" \1584"vld2.8 {d2-d3}, [%[inY1]] @YY \n\t" \1585"vdup.16 "#rreg", %[cR] @cR \n\t" \1586"vld2.8 {d4-d5}, [%[inY2]] @YY \n\t" \1587"vdup.16 "#breg", %[cB] @cB \n\t" \1588"vmlsl.u8 q4, "#ureg", d14 @cG-25u \n\t" \1589"vmax.u8 q1, q15 @max(Y,16) \n\t" \1590"vmlal.u8 "#rreg", "#vreg", d16 @cR+102*v \n\t" \1591"vmlal.u8 "#breg", "#ureg", d17 @cB+129*u \n\t" \1592"vmax.u8 q2, q15 @max(Y,16) \n\t" \1593"vmlsl.u8 q4, "#vreg", d15 @cG-25u-52v \n\t" \1594/*q10,q11,q12,q13 - for output*/ \1595"vmull.u8 q9, d3, d6 @h 149*y \n\t" \1596"vmull.u8 q10, d2, d7 @l 149*y \n\t" \1597"vshr.u16 q9, #1 @h (149*y)/2 \n\t" \1598"vshr.u16 q10, #1 @l (149*y)/2 \n\t" \1599\1600"vhadd.s16 q0, q9, q4 @hG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \1601"vhadd.s16 q12, q10, q6 @lB ((149*y)/2 + cB + 129*u)/2 \n\t" \1602"vhadd.s16 q1, q9, q5 @hR ((149*y)/2 + cR + 102*v)/2 \n\t" \1603"vhadd.s16 q11, q10, q4 @lG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \1604"vhadd.s16 q9, q6 @hB ((149*y)/2 + cB + 129*u)/2 \n\t" \1605"vhadd.s16 q10, q5 @lR ((149*y)/2 + cR + 102*v)/2 \n\t" \1606\1607"vqrshrun.s16 d24, q12, #5 @lB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \1608"vqrshrun.s16 d22, q11, #5 @lG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \1609"vqrshrun.s16 d20, q10, #5 @lR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \1610"vqrshrun.s16 d23, q0, #5 @hG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \1611"vqrshrun.s16 d21, q1, #5 @hR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \1612"vqrshrun.s16 d25, q9, #5 @hB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \1613\1614"vzip.8 d22, d23 @G \n\t" \1615"vzip.8 d20, d21 @R \n\t" \1616"vzip.8 d24, d25 @B \n\t" \1617\1618YUV420STORE1CMD##cn", [%[out1]] \n\t" \1619YUV420STORE2CMD##cn", [%[out1x]] \n\t" \1620\1621"vmull.u8 q9, d5, d6 @h 149*y \n\t" \1622"vmull.u8 q10, d4, d7 @l 149*y \n\t" \1623"vshr.u16 q9, #1 @h (149*y)/2 \n\t" \1624"vshr.u16 q10, #1 @l (149*y)/2 \n\t" \1625\1626"vhadd.s16 q0, q9, q4 @hG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \1627"vhadd.s16 q12, q10, q6 @lB ((149*y)/2 + cB + 129*u)/2 \n\t" \1628"vhadd.s16 q1, q9, q5 @hR ((149*y)/2 + cR + 102*v)/2 \n\t" \1629"vhadd.s16 q11, q10, q4 @lG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \1630"vhadd.s16 q9, q6 @hB ((149*y)/2 + cB + 129*u)/2 \n\t" \1631"vhadd.s16 q10, q5 @lR ((149*y)/2 + cR + 102*v)/2 \n\t" \1632\1633"vqrshrun.s16 d24, q12, #5 @lB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \1634"vqrshrun.s16 d22, q11, #5 @lG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \1635"vqrshrun.s16 d20, q10, #5 @lR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \1636"vqrshrun.s16 d23, q0, #5 @hG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \1637"vqrshrun.s16 d21, q1, #5 @hR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \1638"vqrshrun.s16 d25, q9, #5 @hB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \1639\1640"vzip.8 d22, d23 @G \n\t" \1641"vzip.8 d20, d21 @R \n\t" \1642"vzip.8 d24, d25 @B \n\t" \1643\1644YUV420STORE1CMD##cn", [%[out2]] \n\t" \1645YUV420STORE2CMD##cn", [%[out2x]] \n\t" \1646\1647: /*no output*/ \1648: [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \1649[out1x] "r" (dst1 + dj+cn*8), [out2x] "r" (dst2 + dj+cn*8), \1650[inUV] "r" (uv+j), [inY1] "r" (y1+j), [inY2] "r" (y2+j), \1651[cR] "r" (cR), [cG] "r" (cG), [cB] "r" (cB), \1652"w" (vc16), "w" (cGU), "w" (cGV), "w" (cBU), "w" (cRV), "w" (cRGBY) YUV420ALPHA##cn##_CONVERT \1653: "d0","d1","d2","d3","d4","d5","d8","d9","d10","d11","d12", \1654"d13","d18","d19","d20","d21","d22","d23","d24","d25" \1655);16561657#else16581659template<int bIdx>1660struct _convertYUV420Internals1661{1662uint16x8_t vc14216;1663uint16x8_t vc17672;1664uint16x8_t vc8696;1665uint8x8_t vc102;1666uint8x8_t vc25;1667uint8x8_t vc129;1668uint8x8_t vc52;1669uint16x8_t vc_1;1670uint8x8_t vc149;1671uint8x8_t vc16;1672_convertYUV420Internals()1673{1674vc14216 = vdupq_n_u16(-COEFF_R);1675vc17672 = vdupq_n_u16(-COEFF_B);1676vc8696 = vdupq_n_u16(COEFF_G);1677vc102 = vdup_n_u8(COEFF_RV);1678vc25 = vdup_n_u8(COEFF_GU);1679vc129 = vdup_n_u8(COEFF_BU);1680vc52 = vdup_n_u8(COEFF_GV);1681vc_1 = vdupq_n_u16((uint16_t)-1);1682vc149 = vdup_n_u8(COEFF_Y);1683vc16 = vdup_n_u8(16);1684}16851686inline void UVrgbToRGB( const int16x8_t &ruv, const int16x8_t &guv, const int16x8_t &buv,1687const u8 *y, uint8x16x3_t &rgbl )1688{1689//y get line1690uint8x8x2_t yl = vld2_u8(y);1691yl.val[0] = vmax_u8(yl.val[0], vc16);1692yl.val[1] = vmax_u8(yl.val[1], vc16);16931694//y part line1695uint16x8_t yodd1 = vmlal_u8(vc_1, yl.val[0], vc149); //(-1+149*y)1696uint16x8_t yevn1 = vmlal_u8(vc_1, yl.val[1], vc149); //(-1+149*y)1697int16x8_t yodd1h = (int16x8_t)vshrq_n_u16(yodd1, 1); //(-1+149*y)/21698int16x8_t yevn1h = (int16x8_t)vshrq_n_u16(yevn1, 1); //(-1+149*y)/216991700//y line calc rgb1701int16x8_t rodd1w = vhsubq_s16(yodd1h, ruv); //((-1+149*y)/2 - (14216-102*v))/21702int16x8_t gevn1w = vhaddq_s16(yevn1h, guv); //((-1+149*y)/2 + ((8696-25*u)-52*v))/21703int16x8_t bodd1w = vhsubq_s16(yodd1h, buv); //((-1+149*y)/2 - (17672-129*u))/21704int16x8_t revn1w = vhsubq_s16(yevn1h, ruv); //((-1+149*y)/2 - (14216-102*v))/21705int16x8_t godd1w = vhaddq_s16(yodd1h, guv); //((-1+149*y)/2 + ((8696-25*u)-52*v))/21706int16x8_t bevn1w = vhsubq_s16(yevn1h, buv); //((-1+149*y)/2 - (17672-129*u))/217071708//y line clamp + narrow1709uint8x8_t rodd1n = vqshrun_n_s16(rodd1w, 5);1710uint8x8_t revn1n = vqshrun_n_s16(revn1w, 5);1711uint8x8_t godd1n = vqshrun_n_s16(godd1w, 5);1712uint8x8x2_t r1 = vzip_u8 (rodd1n, revn1n);1713uint8x8_t gevn1n = vqshrun_n_s16(gevn1w, 5);1714uint8x8_t bodd1n = vqshrun_n_s16(bodd1w, 5);1715uint8x8x2_t g1 = vzip_u8 (godd1n, gevn1n);1716uint8x8_t bevn1n = vqshrun_n_s16(bevn1w, 5);1717uint8x8x2_t b1 = vzip_u8 (bodd1n, bevn1n);1718rgbl.val[2 - bIdx] = vcombine_u8(r1.val[0], r1.val[1]);1719rgbl.val[1] = vcombine_u8(g1.val[0], g1.val[1]);1720rgbl.val[0 + bIdx] = vcombine_u8(b1.val[0], b1.val[1]);1721}1722};17231724template<int cn, int bIdx, int vIdx>1725struct _convertYUV4201726{1727_convertYUV420Internals<bIdx> convertYUV420Internals;17281729inline void ToRGB( const u8 *y1, const u8 *y2, const u8 *uv,1730u8 *dst1, u8 *dst2 )1731{1732uint8x8x2_t raw_uv = vld2_u8(uv);1733uint16x8_t gu = vmlsl_u8(convertYUV420Internals.vc8696, raw_uv.val[1-vIdx], convertYUV420Internals.vc25); //(8696-25*u)1734int16x8_t ruv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc14216, raw_uv.val[vIdx], convertYUV420Internals.vc102); //(14216-102*v)17351736int16x8_t buv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc17672, raw_uv.val[1-vIdx], convertYUV420Internals.vc129); //(17672-129*u)1737int16x8_t guv = (int16x8_t)vmlsl_u8(gu, raw_uv.val[vIdx], convertYUV420Internals.vc52); //((8696-25*u)-52*v))17381739uint8x16x3_t rgbl;1740//y line11741convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y1, rgbl);1742vst3q_u8(dst1, rgbl);1743//y line21744convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y2, rgbl);1745vst3q_u8(dst2, rgbl);1746}1747};17481749template<int bIdx, int vIdx>1750struct _convertYUV420<4, bIdx, vIdx>1751{1752_convertYUV420Internals<bIdx> convertYUV420Internals;17531754inline void ToRGB( const u8 *y1, const u8 *y2, const u8 *uv,1755u8 *dst1, u8 *dst2 )1756{1757uint8x8x2_t raw_uv = vld2_u8(uv);1758uint16x8_t gu = vmlsl_u8(convertYUV420Internals.vc8696, raw_uv.val[1-vIdx], convertYUV420Internals.vc25); //(8696-25*u)1759int16x8_t ruv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc14216, raw_uv.val[vIdx], convertYUV420Internals.vc102); //(14216-102*v)17601761int16x8_t buv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc17672, raw_uv.val[1-vIdx], convertYUV420Internals.vc129); //(17672-129*u)1762int16x8_t guv = (int16x8_t)vmlsl_u8(gu, raw_uv.val[vIdx], convertYUV420Internals.vc52); //((8696-25*u)-52*v))17631764union { uint8x16x4_t v4; uint8x16x3_t v3; } rgbl;1765rgbl.v4.val[3] = vdupq_n_u8(0xff);1766//y line11767convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y1, rgbl.v3);1768vst4q_u8(dst1, rgbl.v4);1769//y line21770convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y2, rgbl.v3);1771vst4q_u8(dst2, rgbl.v4);1772}1773};17741775#define YUV420_CONSTS(cn, bIdx, vIdx) _convertYUV420<cn, bIdx, vIdx> convertYUV420;17761777#endif17781779template <int cn> inline void fillAlpha(u8 *, u8 *){}1780template <> inline void fillAlpha<4>(u8 *dst1, u8 *dst2)1781{1782dst1[3] = 255;1783dst1[7] = 255;1784dst2[3] = 255;1785dst2[7] = 255;1786}1787template <int cn, int bIdx, int vIdx>1788inline void convertYUV420ToRGB(const u8 *y1, const u8 *y2, const u8 *uv, u8* dst1, u8 *dst2)1789{1790int Y11 = y1[0];1791int Y12 = y1[1];1792int Y21 = y2[0];1793int Y22 = y2[1];17941795int U = uv[1 - vIdx];1796int V = uv[vIdx];17971798int y11 = (COEFF_Y * std::max(16, Y11)) >> 1;1799int y12 = (COEFF_Y * std::max(16, Y12)) >> 1;1800int y21 = (COEFF_Y * std::max(16, Y21)) >> 1;1801int y22 = (COEFF_Y * std::max(16, Y22)) >> 1;18021803int uvR = COEFF_R + COEFF_RV * V;1804int uvG = COEFF_G - COEFF_GU * U - COEFF_GV * V;1805int uvB = COEFF_B + COEFF_BU * U;18061807dst1[2-bIdx] = internal::saturate_cast<u8>((((y11 + uvR) >> 1) + (1 << 4)) >> 5);1808dst1[1] = internal::saturate_cast<u8>((((y11 + uvG) >> 1) + (1 << 4)) >> 5);1809dst1[bIdx] = internal::saturate_cast<u8>((((y11 + uvB) >> 1) + (1 << 4)) >> 5);18101811dst1[cn+2-bIdx] = internal::saturate_cast<u8>((((y12 + uvR) >> 1) + (1 << 4)) >> 5);1812dst1[cn+1] = internal::saturate_cast<u8>((((y12 + uvG) >> 1) + (1 << 4)) >> 5);1813dst1[cn+bIdx] = internal::saturate_cast<u8>((((y12 + uvB) >> 1) + (1 << 4)) >> 5);18141815dst2[2-bIdx] = internal::saturate_cast<u8>((((y21 + uvR) >> 1) + (1 << 4)) >> 5);1816dst2[1] = internal::saturate_cast<u8>((((y21 + uvG) >> 1) + (1 << 4)) >> 5);1817dst2[bIdx] = internal::saturate_cast<u8>((((y21 + uvB) >> 1) + (1 << 4)) >> 5);18181819dst2[cn+2-bIdx] = internal::saturate_cast<u8>((((y22 + uvR) >> 1) + (1 << 4)) >> 5);1820dst2[cn+1] = internal::saturate_cast<u8>((((y22 + uvG) >> 1) + (1 << 4)) >> 5);1821dst2[cn+bIdx] = internal::saturate_cast<u8>((((y22 + uvB) >> 1) + (1 << 4)) >> 5);18221823fillAlpha<cn>(dst1, dst2);1824}18251826// converts R, G, B (B, G, R) pixels to RGB(BGR)565 format respectively1827inline uint8x16x2_t convertTo565( const uint8x16_t& vR, const uint8x16_t& vG, const uint8x16_t& vB )1828{1829uint8x16x2_t vRgb565; // rrrrRRRR ggggGGGG bbbbBBBB18301831vRgb565.val[1] = vsriq_n_u8(vB, vG, 5); // xxxxxxxx bbbbBggg1832vRgb565.val[0] = vshlq_n_u8(vG, 3); // gGGGG000 bbbbBggg1833vRgb565.val[0] = vsriq_n_u8(vRgb565.val[0], vR, 3); // gGGrrrrR bbbbBggg18341835return vRgb565;1836}1837inline void convertTo565( const u16 R, const u16 G, const u16 B, u8 * dst )1838{1839*((u16*)dst) = (R >> 3)|((G&~3) << 3)|((B&~7) << 8);1840}1841#endif18421843} //namespace18441845void rgb2hsv(const Size2D &size,1846const u8 * srcBase, ptrdiff_t srcStride,1847u8 * dstBase, ptrdiff_t dstStride,1848s32 hrange)1849{1850internal::assertSupportedConfiguration();1851#ifdef CAROTENE_NEON1852size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;1853const s32 hsv_shift = 12;1854#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1855register const f32 vsdiv_table = f32(255 << hsv_shift);1856register f32 vhdiv_table = f32(hrange << hsv_shift);1857register const s32 vhrange = hrange;1858register const s32 v0 = s32(0);1859register const s32 vshift = s32(1 << (hsv_shift-1));1860register const s32 v6 = s32(6);1861register const f32 bias = 0.5f;1862#endif18631864for (size_t i = 0u; i < size.height; ++i)1865{1866const u8 * src = internal::getRowPtr(srcBase, srcStride, i);1867u8 * dst = internal::getRowPtr(dstBase, dstStride, i);1868size_t sj = 0u, dj = 0u, j = 0u;18691870for (; j < roiw8; sj += 24, dj += 24, j += 8)1871{1872internal::prefetch(src + sj);1873#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1874CONVERT_TO_HSV_ASM(vld3.8 {d0-d2}, d0, d2)1875#else1876uint8x8x3_t vRgb = vld3_u8(src + sj);1877uint8x8x3_t vHsv = convertToHSV(vRgb.val[0], vRgb.val[1], vRgb.val[2], hrange);1878vst3_u8(dst + dj, vHsv);1879#endif1880}18811882for (; j < size.width; ++j, sj += 3, dj += 3)1883{1884convertToHSV(src[sj], src[sj+1], src[sj+2], hrange, hsv_shift, dst+dj);1885}1886}1887#else1888(void)size;1889(void)srcBase;1890(void)srcStride;1891(void)dstBase;1892(void)dstStride;1893(void)hrange;1894#endif1895}18961897void rgbx2hsv(const Size2D &size,1898const u8 * srcBase, ptrdiff_t srcStride,1899u8 * dstBase, ptrdiff_t dstStride,1900s32 hrange)1901{1902internal::assertSupportedConfiguration();1903#ifdef CAROTENE_NEON1904size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;1905const s32 hsv_shift = 12;1906#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1907register const f32 vsdiv_table = f32(255 << hsv_shift);1908register f32 vhdiv_table = f32(hrange << hsv_shift);1909register const s32 vhrange = hrange;1910register const s32 v0 = s32(0);1911register const s32 vshift = s32(1 << (hsv_shift-1));1912register const s32 v6 = s32(6);1913register const f32 bias = 0.5f;1914#endif19151916for (size_t i = 0u; i < size.height; ++i)1917{1918const u8 * src = internal::getRowPtr(srcBase, srcStride, i);1919u8 * dst = internal::getRowPtr(dstBase, dstStride, i);1920size_t sj = 0u, dj = 0u, j = 0u;19211922for (; j < roiw8; sj += 32, dj += 24, j += 8)1923{1924internal::prefetch(src + sj);1925#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1926CONVERT_TO_HSV_ASM(vld4.8 {d0-d3}, d0, d2)1927#else1928uint8x8x4_t vRgb = vld4_u8(src + sj);1929uint8x8x3_t vHsv = convertToHSV(vRgb.val[0], vRgb.val[1], vRgb.val[2], hrange);1930vst3_u8(dst + dj, vHsv);1931#endif1932}19331934for (; j < size.width; ++j, sj += 4, dj += 3)1935{1936convertToHSV(src[sj], src[sj+1], src[sj+2], hrange, hsv_shift, dst+dj);1937}1938}1939#else1940(void)size;1941(void)srcBase;1942(void)srcStride;1943(void)dstBase;1944(void)dstStride;1945(void)hrange;1946#endif1947}19481949void bgr2hsv(const Size2D &size,1950const u8 * srcBase, ptrdiff_t srcStride,1951u8 * dstBase, ptrdiff_t dstStride,1952s32 hrange)1953{1954internal::assertSupportedConfiguration();1955#ifdef CAROTENE_NEON1956size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;1957const s32 hsv_shift = 12;1958#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1959register const f32 vsdiv_table = f32(255 << hsv_shift);1960register f32 vhdiv_table = f32(hrange << hsv_shift);1961register const s32 vhrange = hrange;1962register const s32 v0 = s32(0);1963register const s32 vshift = s32(1 << (hsv_shift-1));1964register const s32 v6 = s32(6);1965register const f32 bias = 0.5f;1966#endif19671968for (size_t i = 0u; i < size.height; ++i)1969{1970const u8 * src = internal::getRowPtr(srcBase, srcStride, i);1971u8 * dst = internal::getRowPtr(dstBase, dstStride, i);1972size_t sj = 0u, dj = 0u, j = 0u;19731974for (; j < roiw8; sj += 24, dj += 24, j += 8)1975{1976internal::prefetch(src + sj);1977#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1978CONVERT_TO_HSV_ASM(vld3.8 {d0-d2}, d2, d0)1979#else1980uint8x8x3_t vRgb = vld3_u8(src + sj);1981uint8x8x3_t vHsv = convertToHSV(vRgb.val[2], vRgb.val[1], vRgb.val[0], hrange);1982vst3_u8(dst + dj, vHsv);1983#endif1984}19851986for (; j < size.width; ++j, sj += 3, dj += 3)1987{1988convertToHSV(src[sj+2], src[sj+1], src[sj], hrange, hsv_shift, dst+dj);1989}1990}1991#else1992(void)size;1993(void)srcBase;1994(void)srcStride;1995(void)dstBase;1996(void)dstStride;1997(void)hrange;1998#endif1999}20002001void bgrx2hsv(const Size2D &size,2002const u8 * srcBase, ptrdiff_t srcStride,2003u8 * dstBase, ptrdiff_t dstStride,2004s32 hrange)2005{2006internal::assertSupportedConfiguration();2007#ifdef CAROTENE_NEON2008size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;2009const s32 hsv_shift = 12;2010#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2011register const f32 vsdiv_table = f32(255 << hsv_shift);2012register f32 vhdiv_table = f32(hrange << hsv_shift);2013register const s32 vhrange = hrange;2014register const s32 v0 = s32(0);2015register const s32 vshift = s32(1 << (hsv_shift-1));2016register const s32 v6 = s32(6);2017register const f32 bias = 0.5f;2018#endif20192020for (size_t i = 0u; i < size.height; ++i)2021{2022const u8 * src = internal::getRowPtr(srcBase, srcStride, i);2023u8 * dst = internal::getRowPtr(dstBase, dstStride, i);2024size_t sj = 0u, dj = 0u, j = 0u;20252026for (; j < roiw8; sj += 32, dj += 24, j += 8)2027{2028internal::prefetch(src + sj);2029#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2030CONVERT_TO_HSV_ASM(vld4.8 {d0-d3}, d2, d0)2031#else2032uint8x8x4_t vRgb = vld4_u8(src + sj);2033uint8x8x3_t vHsv = convertToHSV(vRgb.val[2], vRgb.val[1], vRgb.val[0], hrange);2034vst3_u8(dst + dj, vHsv);2035#endif2036}20372038for (; j < size.width; ++j, sj += 4, dj += 3)2039{2040convertToHSV(src[sj+2], src[sj+1], src[sj], hrange, hsv_shift, dst+dj);2041}2042}2043#else2044(void)size;2045(void)srcBase;2046(void)srcStride;2047(void)dstBase;2048(void)dstStride;2049(void)hrange;2050#endif2051}20522053void rgbx2bgr565(const Size2D &size,2054const u8 * srcBase, ptrdiff_t srcStride,2055u8 * dstBase, ptrdiff_t dstStride)2056{2057internal::assertSupportedConfiguration();2058#ifdef CAROTENE_NEON2059size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;20602061for (size_t i = 0u; i < size.height; ++i)2062{2063const u8 * src = internal::getRowPtr(srcBase, srcStride, i);2064u8 * dst = internal::getRowPtr(dstBase, dstStride, i);2065size_t sj = 0u, dj = 0u, j = 0u;20662067for (; j < roiw16; sj += 64, dj += 32, j += 16)2068{2069internal::prefetch(src + sj);2070#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2071__asm__ (2072"vld4.8 {d2, d4, d6, d8}, [%[in0]] @ q0 q1 q2 q3 q4 \n\t"2073"vld4.8 {d3, d5, d7, d9}, [%[in1]] @ xxxxxxxx rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t"2074"vsri.8 q1, q2, #5 @ xxxxxxxx rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"2075"vshl.u8 q0, q2, #3 @ gGGGG000 rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"2076"vsri.8 q0, q3, #3 @ gGGbbbbB rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"2077"vst2.8 {d0, d2}, [%[out0]] \n\t"2078"vst2.8 {d1, d3}, [%[out1]] \n\t"2079: /*no output*/2080: [out0] "r" (dst + dj),2081[out1] "r" (dst + dj + 16),2082[in0] "r" (src + sj),2083[in1] "r" (src + sj + 32)2084: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"2085);2086#else2087uint8x16x4_t vRgba = vld4q_u8(src + sj);2088uint8x16x2_t vVal565 = convertTo565(vRgba.val[2], vRgba.val[1], vRgba.val[0]);2089vst2q_u8(dst + dj, vVal565);2090#endif2091}20922093for (; j < size.width; ++j, sj += 4, dj += 2)2094{2095convertTo565(src[sj + 2], src[sj + 1], src[sj], dst + dj);2096}2097}2098#else2099(void)size;2100(void)srcBase;2101(void)srcStride;2102(void)dstBase;2103(void)dstStride;2104#endif2105}21062107void rgb2bgr565(const Size2D &size,2108const u8 * srcBase, ptrdiff_t srcStride,2109u8 * dstBase, ptrdiff_t dstStride)2110{2111internal::assertSupportedConfiguration();2112#ifdef CAROTENE_NEON2113size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;21142115for (size_t i = 0u; i < size.height; ++i)2116{2117const u8 * src = internal::getRowPtr(srcBase, srcStride, i);2118u8 * dst = internal::getRowPtr(dstBase, dstStride, i);2119size_t sj = 0u, dj = 0u, j = 0u;21202121for (; j < roiw16; sj += 48, dj += 32, j += 16)2122{2123internal::prefetch(src + sj);2124#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2125__asm__ (2126"vld3.8 {d2, d4, d6}, [%[in0]] @ q0 q1 q2 q3 q4 \n\t"2127"vld3.8 {d3, d5, d7}, [%[in1]] @ xxxxxxxx rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t"2128"vsri.8 q1, q2, #5 @ xxxxxxxx rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"2129"vshl.u8 q0, q2, #3 @ gGGGG000 rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"2130"vsri.8 q0, q3, #3 @ gGGbbbbB rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"2131"vst2.8 {d0, d2}, [%[out0]] \n\t"2132"vst2.8 {d1, d3}, [%[out1]] \n\t"2133: /*no output*/2134: [out0] "r" (dst + dj),2135[out1] "r" (dst + dj + 16),2136[in0] "r" (src + sj),2137[in1] "r" (src + sj + 24)2138: "d0","d1","d2","d3","d4","d5","d6","d7"2139);2140#else2141uint8x16x3_t vRgba = vld3q_u8(src + sj);2142uint8x16x2_t vVal565 = convertTo565(vRgba.val[2], vRgba.val[1], vRgba.val[0]);2143vst2q_u8(dst + dj, vVal565);2144#endif2145}21462147for (; j < size.width; ++j, sj += 3, dj += 2)2148{2149convertTo565(src[sj + 2], src[sj + 1], src[sj], dst + dj);2150}2151}2152#else2153(void)size;2154(void)srcBase;2155(void)srcStride;2156(void)dstBase;2157(void)dstStride;2158#endif2159}21602161void rgbx2rgb565(const Size2D &size,2162const u8 * srcBase, ptrdiff_t srcStride,2163u8 * dstBase, ptrdiff_t dstStride)2164{2165internal::assertSupportedConfiguration();2166#ifdef CAROTENE_NEON2167size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;21682169for (size_t i = 0u; i < size.height; ++i)2170{2171const u8 * src = internal::getRowPtr(srcBase, srcStride, i);2172u8 * dst = internal::getRowPtr(dstBase, dstStride, i);2173size_t sj = 0u, dj = 0u, j = 0u;21742175for (; j < roiw16; sj += 64, dj += 32, j += 16)2176{2177internal::prefetch(src + sj);2178#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2179__asm__ (2180"vld4.8 {d0, d2, d4, d6}, [%[in0]] @ q0 q1 q2 q3 \n\t"2181"vld4.8 {d1, d3, d5, d7}, [%[in1]] @ rrrrRRRR ggggGGGG bbbbBBBB aaaaAAAA \n\t"2182"vsri.8 q2, q1, #5 @ rrrrRRRR ggggGGGG bbbbBggg aaaaAAAA \n\t"2183"vshl.u8 q1, #3 @ rrrrRRRR gGGGG000 bbbbBggg aaaaAAAA \n\t"2184"vsri.8 q1, q0, #3 @ rrrrRRRR gGGrrrrR bbbbBggg aaaaAAAA \n\t"2185"vst2.8 {d2, d4}, [%[out0]] \n\t"2186"vst2.8 {d3, d5}, [%[out1]] \n\t"2187: /*no output*/2188: [out0] "r" (dst + dj),2189[out1] "r" (dst + dj + 16),2190[in0] "r" (src + sj),2191[in1] "r" (src + sj + 32)2192: "d0","d1","d2","d3","d4","d5","d6","d7"2193);2194#else2195uint8x16x4_t vRgba = vld4q_u8(src + sj);2196uint8x16x2_t vVal565 = convertTo565(vRgba.val[0], vRgba.val[1], vRgba.val[2]);2197vst2q_u8(dst + dj, vVal565);2198#endif2199}22002201for (; j < size.width; ++j, sj += 4, dj += 2)2202{2203convertTo565(src[sj], src[sj + 1], src[sj + 2], dst + dj);2204}2205}2206#else2207(void)size;2208(void)srcBase;2209(void)srcStride;2210(void)dstBase;2211(void)dstStride;2212#endif2213}22142215void rgb2rgb565(const Size2D &size,2216const u8 * srcBase, ptrdiff_t srcStride,2217u8 * dstBase, ptrdiff_t dstStride)2218{2219internal::assertSupportedConfiguration();2220#ifdef CAROTENE_NEON2221size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;22222223for (size_t i = 0u; i < size.height; ++i)2224{2225const u8 * src = internal::getRowPtr(srcBase, srcStride, i);2226u8 * dst = internal::getRowPtr(dstBase, dstStride, i);2227size_t sj = 0u, dj = 0u, j = 0u;22282229for (; j < roiw16; sj += 48, dj += 32, j += 16)2230{2231internal::prefetch(src + sj);2232#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2233__asm__ (2234"vld3.8 {d0, d2, d4}, [%[in0]] @ q0 q1 q2 q3 \n\t"2235"vld3.8 {d1, d3, d5}, [%[in1]] @ rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t"2236"vsri.8 q2, q1, #5 @ rrrrRRRR ggggGGGG bbbbBggg xxxxxxxx \n\t"2237"vshl.u8 q1, #3 @ rrrrRRRR gGGGG000 bbbbBggg xxxxxxxx \n\t"2238"vsri.8 q1, q0, #3 @ rrrrRRRR gGGrrrrR bbbbBggg xxxxxxxx \n\t"2239"vst2.8 {d2, d4}, [%[out0]] \n\t"2240"vst2.8 {d3, d5}, [%[out1]] \n\t"2241: /*no output*/2242: [out0] "r" (dst + dj),2243[out1] "r" (dst + dj + 16),2244[in0] "r" (src + sj),2245[in1] "r" (src + sj + 24)2246: "d0","d1","d2","d3","d4","d5"2247);2248#else2249uint8x16x3_t vRgba = vld3q_u8(src + sj);2250uint8x16x2_t vVal565 = convertTo565(vRgba.val[0], vRgba.val[1], vRgba.val[2]);2251vst2q_u8(dst + dj, vVal565);2252#endif2253}22542255for (; j < size.width; ++j, sj += 3, dj += 2)2256{2257convertTo565(src[sj], src[sj + 1], src[sj + 2], dst + dj);2258}2259}2260#else2261(void)size;2262(void)srcBase;2263(void)srcStride;2264(void)dstBase;2265(void)dstStride;2266#endif2267}22682269void rgb2ycrcb(const Size2D &size,2270const u8 * srcBase, ptrdiff_t srcStride,2271u8 * dstBase, ptrdiff_t dstStride)2272{2273internal::assertSupportedConfiguration();2274#ifdef CAROTENE_NEON2275YCRCB_CONSTS2276size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;22772278for (size_t i = 0u; i < size.height; ++i)2279{2280const u8 * src = internal::getRowPtr(srcBase, srcStride, i);2281u8 * dst = internal::getRowPtr(dstBase, dstStride, i);2282size_t sj = 0u, dj = 0u, j = 0u;22832284for (; j < roiw8; sj += 24, dj += 24, j += 8)2285{2286internal::prefetch(src + sj);2287#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2288CONVERTTOYCRCB(vld3.8 {d0-d2}, d0, d1, d2)2289#else2290uint8x8x3_t vRgb = vld3_u8(src + sj);2291int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[0]));2292int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[1]));2293int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[2]));2294uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);2295vst3_u8(dst + dj, vYCrCb);2296#endif2297}22982299for (; j < size.width; ++j, sj += 3, dj += 3)2300{2301S_CONVERTTOYCRCB(src[sj], src[sj + 1], src[sj + 2]);2302}2303}2304#else2305(void)size;2306(void)srcBase;2307(void)srcStride;2308(void)dstBase;2309(void)dstStride;2310#endif2311}23122313void rgbx2ycrcb(const Size2D &size,2314const u8 * srcBase, ptrdiff_t srcStride,2315u8 * dstBase, ptrdiff_t dstStride)2316{2317internal::assertSupportedConfiguration();2318#ifdef CAROTENE_NEON2319YCRCB_CONSTS2320size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;23212322for (size_t i = 0u; i < size.height; ++i)2323{2324const u8 * src = internal::getRowPtr(srcBase, srcStride, i);2325u8 * dst = internal::getRowPtr(dstBase, dstStride, i);2326size_t sj = 0u, dj = 0u, j = 0u;23272328for (; j < roiw8; sj += 32, dj += 24, j += 8)2329{2330internal::prefetch(src + sj);2331#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2332CONVERTTOYCRCB(vld4.8 {d0-d3}, d0, d1, d2)2333#else2334uint8x8x4_t vRgba = vld4_u8(src + sj);2335int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[0]));2336int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[1]));2337int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[2]));2338uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);2339vst3_u8(dst + dj, vYCrCb);2340#endif2341}23422343for (; j < size.width; ++j, sj += 4, dj += 3)2344{2345S_CONVERTTOYCRCB(src[sj], src[sj + 1], src[sj + 2]);2346}2347}2348#else2349(void)size;2350(void)srcBase;2351(void)srcStride;2352(void)dstBase;2353(void)dstStride;2354#endif2355}23562357void bgr2ycrcb(const Size2D &size,2358const u8 * srcBase, ptrdiff_t srcStride,2359u8 * dstBase, ptrdiff_t dstStride)2360{2361internal::assertSupportedConfiguration();2362#ifdef CAROTENE_NEON2363YCRCB_CONSTS2364size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;23652366for (size_t i = 0u; i < size.height; ++i)2367{2368const u8 * src = internal::getRowPtr(srcBase, srcStride, i);2369u8 * dst = internal::getRowPtr(dstBase, dstStride, i);2370size_t sj = 0u, dj = 0u, j = 0u;23712372for (; j < roiw8; sj += 24, dj += 24, j += 8)2373{2374internal::prefetch(src + sj);2375#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2376CONVERTTOYCRCB(vld3.8 {d0-d2}, d2, d1, d0)2377#else2378uint8x8x3_t vBgr = vld3_u8(src + sj);2379int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[0]));2380int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[1]));2381int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[2]));2382uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);2383vst3_u8(dst + dj, vYCrCb);2384#endif2385}23862387for (; j < size.width; ++j, sj += 3, dj += 3)2388{2389S_CONVERTTOYCRCB(src[sj + 2], src[sj + 1], src[sj]);2390}2391}2392#else2393(void)size;2394(void)srcBase;2395(void)srcStride;2396(void)dstBase;2397(void)dstStride;2398#endif2399}24002401void bgrx2ycrcb(const Size2D &size,2402const u8 * srcBase, ptrdiff_t srcStride,2403u8 * dstBase, ptrdiff_t dstStride)2404{2405internal::assertSupportedConfiguration();2406#ifdef CAROTENE_NEON2407YCRCB_CONSTS2408size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;24092410for (size_t i = 0u; i < size.height; ++i)2411{2412const u8 * src = internal::getRowPtr(srcBase, srcStride, i);2413u8 * dst = internal::getRowPtr(dstBase, dstStride, i);2414size_t sj = 0u, dj = 0u, j = 0u;24152416for (; j < roiw8; sj += 32, dj += 24, j += 8)2417{2418internal::prefetch(src + sj);2419#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2420CONVERTTOYCRCB(vld4.8 {d0-d3}, d2, d1, d0)2421#else2422uint8x8x4_t vBgra = vld4_u8(src + sj);2423int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[0]));2424int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[1]));2425int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[2]));2426uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);2427vst3_u8(dst + dj, vYCrCb);2428#endif2429}24302431for (; j < size.width; ++j, sj += 4, dj += 3)2432{2433S_CONVERTTOYCRCB(src[sj + 2], src[sj + 1], src[sj]);2434}2435}2436#else2437(void)size;2438(void)srcBase;2439(void)srcStride;2440(void)dstBase;2441(void)dstStride;2442#endif2443}24442445void yuv420sp2rgb(const Size2D &size,2446const u8 * yBase, ptrdiff_t yStride,2447const u8 * uvBase, ptrdiff_t uvStride,2448u8 * dstBase, ptrdiff_t dstStride)2449{2450// input data:2451////////////// Y matrix:2452// {y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16}2453// {Y1, Y2, Y3, Y4, Y5, Y6, Y7, Y8, Y9, Y10, Y11, Y12, Y13, Y14, Y15, Y16}2454////////////// UV matrix:2455// {v12, u12, v34, u34, v56, u56, v78, u78, v90 u90, V12, U12, V34, U34, V56, U56}24562457// fp version2458// R = 1.164(Y - 16) + 1.596(V - 128)2459// G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)2460// B = 1.164(Y - 16) + 2.018(U - 128)24612462// integer version2463// R = [((149*y)/2 + (-14248+102*v) )/2]/322464// G = [((149*y)/2 + ((8663- 25*u)-52*v))/2]/322465// B = [((149*y)/2 + (-17705+129*u) )/2]/3224662467// error estimation:2468//Rerr = 0.0000625 * y - 0.00225 * v - 0.2872469//Gerr = 0.0000625 * y + 0.0005 * v + 0.000375 * u + 0.1286252470//Berr = 0.0000625 * y - 0.002375 * u - 0.28737524712472//real error test:2473//=================2474//R: 1 less: 520960 == 3.11% of full space2475//G: 1 less: 251425 == 1.50% of full space2476//B: 1 less: 455424 == 2.71% of full space2477//=================2478//R: 1 more: 642048 == 3.83% of full space2479//G: 1 more: 192458 == 1.15% of full space2480//B: 1 more: 445184 == 2.65% of full space24812482internal::assertSupportedConfiguration();2483#ifdef CAROTENE_NEON2484YUV420_CONSTS(3, 2, 0)2485size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;24862487for (size_t i = 0u; i < size.height; i+=2)2488{2489const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);2490const u8 * y1 = internal::getRowPtr(yBase, yStride, i);2491const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);2492u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);2493u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);24942495size_t dj = 0u, j = 0u;2496for (; j < roiw16; dj += 48, j += 16)2497{2498internal::prefetch(uv + j);2499internal::prefetch(y1 + j);2500internal::prefetch(y2 + j);2501#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2502CONVERTYUV420TORGB(3, d1, d0, q5, q6)2503#else2504convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);2505#endif2506}2507for (; j + 2 <= size.width; j+=2, dj += 6)2508{2509convertYUV420ToRGB<3, 2, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);2510}2511}2512#else2513(void)size;2514(void)yBase;2515(void)yStride;2516(void)uvBase;2517(void)uvStride;2518(void)dstBase;2519(void)dstStride;2520#endif2521}25222523void yuv420sp2rgbx(const Size2D &size,2524const u8 * yBase, ptrdiff_t yStride,2525const u8 * uvBase, ptrdiff_t uvStride,2526u8 * dstBase, ptrdiff_t dstStride)2527{2528internal::assertSupportedConfiguration();2529#ifdef CAROTENE_NEON2530YUV420_CONSTS(4, 2, 0)2531size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;25322533for (size_t i = 0u; i < size.height; i+=2)2534{2535const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);2536const u8 * y1 = internal::getRowPtr(yBase, yStride, i);2537const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);2538u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);2539u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);25402541size_t dj = 0u, j = 0u;2542for (; j < roiw16; dj += 64, j += 16)2543{2544internal::prefetch(uv + j);2545internal::prefetch(y1 + j);2546internal::prefetch(y2 + j);2547#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2548CONVERTYUV420TORGB(4, d1, d0, q5, q6)2549#else2550convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);2551#endif2552}2553for (; j + 2 <= size.width; j+=2, dj += 8)2554{2555convertYUV420ToRGB<4, 2, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);2556}2557}2558#else2559(void)size;2560(void)yBase;2561(void)yStride;2562(void)uvBase;2563(void)uvStride;2564(void)dstBase;2565(void)dstStride;2566#endif2567}25682569void yuv420i2rgb(const Size2D &size,2570const u8 * yBase, ptrdiff_t yStride,2571const u8 * uvBase, ptrdiff_t uvStride,2572u8 * dstBase, ptrdiff_t dstStride)2573{2574internal::assertSupportedConfiguration();2575#ifdef CAROTENE_NEON2576YUV420_CONSTS(3, 2, 1)2577size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;25782579for (size_t i = 0u; i < size.height; i+=2)2580{2581const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);2582const u8 * y1 = internal::getRowPtr(yBase, yStride, i);2583const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);2584u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);2585u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);25862587size_t dj = 0u, j = 0u;2588for (; j < roiw16; dj += 48, j += 16)2589{2590internal::prefetch(uv + j);2591internal::prefetch(y1 + j);2592internal::prefetch(y2 + j);2593#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2594CONVERTYUV420TORGB(3, d0, d1, q5, q6)2595#else2596convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);2597#endif2598}2599for (; j + 2 <= size.width; j+=2, dj += 6)2600{2601convertYUV420ToRGB<3, 2, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);2602}2603}2604#else2605(void)size;2606(void)yBase;2607(void)yStride;2608(void)uvBase;2609(void)uvStride;2610(void)dstBase;2611(void)dstStride;2612#endif2613}26142615void yuv420i2rgbx(const Size2D &size,2616const u8 * yBase, ptrdiff_t yStride,2617const u8 * uvBase, ptrdiff_t uvStride,2618u8 * dstBase, ptrdiff_t dstStride)2619{2620internal::assertSupportedConfiguration();2621#ifdef CAROTENE_NEON2622YUV420_CONSTS(4, 2, 1)2623size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;26242625for (size_t i = 0u; i < size.height; i+=2)2626{2627const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);2628const u8 * y1 = internal::getRowPtr(yBase, yStride, i);2629const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);2630u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);2631u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);26322633size_t dj = 0u, j = 0u;2634for (; j < roiw16; dj += 64, j += 16)2635{2636internal::prefetch(uv + j);2637internal::prefetch(y1 + j);2638internal::prefetch(y2 + j);2639#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2640CONVERTYUV420TORGB(4, d0, d1, q5, q6)2641#else2642convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);2643#endif2644}2645for (; j + 2 <= size.width; j+=2, dj += 8)2646{2647convertYUV420ToRGB<4, 2, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);2648}2649}2650#else2651(void)size;2652(void)yBase;2653(void)yStride;2654(void)uvBase;2655(void)uvStride;2656(void)dstBase;2657(void)dstStride;2658#endif2659}26602661void yuv420sp2bgr(const Size2D &size,2662const u8 * yBase, ptrdiff_t yStride,2663const u8 * uvBase, ptrdiff_t uvStride,2664u8 * dstBase, ptrdiff_t dstStride)2665{2666internal::assertSupportedConfiguration();2667#ifdef CAROTENE_NEON2668YUV420_CONSTS(3, 0, 0)2669size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;26702671for (size_t i = 0u; i < size.height; i+=2)2672{2673const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);2674const u8 * y1 = internal::getRowPtr(yBase, yStride, i);2675const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);2676u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);2677u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);26782679size_t dj = 0u, j = 0u;2680for (; j < roiw16; dj += 48, j += 16)2681{2682internal::prefetch(uv + j);2683internal::prefetch(y1 + j);2684internal::prefetch(y2 + j);2685#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2686CONVERTYUV420TORGB(3, d1, d0, q6, q5)2687#else2688convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);2689#endif2690}2691for (; j + 2 <= size.width; j+=2, dj += 6)2692{2693convertYUV420ToRGB<3, 0, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);2694}2695}2696#else2697(void)size;2698(void)yBase;2699(void)yStride;2700(void)uvBase;2701(void)uvStride;2702(void)dstBase;2703(void)dstStride;2704#endif2705}27062707void yuv420sp2bgrx(const Size2D &size,2708const u8 * yBase, ptrdiff_t yStride,2709const u8 * uvBase, ptrdiff_t uvStride,2710u8 * dstBase, ptrdiff_t dstStride)2711{2712internal::assertSupportedConfiguration();2713#ifdef CAROTENE_NEON2714YUV420_CONSTS(4, 0, 0)2715size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;27162717for (size_t i = 0u; i < size.height; i+=2)2718{2719const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);2720const u8 * y1 = internal::getRowPtr(yBase, yStride, i);2721const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);2722u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);2723u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);27242725size_t dj = 0u, j = 0u;2726for (; j < roiw16; dj += 64, j += 16)2727{2728internal::prefetch(uv + j);2729internal::prefetch(y1 + j);2730internal::prefetch(y2 + j);2731#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2732CONVERTYUV420TORGB(4, d1, d0, q6, q5)2733#else2734convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);2735#endif2736}2737for (; j + 2 <= size.width; j+=2, dj += 8)2738{2739convertYUV420ToRGB<4, 0, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);2740}2741}2742#else2743(void)size;2744(void)yBase;2745(void)yStride;2746(void)uvBase;2747(void)uvStride;2748(void)dstBase;2749(void)dstStride;2750#endif2751}27522753void yuv420i2bgr(const Size2D &size,2754const u8 * yBase, ptrdiff_t yStride,2755const u8 * uvBase, ptrdiff_t uvStride,2756u8 * dstBase, ptrdiff_t dstStride)2757{2758internal::assertSupportedConfiguration();2759#ifdef CAROTENE_NEON2760YUV420_CONSTS(3, 0, 1)2761size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;27622763for (size_t i = 0u; i < size.height; i+=2)2764{2765const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);2766const u8 * y1 = internal::getRowPtr(yBase, yStride, i);2767const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);2768u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);2769u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);27702771size_t dj = 0u, j = 0u;2772for (; j < roiw16; dj += 48, j += 16)2773{2774internal::prefetch(uv + j);2775internal::prefetch(y1 + j);2776internal::prefetch(y2 + j);2777#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2778CONVERTYUV420TORGB(3, d0, d1, q6, q5)2779#else2780convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);2781#endif2782}2783for (; j + 2 <= size.width; j+=2, dj += 6)2784{2785convertYUV420ToRGB<3, 0, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);2786}2787}2788#else2789(void)size;2790(void)yBase;2791(void)yStride;2792(void)uvBase;2793(void)uvStride;2794(void)dstBase;2795(void)dstStride;2796#endif2797}27982799void yuv420i2bgrx(const Size2D &size,2800const u8 * yBase, ptrdiff_t yStride,2801const u8 * uvBase, ptrdiff_t uvStride,2802u8 * dstBase, ptrdiff_t dstStride)2803{2804internal::assertSupportedConfiguration();2805#ifdef CAROTENE_NEON2806YUV420_CONSTS(4, 0, 1)2807size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;28082809for (size_t i = 0u; i < size.height; i+=2)2810{2811const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);2812const u8 * y1 = internal::getRowPtr(yBase, yStride, i);2813const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);2814u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);2815u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);28162817size_t dj = 0u, j = 0u;2818for (; j < roiw16; dj += 64, j += 16)2819{2820internal::prefetch(uv + j);2821internal::prefetch(y1 + j);2822internal::prefetch(y2 + j);2823#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2824CONVERTYUV420TORGB(4, d0, d1, q6, q5)2825#else2826convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);2827#endif2828}2829for (; j + 2 <= size.width; j+=2, dj += 8)2830{2831convertYUV420ToRGB<4, 0, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);2832}2833}2834#else2835(void)size;2836(void)yBase;2837(void)yStride;2838(void)uvBase;2839(void)uvStride;2840(void)dstBase;2841(void)dstStride;2842#endif2843}28442845} // namespace CAROTENE_NS284628472848