Path: blob/master/3rdparty/carotene/src/convert_scale.cpp
16337 views
/*1* By downloading, copying, installing or using the software you agree to this license.2* If you do not agree to this license, do not download, install,3* copy or use the software.4*5*6* License Agreement7* For Open Source Computer Vision Library8* (3-clause BSD License)9*10* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.11* Third party copyrights are property of their respective owners.12*13* Redistribution and use in source and binary forms, with or without modification,14* are permitted provided that the following conditions are met:15*16* * Redistributions of source code must retain the above copyright notice,17* this list of conditions and the following disclaimer.18*19* * Redistributions in binary form must reproduce the above copyright notice,20* this list of conditions and the following disclaimer in the documentation21* and/or other materials provided with the distribution.22*23* * Neither the names of the copyright holders nor the names of the contributors24* may be used to endorse or promote products derived from this software25* without specific prior written permission.26*27* This software is provided by the copyright holders and contributors "as is" and28* any express or implied warranties, including, but not limited to, the implied29* warranties of merchantability and fitness for a particular purpose are disclaimed.30* In no event shall copyright holders or contributors be liable for any direct,31* indirect, incidental, special, exemplary, or consequential damages32* (including, but not limited to, procurement of substitute goods or services;33* loss of use, data, or profits; or business interruption) however caused34* and on any theory of liability, whether in contract, strict liability,35* or tort (including negligence or otherwise) arising in any way out of36* the use of this software, even if advised of the possibility of such damage.37*/3839#include "common.hpp"4041namespace CAROTENE_NS {4243#ifdef CAROTENE_NEON4445#define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \46void convertScale(const Size2D &_size, \47const T1 * srcBase, ptrdiff_t srcStride, \48T2 * dstBase, ptrdiff_t dstStride, \49f64 alpha, f64 beta) \50{ \51internal::assertSupportedConfiguration(); \52Size2D size(_size); \53if (srcStride == dstStride && \54srcStride == (ptrdiff_t)(size.width)) \55{ \56size.width *= size.height; \57size.height = 1; \58} \59const ptrdiff_t sstep = srcStride / sizeof(T1); \60const ptrdiff_t dstep = dstStride / sizeof(T2); \61const size_t w = size.width & ~(SIMD_SIZE-1); \62if (size.width >= SIMD_SIZE) \63{ \64const T1* _src = srcBase; \65T2* _dst = dstBase; \66CVTINIT \67for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \68CVTROW \69} \70if(w < size.width) \71{ \72const T1* _src = srcBase; \73T2* _dst = dstBase; \74for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \75for(size_t i = w; i < size.width; i++ ) \76_dst[i] = internal::saturate_cast<T2>(_src[i]*alpha + beta); \77} \78}7980#define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW) \81void convertScale(const Size2D &_size, \82const T1 * srcBase, ptrdiff_t srcStride, \83T1 * dstBase, ptrdiff_t dstStride, \84f64 alpha, f64 beta) \85{ \86internal::assertSupportedConfiguration(); \87Size2D size(_size); \88if (srcStride == dstStride && \89srcStride == (ptrdiff_t)(size.width)) \90{ \91size.width *= size.height; \92size.height = 1; \93} \94const ptrdiff_t sstep = srcStride / sizeof(T1); \95const ptrdiff_t dstep = dstStride / sizeof(T1); \96const size_t w = size.width & ~(SIMD_SIZE-1); \97if (size.width >= SIMD_SIZE) \98{ \99const T1* _src = srcBase; \100T1* _dst = dstBase; \101CVTSINIT \102for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \103CVTSROW \104} \105if(w < size.width) \106{ \107const T1* _src = srcBase; \108T1* _dst = dstBase; \109for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \110for(size_t i = w; i < size.width; i++ ) \111_dst[i] = internal::saturate_cast<T1>(_src[i]*alpha + beta); \112} \113}114115#else116117#define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \118void convertScale(const Size2D &, \119const T1 *, ptrdiff_t, \120T2 *, ptrdiff_t, \121f64, f64) \122{ \123internal::assertSupportedConfiguration(); \124}125126#define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW) \127void convertScale(const Size2D &, \128const T1 *, ptrdiff_t, \129T1 *, ptrdiff_t, \130f64, f64) \131{ \132internal::assertSupportedConfiguration(); \133}134135#endif136137#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)138CVTS_FUNC1(u8, 16,139register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);140register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,141{142for (size_t i = 0; i < w; i += 16)143{144internal::prefetch(_src + i);145__asm__ (146"vld1.8 {d4-d5}, [%[src]] \n\t"147"vmovl.u8 q3, d4 \n\t"148"vmovl.u8 q4, d5 \n\t"149"vmovl.u16 q5, d6 \n\t"150"vmovl.u16 q6, d7 \n\t"151"vmovl.u16 q7, d8 \n\t"152"vmovl.u16 q8, d9 \n\t"153"vcvt.f32.u32 q9, q5 \n\t"154"vcvt.f32.u32 q10, q6 \n\t"155"vcvt.f32.u32 q11, q7 \n\t"156"vcvt.f32.u32 q12, q8 \n\t"157"vmul.f32 q13, q9, q0 \n\t"158"vmul.f32 q14, q10, q0 \n\t"159"vmul.f32 q15, q11, q0 \n\t"160"vmul.f32 q2, q12, q0 \n\t"161"vadd.f32 q3, q13, q1 \n\t"162"vadd.f32 q4, q14, q1 \n\t"163"vadd.f32 q5, q15, q1 \n\t"164"vadd.f32 q6, q2, q1 \n\t"165"vcvt.s32.f32 q7, q3 \n\t"166"vcvt.s32.f32 q8, q4 \n\t"167"vcvt.s32.f32 q9, q5 \n\t"168"vcvt.s32.f32 q10, q6 \n\t"169"vqmovun.s32 d22, q7 \n\t"170"vqmovun.s32 d23, q8 \n\t"171"vqmovun.s32 d24, q9 \n\t"172"vqmovun.s32 d25, q10 \n\t"173"vqmovn.u16 d26, q11 \n\t"174"vqmovn.u16 d27, q12 \n\t"175"vst1.8 {d26-d27}, [%[dst1]] \n\t"176: /*no output*/177: [src] "r" (_src + i),178[dst1] "r" (_dst + i + 0),179"w" (vscale), "w" (vshift)180: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"181);182}183})184#else185CVTS_FUNC1(u8, 16,186float32x4_t vscale = vdupq_n_f32((f32)alpha);187float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,188{189for (size_t i = 0; i < w; i += 16)190{191internal::prefetch(_src + i);192uint8x16_t vline = vld1q_u8(_src + i);193uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));194uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));195uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));196uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));197uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));198uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));199float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);200float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);201float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);202float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);203vline1_f32 = vmulq_f32(vline1_f32, vscale);204vline2_f32 = vmulq_f32(vline2_f32, vscale);205vline3_f32 = vmulq_f32(vline3_f32, vscale);206vline4_f32 = vmulq_f32(vline4_f32, vscale);207vline1_f32 = vaddq_f32(vline1_f32, vshift);208vline2_f32 = vaddq_f32(vline2_f32, vshift);209vline3_f32 = vaddq_f32(vline3_f32, vshift);210vline4_f32 = vaddq_f32(vline4_f32, vshift);211int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);212int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);213int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);214int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);215uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));216uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));217vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));218}219})220#endif221222#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)223CVTS_FUNC(u8, s8, 16,224register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);225register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,226{227for (size_t i = 0; i < w; i += 16)228{229internal::prefetch(_src + i);230__asm__ (231"vld1.8 {d4-d5}, [%[src]] \n\t"232"vmovl.u8 q3, d4 \n\t"233"vmovl.u8 q4, d5 \n\t"234"vmovl.u16 q5, d6 \n\t"235"vmovl.u16 q6, d7 \n\t"236"vmovl.u16 q7, d8 \n\t"237"vmovl.u16 q8, d9 \n\t"238"vcvt.f32.u32 q9, q5 \n\t"239"vcvt.f32.u32 q10, q6 \n\t"240"vcvt.f32.u32 q11, q7 \n\t"241"vcvt.f32.u32 q12, q8 \n\t"242"vmul.f32 q13, q9, q0 \n\t"243"vmul.f32 q14, q10, q0 \n\t"244"vmul.f32 q15, q11, q0 \n\t"245"vmul.f32 q2, q12, q0 \n\t"246"vadd.f32 q3, q13, q1 \n\t"247"vadd.f32 q4, q14, q1 \n\t"248"vadd.f32 q5, q15, q1 \n\t"249"vadd.f32 q6, q2, q1 \n\t"250"vcvt.s32.f32 q7, q3 \n\t"251"vcvt.s32.f32 q8, q4 \n\t"252"vcvt.s32.f32 q9, q5 \n\t"253"vcvt.s32.f32 q10, q6 \n\t"254"vqmovn.s32 d22, q7 \n\t"255"vqmovn.s32 d23, q8 \n\t"256"vqmovn.s32 d24, q9 \n\t"257"vqmovn.s32 d25, q10 \n\t"258"vqmovn.s16 d26, q11 \n\t"259"vqmovn.s16 d27, q12 \n\t"260"vst1.8 {d26-d27}, [%[dst1]] \n\t"261: //no output262: [src] "r" (_src + i),263[dst1] "r" (_dst + i + 0),264"w" (vscale), "w" (vshift)265: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"266);267}268})269#else270CVTS_FUNC(u8, s8, 16,271float32x4_t vscale = vdupq_n_f32((f32)alpha);272float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,273{274for (size_t i = 0; i < w; i += 16)275{276internal::prefetch(_src + i);277uint8x16_t vline = vld1q_u8(_src + i);278uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));279uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));280uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));281uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));282uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));283uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));284float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);285float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);286float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);287float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);288vline1_f32 = vmulq_f32(vline1_f32, vscale);289vline2_f32 = vmulq_f32(vline2_f32, vscale);290vline3_f32 = vmulq_f32(vline3_f32, vscale);291vline4_f32 = vmulq_f32(vline4_f32, vscale);292vline1_f32 = vaddq_f32(vline1_f32, vshift);293vline2_f32 = vaddq_f32(vline2_f32, vshift);294vline3_f32 = vaddq_f32(vline3_f32, vshift);295vline4_f32 = vaddq_f32(vline4_f32, vshift);296int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);297int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);298int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);299int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);300int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));301int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));302vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16)));303}304})305#endif306307#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)308CVTS_FUNC(u8, u16, 16,309register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);310register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,311{312for (size_t i = 0; i < w; i += 16)313{314internal::prefetch(_src + i);315__asm__ (316"vld1.8 {d4-d5}, [%[src]] \n\t"317"vmovl.u8 q3, d4 \n\t"318"vmovl.u8 q4, d5 \n\t"319"vmovl.u16 q5, d6 \n\t"320"vmovl.u16 q6, d7 \n\t"321"vmovl.u16 q7, d8 \n\t"322"vmovl.u16 q8, d9 \n\t"323"vcvt.f32.u32 q9, q5 \n\t"324"vcvt.f32.u32 q10, q6 \n\t"325"vcvt.f32.u32 q11, q7 \n\t"326"vcvt.f32.u32 q12, q8 \n\t"327"vmul.f32 q13, q9, q0 \n\t"328"vmul.f32 q14, q10, q0 \n\t"329"vmul.f32 q15, q11, q0 \n\t"330"vmul.f32 q2, q12, q0 \n\t"331"vadd.f32 q3, q13, q1 \n\t"332"vadd.f32 q4, q14, q1 \n\t"333"vadd.f32 q5, q15, q1 \n\t"334"vadd.f32 q6, q2, q1 \n\t"335"vcvt.s32.f32 q7, q3 \n\t"336"vcvt.s32.f32 q8, q4 \n\t"337"vcvt.s32.f32 q9, q5 \n\t"338"vcvt.s32.f32 q10, q6 \n\t"339"vqmovun.s32 d22, q7 \n\t"340"vqmovun.s32 d23, q8 \n\t"341"vqmovun.s32 d24, q9 \n\t"342"vqmovun.s32 d25, q10 \n\t"343"vst1.16 {d22-d23}, [%[dst1]] \n\t"344"vst1.16 {d24-d25}, [%[dst2]] \n\t"345: /*no output*/346: [src] "r" (_src + i),347[dst1] "r" (_dst + i + 0),348[dst2] "r" (_dst + i + 8),349"w" (vscale), "w" (vshift)350: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"351);352}353})354#else355CVTS_FUNC(u8, u16, 16,356float32x4_t vscale = vdupq_n_f32((f32)alpha);357float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,358{359for (size_t i = 0; i < w; i += 16)360{361internal::prefetch(_src + i);362uint8x16_t vline = vld1q_u8(_src + i);363uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));364uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));365uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));366uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));367uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));368uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));369float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);370float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);371float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);372float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);373vline1_f32 = vmulq_f32(vline1_f32, vscale);374vline2_f32 = vmulq_f32(vline2_f32, vscale);375vline3_f32 = vmulq_f32(vline3_f32, vscale);376vline4_f32 = vmulq_f32(vline4_f32, vscale);377vline1_f32 = vaddq_f32(vline1_f32, vshift);378vline2_f32 = vaddq_f32(vline2_f32, vshift);379vline3_f32 = vaddq_f32(vline3_f32, vshift);380vline4_f32 = vaddq_f32(vline4_f32, vshift);381int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);382int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);383int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);384int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);385vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)));386vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)));387}388})389#endif390391#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)392CVTS_FUNC(u8, s16, 16,393register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);394register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,395{396for (size_t i = 0; i < w; i += 16)397{398internal::prefetch(_src + i);399__asm__ (400"vld1.8 {d4-d5}, [%[src]] \n\t"401"vmovl.u8 q3, d4 \n\t"402"vmovl.u8 q4, d5 \n\t"403"vmovl.u16 q5, d6 \n\t"404"vmovl.u16 q6, d7 \n\t"405"vmovl.u16 q7, d8 \n\t"406"vmovl.u16 q8, d9 \n\t"407"vcvt.f32.u32 q9, q5 \n\t"408"vcvt.f32.u32 q10, q6 \n\t"409"vcvt.f32.u32 q11, q7 \n\t"410"vcvt.f32.u32 q12, q8 \n\t"411"vmul.f32 q13, q9, q0 \n\t"412"vmul.f32 q14, q10, q0 \n\t"413"vmul.f32 q15, q11, q0 \n\t"414"vmul.f32 q2, q12, q0 \n\t"415"vadd.f32 q3, q13, q1 \n\t"416"vadd.f32 q4, q14, q1 \n\t"417"vadd.f32 q5, q15, q1 \n\t"418"vadd.f32 q6, q2, q1 \n\t"419"vcvt.s32.f32 q7, q3 \n\t"420"vcvt.s32.f32 q8, q4 \n\t"421"vcvt.s32.f32 q9, q5 \n\t"422"vcvt.s32.f32 q10, q6 \n\t"423"vqmovn.s32 d22, q7 \n\t"424"vqmovn.s32 d23, q8 \n\t"425"vqmovn.s32 d24, q9 \n\t"426"vqmovn.s32 d25, q10 \n\t"427"vst1.16 {d22-d23}, [%[dst1]] \n\t"428"vst1.16 {d24-d25}, [%[dst2]] \n\t"429: //no output430: [src] "r" (_src + i),431[dst1] "r" (_dst + i + 0),432[dst2] "r" (_dst + i + 8),433"w" (vscale), "w" (vshift)434: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"435);436}437})438#else439CVTS_FUNC(u8, s16, 16,440float32x4_t vscale = vdupq_n_f32((f32)alpha);441float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,442{443for (size_t i = 0; i < w; i += 16)444{445internal::prefetch(_src + i);446uint8x16_t vline = vld1q_u8(_src + i);447uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));448uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));449uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));450uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));451uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));452uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));453float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);454float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);455float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);456float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);457vline1_f32 = vmulq_f32(vline1_f32, vscale);458vline2_f32 = vmulq_f32(vline2_f32, vscale);459vline3_f32 = vmulq_f32(vline3_f32, vscale);460vline4_f32 = vmulq_f32(vline4_f32, vscale);461vline1_f32 = vaddq_f32(vline1_f32, vshift);462vline2_f32 = vaddq_f32(vline2_f32, vshift);463vline3_f32 = vaddq_f32(vline3_f32, vshift);464vline4_f32 = vaddq_f32(vline4_f32, vshift);465int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);466int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);467int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);468int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);469vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)));470vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)));471}472})473#endif474475#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)476CVTS_FUNC(u8, s32, 16,477register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);478register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,479{480for (size_t i = 0; i < w; i += 16)481{482internal::prefetch(_src + i);483__asm__ (484"vld1.8 {d4-d5}, [%[src]] \n\t"485"vmovl.u8 q3, d4 \n\t"486"vmovl.u8 q4, d5 \n\t"487"vmovl.u16 q5, d6 \n\t"488"vmovl.u16 q6, d7 \n\t"489"vmovl.u16 q7, d8 \n\t"490"vmovl.u16 q8, d9 \n\t"491"vcvt.f32.u32 q9, q5 \n\t"492"vcvt.f32.u32 q10, q6 \n\t"493"vcvt.f32.u32 q11, q7 \n\t"494"vcvt.f32.u32 q12, q8 \n\t"495"vmul.f32 q13, q9, q0 \n\t"496"vmul.f32 q14, q10, q0 \n\t"497"vmul.f32 q15, q11, q0 \n\t"498"vmul.f32 q2, q12, q0 \n\t"499"vadd.f32 q3, q13, q1 \n\t"500"vadd.f32 q4, q14, q1 \n\t"501"vadd.f32 q5, q15, q1 \n\t"502"vadd.f32 q6, q2, q1 \n\t"503"vcvt.s32.f32 q7, q3 \n\t"504"vcvt.s32.f32 q8, q4 \n\t"505"vcvt.s32.f32 q9, q5 \n\t"506"vcvt.s32.f32 q10, q6 \n\t"507"vst1.32 {d14-d15}, [%[dst1]] \n\t"508"vst1.32 {d16-d17}, [%[dst2]] \n\t"509"vst1.32 {d18-d19}, [%[dst3]] \n\t"510"vst1.32 {d20-d21}, [%[dst4]] \n\t"511: /*no output*/512: [src] "r" (_src + i),513[dst1] "r" (_dst + i + 0),514[dst2] "r" (_dst + i + 4),515[dst3] "r" (_dst + i + 8),516[dst4] "r" (_dst + i + 12),517"w" (vscale), "w" (vshift)518: "d4","d5","d6","d7","d8","d9","d10",519"d11","d12","d13","d14","d15","d16","d17",520"d18","d19","d20","d21","d22","d23","d24",521"d25","d26","d27","d28","d29","d30","d31"522);523}524})525#else526CVTS_FUNC(u8, s32, 16,527float32x4_t vscale = vdupq_n_f32((f32)alpha);528float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,529{530for (size_t i = 0; i < w; i += 16)531{532internal::prefetch(_src + i);533uint8x16_t vline = vld1q_u8(_src + i);534uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));535uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));536uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));537uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));538uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));539uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));540float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);541float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);542float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);543float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);544vline1_f32 = vmulq_f32(vline1_f32, vscale);545vline2_f32 = vmulq_f32(vline2_f32, vscale);546vline3_f32 = vmulq_f32(vline3_f32, vscale);547vline4_f32 = vmulq_f32(vline4_f32, vscale);548vline1_f32 = vaddq_f32(vline1_f32, vshift);549vline2_f32 = vaddq_f32(vline2_f32, vshift);550vline3_f32 = vaddq_f32(vline3_f32, vshift);551vline4_f32 = vaddq_f32(vline4_f32, vshift);552int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);553int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);554int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);555int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);556vst1q_s32(_dst + i + 0, vline1_s32);557vst1q_s32(_dst + i + 4, vline2_s32);558vst1q_s32(_dst + i + 8, vline3_s32);559vst1q_s32(_dst + i + 12, vline4_s32);560}561})562#endif563564#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)565CVTS_FUNC(u8, f32, 16,566register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);567register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,568{569for (size_t i = 0; i < w; i += 16)570{571internal::prefetch(_src + i);572__asm__ (573"vld1.8 {d4-d5}, [%[src]] \n\t"574"vmovl.u8 q3, d4 \n\t"575"vmovl.u8 q4, d5 \n\t"576"vmovl.u16 q5, d6 \n\t"577"vmovl.u16 q6, d7 \n\t"578"vmovl.u16 q7, d8 \n\t"579"vmovl.u16 q8, d9 \n\t"580"vcvt.f32.u32 q9, q5 \n\t"581"vcvt.f32.u32 q10, q6 \n\t"582"vcvt.f32.u32 q11, q7 \n\t"583"vcvt.f32.u32 q12, q8 \n\t"584"vmul.f32 q13, q9, q0 \n\t"585"vmul.f32 q14, q10, q0 \n\t"586"vmul.f32 q15, q11, q0 \n\t"587"vmul.f32 q2, q12, q0 \n\t"588"vadd.f32 q3, q13, q1 \n\t"589"vadd.f32 q4, q14, q1 \n\t"590"vadd.f32 q5, q15, q1 \n\t"591"vadd.f32 q6, q2, q1 \n\t"592"vst1.32 {d6-d7}, [%[dst1]] \n\t"593"vst1.32 {d8-d9}, [%[dst2]] \n\t"594"vst1.32 {d10-d11}, [%[dst3]] \n\t"595"vst1.32 {d12-d13}, [%[dst4]] \n\t"596: /*no output*/597: [src] "r" (_src + i),598[dst1] "r" (_dst + i + 0),599[dst2] "r" (_dst + i + 4),600[dst3] "r" (_dst + i + 8),601[dst4] "r" (_dst + i + 12),602"w" (vscale), "w" (vshift)603: "d4","d5","d6","d7","d8","d9","d10",604"d11","d12","d13","d14","d15","d16","d17",605"d18","d19","d20","d21","d22","d23","d24",606"d25","d26","d27","d28","d29","d30","d31"607);608}609})610#else611CVTS_FUNC(u8, f32, 16,612float32x4_t vscale = vdupq_n_f32((f32)alpha);613float32x4_t vshift = vdupq_n_f32((f32)beta);,614{615for (size_t i = 0; i < w; i += 16)616{617internal::prefetch(_src + i);618uint8x16_t vline = vld1q_u8(_src + i);619uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));620uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));621uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));622uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));623uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));624uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));625float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);626float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);627float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);628float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);629vline1_f32 = vmulq_f32(vline1_f32, vscale);630vline2_f32 = vmulq_f32(vline2_f32, vscale);631vline3_f32 = vmulq_f32(vline3_f32, vscale);632vline4_f32 = vmulq_f32(vline4_f32, vscale);633vline1_f32 = vaddq_f32(vline1_f32, vshift);634vline2_f32 = vaddq_f32(vline2_f32, vshift);635vline3_f32 = vaddq_f32(vline3_f32, vshift);636vline4_f32 = vaddq_f32(vline4_f32, vshift);637vst1q_f32(_dst + i + 0, vline1_f32);638vst1q_f32(_dst + i + 4, vline2_f32);639vst1q_f32(_dst + i + 8, vline3_f32);640vst1q_f32(_dst + i + 12, vline4_f32);641}642})643#endif644645#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)646CVTS_FUNC(s8, u8, 16,647register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);648register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,649{650for (size_t i = 0; i < w; i += 16)651{652internal::prefetch(_src + i);653__asm__ (654"vld1.8 {d4-d5}, [%[src]] \n\t"655"vmovl.s8 q3, d4 \n\t"656"vmovl.s8 q4, d5 \n\t"657"vmovl.s16 q5, d6 \n\t"658"vmovl.s16 q6, d7 \n\t"659"vmovl.s16 q7, d8 \n\t"660"vmovl.s16 q8, d9 \n\t"661"vcvt.f32.s32 q9, q5 \n\t"662"vcvt.f32.s32 q10, q6 \n\t"663"vcvt.f32.s32 q11, q7 \n\t"664"vcvt.f32.s32 q12, q8 \n\t"665"vmul.f32 q13, q9, q0 \n\t"666"vmul.f32 q14, q10, q0 \n\t"667"vmul.f32 q15, q11, q0 \n\t"668"vmul.f32 q2, q12, q0 \n\t"669"vadd.f32 q3, q13, q1 \n\t"670"vadd.f32 q4, q14, q1 \n\t"671"vadd.f32 q5, q15, q1 \n\t"672"vadd.f32 q6, q2, q1 \n\t"673"vcvt.s32.f32 q7, q3 \n\t"674"vcvt.s32.f32 q8, q4 \n\t"675"vcvt.s32.f32 q9, q5 \n\t"676"vcvt.s32.f32 q10, q6 \n\t"677"vqmovun.s32 d22, q7 \n\t"678"vqmovun.s32 d23, q8 \n\t"679"vqmovun.s32 d24, q9 \n\t"680"vqmovun.s32 d25, q10 \n\t"681"vqmovn.u16 d26, q11 \n\t"682"vqmovn.u16 d27, q12 \n\t"683"vst1.8 {d26-d27}, [%[dst1]] \n\t"684: /*no output*/685: [src] "r" (_src + i),686[dst1] "r" (_dst + i + 0),687"w" (vscale), "w" (vshift)688: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"689);690}691})692#else693CVTS_FUNC(s8, u8, 16,694float32x4_t vscale = vdupq_n_f32((f32)alpha);695float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,696{697for (size_t i = 0; i < w; i += 16)698{699internal::prefetch(_src + i);700int8x16_t vline = vld1q_s8(_src + i);701int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));702int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));703int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));704int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));705int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));706int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));707float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);708float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);709float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);710float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);711vline1_f32 = vmulq_f32(vline1_f32, vscale);712vline2_f32 = vmulq_f32(vline2_f32, vscale);713vline3_f32 = vmulq_f32(vline3_f32, vscale);714vline4_f32 = vmulq_f32(vline4_f32, vscale);715vline1_f32 = vaddq_f32(vline1_f32, vshift);716vline2_f32 = vaddq_f32(vline2_f32, vshift);717vline3_f32 = vaddq_f32(vline3_f32, vshift);718vline4_f32 = vaddq_f32(vline4_f32, vshift);719vline1_s32 = vcvtq_s32_f32(vline1_f32);720vline2_s32 = vcvtq_s32_f32(vline2_f32);721vline3_s32 = vcvtq_s32_f32(vline3_f32);722vline4_s32 = vcvtq_s32_f32(vline4_f32);723uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));724uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));725vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));726}727})728#endif729730#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)731CVTS_FUNC1(s8, 16,732register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);733register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,734{735for (size_t i = 0; i < w; i += 16)736{737internal::prefetch(_src + i);738__asm__ (739"vld1.8 {d4-d5}, [%[src]] \n\t"740"vmovl.s8 q3, d4 \n\t"741"vmovl.s8 q4, d5 \n\t"742"vmovl.s16 q5, d6 \n\t"743"vmovl.s16 q6, d7 \n\t"744"vmovl.s16 q7, d8 \n\t"745"vmovl.s16 q8, d9 \n\t"746"vcvt.f32.s32 q9, q5 \n\t"747"vcvt.f32.s32 q10, q6 \n\t"748"vcvt.f32.s32 q11, q7 \n\t"749"vcvt.f32.s32 q12, q8 \n\t"750"vmul.f32 q13, q9, q0 \n\t"751"vmul.f32 q14, q10, q0 \n\t"752"vmul.f32 q15, q11, q0 \n\t"753"vmul.f32 q2, q12, q0 \n\t"754"vadd.f32 q3, q13, q1 \n\t"755"vadd.f32 q4, q14, q1 \n\t"756"vadd.f32 q5, q15, q1 \n\t"757"vadd.f32 q6, q2, q1 \n\t"758"vcvt.s32.f32 q7, q3 \n\t"759"vcvt.s32.f32 q8, q4 \n\t"760"vcvt.s32.f32 q9, q5 \n\t"761"vcvt.s32.f32 q10, q6 \n\t"762"vqmovn.s32 d22, q7 \n\t"763"vqmovn.s32 d23, q8 \n\t"764"vqmovn.s32 d24, q9 \n\t"765"vqmovn.s32 d25, q10 \n\t"766"vqmovn.s16 d26, q11 \n\t"767"vqmovn.s16 d27, q12 \n\t"768"vst1.8 {d26-d27}, [%[dst1]] \n\t"769: /*no output*/770: [src] "r" (_src + i),771[dst1] "r" (_dst + i + 0),772"w" (vscale), "w" (vshift)773: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"774);775}776})777#else778CVTS_FUNC1(s8, 16,779float32x4_t vscale = vdupq_n_f32((f32)alpha);780float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,781{782for (size_t i = 0; i < w; i += 16)783{784internal::prefetch(_src + i);785int8x16_t vline = vld1q_s8(_src + i);786int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));787int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));788int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));789int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));790int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));791int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));792float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);793float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);794float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);795float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);796vline1_f32 = vmulq_f32(vline1_f32, vscale);797vline2_f32 = vmulq_f32(vline2_f32, vscale);798vline3_f32 = vmulq_f32(vline3_f32, vscale);799vline4_f32 = vmulq_f32(vline4_f32, vscale);800vline1_f32 = vaddq_f32(vline1_f32, vshift);801vline2_f32 = vaddq_f32(vline2_f32, vshift);802vline3_f32 = vaddq_f32(vline3_f32, vshift);803vline4_f32 = vaddq_f32(vline4_f32, vshift);804vline1_s32 = vcvtq_s32_f32(vline1_f32);805vline2_s32 = vcvtq_s32_f32(vline2_f32);806vline3_s32 = vcvtq_s32_f32(vline3_f32);807vline4_s32 = vcvtq_s32_f32(vline4_f32);808int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));809int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));810vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16)));811}812})813#endif814815#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)816CVTS_FUNC(s8, u16, 16,817register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);818register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,819{820for (size_t i = 0; i < w; i += 16)821{822internal::prefetch(_src + i);823__asm__ (824"vld1.8 {d4-d5}, [%[src]] \n\t"825"vmovl.s8 q3, d4 \n\t"826"vmovl.s8 q4, d5 \n\t"827"vmovl.s16 q5, d6 \n\t"828"vmovl.s16 q6, d7 \n\t"829"vmovl.s16 q7, d8 \n\t"830"vmovl.s16 q8, d9 \n\t"831"vcvt.f32.s32 q9, q5 \n\t"832"vcvt.f32.s32 q10, q6 \n\t"833"vcvt.f32.s32 q11, q7 \n\t"834"vcvt.f32.s32 q12, q8 \n\t"835"vmul.f32 q13, q9, q0 \n\t"836"vmul.f32 q14, q10, q0 \n\t"837"vmul.f32 q15, q11, q0 \n\t"838"vmul.f32 q2, q12, q0 \n\t"839"vadd.f32 q3, q13, q1 \n\t"840"vadd.f32 q4, q14, q1 \n\t"841"vadd.f32 q5, q15, q1 \n\t"842"vadd.f32 q6, q2, q1 \n\t"843"vcvt.s32.f32 q7, q3 \n\t"844"vcvt.s32.f32 q8, q4 \n\t"845"vcvt.s32.f32 q9, q5 \n\t"846"vcvt.s32.f32 q10, q6 \n\t"847"vqmovun.s32 d22, q7 \n\t"848"vqmovun.s32 d23, q8 \n\t"849"vqmovun.s32 d24, q9 \n\t"850"vqmovun.s32 d25, q10 \n\t"851"vst1.16 {d22-d23}, [%[dst1]] \n\t"852"vst1.16 {d24-d25}, [%[dst2]] \n\t"853: /*no output*/854: [src] "r" (_src + i),855[dst1] "r" (_dst + i + 0),856[dst2] "r" (_dst + i + 8),857"w" (vscale), "w" (vshift)858: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"859);860}861})862#else863CVTS_FUNC(s8, u16, 16,864float32x4_t vscale = vdupq_n_f32((f32)alpha);865float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,866{867for (size_t i = 0; i < w; i += 16)868{869internal::prefetch(_src + i);870int8x16_t vline = vld1q_s8(_src + i);871int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));872int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));873int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));874int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));875int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));876int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));877float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);878float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);879float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);880float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);881vline1_f32 = vmulq_f32(vline1_f32, vscale);882vline2_f32 = vmulq_f32(vline2_f32, vscale);883vline3_f32 = vmulq_f32(vline3_f32, vscale);884vline4_f32 = vmulq_f32(vline4_f32, vscale);885vline1_f32 = vaddq_f32(vline1_f32, vshift);886vline2_f32 = vaddq_f32(vline2_f32, vshift);887vline3_f32 = vaddq_f32(vline3_f32, vshift);888vline4_f32 = vaddq_f32(vline4_f32, vshift);889vline1_s32 = vcvtq_s32_f32(vline1_f32);890vline2_s32 = vcvtq_s32_f32(vline2_f32);891vline3_s32 = vcvtq_s32_f32(vline3_f32);892vline4_s32 = vcvtq_s32_f32(vline4_f32);893uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));894uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));895vst1q_u16(_dst + i + 0, vRes1_u16);896vst1q_u16(_dst + i + 8, vRes2_u16);897}898})899#endif900901#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)902CVTS_FUNC(s8, s16, 16,903register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);904register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,905{906for (size_t i = 0; i < w; i += 16)907{908internal::prefetch(_src + i);909__asm__ (910"vld1.8 {d4-d5}, [%[src]] \n\t"911"vmovl.s8 q3, d4 \n\t"912"vmovl.s8 q4, d5 \n\t"913"vmovl.s16 q5, d6 \n\t"914"vmovl.s16 q6, d7 \n\t"915"vmovl.s16 q7, d8 \n\t"916"vmovl.s16 q8, d9 \n\t"917"vcvt.f32.s32 q9, q5 \n\t"918"vcvt.f32.s32 q10, q6 \n\t"919"vcvt.f32.s32 q11, q7 \n\t"920"vcvt.f32.s32 q12, q8 \n\t"921"vmul.f32 q13, q9, q0 \n\t"922"vmul.f32 q14, q10, q0 \n\t"923"vmul.f32 q15, q11, q0 \n\t"924"vmul.f32 q2, q12, q0 \n\t"925"vadd.f32 q3, q13, q1 \n\t"926"vadd.f32 q4, q14, q1 \n\t"927"vadd.f32 q5, q15, q1 \n\t"928"vadd.f32 q6, q2, q1 \n\t"929"vcvt.s32.f32 q7, q3 \n\t"930"vcvt.s32.f32 q8, q4 \n\t"931"vcvt.s32.f32 q9, q5 \n\t"932"vcvt.s32.f32 q10, q6 \n\t"933"vqmovn.s32 d22, q7 \n\t"934"vqmovn.s32 d23, q8 \n\t"935"vqmovn.s32 d24, q9 \n\t"936"vqmovn.s32 d25, q10 \n\t"937"vst1.16 {d22-d23}, [%[dst1]] \n\t"938"vst1.16 {d24-d25}, [%[dst2]] \n\t"939: /*no output*/940: [src] "r" (_src + i),941[dst1] "r" (_dst + i + 0),942[dst2] "r" (_dst + i + 8),943"w" (vscale), "w" (vshift)944: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"945);946}947})948#else949CVTS_FUNC(s8, s16, 16,950float32x4_t vscale = vdupq_n_f32((f32)alpha);951float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,952{953for (size_t i = 0; i < w; i += 16)954{955internal::prefetch(_src + i);956int8x16_t vline = vld1q_s8(_src + i);957int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));958int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));959int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));960int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));961int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));962int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));963float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);964float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);965float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);966float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);967vline1_f32 = vmulq_f32(vline1_f32, vscale);968vline2_f32 = vmulq_f32(vline2_f32, vscale);969vline3_f32 = vmulq_f32(vline3_f32, vscale);970vline4_f32 = vmulq_f32(vline4_f32, vscale);971vline1_f32 = vaddq_f32(vline1_f32, vshift);972vline2_f32 = vaddq_f32(vline2_f32, vshift);973vline3_f32 = vaddq_f32(vline3_f32, vshift);974vline4_f32 = vaddq_f32(vline4_f32, vshift);975vline1_s32 = vcvtq_s32_f32(vline1_f32);976vline2_s32 = vcvtq_s32_f32(vline2_f32);977vline3_s32 = vcvtq_s32_f32(vline3_f32);978vline4_s32 = vcvtq_s32_f32(vline4_f32);979int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));980int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));981vst1q_s16(_dst + i + 0, vRes1_s16);982vst1q_s16(_dst + i + 8, vRes2_s16);983}984})985#endif986987#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)988CVTS_FUNC(s8, s32, 16,989register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);990register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,991{992for (size_t i = 0; i < w; i += 16)993{994internal::prefetch(_src + i);995__asm__ (996"vld1.8 {d4-d5}, [%[src]] \n\t"997"vmovl.s8 q3, d4 \n\t"998"vmovl.s8 q4, d5 \n\t"999"vmovl.s16 q5, d6 \n\t"1000"vmovl.s16 q6, d7 \n\t"1001"vmovl.s16 q7, d8 \n\t"1002"vmovl.s16 q8, d9 \n\t"1003"vcvt.f32.s32 q9, q5 \n\t"1004"vcvt.f32.s32 q10, q6 \n\t"1005"vcvt.f32.s32 q11, q7 \n\t"1006"vcvt.f32.s32 q12, q8 \n\t"1007"vmul.f32 q13, q9, q0 \n\t"1008"vmul.f32 q14, q10, q0 \n\t"1009"vmul.f32 q15, q11, q0 \n\t"1010"vmul.f32 q2, q12, q0 \n\t"1011"vadd.f32 q3, q13, q1 \n\t"1012"vadd.f32 q4, q14, q1 \n\t"1013"vadd.f32 q5, q15, q1 \n\t"1014"vadd.f32 q6, q2, q1 \n\t"1015"vcvt.s32.f32 q7, q3 \n\t"1016"vcvt.s32.f32 q8, q4 \n\t"1017"vcvt.s32.f32 q9, q5 \n\t"1018"vcvt.s32.f32 q10, q6 \n\t"1019"vst1.32 {d14-d15}, [%[dst1]] \n\t"1020"vst1.32 {d16-d17}, [%[dst2]] \n\t"1021"vst1.32 {d18-d19}, [%[dst3]] \n\t"1022"vst1.32 {d20-d21}, [%[dst4]] \n\t"1023: /*no output*/1024: [src] "r" (_src + i),1025[dst1] "r" (_dst + i + 0),1026[dst2] "r" (_dst + i + 4),1027[dst3] "r" (_dst + i + 8),1028[dst4] "r" (_dst + i + 12),1029"w" (vscale), "w" (vshift)1030: "d4","d5","d6","d7","d8","d9","d10",1031"d11","d12","d13","d14","d15","d16","d17",1032"d18","d19","d20","d21","d22","d23","d24",1033"d25","d26","d27","d28","d29","d30","d31"1034);1035}1036})1037#else1038CVTS_FUNC(s8, s32, 16,1039float32x4_t vscale = vdupq_n_f32((f32)alpha);1040float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,1041{1042for (size_t i = 0; i < w; i += 16)1043{1044internal::prefetch(_src + i);1045int8x16_t vline = vld1q_s8(_src + i);1046int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));1047int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));1048int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));1049int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));1050int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));1051int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));1052float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);1053float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);1054float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);1055float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);1056vline1_f32 = vmulq_f32(vline1_f32, vscale);1057vline2_f32 = vmulq_f32(vline2_f32, vscale);1058vline3_f32 = vmulq_f32(vline3_f32, vscale);1059vline4_f32 = vmulq_f32(vline4_f32, vscale);1060vline1_f32 = vaddq_f32(vline1_f32, vshift);1061vline2_f32 = vaddq_f32(vline2_f32, vshift);1062vline3_f32 = vaddq_f32(vline3_f32, vshift);1063vline4_f32 = vaddq_f32(vline4_f32, vshift);1064vline1_s32 = vcvtq_s32_f32(vline1_f32);1065vline2_s32 = vcvtq_s32_f32(vline2_f32);1066vline3_s32 = vcvtq_s32_f32(vline3_f32);1067vline4_s32 = vcvtq_s32_f32(vline4_f32);1068vst1q_s32(_dst + i + 0, vline1_s32);1069vst1q_s32(_dst + i + 4, vline2_s32);1070vst1q_s32(_dst + i + 8, vline3_s32);1071vst1q_s32(_dst + i + 12, vline4_s32);1072}1073})1074#endif10751076#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1077CVTS_FUNC(s8, f32, 16,1078register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);1079register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,1080{1081for (size_t i = 0; i < w; i += 16)1082{1083internal::prefetch(_src + i);1084__asm__ (1085"vld1.8 {d4-d5}, [%[src]] \n\t"1086"vmovl.s8 q3, d4 \n\t"1087"vmovl.s8 q4, d5 \n\t"1088"vmovl.s16 q5, d6 \n\t"1089"vmovl.s16 q6, d7 \n\t"1090"vmovl.s16 q7, d8 \n\t"1091"vmovl.s16 q8, d9 \n\t"1092"vcvt.f32.s32 q9, q5 \n\t"1093"vcvt.f32.s32 q10, q6 \n\t"1094"vcvt.f32.s32 q11, q7 \n\t"1095"vcvt.f32.s32 q12, q8 \n\t"1096"vmul.f32 q13, q9, q0 \n\t"1097"vmul.f32 q14, q10, q0 \n\t"1098"vmul.f32 q15, q11, q0 \n\t"1099"vmul.f32 q2, q12, q0 \n\t"1100"vadd.f32 q3, q13, q1 \n\t"1101"vadd.f32 q4, q14, q1 \n\t"1102"vadd.f32 q5, q15, q1 \n\t"1103"vadd.f32 q6, q2, q1 \n\t"1104"vst1.32 {d6-d7}, [%[dst1]] \n\t"1105"vst1.32 {d8-d9}, [%[dst2]] \n\t"1106"vst1.32 {d10-d11}, [%[dst3]] \n\t"1107"vst1.32 {d12-d13}, [%[dst4]] \n\t"1108: /*no output*/1109: [src] "r" (_src + i),1110[dst1] "r" (_dst + i + 0),1111[dst2] "r" (_dst + i + 4),1112[dst3] "r" (_dst + i + 8),1113[dst4] "r" (_dst + i + 12),1114"w" (vscale), "w" (vshift)1115: "d4","d5","d6","d7","d8","d9","d10",1116"d11","d12","d13","d14","d15","d16","d17",1117"d18","d19","d20","d21","d22","d23","d24",1118"d25","d26","d27","d28","d29","d30","d31"1119);1120}1121})1122#else1123CVTS_FUNC(s8, f32, 16,1124float32x4_t vscale = vdupq_n_f32((f32)alpha);1125float32x4_t vshift = vdupq_n_f32((f32)beta);,1126{1127for (size_t i = 0; i < w; i += 16)1128{1129internal::prefetch(_src + i);1130int8x16_t vline = vld1q_s8(_src + i);1131int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));1132int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));1133int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));1134int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));1135int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));1136int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));1137float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);1138float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);1139float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);1140float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);1141vline1_f32 = vmulq_f32(vline1_f32, vscale);1142vline2_f32 = vmulq_f32(vline2_f32, vscale);1143vline3_f32 = vmulq_f32(vline3_f32, vscale);1144vline4_f32 = vmulq_f32(vline4_f32, vscale);1145vline1_f32 = vaddq_f32(vline1_f32, vshift);1146vline2_f32 = vaddq_f32(vline2_f32, vshift);1147vline3_f32 = vaddq_f32(vline3_f32, vshift);1148vline4_f32 = vaddq_f32(vline4_f32, vshift);1149vst1q_f32(_dst + i + 0, vline1_f32);1150vst1q_f32(_dst + i + 4, vline2_f32);1151vst1q_f32(_dst + i + 8, vline3_f32);1152vst1q_f32(_dst + i + 12, vline4_f32);1153}1154})1155#endif11561157#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1158CVTS_FUNC(u16, u8, 16,1159register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);1160register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,1161{1162for (size_t i = 0; i < w; i += 8)1163{1164internal::prefetch(_src + i);1165__asm__ (1166"vld1.8 {d4-d5}, [%[src1]] \n\t"1167"vmovl.u16 q3, d4 \n\t"1168"vmovl.u16 q4, d5 \n\t"1169"vcvt.f32.u32 q5, q3 \n\t"1170"vcvt.f32.u32 q6, q4 \n\t"1171"vmul.f32 q7, q5, q0 \n\t"1172"vmul.f32 q8, q6, q0 \n\t"1173"vadd.f32 q9, q7, q1 \n\t"1174"vadd.f32 q10, q8, q1 \n\t"1175"vcvt.s32.f32 q11, q9 \n\t"1176"vcvt.s32.f32 q12, q10 \n\t"1177"vqmovn.s32 d26, q11 \n\t"1178"vqmovn.s32 d27, q12 \n\t"1179"vqmovun.s16 d28, q13 \n\t"1180"vst1.8 {d28}, [%[dst]] \n\t"1181: /*no output*/1182: [src1] "r" (_src + i),1183[dst] "r" (_dst + i + 0),1184"w" (vscale), "w" (vshift)1185: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"1186);1187}1188})1189#else1190CVTS_FUNC(u16, u8, 16,1191float32x4_t vscale = vdupq_n_f32((f32)alpha);1192float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,1193{1194for (size_t i = 0; i < w; i += 8)1195{1196internal::prefetch(_src + i);1197uint16x8_t vline = vld1q_u16(_src + i);1198uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));1199uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));1200float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);1201float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);1202vline1_f32 = vmulq_f32(vline1_f32, vscale);1203vline2_f32 = vmulq_f32(vline2_f32, vscale);1204vline1_f32 = vaddq_f32(vline1_f32, vshift);1205vline2_f32 = vaddq_f32(vline2_f32, vshift);1206int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);1207int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);1208int16x4_t vRes1 = vqmovn_s32(vline1_s32);1209int16x4_t vRes2 = vqmovn_s32(vline2_s32);1210uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));1211vst1_u8(_dst + i, vRes);1212}1213})1214#endif12151216#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1217CVTS_FUNC(u16, s8, 16,1218register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);1219register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,1220{1221for (size_t i = 0; i < w; i += 8)1222{1223internal::prefetch(_src + i);1224__asm__ (1225"vld1.8 {d4-d5}, [%[src1]] \n\t"1226"vmovl.u16 q3, d4 \n\t"1227"vmovl.u16 q4, d5 \n\t"1228"vcvt.f32.u32 q5, q3 \n\t"1229"vcvt.f32.u32 q6, q4 \n\t"1230"vmul.f32 q7, q5, q0 \n\t"1231"vmul.f32 q8, q6, q0 \n\t"1232"vadd.f32 q9, q7, q1 \n\t"1233"vadd.f32 q10, q8, q1 \n\t"1234"vcvt.s32.f32 q11, q9 \n\t"1235"vcvt.s32.f32 q12, q10 \n\t"1236"vqmovn.s32 d26, q11 \n\t"1237"vqmovn.s32 d27, q12 \n\t"1238"vqmovn.s16 d28, q13 \n\t"1239"vst1.8 {d28}, [%[dst]] \n\t"1240: /*no output*/1241: [src1] "r" (_src + i),1242[dst] "r" (_dst + i + 0),1243"w" (vscale), "w" (vshift)1244: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"1245);1246}1247})1248#else1249CVTS_FUNC(u16, s8, 16,1250float32x4_t vscale = vdupq_n_f32((f32)alpha);1251float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,1252{1253for (size_t i = 0; i < w; i += 8)1254{1255internal::prefetch(_src + i);1256uint16x8_t vline = vld1q_u16(_src + i);1257uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));1258uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));1259float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);1260float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);1261vline1_f32 = vmulq_f32(vline1_f32, vscale);1262vline2_f32 = vmulq_f32(vline2_f32, vscale);1263vline1_f32 = vaddq_f32(vline1_f32, vshift);1264vline2_f32 = vaddq_f32(vline2_f32, vshift);1265int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);1266int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);1267int16x4_t vRes1 = vqmovn_s32(vline1_s32);1268int16x4_t vRes2 = vqmovn_s32(vline2_s32);1269int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));1270vst1_s8(_dst + i, vRes);1271}1272})1273#endif12741275#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1276CVTS_FUNC1(u16, 16,1277register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);1278register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,1279{1280for (size_t i = 0; i < w; i += 8)1281{1282internal::prefetch(_src + i);1283__asm__ (1284"vld1.16 {d4-d5}, [%[src]] \n\t"1285"vmovl.u16 q3, d4 \n\t"1286"vmovl.u16 q4, d5 \n\t"1287"vcvt.f32.u32 q5, q3 \n\t"1288"vcvt.f32.u32 q6, q4 \n\t"1289"vmul.f32 q7, q5, q0 \n\t"1290"vmul.f32 q8, q6, q0 \n\t"1291"vadd.f32 q9, q7, q1 \n\t"1292"vadd.f32 q10, q8, q1 \n\t"1293"vcvt.s32.f32 q11, q9 \n\t"1294"vcvt.s32.f32 q12, q10 \n\t"1295"vqmovun.s32 d26, q11 \n\t"1296"vqmovun.s32 d27, q12 \n\t"1297"vst1.16 {d26-d27}, [%[dst]] \n\t"1298: /*no output*/1299: [src] "r" (_src + i),1300[dst] "r" (_dst + i + 0),1301"w" (vshift), "w" (vscale)1302: "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"1303);1304}1305})1306#else1307CVTS_FUNC1(u16, 16,1308float32x4_t vscale = vdupq_n_f32((f32)alpha);1309float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,1310{1311for (size_t i = 0; i < w; i += 8)1312{1313internal::prefetch(_src + i);1314uint16x8_t vline = vld1q_u16(_src + i);1315uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));1316uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));1317float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);1318float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);1319vline1_f32 = vmulq_f32(vline1_f32, vscale);1320vline2_f32 = vmulq_f32(vline2_f32, vscale);1321vline1_f32 = vaddq_f32(vline1_f32, vshift);1322vline2_f32 = vaddq_f32(vline2_f32, vshift);1323int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);1324int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);1325uint16x4_t vRes1 = vqmovun_s32(vline1_s32);1326uint16x4_t vRes2 = vqmovun_s32(vline2_s32);1327vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));1328}1329})1330#endif13311332#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1333CVTS_FUNC(u16, s16, 8,1334register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);1335register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,1336{1337for (size_t i = 0; i < w; i += 8)1338{1339internal::prefetch(_src + i);1340__asm__ (1341"vld1.16 {d4-d5}, [%[src]] \n\t"1342"vmovl.u16 q3, d4 \n\t"1343"vmovl.u16 q4, d5 \n\t"1344"vcvt.f32.u32 q5, q3 \n\t"1345"vcvt.f32.u32 q6, q4 \n\t"1346"vmul.f32 q7, q5, q0 \n\t"1347"vmul.f32 q8, q6, q0 \n\t"1348"vadd.f32 q9, q7, q1 \n\t"1349"vadd.f32 q10, q8, q1 \n\t"1350"vcvt.s32.f32 q11, q9 \n\t"1351"vcvt.s32.f32 q12, q10 \n\t"1352"vqmovn.s32 d26, q11 \n\t"1353"vqmovn.s32 d27, q12 \n\t"1354"vst1.16 {d26-d27}, [%[dst]] \n\t"1355: /*no output*/1356: [src] "r" (_src + i),1357[dst] "r" (_dst + i + 0),1358"w" (vshift), "w" (vscale)1359: "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"1360);1361}1362})1363#else1364CVTS_FUNC(u16, s16, 8,1365float32x4_t vscale = vdupq_n_f32((f32)alpha);1366float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,1367{1368for (size_t i = 0; i < w; i += 8)1369{1370internal::prefetch(_src + i);1371uint16x8_t vline = vld1q_u16(_src + i);1372uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));1373uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));1374float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);1375float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);1376vline1_f32 = vmulq_f32(vline1_f32, vscale);1377vline2_f32 = vmulq_f32(vline2_f32, vscale);1378vline1_f32 = vaddq_f32(vline1_f32, vshift);1379vline2_f32 = vaddq_f32(vline2_f32, vshift);1380int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);1381int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);1382int16x4_t vRes1 = vqmovn_s32(vline1_s32);1383int16x4_t vRes2 = vqmovn_s32(vline2_s32);1384vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));1385}1386})1387#endif13881389#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1390CVTS_FUNC(u16, s32, 8,1391register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);1392register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,1393{1394for (size_t i = 0; i < w; i += 8)1395{1396internal::prefetch(_src + i);1397__asm__ (1398"vld1.16 {d4-d5}, [%[src]] \n\t"1399"vmovl.u16 q3, d4 \n\t"1400"vmovl.u16 q4, d5 \n\t"1401"vcvt.f32.u32 q5, q3 \n\t"1402"vcvt.f32.u32 q6, q4 \n\t"1403"vmul.f32 q7, q5, q0 \n\t"1404"vmul.f32 q8, q6, q0 \n\t"1405"vadd.f32 q9, q7, q1 \n\t"1406"vadd.f32 q10, q8, q1 \n\t"1407"vcvt.s32.f32 q11, q9 \n\t"1408"vcvt.s32.f32 q12, q10 \n\t"1409"vst1.32 {d22-d23}, [%[dst1]] \n\t"1410"vst1.32 {d24-d25}, [%[dst2]] \n\t"1411: /*no output*/1412: [src] "r" (_src + i),1413[dst1] "r" (_dst + i),1414[dst2] "r" (_dst + i + 4),1415"w" (vshift), "w" (vscale)1416: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"1417);1418}1419})1420#else1421CVTS_FUNC(u16, s32, 8,1422float32x4_t vscale = vdupq_n_f32((f32)alpha);1423float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,1424{1425for (size_t i = 0; i < w; i += 8)1426{1427internal::prefetch(_src + i);1428uint16x8_t vline = vld1q_u16(_src + i);1429uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));1430uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));1431float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);1432float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);1433vline1_f32 = vmulq_f32(vline1_f32, vscale);1434vline2_f32 = vmulq_f32(vline2_f32, vscale);1435vline1_f32 = vaddq_f32(vline1_f32, vshift);1436vline2_f32 = vaddq_f32(vline2_f32, vshift);1437int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);1438int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);1439vst1q_s32(_dst + i + 0, vline1_s32);1440vst1q_s32(_dst + i + 4, vline2_s32);1441}1442})1443#endif14441445#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1446CVTS_FUNC(u16, f32, 8,1447register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);1448register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,1449{1450for (size_t i = 0; i < w; i += 8)1451{1452internal::prefetch(_src + i);1453__asm__ (1454"vld1.16 {d4-d5}, [%[src]] \n\t"1455"vmovl.u16 q3, d4 \n\t"1456"vmovl.u16 q4, d5 \n\t"1457"vcvt.f32.u32 q5, q3 \n\t"1458"vcvt.f32.u32 q6, q4 \n\t"1459"vmul.f32 q7, q5, q0 \n\t"1460"vmul.f32 q8, q6, q0 \n\t"1461"vadd.f32 q9, q7, q1 \n\t"1462"vadd.f32 q10, q8, q1 \n\t"1463"vst1.32 {d18-d19}, [%[dst1]] \n\t"1464"vst1.32 {d20-d21}, [%[dst2]] \n\t"1465: /*no output*/1466: [src] "r" (_src + i),1467[dst1] "r" (_dst + i + 0),1468[dst2] "r" (_dst + i + 4),1469"w" (vscale), "w" (vshift)1470: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"1471);1472}1473})1474#else1475CVTS_FUNC(u16, f32, 8,1476float32x4_t vscale = vdupq_n_f32((f32)alpha);1477float32x4_t vshift = vdupq_n_f32((f32)beta);,1478{1479for (size_t i = 0; i < w; i += 8)1480{1481internal::prefetch(_src + i);1482uint16x8_t vline = vld1q_u16(_src + i);1483uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));1484uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));1485float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);1486float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);1487vline1_f32 = vmulq_f32(vline1_f32, vscale);1488vline2_f32 = vmulq_f32(vline2_f32, vscale);1489vline1_f32 = vaddq_f32(vline1_f32, vshift);1490vline2_f32 = vaddq_f32(vline2_f32, vshift);1491vst1q_f32(_dst + i + 0, vline1_f32);1492vst1q_f32(_dst + i + 4, vline2_f32);1493}1494})1495#endif14961497#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1498CVTS_FUNC(s16, u8, 16,1499register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);1500register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,1501{1502for (size_t i = 0; i < w; i += 8)1503{1504internal::prefetch(_src + i);1505__asm__ (1506"vld1.8 {d4-d5}, [%[src1]] \n\t"1507"vmovl.s16 q3, d4 \n\t"1508"vmovl.s16 q4, d5 \n\t"1509"vcvt.f32.s32 q5, q3 \n\t"1510"vcvt.f32.s32 q6, q4 \n\t"1511"vmul.f32 q7, q5, q0 \n\t"1512"vmul.f32 q8, q6, q0 \n\t"1513"vadd.f32 q9, q7, q1 \n\t"1514"vadd.f32 q10, q8, q1 \n\t"1515"vcvt.s32.f32 q11, q9 \n\t"1516"vcvt.s32.f32 q12, q10 \n\t"1517"vqmovn.s32 d26, q11 \n\t"1518"vqmovn.s32 d27, q12 \n\t"1519"vqmovun.s16 d28, q13 \n\t"1520"vst1.8 {d28}, [%[dst]] \n\t"1521: /*no output*/1522: [src1] "r" (_src + i),1523[dst] "r" (_dst + i + 0),1524"w" (vscale), "w" (vshift)1525: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"1526);1527}1528})1529#else1530CVTS_FUNC(s16, u8, 16,1531float32x4_t vscale = vdupq_n_f32((f32)alpha);1532float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,1533{1534for (size_t i = 0; i < w; i += 8)1535{1536internal::prefetch(_src + i);1537int16x8_t vline = vld1q_s16(_src + i);1538int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));1539int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));1540float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);1541float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);1542vline1_f32 = vmulq_f32(vline1_f32, vscale);1543vline2_f32 = vmulq_f32(vline2_f32, vscale);1544vline1_f32 = vaddq_f32(vline1_f32, vshift);1545vline2_f32 = vaddq_f32(vline2_f32, vshift);1546vline1_s32 = vcvtq_s32_f32(vline1_f32);1547vline2_s32 = vcvtq_s32_f32(vline2_f32);1548int16x4_t vRes1 = vqmovn_s32(vline1_s32);1549int16x4_t vRes2 = vqmovn_s32(vline2_s32);1550uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));1551vst1_u8(_dst + i, vRes);1552}1553})1554#endif15551556#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1557CVTS_FUNC(s16, s8, 16,1558register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);1559register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,1560{1561for (size_t i = 0; i < w; i += 8)1562{1563internal::prefetch(_src + i);1564__asm__ (1565"vld1.8 {d4-d5}, [%[src1]] \n\t"1566"vmovl.s16 q3, d4 \n\t"1567"vmovl.s16 q4, d5 \n\t"1568"vcvt.f32.s32 q5, q3 \n\t"1569"vcvt.f32.s32 q6, q4 \n\t"1570"vmul.f32 q7, q5, q0 \n\t"1571"vmul.f32 q8, q6, q0 \n\t"1572"vadd.f32 q9, q7, q1 \n\t"1573"vadd.f32 q10, q8, q1 \n\t"1574"vcvt.s32.f32 q11, q9 \n\t"1575"vcvt.s32.f32 q12, q10 \n\t"1576"vqmovn.s32 d26, q11 \n\t"1577"vqmovn.s32 d27, q12 \n\t"1578"vqmovn.s16 d28, q13 \n\t"1579"vst1.8 {d28}, [%[dst]] \n\t"1580: /*no output*/1581: [src1] "r" (_src + i),1582[dst] "r" (_dst + i + 0),1583"w" (vscale), "w" (vshift)1584: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"1585);1586}1587})1588#else1589CVTS_FUNC(s16, s8, 16,1590float32x4_t vscale = vdupq_n_f32((f32)alpha);1591float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,1592{1593for (size_t i = 0; i < w; i += 8)1594{1595internal::prefetch(_src + i);1596int16x8_t vline = vld1q_s16(_src + i);1597int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));1598int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));1599float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);1600float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);1601vline1_f32 = vmulq_f32(vline1_f32, vscale);1602vline2_f32 = vmulq_f32(vline2_f32, vscale);1603vline1_f32 = vaddq_f32(vline1_f32, vshift);1604vline2_f32 = vaddq_f32(vline2_f32, vshift);1605vline1_s32 = vcvtq_s32_f32(vline1_f32);1606vline2_s32 = vcvtq_s32_f32(vline2_f32);1607int16x4_t vRes1 = vqmovn_s32(vline1_s32);1608int16x4_t vRes2 = vqmovn_s32(vline2_s32);1609int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));1610vst1_s8(_dst + i, vRes);1611}1612})1613#endif16141615#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1616CVTS_FUNC(s16, u16, 8,1617register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);1618register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,1619{1620for (size_t i = 0; i < w; i += 8)1621{1622internal::prefetch(_src + i);1623__asm__ (1624"vld1.16 {d4-d5}, [%[src]] \n\t"1625"vmovl.s16 q3, d4 \n\t"1626"vmovl.s16 q4, d5 \n\t"1627"vcvt.f32.s32 q5, q3 \n\t"1628"vcvt.f32.s32 q6, q4 \n\t"1629"vmul.f32 q7, q5, q0 \n\t"1630"vmul.f32 q8, q6, q0 \n\t"1631"vadd.f32 q9, q7, q1 \n\t"1632"vadd.f32 q10, q8, q1 \n\t"1633"vcvt.s32.f32 q11, q9 \n\t"1634"vcvt.s32.f32 q12, q10 \n\t"1635"vqmovun.s32 d26, q11 \n\t"1636"vqmovun.s32 d27, q12 \n\t"1637"vst1.16 {d26-d27}, [%[dst]] \n\t"1638: /*no output*/1639: [src] "r" (_src + i),1640[dst] "r" (_dst + i + 0),1641"w" (vscale), "w" (vshift)1642: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"1643);1644}1645})1646#else1647CVTS_FUNC(s16, u16, 8,1648float32x4_t vscale = vdupq_n_f32((f32)alpha);1649float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,1650{1651for (size_t i = 0; i < w; i += 8)1652{1653internal::prefetch(_src + i);1654int16x8_t vline = vld1q_s16(_src + i);1655int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));1656int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));1657float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);1658float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);1659vline1_f32 = vmulq_f32(vline1_f32, vscale);1660vline2_f32 = vmulq_f32(vline2_f32, vscale);1661vline1_f32 = vaddq_f32(vline1_f32, vshift);1662vline2_f32 = vaddq_f32(vline2_f32, vshift);1663vline1_s32 = vcvtq_s32_f32(vline1_f32);1664vline2_s32 = vcvtq_s32_f32(vline2_f32);1665uint16x4_t vRes1 = vqmovun_s32(vline1_s32);1666uint16x4_t vRes2 = vqmovun_s32(vline2_s32);1667vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));1668}1669})1670#endif16711672#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1673CVTS_FUNC1(s16, 16,1674register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);1675register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,1676{1677for (size_t i = 0; i < w; i += 8)1678{1679internal::prefetch(_src + i);1680__asm__ (1681"vld1.16 {d4-d5}, [%[src]] \n\t"1682"vmovl.s16 q3, d4 \n\t"1683"vmovl.s16 q4, d5 \n\t"1684"vcvt.f32.s32 q5, q3 \n\t"1685"vcvt.f32.s32 q6, q4 \n\t"1686"vmul.f32 q7, q5, q0 \n\t"1687"vmul.f32 q8, q6, q0 \n\t"1688"vadd.f32 q9, q7, q1 \n\t"1689"vadd.f32 q10, q8, q1 \n\t"1690"vcvt.s32.f32 q11, q9 \n\t"1691"vcvt.s32.f32 q12, q10 \n\t"1692"vqmovn.s32 d26, q11 \n\t"1693"vqmovn.s32 d27, q12 \n\t"1694"vst1.16 {d26-d27}, [%[dst]] \n\t"1695: /*no output*/1696: [src] "r" (_src + i),1697[dst] "r" (_dst + i + 0),1698"w" (vshift), "w" (vscale)1699: "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"1700);1701}1702})1703#else1704CVTS_FUNC1(s16, 16,1705float32x4_t vscale = vdupq_n_f32((f32)alpha);1706float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,1707{1708for (size_t i = 0; i < w; i += 8)1709{1710internal::prefetch(_src + i);1711int16x8_t vline = vld1q_s16(_src + i);1712int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));1713int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));1714float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);1715float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);1716vline1_f32 = vmulq_f32(vline1_f32, vscale);1717vline2_f32 = vmulq_f32(vline2_f32, vscale);1718vline1_f32 = vaddq_f32(vline1_f32, vshift);1719vline2_f32 = vaddq_f32(vline2_f32, vshift);1720vline1_s32 = vcvtq_s32_f32(vline1_f32);1721vline2_s32 = vcvtq_s32_f32(vline2_f32);1722int16x4_t vRes1 = vqmovn_s32(vline1_s32);1723int16x4_t vRes2 = vqmovn_s32(vline2_s32);1724vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));1725}1726})1727#endif17281729#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1730CVTS_FUNC(s16, s32, 8,1731register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);1732register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,1733{1734for (size_t i = 0; i < w; i += 8)1735{1736internal::prefetch(_src + i);1737__asm__ (1738"vld1.16 {d4-d5}, [%[src]] \n\t"1739"vmovl.s16 q3, d4 \n\t"1740"vmovl.s16 q4, d5 \n\t"1741"vcvt.f32.s32 q5, q3 \n\t"1742"vcvt.f32.s32 q6, q4 \n\t"1743"vmul.f32 q7, q5, q0 \n\t"1744"vmul.f32 q8, q6, q0 \n\t"1745"vadd.f32 q9, q7, q1 \n\t"1746"vadd.f32 q10, q8, q1 \n\t"1747"vcvt.s32.f32 q11, q9 \n\t"1748"vcvt.s32.f32 q12, q10 \n\t"1749"vst1.32 {d22-d23}, [%[dst1]] \n\t"1750"vst1.32 {d24-d25}, [%[dst2]] \n\t"1751: /*no output*/1752: [src] "r" (_src + i),1753[dst1] "r" (_dst + i + 0),1754[dst2] "r" (_dst + i + 4),1755"w" (vscale), "w" (vshift)1756: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"1757);1758}1759})1760#else1761CVTS_FUNC(s16, s32, 8,1762float32x4_t vscale = vdupq_n_f32((f32)alpha);1763float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,1764{1765for (size_t i = 0; i < w; i += 8)1766{1767internal::prefetch(_src + i);1768int16x8_t vline = vld1q_s16(_src + i);1769int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));1770int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));1771float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);1772float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);1773vline1_f32 = vmulq_f32(vline1_f32, vscale);1774vline2_f32 = vmulq_f32(vline2_f32, vscale);1775vline1_f32 = vaddq_f32(vline1_f32, vshift);1776vline2_f32 = vaddq_f32(vline2_f32, vshift);1777vline1_s32 = vcvtq_s32_f32(vline1_f32);1778vline2_s32 = vcvtq_s32_f32(vline2_f32);1779vst1q_s32(_dst + i + 0, vline1_s32);1780vst1q_s32(_dst + i + 4, vline2_s32);1781}1782})1783#endif17841785#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1786CVTS_FUNC(s16, f32, 8,1787register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);1788register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,1789{1790for (size_t i = 0; i < w; i += 8)1791{1792internal::prefetch(_src + i);1793__asm__ (1794"vld1.16 {d4-d5}, [%[src]] \n\t"1795"vmovl.s16 q3, d4 \n\t"1796"vmovl.s16 q4, d5 \n\t"1797"vcvt.f32.s32 q5, q3 \n\t"1798"vcvt.f32.s32 q6, q4 \n\t"1799"vmul.f32 q7, q5, q0 \n\t"1800"vmul.f32 q8, q6, q0 \n\t"1801"vadd.f32 q9, q7, q1 \n\t"1802"vadd.f32 q10, q8, q1 \n\t"1803"vst1.32 {d18-d19}, [%[dst1]] \n\t"1804"vst1.32 {d20-d21}, [%[dst2]] \n\t"1805: /*no output*/1806: [src] "r" (_src + i),1807[dst1] "r" (_dst + i + 0),1808[dst2] "r" (_dst + i + 4),1809"w" (vscale), "w" (vshift)1810: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"1811);1812}1813})1814#else1815CVTS_FUNC(s16, f32, 8,1816float32x4_t vscale = vdupq_n_f32((f32)alpha);1817float32x4_t vshift = vdupq_n_f32((f32)beta);,1818{1819for (size_t i = 0; i < w; i += 8)1820{1821internal::prefetch(_src + i);1822int16x8_t vline = vld1q_s16(_src + i);1823int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));1824int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));1825float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);1826float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);1827vline1_f32 = vmulq_f32(vline1_f32, vscale);1828vline2_f32 = vmulq_f32(vline2_f32, vscale);1829vline1_f32 = vaddq_f32(vline1_f32, vshift);1830vline2_f32 = vaddq_f32(vline2_f32, vshift);1831vst1q_f32(_dst + i + 0, vline1_f32);1832vst1q_f32(_dst + i + 4, vline2_f32);1833}1834})1835#endif18361837#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1838CVTS_FUNC(s32, u8, 8,1839register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);1840register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,1841{1842for (size_t i = 0; i < w; i += 8)1843{1844internal::prefetch(_src + i);1845__asm__ (1846"vld1.32 {d4-d5}, [%[src1]] \n\t"1847"vld1.32 {d6-d7}, [%[src2]] \n\t"1848"vcvt.f32.s32 q4, q2 \n\t"1849"vcvt.f32.s32 q5, q3 \n\t"1850"vmul.f32 q6, q4, q0 \n\t"1851"vmul.f32 q7, q5, q0 \n\t"1852"vadd.f32 q8, q6, q1 \n\t"1853"vadd.f32 q9, q7, q1 \n\t"1854"vcvt.s32.f32 q10, q8 \n\t"1855"vcvt.s32.f32 q11, q9 \n\t"1856"vqmovun.s32 d24, q10 \n\t"1857"vqmovun.s32 d25, q11 \n\t"1858"vqmovn.u16 d26, q12 \n\t"1859"vst1.8 {d26}, [%[dst]] \n\t"1860: /*no output*/1861: [src1] "r" (_src + i + 0),1862[src2] "r" (_src + i + 4),1863[dst] "r" (_dst + i),1864"w" (vscale), "w" (vshift)1865: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26"1866);1867}1868})1869#else1870CVTS_FUNC(s32, u8, 8,1871float32x4_t vscale = vdupq_n_f32((f32)alpha);1872float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,1873{1874for (size_t i = 0; i < w; i += 8)1875{1876internal::prefetch(_src + i);1877int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);1878int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);1879float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);1880float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);1881vline1_f32 = vmulq_f32(vline1_f32, vscale);1882vline2_f32 = vmulq_f32(vline2_f32, vscale);1883vline1_f32 = vaddq_f32(vline1_f32, vshift);1884vline2_f32 = vaddq_f32(vline2_f32, vshift);1885vline1_s32 = vcvtq_s32_f32(vline1_f32);1886vline2_s32 = vcvtq_s32_f32(vline2_f32);1887uint16x4_t vRes1 = vqmovun_s32(vline1_s32);1888uint16x4_t vRes2 = vqmovun_s32(vline2_s32);1889uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));1890vst1_u8(_dst + i, vRes);1891}1892})1893#endif18941895#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1896CVTS_FUNC(s32, s8, 8,1897register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);1898register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,1899{1900for (size_t i = 0; i < w; i += 8)1901{1902internal::prefetch(_src + i);1903__asm__ (1904"vld1.32 {d4-d5}, [%[src1]] \n\t"1905"vld1.32 {d6-d7}, [%[src2]] \n\t"1906"vcvt.f32.s32 q4, q2 \n\t"1907"vcvt.f32.s32 q5, q3 \n\t"1908"vmul.f32 q6, q4, q0 \n\t"1909"vmul.f32 q7, q5, q0 \n\t"1910"vadd.f32 q8, q6, q1 \n\t"1911"vadd.f32 q9, q7, q1 \n\t"1912"vcvt.s32.f32 q10, q8 \n\t"1913"vcvt.s32.f32 q11, q9 \n\t"1914"vqmovn.s32 d24, q10 \n\t"1915"vqmovn.s32 d25, q11 \n\t"1916"vqmovn.s16 d26, q12 \n\t"1917"vst1.8 {d26}, [%[dst]] \n\t"1918: /*no output*/1919: [src1] "r" (_src + i + 0),1920[src2] "r" (_src + i + 4),1921[dst] "r" (_dst + i),1922"w" (vscale), "w" (vshift)1923: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26"1924);1925}1926})1927#else1928CVTS_FUNC(s32, s8, 8,1929float32x4_t vscale = vdupq_n_f32((f32)alpha);1930float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,1931{1932for (size_t i = 0; i < w; i += 8)1933{1934internal::prefetch(_src + i);1935int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);1936int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);1937float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);1938float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);1939vline1_f32 = vmulq_f32(vline1_f32, vscale);1940vline2_f32 = vmulq_f32(vline2_f32, vscale);1941vline1_f32 = vaddq_f32(vline1_f32, vshift);1942vline2_f32 = vaddq_f32(vline2_f32, vshift);1943vline1_s32 = vcvtq_s32_f32(vline1_f32);1944vline2_s32 = vcvtq_s32_f32(vline2_f32);1945int16x4_t vRes1 = vqmovn_s32(vline1_s32);1946int16x4_t vRes2 = vqmovn_s32(vline2_s32);1947int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));1948vst1_s8(_dst + i, vRes);1949}1950})1951#endif19521953#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)1954CVTS_FUNC(s32, u16, 8,1955register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);1956register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,1957{1958for (size_t i = 0; i < w; i += 8)1959{1960internal::prefetch(_src + i);1961__asm__ (1962"vld1.32 {d4-d5}, [%[src1]] \n\t"1963"vld1.32 {d6-d7}, [%[src2]] \n\t"1964"vcvt.f32.s32 q4, q2 \n\t"1965"vcvt.f32.s32 q5, q3 \n\t"1966"vmul.f32 q6, q4, q0 \n\t"1967"vmul.f32 q7, q5, q0 \n\t"1968"vadd.f32 q8, q6, q1 \n\t"1969"vadd.f32 q9, q7, q1 \n\t"1970"vcvt.s32.f32 q10, q8 \n\t"1971"vcvt.s32.f32 q11, q9 \n\t"1972"vqmovun.s32 d24, q10 \n\t"1973"vqmovun.s32 d25, q11 \n\t"1974"vst1.16 {d24-d25}, [%[dst]] \n\t"1975: /*no output*/1976: [src1] "r" (_src + i + 0),1977[src2] "r" (_src + i + 4),1978[dst] "r" (_dst + i),1979"w" (vscale), "w" (vshift)1980: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"1981);1982}1983})1984#else1985CVTS_FUNC(s32, u16, 8,1986float32x4_t vscale = vdupq_n_f32((f32)alpha);1987float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,1988{1989for (size_t i = 0; i < w; i += 8)1990{1991internal::prefetch(_src + i);1992int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);1993int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);1994float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);1995float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);1996vline1_f32 = vmulq_f32(vline1_f32, vscale);1997vline2_f32 = vmulq_f32(vline2_f32, vscale);1998vline1_f32 = vaddq_f32(vline1_f32, vshift);1999vline2_f32 = vaddq_f32(vline2_f32, vshift);2000vline1_s32 = vcvtq_s32_f32(vline1_f32);2001vline2_s32 = vcvtq_s32_f32(vline2_f32);2002uint16x4_t vRes1 = vqmovun_s32(vline1_s32);2003uint16x4_t vRes2 = vqmovun_s32(vline2_s32);2004vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));2005}2006})2007#endif20082009#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2010CVTS_FUNC(s32, s16, 8,2011register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);2012register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,2013{2014for (size_t i = 0; i < w; i += 8)2015{2016internal::prefetch(_src + i);2017__asm__ (2018"vld1.32 {d4-d5}, [%[src1]] \n\t"2019"vld1.32 {d6-d7}, [%[src2]] \n\t"2020"vcvt.f32.s32 q4, q2 \n\t"2021"vcvt.f32.s32 q5, q3 \n\t"2022"vmul.f32 q6, q4, q0 \n\t"2023"vmul.f32 q7, q5, q0 \n\t"2024"vadd.f32 q8, q6, q1 \n\t"2025"vadd.f32 q9, q7, q1 \n\t"2026"vcvt.s32.f32 q10, q8 \n\t"2027"vcvt.s32.f32 q11, q9 \n\t"2028"vqmovn.s32 d24, q10 \n\t"2029"vqmovn.s32 d25, q11 \n\t"2030"vst1.8 {d24-d25}, [%[dst]] \n\t"2031: /*no output*/2032: [src1] "r" (_src + i + 0),2033[src2] "r" (_src + i + 4),2034[dst] "r" (_dst + i),2035"w" (vscale), "w" (vshift)2036: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"2037);2038}2039})2040#else2041CVTS_FUNC(s32, s16, 8,2042float32x4_t vscale = vdupq_n_f32((f32)alpha);2043float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,2044{2045for (size_t i = 0; i < w; i += 8)2046{2047internal::prefetch(_src + i);2048int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);2049int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);2050float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);2051float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);2052vline1_f32 = vmulq_f32(vline1_f32, vscale);2053vline2_f32 = vmulq_f32(vline2_f32, vscale);2054vline1_f32 = vaddq_f32(vline1_f32, vshift);2055vline2_f32 = vaddq_f32(vline2_f32, vshift);2056vline1_s32 = vcvtq_s32_f32(vline1_f32);2057vline2_s32 = vcvtq_s32_f32(vline2_f32);2058int16x4_t vRes1 = vqmovn_s32(vline1_s32);2059int16x4_t vRes2 = vqmovn_s32(vline2_s32);2060vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));2061}2062})2063#endif20642065#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2066CVTS_FUNC1(s32, 8,2067register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);2068register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,2069{2070for (size_t i = 0; i < w; i += 8)2071{2072internal::prefetch(_src + i);2073__asm__ (2074"vld1.32 {d4-d5}, [%[src1]] \n\t"2075"vld1.32 {d6-d7}, [%[src2]] \n\t"2076"vcvt.f32.s32 q4, q2 \n\t"2077"vcvt.f32.s32 q5, q3 \n\t"2078"vmul.f32 q6, q4, q0 \n\t"2079"vmul.f32 q7, q5, q0 \n\t"2080"vadd.f32 q8, q6, q1 \n\t"2081"vadd.f32 q9, q7, q1 \n\t"2082"vcvt.s32.f32 q10, q8 \n\t"2083"vcvt.s32.f32 q11, q9 \n\t"2084"vst1.32 {d20-d21}, [%[dst1]] \n\t"2085"vst1.32 {d22-d23}, [%[dst2]] \n\t"2086: /*no output*/2087: [src1] "r" (_src + i + 0),2088[src2] "r" (_src + i + 4),2089[dst1] "r" (_dst + i + 0),2090[dst2] "r" (_dst + i + 4),2091"w" (vscale), "w" (vshift)2092: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"2093);2094}2095})2096#else2097CVTS_FUNC1(s32, 8,2098float32x4_t vscale = vdupq_n_f32((f32)alpha);2099float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,2100{2101for (size_t i = 0; i < w; i += 8)2102{2103internal::prefetch(_src + i);2104int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);2105int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);2106float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);2107float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);2108vline1_f32 = vmulq_f32(vline1_f32, vscale);2109vline2_f32 = vmulq_f32(vline2_f32, vscale);2110vline1_f32 = vaddq_f32(vline1_f32, vshift);2111vline2_f32 = vaddq_f32(vline2_f32, vshift);2112vline1_s32 = vcvtq_s32_f32(vline1_f32);2113vline2_s32 = vcvtq_s32_f32(vline2_f32);2114vst1q_s32(_dst + i + 0, vline1_s32);2115vst1q_s32(_dst + i + 4, vline2_s32);2116}2117})2118#endif21192120#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2121CVTS_FUNC(s32, f32, 8,2122register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);2123register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,2124{2125for (size_t i = 0; i < w; i += 8)2126{2127internal::prefetch(_src + i);2128__asm__ (2129"vld1.32 {d4-d5}, [%[src1]] \n\t"2130"vld1.32 {d6-d7}, [%[src2]] \n\t"2131"vcvt.f32.s32 q4, q2 \n\t"2132"vcvt.f32.s32 q5, q3 \n\t"2133"vmul.f32 q6, q4, q0 \n\t"2134"vmul.f32 q7, q5, q0 \n\t"2135"vadd.f32 q8, q6, q1 \n\t"2136"vadd.f32 q9, q7, q1 \n\t"2137"vst1.32 {d16-d17}, [%[dst1]] \n\t"2138"vst1.32 {d18-d19}, [%[dst2]] \n\t"2139: /*no output*/2140: [src1] "r" (_src + i),2141[src2] "r" (_src + i + 4),2142[dst1] "r" (_dst + i),2143[dst2] "r" (_dst + i + 4),2144"w" (vscale), "w" (vshift)2145: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"2146);2147}2148})2149#else2150CVTS_FUNC(s32, f32, 8,2151float32x4_t vscale = vdupq_n_f32((f32)alpha);2152float32x4_t vshift = vdupq_n_f32((f32)beta);,2153{2154for (size_t i = 0; i < w; i += 8)2155{2156internal::prefetch(_src + i);2157int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);2158int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);2159float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);2160float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);2161vline1_f32 = vmulq_f32(vline1_f32, vscale);2162vline2_f32 = vmulq_f32(vline2_f32, vscale);2163vline1_f32 = vaddq_f32(vline1_f32, vshift);2164vline2_f32 = vaddq_f32(vline2_f32, vshift);2165vst1q_f32(_dst + i + 0, vline1_f32);2166vst1q_f32(_dst + i + 4, vline2_f32);2167}2168})2169#endif21702171#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2172CVTS_FUNC(f32, u8, 8,2173register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)((1 << 16)*alpha));2174register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)((1 << 16)*beta));2175register uint32x4_t vmask asm ("q2") = vdupq_n_u32(1<<16);,2176{2177for (size_t i = 0; i < w; i += 8)2178{2179internal::prefetch(_src + i);2180__asm__ (2181"vld1.32 {d6-d7}, [%[src1]] \n\t"2182"vld1.32 {d8-d9}, [%[src2]] \n\t"2183"vmul.f32 q5, q3, q0 \n\t"2184"vmul.f32 q6, q4, q0 \n\t"2185"vadd.f32 q7, q5, q1 \n\t"2186"vadd.f32 q8, q6, q1 \n\t"2187"vcvt.u32.f32 q9, q7 \n\t"2188"vcvt.u32.f32 q10, q8 \n\t"2189"vbic q11, q2, q6 \n\t"2190"vbic q12, q2, q7 \n\t"2191"vshr.u32 q13, q11, #16 \n\t"2192"vshr.u32 q14, q12, #16 \n\t"2193"vqsub.u32 q7, q9, q13 \n\t"2194"vqsub.u32 q8, q10, q14 \n\t"2195"vqrshrn.u32 d22, q7, #16 \n\t"2196"vqrshrn.u32 d23, q8, #16 \n\t"2197"vqmovn.u16 d30, q11 \n\t"2198"vst1.8 {d30}, [%[dst]] \n\t"2199: /*no output*/2200: [src1] "r" (_src + i + 0),2201[src2] "r" (_src + i + 4),2202[dst] "r" (_dst + i),2203"w" (vscale), "w" (vshift), "w" (vmask)2204: "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30"2205);2206}2207})2208#else2209CVTS_FUNC(f32, u8, 8,2210float32x4_t vscale = vdupq_n_f32((f32)((1 << 16)*alpha));2211float32x4_t vshift = vdupq_n_f32((f32)((1 << 16)*beta));2212uint32x4_t vmask = vdupq_n_u32(1<<16);,2213{2214for (size_t i = 0; i < w; i += 8)2215{2216internal::prefetch(_src + i);2217float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);2218float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);22192220vline1_f32 = vmulq_f32(vline1_f32, vscale);2221vline2_f32 = vmulq_f32(vline2_f32, vscale);2222float32x4_t vline1Shifted_f32 = vaddq_f32(vline1_f32, vshift);2223float32x4_t vline2Shifted_f32 = vaddq_f32(vline2_f32, vshift);2224uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1Shifted_f32);2225uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2Shifted_f32);2226uint32x4_t vline1Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline2_f32));2227uint32x4_t vline2Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline1Shifted_f32));2228vline1Mask = vshrq_n_u32(vline1Mask, 16);2229vline2Mask = vshrq_n_u32(vline2Mask, 16);2230vline1_u32 = vqsubq_u32(vline1_u32, vline1Mask);2231vline2_u32 = vqsubq_u32(vline2_u32, vline2Mask);2232uint16x4_t vRes1 = vqrshrn_n_u32(vline1_u32, 16);2233uint16x4_t vRes2 = vqrshrn_n_u32(vline2_u32, 16);2234uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));22352236vst1_u8(_dst + i, vRes);2237}2238})2239#endif22402241#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2242CVTS_FUNC(f32, s8, 8,2243register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);2244register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,2245{2246for (size_t i = 0; i < w; i += 8)2247{2248internal::prefetch(_src + i);2249__asm__ (2250"vld1.32 {d4-d5}, [%[src1]] \n\t"2251"vld1.32 {d6-d7}, [%[src2]] \n\t"2252"vmul.f32 q4, q2, q0 \n\t"2253"vmul.f32 q5, q3, q0 \n\t"2254"vadd.f32 q6, q4, q1 \n\t"2255"vadd.f32 q7, q5, q1 \n\t"2256"vcvt.s32.f32 q8, q6 \n\t"2257"vcvt.s32.f32 q9, q7 \n\t"2258"vqmovn.s32 d14, q8 \n\t"2259"vqmovn.s32 d15, q9 \n\t"2260"vqmovn.s16 d16, q7 \n\t"2261"vst1.8 {d16}, [%[dst]] \n\t"2262: /*no output*/2263: [src1] "r" (_src + i + 0),2264[src2] "r" (_src + i + 4),2265[dst] "r" (_dst + i),2266"w" (vscale), "w" (vshift)2267: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"2268);2269}2270})2271#else2272CVTS_FUNC(f32, s8, 8,2273float32x4_t vscale = vdupq_n_f32((f32)alpha);2274float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,2275{2276for (size_t i = 0; i < w; i += 8)2277{2278internal::prefetch(_src + i);2279float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);2280float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);2281vline1_f32 = vmulq_f32(vline1_f32, vscale);2282vline2_f32 = vmulq_f32(vline2_f32, vscale);2283vline1_f32 = vaddq_f32(vline1_f32, vshift);2284vline2_f32 = vaddq_f32(vline2_f32, vshift);2285int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);2286int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);2287int16x4_t vRes1 = vqmovn_s32(vline1_s32);2288int16x4_t vRes2 = vqmovn_s32(vline2_s32);2289int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));2290vst1_s8(_dst + i, vRes);2291}2292})2293#endif22942295#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2296CVTS_FUNC(f32, u16, 8,2297register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);2298register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,2299{2300for (size_t i = 0; i < w; i += 8)2301{2302internal::prefetch(_src + i);2303__asm__ (2304"vld1.32 {d4-d5}, [%[src1]] \n\t"2305"vld1.32 {d6-d7}, [%[src2]] \n\t"2306"vmul.f32 q4, q2, q0 \n\t"2307"vmul.f32 q5, q3, q0 \n\t"2308"vadd.f32 q6, q4, q1 \n\t"2309"vadd.f32 q7, q5, q1 \n\t"2310"vcvt.u32.f32 q8, q6 \n\t"2311"vcvt.u32.f32 q9, q7 \n\t"2312"vqmovn.u32 d8, q8 \n\t"2313"vqmovn.u32 d9, q9 \n\t"2314"vst1.16 {d8-d9}, [%[dst]] \n\t"2315: /*no output*/2316: [src1] "r" (_src + i + 0),2317[src2] "r" (_src + i + 4),2318[dst] "r" (_dst + i),2319"w" (vscale), "w" (vshift)2320: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"2321);2322}2323})2324#else2325CVTS_FUNC(f32, u16, 8,2326float32x4_t vscale = vdupq_n_f32((f32)alpha);2327float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,2328{2329for (size_t i = 0; i < w; i += 8)2330{2331internal::prefetch(_src + i);2332float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);2333float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);2334vline1_f32 = vmulq_f32(vline1_f32, vscale);2335vline2_f32 = vmulq_f32(vline2_f32, vscale);2336vline1_f32 = vaddq_f32(vline1_f32, vshift);2337vline2_f32 = vaddq_f32(vline2_f32, vshift);2338uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1_f32);2339uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2_f32);2340uint16x4_t vRes1 = vqmovn_u32(vline1_u32);2341uint16x4_t vRes2 = vqmovn_u32(vline2_u32);2342vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));2343}2344})2345#endif23462347#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2348CVTS_FUNC(f32, s16, 8,2349register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);2350register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,2351{2352for (size_t i = 0; i < w; i += 8)2353{2354internal::prefetch(_src + i);2355__asm__ (2356"vld1.32 {d4-d5}, [%[src1]] \n\t"2357"vld1.32 {d6-d7}, [%[src2]] \n\t"2358"vmul.f32 q4, q2, q0 \n\t"2359"vmul.f32 q5, q3, q0 \n\t"2360"vadd.f32 q6, q4, q1 \n\t"2361"vadd.f32 q7, q5, q1 \n\t"2362"vcvt.s32.f32 q8, q6 \n\t"2363"vcvt.s32.f32 q9, q7 \n\t"2364"vqmovn.s32 d8, q8 \n\t"2365"vqmovn.s32 d9, q9 \n\t"2366"vst1.16 {d8-d9}, [%[dst]] \n\t"2367: /*no output*/2368: [src1] "r" (_src + i + 0),2369[src2] "r" (_src + i + 4),2370[dst] "r" (_dst + i),2371"w" (vscale), "w" (vshift)2372: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"2373);2374}2375})2376#else2377CVTS_FUNC(f32, s16, 8,2378float32x4_t vscale = vdupq_n_f32((f32)alpha);2379float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,2380{2381for (size_t i = 0; i < w; i += 8)2382{2383internal::prefetch(_src + i);2384float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);2385float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);2386vline1_f32 = vmulq_f32(vline1_f32, vscale);2387vline2_f32 = vmulq_f32(vline2_f32, vscale);2388vline1_f32 = vaddq_f32(vline1_f32, vshift);2389vline2_f32 = vaddq_f32(vline2_f32, vshift);2390int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);2391int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);2392int16x4_t vRes1 = vqmovn_s32(vline1_s32);2393int16x4_t vRes2 = vqmovn_s32(vline2_s32);2394vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));2395}2396})2397#endif23982399#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2400CVTS_FUNC(f32, s32, 8,2401register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);2402register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,2403{2404for (size_t i = 0; i < w; i += 8)2405{2406internal::prefetch(_src + i);2407__asm__ (2408"vld1.32 {d4-d5}, [%[src1]] \n\t"2409"vld1.32 {d6-d7}, [%[src2]] \n\t"2410"vmul.f32 q4, q2, q0 \n\t"2411"vmul.f32 q5, q3, q0 \n\t"2412"vadd.f32 q6, q4, q1 \n\t"2413"vadd.f32 q7, q5, q1 \n\t"2414"vcvt.s32.f32 q4, q6 \n\t"2415"vcvt.s32.f32 q5, q7 \n\t"2416"vst1.32 {d8-d9}, [%[dst1]] \n\t"2417"vst1.32 {d10-d11}, [%[dst2]] \n\t"2418: //no output2419: [src1] "r" (_src + i),2420[src2] "r" (_src + i + 4),2421[dst1] "r" (_dst + i),2422[dst2] "r" (_dst + i + 4),2423"w" (vscale), "w" (vshift)2424: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"2425);2426}2427})2428#else2429CVTS_FUNC(f32, s32, 8,2430float32x4_t vscale = vdupq_n_f32((f32)alpha);2431float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,2432{2433for (size_t i = 0; i < w; i += 8)2434{2435internal::prefetch(_src + i);2436float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);2437float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);2438vline1_f32 = vmulq_f32(vline1_f32, vscale);2439vline2_f32 = vmulq_f32(vline2_f32, vscale);2440vline1_f32 = vaddq_f32(vline1_f32, vshift);2441vline2_f32 = vaddq_f32(vline2_f32, vshift);2442int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);2443int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);2444vst1q_s32(_dst + i + 0, vline1_s32);2445vst1q_s32(_dst + i + 4, vline2_s32);2446}2447})2448#endif24492450#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)2451CVTS_FUNC1(f32, 8,2452register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);2453register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,2454{2455for (size_t i = 0; i < w; i += 8)2456{2457internal::prefetch(_src + i);2458__asm__ (2459"vld1.32 {d4-d5}, [%[src1]] \n\t"2460"vld1.32 {d6-d7}, [%[src2]] \n\t"2461"vmul.f32 q4, q2, q0 \n\t"2462"vmul.f32 q5, q3, q0 \n\t"2463"vadd.f32 q6, q4, q1 \n\t"2464"vadd.f32 q7, q5, q1 \n\t"2465"vst1.32 {d12-d13}, [%[dst1]] \n\t"2466"vst1.32 {d14-d15}, [%[dst2]] \n\t"2467: /*no output*/2468: [src1] "r" (_src + i + 0),2469[src2] "r" (_src + i + 4),2470[dst1] "r" (_dst + i + 0),2471[dst2] "r" (_dst + i + 4),2472"w" (vscale), "w" (vshift)2473: "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"2474);2475}2476})2477#else2478CVTS_FUNC1(f32, 8,2479float32x4_t vscale = vdupq_n_f32((f32)alpha);2480float32x4_t vshift = vdupq_n_f32((f32)beta);,2481{2482for (size_t i = 0; i < w; i += 8)2483{2484internal::prefetch(_src + i);2485float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);2486float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);2487vline1_f32 = vmulq_f32(vline1_f32, vscale);2488vline2_f32 = vmulq_f32(vline2_f32, vscale);2489vline1_f32 = vaddq_f32(vline1_f32, vshift);2490vline2_f32 = vaddq_f32(vline2_f32, vshift);2491vst1q_f32(_dst + i + 0, vline1_f32);2492vst1q_f32(_dst + i + 4, vline2_f32);2493}2494})2495#endif24962497} // namespace CAROTENE_NS249824992500