Path: blob/master/thirdparty/cvtt/ConvectionKernels_ParallelMath.h
9898 views
/*1Convection Texture Tools2Copyright (c) 2018-2019 Eric Lasota34Permission is hereby granted, free of charge, to any person obtaining5a copy of this software and associated documentation files (the6"Software"), to deal in the Software without restriction, including7without limitation the rights to use, copy, modify, merge, publish,8distribute, sublicense, and/or sell copies of the Software, and to9permit persons to whom the Software is furnished to do so, subject10to the following conditions:1112The above copyright notice and this permission notice shall be included13in all copies or substantial portions of the Software.1415THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS16OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF17MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.18IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY19CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,20TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE21SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.2223*/24#pragma once25#ifndef __CVTT_PARALLELMATH_H__26#define __CVTT_PARALLELMATH_H__2728#include "ConvectionKernels.h"29#include "ConvectionKernels_Config.h"3031#ifdef CVTT_USE_SSE232#include <emmintrin.h>33#endif3435#include <float.h>36#include <assert.h>37#include <string.h>38#include <algorithm>39#include <math.h>4041#define UNREFERENCED_PARAMETER(n) ((void)n)4243// Parallel math implementation44//45// After preprocessor defs are handled, what this should do is expose the following types:46// SInt16 - Signed 16-bit integer47// UInt16 - Signed 16-bit integer48// UInt15 - Unsigned 15-bit integer49// SInt32 - Signed 32-bit integer50// UInt31 - Unsigned 31-bit integer51// AInt16 - 16-bit integer of unknown signedness (only used for storage)52// Int16CompFlag - Comparison flags from comparing 16-bit integers53// Int32CompFlag - Comparison flags from comparing 32-bit integers54// FloatCompFlag - Comparison flags from comparing 32-bit floats55//56// The reason for these distinctions are that depending on the instruction set, signed or unsigned versions of certain ops57// (particularly max, min, compares, and right shift) may not be available. In cases where ops are not available, it's58// necessary to do high bit manipulations to accomplish the operation with 16-bit numbers. The 15-bit and 31-bit uint types59// can elide the bit flips if unsigned versions are not available.6061namespace cvtt62{63#ifdef CVTT_USE_SSE264// SSE2 version65struct ParallelMath66{67typedef uint16_t ScalarUInt16;68typedef int16_t ScalarSInt16;6970template<unsigned int TRoundingMode>71struct RoundForScope72{73unsigned int m_oldCSR;7475RoundForScope()76{77m_oldCSR = _mm_getcsr();78_mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) | (TRoundingMode));79}8081~RoundForScope()82{83_mm_setcsr(m_oldCSR);84}85};8687struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO>88{89};9091struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST>92{93};9495struct RoundUpForScope : RoundForScope<_MM_ROUND_UP>96{97};9899struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN>100{101};102103static const int ParallelSize = 8;104105enum Int16Subtype106{107IntSubtype_Signed,108IntSubtype_UnsignedFull,109IntSubtype_UnsignedTruncated,110IntSubtype_Abstract,111};112113template<int TSubtype>114struct VInt16115{116__m128i m_value;117118inline VInt16 operator+(int16_t other) const119{120VInt16 result;121result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other)));122return result;123}124125inline VInt16 operator+(const VInt16 &other) const126{127VInt16 result;128result.m_value = _mm_add_epi16(m_value, other.m_value);129return result;130}131132inline VInt16 operator|(const VInt16 &other) const133{134VInt16 result;135result.m_value = _mm_or_si128(m_value, other.m_value);136return result;137}138139inline VInt16 operator&(const VInt16 &other) const140{141VInt16 result;142result.m_value = _mm_and_si128(m_value, other.m_value);143return result;144}145146inline VInt16 operator-(const VInt16 &other) const147{148VInt16 result;149result.m_value = _mm_sub_epi16(m_value, other.m_value);150return result;151}152153inline VInt16 operator<<(int bits) const154{155VInt16 result;156result.m_value = _mm_slli_epi16(m_value, bits);157return result;158}159160inline VInt16 operator^(const VInt16 &other) const161{162VInt16 result;163result.m_value = _mm_xor_si128(m_value, other.m_value);164return result;165}166};167168typedef VInt16<IntSubtype_Signed> SInt16;169typedef VInt16<IntSubtype_UnsignedFull> UInt16;170typedef VInt16<IntSubtype_UnsignedTruncated> UInt15;171typedef VInt16<IntSubtype_Abstract> AInt16;172173template<int TSubtype>174struct VInt32175{176__m128i m_values[2];177178inline VInt32 operator+(const VInt32& other) const179{180VInt32 result;181result.m_values[0] = _mm_add_epi32(m_values[0], other.m_values[0]);182result.m_values[1] = _mm_add_epi32(m_values[1], other.m_values[1]);183return result;184}185186inline VInt32 operator-(const VInt32& other) const187{188VInt32 result;189result.m_values[0] = _mm_sub_epi32(m_values[0], other.m_values[0]);190result.m_values[1] = _mm_sub_epi32(m_values[1], other.m_values[1]);191return result;192}193194inline VInt32 operator<<(const int other) const195{196VInt32 result;197result.m_values[0] = _mm_slli_epi32(m_values[0], other);198result.m_values[1] = _mm_slli_epi32(m_values[1], other);199return result;200}201202inline VInt32 operator|(const VInt32& other) const203{204VInt32 result;205result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]);206result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]);207return result;208}209};210211typedef VInt32<IntSubtype_Signed> SInt32;212typedef VInt32<IntSubtype_UnsignedTruncated> UInt31;213typedef VInt32<IntSubtype_UnsignedFull> UInt32;214typedef VInt32<IntSubtype_Abstract> AInt32;215216template<class TTargetType>217struct LosslessCast218{219#ifdef CVTT_PERMIT_ALIASING220template<int TSrcSubtype>221static const TTargetType& Cast(const VInt32<TSrcSubtype> &src)222{223return reinterpret_cast<VInt32<TSubtype>&>(src);224}225226template<int TSrcSubtype>227static const TTargetType& Cast(const VInt16<TSrcSubtype> &src)228{229return reinterpret_cast<VInt16<TSubtype>&>(src);230}231#else232template<int TSrcSubtype>233static TTargetType Cast(const VInt32<TSrcSubtype> &src)234{235TTargetType result;236result.m_values[0] = src.m_values[0];237result.m_values[1] = src.m_values[1];238return result;239}240241template<int TSrcSubtype>242static TTargetType Cast(const VInt16<TSrcSubtype> &src)243{244TTargetType result;245result.m_value = src.m_value;246return result;247}248#endif249};250251struct Int64252{253__m128i m_values[4];254};255256struct Float257{258__m128 m_values[2];259260inline Float operator+(const Float &other) const261{262Float result;263result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]);264result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]);265return result;266}267268inline Float operator+(float other) const269{270Float result;271result.m_values[0] = _mm_add_ps(m_values[0], _mm_set1_ps(other));272result.m_values[1] = _mm_add_ps(m_values[1], _mm_set1_ps(other));273return result;274}275276inline Float operator-(const Float& other) const277{278Float result;279result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]);280result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]);281return result;282}283284inline Float operator-() const285{286Float result;287result.m_values[0] = _mm_sub_ps(_mm_setzero_ps(), m_values[0]);288result.m_values[1] = _mm_sub_ps(_mm_setzero_ps(), m_values[1]);289return result;290}291292inline Float operator*(const Float& other) const293{294Float result;295result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]);296result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]);297return result;298}299300inline Float operator*(float other) const301{302Float result;303result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other));304result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other));305return result;306}307308inline Float operator/(const Float &other) const309{310Float result;311result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]);312result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]);313return result;314}315316inline Float operator/(float other) const317{318Float result;319result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other));320result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other));321return result;322}323};324325struct Int16CompFlag326{327__m128i m_value;328329inline Int16CompFlag operator&(const Int16CompFlag &other) const330{331Int16CompFlag result;332result.m_value = _mm_and_si128(m_value, other.m_value);333return result;334}335336inline Int16CompFlag operator|(const Int16CompFlag &other) const337{338Int16CompFlag result;339result.m_value = _mm_or_si128(m_value, other.m_value);340return result;341}342};343344struct Int32CompFlag345{346__m128i m_values[2];347348inline Int32CompFlag operator&(const Int32CompFlag &other) const349{350Int32CompFlag result;351result.m_values[0] = _mm_and_si128(m_values[0], other.m_values[0]);352result.m_values[1] = _mm_and_si128(m_values[1], other.m_values[1]);353return result;354}355356inline Int32CompFlag operator|(const Int32CompFlag &other) const357{358Int32CompFlag result;359result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]);360result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]);361return result;362}363};364365struct FloatCompFlag366{367__m128 m_values[2];368369inline FloatCompFlag operator&(const FloatCompFlag &other) const370{371FloatCompFlag result;372result.m_values[0] = _mm_and_ps(m_values[0], other.m_values[0]);373result.m_values[1] = _mm_and_ps(m_values[1], other.m_values[1]);374return result;375}376377inline FloatCompFlag operator|(const FloatCompFlag &other) const378{379FloatCompFlag result;380result.m_values[0] = _mm_or_ps(m_values[0], other.m_values[0]);381result.m_values[1] = _mm_or_ps(m_values[1], other.m_values[1]);382return result;383}384};385386template<int TSubtype>387static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)388{389VInt16<TSubtype> result;390result.m_value = _mm_add_epi16(a.m_value, b.m_value);391return result;392}393394template<int TSubtype>395static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)396{397VInt16<TSubtype> result;398result.m_value = _mm_sub_epi16(a.m_value, b.m_value);399return result;400}401402static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b)403{404Float result;405for (int i = 0; i < 2; i++)406result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i]));407return result;408}409410template<int TSubtype>411static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)412{413VInt16<TSubtype> result;414result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value));415return result;416}417418template<int TSubtype>419static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a)420{421VInt16<TSubtype> result;422result.m_value = _mm_and_si128(flag.m_value, a.m_value);423return result;424}425426template<int TSubtype>427static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)428{429dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));430}431432template<int TSubtype>433static void ConditionalSet(VInt32<TSubtype> &dest, const Int16CompFlag &flag, const VInt32<TSubtype> &src)434{435__m128i lowFlags = _mm_unpacklo_epi16(flag.m_value, flag.m_value);436__m128i highFlags = _mm_unpackhi_epi16(flag.m_value, flag.m_value);437dest.m_values[0] = _mm_or_si128(_mm_andnot_si128(lowFlags, dest.m_values[0]), _mm_and_si128(lowFlags, src.m_values[0]));438dest.m_values[1] = _mm_or_si128(_mm_andnot_si128(highFlags, dest.m_values[1]), _mm_and_si128(highFlags, src.m_values[1]));439}440441static void ConditionalSet(ParallelMath::Int16CompFlag &dest, const Int16CompFlag &flag, const ParallelMath::Int16CompFlag &src)442{443dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));444}445446static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v)447{448SInt16 result;449result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, 15));450return result;451}452453template<int TSubtype>454static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)455{456dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value));457}458459static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)460{461for (int i = 0; i < 2; i++)462dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i]));463}464465static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)466{467for (int i = 0; i < 2; i++)468dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i]));469}470471static void MakeSafeDenominator(Float& v)472{473ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f));474}475476static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision)477{478int lostBits = 16 - precision;479if (lostBits == 0)480return v;481482SInt16 result;483result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);484return result;485}486487static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision)488{489int lostBits = 16 - precision;490if (lostBits == 0)491return v;492493UInt16 result;494result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);495return result;496}497498static UInt16 Min(const UInt16 &a, const UInt16 &b)499{500__m128i bitFlip = _mm_set1_epi16(-32768);501502UInt16 result;503result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);504return result;505}506507static SInt16 Min(const SInt16 &a, const SInt16 &b)508{509SInt16 result;510result.m_value = _mm_min_epi16(a.m_value, b.m_value);511return result;512}513514static UInt15 Min(const UInt15 &a, const UInt15 &b)515{516UInt15 result;517result.m_value = _mm_min_epi16(a.m_value, b.m_value);518return result;519}520521static Float Min(const Float &a, const Float &b)522{523Float result;524for (int i = 0; i < 2; i++)525result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]);526return result;527}528529static UInt16 Max(const UInt16 &a, const UInt16 &b)530{531__m128i bitFlip = _mm_set1_epi16(-32768);532533UInt16 result;534result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);535return result;536}537538static SInt16 Max(const SInt16 &a, const SInt16 &b)539{540SInt16 result;541result.m_value = _mm_max_epi16(a.m_value, b.m_value);542return result;543}544545static UInt15 Max(const UInt15 &a, const UInt15 &b)546{547UInt15 result;548result.m_value = _mm_max_epi16(a.m_value, b.m_value);549return result;550}551552static Float Max(const Float &a, const Float &b)553{554Float result;555for (int i = 0; i < 2; i++)556result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]);557return result;558}559560static Float Clamp(const Float &v, float min, float max)561{562Float result;563for (int i = 0; i < 2; i++)564result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min));565return result;566}567568static Float Reciprocal(const Float &v)569{570Float result;571for (int i = 0; i < 2; i++)572result.m_values[i] = _mm_rcp_ps(v.m_values[i]);573return result;574}575576static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut)577{578int16_t values[8];579for (int i = 0; i < 8; i++)580values[i] = inputBlocks[i].m_pixels[pxOffset][channel];581582chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);583}584585static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut)586{587int16_t values[8];588for (int i = 0; i < 8; i++)589values[i] = inputBlocks[i].m_pixels[pxOffset][channel];590591chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);592}593594static Float MakeFloat(float v)595{596Float f;597f.m_values[0] = f.m_values[1] = _mm_set1_ps(v);598return f;599}600601static Float MakeFloatZero()602{603Float f;604f.m_values[0] = f.m_values[1] = _mm_setzero_ps();605return f;606}607608static UInt16 MakeUInt16(uint16_t v)609{610UInt16 result;611result.m_value = _mm_set1_epi16(static_cast<short>(v));612return result;613}614615static SInt16 MakeSInt16(int16_t v)616{617SInt16 result;618result.m_value = _mm_set1_epi16(static_cast<short>(v));619return result;620}621622static AInt16 MakeAInt16(int16_t v)623{624AInt16 result;625result.m_value = _mm_set1_epi16(static_cast<short>(v));626return result;627}628629static UInt15 MakeUInt15(uint16_t v)630{631UInt15 result;632result.m_value = _mm_set1_epi16(static_cast<short>(v));633return result;634}635636static SInt32 MakeSInt32(int32_t v)637{638SInt32 result;639result.m_values[0] = _mm_set1_epi32(v);640result.m_values[1] = _mm_set1_epi32(v);641return result;642}643644static UInt31 MakeUInt31(uint32_t v)645{646UInt31 result;647result.m_values[0] = _mm_set1_epi32(v);648result.m_values[1] = _mm_set1_epi32(v);649return result;650}651652static uint16_t Extract(const UInt16 &v, int offset)653{654return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];655}656657static int16_t Extract(const SInt16 &v, int offset)658{659return reinterpret_cast<const int16_t*>(&v.m_value)[offset];660}661662static uint16_t Extract(const UInt15 &v, int offset)663{664return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];665}666667static int16_t Extract(const AInt16 &v, int offset)668{669return reinterpret_cast<const int16_t*>(&v.m_value)[offset];670}671672static int32_t Extract(const SInt32 &v, int offset)673{674return reinterpret_cast<const int32_t*>(&v.m_values[offset >> 2])[offset & 3];675}676677static float Extract(const Float &v, int offset)678{679return reinterpret_cast<const float*>(&v.m_values[offset >> 2])[offset & 3];680}681682static bool Extract(const ParallelMath::Int16CompFlag &v, int offset)683{684return reinterpret_cast<const int16_t*>(&v.m_value)[offset] != 0;685}686687static void PutUInt16(UInt16 &dest, int offset, uint16_t v)688{689reinterpret_cast<uint16_t*>(&dest)[offset] = v;690}691692static void PutUInt15(UInt15 &dest, int offset, uint16_t v)693{694reinterpret_cast<uint16_t*>(&dest)[offset] = v;695}696697static void PutSInt16(SInt16 &dest, int offset, int16_t v)698{699reinterpret_cast<int16_t*>(&dest)[offset] = v;700}701702static float ExtractFloat(const Float& v, int offset)703{704return reinterpret_cast<const float*>(&v)[offset];705}706707static void PutFloat(Float &dest, int offset, float v)708{709reinterpret_cast<float*>(&dest)[offset] = v;710}711712static void PutBoolInt16(Int16CompFlag &dest, int offset, bool v)713{714reinterpret_cast<int16_t*>(&dest)[offset] = v ? -1 : 0;715}716717static Int32CompFlag Less(const UInt31 &a, const UInt31 &b)718{719Int32CompFlag result;720result.m_values[0] = _mm_cmplt_epi32(a.m_values[0], b.m_values[0]);721result.m_values[1] = _mm_cmplt_epi32(a.m_values[1], b.m_values[1]);722return result;723}724725static Int16CompFlag Less(const SInt16 &a, const SInt16 &b)726{727Int16CompFlag result;728result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);729return result;730}731732static Int16CompFlag Less(const UInt15 &a, const UInt15 &b)733{734Int16CompFlag result;735result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);736return result;737}738739static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b)740{741Int16CompFlag result;742result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);743return result;744}745746static FloatCompFlag Less(const Float &a, const Float &b)747{748FloatCompFlag result;749for (int i = 0; i < 2; i++)750result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]);751return result;752}753754static FloatCompFlag LessOrEqual(const Float &a, const Float &b)755{756FloatCompFlag result;757for (int i = 0; i < 2; i++)758result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]);759return result;760}761762template<int TSubtype>763static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)764{765Int16CompFlag result;766result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value);767return result;768}769770static FloatCompFlag Equal(const Float &a, const Float &b)771{772FloatCompFlag result;773for (int i = 0; i < 2; i++)774result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]);775return result;776}777778static Int16CompFlag Equal(const Int16CompFlag &a, const Int16CompFlag &b)779{780Int16CompFlag notResult;781notResult.m_value = _mm_xor_si128(a.m_value, b.m_value);782return Not(notResult);783}784785static Float ToFloat(const UInt16 &v)786{787Float result;788result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));789result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));790return result;791}792793static UInt31 ToUInt31(const UInt16 &v)794{795UInt31 result;796result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());797result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());798return result;799}800801static SInt32 ToInt32(const UInt16 &v)802{803SInt32 result;804result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());805result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());806return result;807}808809static SInt32 ToInt32(const UInt15 &v)810{811SInt32 result;812result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());813result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());814return result;815}816817static SInt32 ToInt32(const SInt16 &v)818{819SInt32 result;820result.m_values[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16);821result.m_values[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16);822return result;823}824825static Float ToFloat(const SInt16 &v)826{827Float result;828result.m_values[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16));829result.m_values[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16));830return result;831}832833static Float ToFloat(const UInt15 &v)834{835Float result;836result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));837result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));838return result;839}840841static Float ToFloat(const UInt31 &v)842{843Float result;844result.m_values[0] = _mm_cvtepi32_ps(v.m_values[0]);845result.m_values[1] = _mm_cvtepi32_ps(v.m_values[1]);846return result;847}848849static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v)850{851__m128i lo = _mm_castps_si128(v.m_values[0]);852__m128i hi = _mm_castps_si128(v.m_values[1]);853854Int16CompFlag result;855result.m_value = _mm_packs_epi32(lo, hi);856return result;857}858859static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v)860{861__m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value);862__m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value);863864FloatCompFlag result;865result.m_values[0] = _mm_castsi128_ps(lo);866result.m_values[1] = _mm_castsi128_ps(hi);867return result;868}869870static Int16CompFlag Int32FlagToInt16(const Int32CompFlag &v)871{872__m128i lo = v.m_values[0];873__m128i hi = v.m_values[1];874875Int16CompFlag result;876result.m_value = _mm_packs_epi32(lo, hi);877return result;878}879880static Int16CompFlag MakeBoolInt16(bool b)881{882Int16CompFlag result;883if (b)884result.m_value = _mm_set1_epi16(-1);885else886result.m_value = _mm_setzero_si128();887return result;888}889890static FloatCompFlag MakeBoolFloat(bool b)891{892FloatCompFlag result;893if (b)894result.m_values[0] = result.m_values[1] = _mm_castsi128_ps(_mm_set1_epi32(-1));895else896result.m_values[0] = result.m_values[1] = _mm_setzero_ps();897return result;898}899900static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b)901{902Int16CompFlag result;903result.m_value = _mm_andnot_si128(b.m_value, a.m_value);904return result;905}906907static Int16CompFlag Not(const Int16CompFlag &b)908{909Int16CompFlag result;910result.m_value = _mm_xor_si128(b.m_value, _mm_set1_epi32(-1));911return result;912}913914static Int32CompFlag Not(const Int32CompFlag &b)915{916Int32CompFlag result;917result.m_values[0] = _mm_xor_si128(b.m_values[0], _mm_set1_epi32(-1));918result.m_values[1] = _mm_xor_si128(b.m_values[1], _mm_set1_epi32(-1));919return result;920}921922static UInt16 RoundAndConvertToU16(const Float &v, const void* /*roundingMode*/)923{924__m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[0], _mm_set1_ps(-32768)));925__m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[1], _mm_set1_ps(-32768)));926927__m128i packed = _mm_packs_epi32(lo, hi);928929UInt16 result;930result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-32768));931return result;932}933934static UInt15 RoundAndConvertToU15(const Float &v, const void* /*roundingMode*/)935{936__m128i lo = _mm_cvtps_epi32(v.m_values[0]);937__m128i hi = _mm_cvtps_epi32(v.m_values[1]);938939__m128i packed = _mm_packs_epi32(lo, hi);940941UInt15 result;942result.m_value = _mm_packs_epi32(lo, hi);943return result;944}945946static SInt16 RoundAndConvertToS16(const Float &v, const void* /*roundingMode*/)947{948__m128i lo = _mm_cvtps_epi32(v.m_values[0]);949__m128i hi = _mm_cvtps_epi32(v.m_values[1]);950951__m128i packed = _mm_packs_epi32(lo, hi);952953SInt16 result;954result.m_value = _mm_packs_epi32(lo, hi);955return result;956}957958static Float Sqrt(const Float &f)959{960Float result;961for (int i = 0; i < 2; i++)962result.m_values[i] = _mm_sqrt_ps(f.m_values[i]);963return result;964}965966static UInt16 Abs(const SInt16 &a)967{968__m128i signBitsXor = _mm_srai_epi16(a.m_value, 15);969__m128i signBitsAdd = _mm_srli_epi16(a.m_value, 15);970971UInt16 result;972result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd);973return result;974}975976static Float Abs(const Float& a)977{978__m128 invMask = _mm_set1_ps(-0.0f);979980Float result;981result.m_values[0] = _mm_andnot_ps(invMask, a.m_values[0]);982result.m_values[1] = _mm_andnot_ps(invMask, a.m_values[1]);983return result;984}985986static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b)987{988__m128i diff = _mm_sub_epi16(a.m_value, b.m_value);989990UInt16 result;991result.m_value = _mm_mullo_epi16(diff, diff);992return result;993}994995static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b)996{997__m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value));998999__m128i mulHi = _mm_mulhi_epu16(diffU, diffU);1000__m128i mulLo = _mm_mullo_epi16(diffU, diffU);1001__m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi);1002__m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi);10031004Float result;1005result.m_values[0] = _mm_cvtepi32_ps(sqDiffLo);1006result.m_values[1] = _mm_cvtepi32_ps(sqDiffHi);10071008return result;1009}10101011static Float TwosCLHalfToFloat(const SInt16 &v)1012{1013__m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, 15)), _mm_srli_epi16(v.m_value, 15));10141015__m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-32768));1016__m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(0x03ff));1017__m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(0x7c00));10181019__m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128());10201021// Convert exponent to high-bits1022exponent = _mm_add_epi16(_mm_srli_epi16(exponent, 3), _mm_set1_epi16(14336));10231024__m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(14336)));10251026__m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, 3)));1027__m128i lowBits = _mm_slli_epi16(mantissa, 13);10281029__m128i flow = _mm_unpacklo_epi16(lowBits, highBits);1030__m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits);10311032__m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh);1033__m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh);10341035Float result;1036result.m_values[0] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow));1037result.m_values[1] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh));10381039return result;1040}10411042static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)1043{1044Float fa = TwosCLHalfToFloat(a);10451046Float diff = fa - b;1047return diff * diff;1048}10491050static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)1051{1052Float fa = TwosCLHalfToFloat(a);1053Float fb = TwosCLHalfToFloat(b);10541055Float diff = fa - fb;1056return diff * diff;1057}10581059static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)1060{1061Float fa = TwosCLHalfToFloat(a) * aWeight;10621063Float diff = fa - b;1064return diff * diff;1065}10661067static UInt16 RightShift(const UInt16 &v, int bits)1068{1069UInt16 result;1070result.m_value = _mm_srli_epi16(v.m_value, bits);1071return result;1072}10731074static UInt31 RightShift(const UInt31 &v, int bits)1075{1076UInt31 result;1077result.m_values[0] = _mm_srli_epi32(v.m_values[0], bits);1078result.m_values[1] = _mm_srli_epi32(v.m_values[1], bits);1079return result;1080}10811082static SInt16 RightShift(const SInt16 &v, int bits)1083{1084SInt16 result;1085result.m_value = _mm_srai_epi16(v.m_value, bits);1086return result;1087}10881089static UInt15 RightShift(const UInt15 &v, int bits)1090{1091UInt15 result;1092result.m_value = _mm_srli_epi16(v.m_value, bits);1093return result;1094}10951096static SInt32 RightShift(const SInt32 &v, int bits)1097{1098SInt32 result;1099result.m_values[0] = _mm_srai_epi32(v.m_values[0], bits);1100result.m_values[1] = _mm_srai_epi32(v.m_values[1], bits);1101return result;1102}11031104static SInt16 ToSInt16(const SInt32 &v)1105{1106SInt16 result;1107result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);1108return result;1109}11101111static SInt16 ToSInt16(const UInt16 &v)1112{1113SInt16 result;1114result.m_value = v.m_value;1115return result;1116}11171118static SInt16 ToSInt16(const UInt15 &v)1119{1120SInt16 result;1121result.m_value = v.m_value;1122return result;1123}11241125static UInt16 ToUInt16(const UInt32 &v)1126{1127__m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);1128__m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);11291130UInt16 result;1131result.m_value = _mm_packs_epi32(low, high);1132return result;1133}11341135static UInt16 ToUInt16(const UInt31 &v)1136{1137__m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);1138__m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);11391140UInt16 result;1141result.m_value = _mm_packs_epi32(low, high);1142return result;1143}11441145static UInt15 ToUInt15(const UInt31 &v)1146{1147UInt15 result;1148result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);1149return result;1150}11511152static UInt15 ToUInt15(const SInt16 &v)1153{1154UInt15 result;1155result.m_value = v.m_value;1156return result;1157}11581159static UInt15 ToUInt15(const UInt16 &v)1160{1161UInt15 result;1162result.m_value = v.m_value;1163return result;1164}11651166static SInt32 XMultiply(const SInt16 &a, const SInt16 &b)1167{1168__m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);1169__m128i low = _mm_mullo_epi16(a.m_value, b.m_value);11701171SInt32 result;1172result.m_values[0] = _mm_unpacklo_epi16(low, high);1173result.m_values[1] = _mm_unpackhi_epi16(low, high);1174return result;1175}11761177static SInt32 XMultiply(const SInt16 &a, const UInt15 &b)1178{1179__m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);1180__m128i low = _mm_mullo_epi16(a.m_value, b.m_value);11811182SInt32 result;1183result.m_values[0] = _mm_unpacklo_epi16(low, high);1184result.m_values[1] = _mm_unpackhi_epi16(low, high);1185return result;1186}11871188static SInt32 XMultiply(const UInt15 &a, const SInt16 &b)1189{1190return XMultiply(b, a);1191}11921193static UInt32 XMultiply(const UInt16 &a, const UInt16 &b)1194{1195__m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);1196__m128i low = _mm_mullo_epi16(a.m_value, b.m_value);11971198UInt32 result;1199result.m_values[0] = _mm_unpacklo_epi16(low, high);1200result.m_values[1] = _mm_unpackhi_epi16(low, high);1201return result;1202}12031204static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b)1205{1206UInt16 result;1207result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);1208return result;1209}12101211static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b)1212{1213UInt16 result;1214result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);1215return result;1216}12171218static SInt16 CompactMultiply(const SInt16 &a, const UInt15 &b)1219{1220SInt16 result;1221result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);1222return result;1223}12241225static SInt16 CompactMultiply(const SInt16 &a, const SInt16 &b)1226{1227SInt16 result;1228result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);1229return result;1230}12311232static UInt31 XMultiply(const UInt15 &a, const UInt15 &b)1233{1234__m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);1235__m128i low = _mm_mullo_epi16(a.m_value, b.m_value);12361237UInt31 result;1238result.m_values[0] = _mm_unpacklo_epi16(low, high);1239result.m_values[1] = _mm_unpackhi_epi16(low, high);1240return result;1241}12421243static UInt31 XMultiply(const UInt16 &a, const UInt15 &b)1244{1245__m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);1246__m128i low = _mm_mullo_epi16(a.m_value, b.m_value);12471248UInt31 result;1249result.m_values[0] = _mm_unpacklo_epi16(low, high);1250result.m_values[1] = _mm_unpackhi_epi16(low, high);1251return result;1252}12531254static UInt31 XMultiply(const UInt15 &a, const UInt16 &b)1255{1256return XMultiply(b, a);1257}12581259static bool AnySet(const Int16CompFlag &v)1260{1261return _mm_movemask_epi8(v.m_value) != 0;1262}12631264static bool AllSet(const Int16CompFlag &v)1265{1266return _mm_movemask_epi8(v.m_value) == 0xffff;1267}12681269static bool AnySet(const FloatCompFlag &v)1270{1271return _mm_movemask_ps(v.m_values[0]) != 0 || _mm_movemask_ps(v.m_values[1]) != 0;1272}12731274static bool AllSet(const FloatCompFlag &v)1275{1276return _mm_movemask_ps(v.m_values[0]) == 0xf && _mm_movemask_ps(v.m_values[1]) == 0xf;1277}1278};12791280#else1281// Scalar version1282struct ParallelMath1283{1284struct RoundTowardZeroForScope1285{1286};12871288struct RoundTowardNearestForScope1289{1290};12911292struct RoundUpForScope1293{1294};12951296struct RoundDownForScope1297{1298};12991300static const int ParallelSize = 1;13011302enum Int16Subtype1303{1304IntSubtype_Signed,1305IntSubtype_UnsignedFull,1306IntSubtype_UnsignedTruncated,1307IntSubtype_Abstract,1308};13091310typedef int32_t SInt16;1311typedef int32_t UInt15;1312typedef int32_t UInt16;1313typedef int32_t AInt16;13141315typedef int32_t SInt32;1316typedef int32_t UInt31;1317typedef int32_t UInt32;1318typedef int32_t AInt32;13191320typedef int32_t ScalarUInt16;1321typedef int32_t ScalarSInt16;13221323typedef float Float;13241325template<class TTargetType>1326struct LosslessCast1327{1328static const int32_t& Cast(const int32_t &src)1329{1330return src;1331}1332};13331334typedef bool Int16CompFlag;1335typedef bool FloatCompFlag;13361337static int32_t AbstractAdd(const int32_t &a, const int32_t &b)1338{1339return a + b;1340}13411342static int32_t AbstractSubtract(const int32_t &a, const int32_t &b)1343{1344return a - b;1345}13461347static float Select(bool flag, float a, float b)1348{1349return flag ? a : b;1350}13511352static int32_t Select(bool flag, int32_t a, int32_t b)1353{1354return flag ? a : b;1355}13561357static int32_t SelectOrZero(bool flag, int32_t a)1358{1359return flag ? a : 0;1360}13611362static void ConditionalSet(int32_t& dest, bool flag, int32_t src)1363{1364if (flag)1365dest = src;1366}13671368static void ConditionalSet(bool& dest, bool flag, bool src)1369{1370if (flag)1371dest = src;1372}13731374static int32_t ConditionalNegate(bool flag, int32_t v)1375{1376return (flag) ? -v : v;1377}13781379static void NotConditionalSet(int32_t& dest, bool flag, int32_t src)1380{1381if (!flag)1382dest = src;1383}13841385static void ConditionalSet(float& dest, bool flag, float src)1386{1387if (flag)1388dest = src;1389}13901391static void NotConditionalSet(float& dest, bool flag, float src)1392{1393if (!flag)1394dest = src;1395}13961397static void MakeSafeDenominator(float& v)1398{1399if (v == 0.0f)1400v = 1.0f;1401}14021403static int32_t SignedRightShift(int32_t v, int bits)1404{1405return v >> bits;1406}14071408static int32_t TruncateToPrecisionSigned(int32_t v, int precision)1409{1410v = (v << (32 - precision)) & 0xffffffff;1411return SignedRightShift(v, 32 - precision);1412}14131414static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision)1415{1416return v & ((1 << precision) - 1);1417}14181419static int32_t Min(int32_t a, int32_t b)1420{1421if (a < b)1422return a;1423return b;1424}14251426static float Min(float a, float b)1427{1428if (a < b)1429return a;1430return b;1431}14321433static int32_t Max(int32_t a, int32_t b)1434{1435if (a > b)1436return a;1437return b;1438}14391440static float Max(float a, float b)1441{1442if (a > b)1443return a;1444return b;1445}14461447static float Abs(float a)1448{1449return fabsf(a);1450}14511452static int32_t Abs(int32_t a)1453{1454if (a < 0)1455return -a;1456return a;1457}14581459static float Clamp(float v, float min, float max)1460{1461if (v < min)1462return min;1463if (v > max)1464return max;1465return v;1466}14671468static float Reciprocal(float v)1469{1470return 1.0f / v;1471}14721473static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut)1474{1475chOut = inputBlocks[0].m_pixels[pxOffset][channel];1476}14771478static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut)1479{1480chOut = inputBlocks[0].m_pixels[pxOffset][channel];1481}14821483static float MakeFloat(float v)1484{1485return v;1486}14871488static float MakeFloatZero()1489{1490return 0.0f;1491}14921493static int32_t MakeUInt16(uint16_t v)1494{1495return v;1496}14971498static int32_t MakeSInt16(int16_t v)1499{1500return v;1501}15021503static int32_t MakeAInt16(int16_t v)1504{1505return v;1506}15071508static int32_t MakeUInt15(uint16_t v)1509{1510return v;1511}15121513static int32_t MakeSInt32(int32_t v)1514{1515return v;1516}15171518static int32_t MakeUInt31(int32_t v)1519{1520return v;1521}15221523static int32_t Extract(int32_t v, int offset)1524{1525UNREFERENCED_PARAMETER(offset);1526return v;1527}15281529static bool Extract(bool v, int offset)1530{1531UNREFERENCED_PARAMETER(offset);1532return v;1533}15341535static float Extract(float v, int offset)1536{1537UNREFERENCED_PARAMETER(offset);1538return v;1539}15401541static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)1542{1543UNREFERENCED_PARAMETER(offset);1544dest = v;1545}15461547static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)1548{1549UNREFERENCED_PARAMETER(offset);1550dest = v;1551}15521553static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v)1554{1555UNREFERENCED_PARAMETER(offset);1556dest = v;1557}15581559static float ExtractFloat(float v, int offset)1560{1561UNREFERENCED_PARAMETER(offset);1562return v;1563}15641565static void PutFloat(float &dest, int offset, float v)1566{1567UNREFERENCED_PARAMETER(offset);1568dest = v;1569}15701571static void PutBoolInt16(bool &dest, int offset, bool v)1572{1573UNREFERENCED_PARAMETER(offset);1574dest = v;1575}15761577static bool Less(int32_t a, int32_t b)1578{1579return a < b;1580}15811582static bool Less(float a, float b)1583{1584return a < b;1585}15861587static bool LessOrEqual(int32_t a, int32_t b)1588{1589return a < b;1590}15911592static bool LessOrEqual(float a, float b)1593{1594return a < b;1595}15961597static bool Equal(int32_t a, int32_t b)1598{1599return a == b;1600}16011602static bool Equal(float a, float b)1603{1604return a == b;1605}16061607static float ToFloat(int32_t v)1608{1609return static_cast<float>(v);1610}16111612static int32_t ToUInt31(int32_t v)1613{1614return v;1615}16161617static int32_t ToInt32(int32_t v)1618{1619return v;1620}16211622static bool FloatFlagToInt16(bool v)1623{1624return v;1625}16261627static bool Int32FlagToInt16(bool v)1628{1629return v;1630}16311632static bool Int16FlagToFloat(bool v)1633{1634return v;1635}16361637static bool MakeBoolInt16(bool b)1638{1639return b;1640}16411642static bool MakeBoolFloat(bool b)1643{1644return b;1645}16461647static bool AndNot(bool a, bool b)1648{1649return a && !b;1650}16511652static bool Not(bool b)1653{1654return !b;1655}16561657static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz)1658{1659UNREFERENCED_PARAMETER(rtz);1660return static_cast<int>(v);1661}16621663static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru)1664{1665UNREFERENCED_PARAMETER(ru);1666return static_cast<int>(ceilf(v));1667}16681669static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd)1670{1671UNREFERENCED_PARAMETER(rd);1672return static_cast<int>(floorf(v));1673}16741675static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn)1676{1677UNREFERENCED_PARAMETER(rtn);1678return static_cast<int>(floorf(v + 0.5f));1679}16801681template<class TRoundMode>1682static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode)1683{1684return RoundAndConvertToInt(v, roundingMode);1685}16861687template<class TRoundMode>1688static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode)1689{1690return RoundAndConvertToInt(v, roundingMode);1691}16921693template<class TRoundMode>1694static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode)1695{1696return RoundAndConvertToInt(v, roundingMode);1697}16981699static float Sqrt(float f)1700{1701return sqrtf(f);1702}17031704static int32_t SqDiffUInt8(int32_t a, int32_t b)1705{1706int32_t delta = a - b;1707return delta * delta;1708}17091710static int32_t SqDiffInt16(int32_t a, int32_t b)1711{1712int32_t delta = a - b;1713return delta * delta;1714}17151716static int32_t SqDiffSInt16(int32_t a, int32_t b)1717{1718int32_t delta = a - b;1719return delta * delta;1720}17211722static float TwosCLHalfToFloat(int32_t v)1723{1724int32_t absV = (v < 0) ? -v : v;17251726int32_t signBits = (absV & -32768);1727int32_t mantissa = (absV & 0x03ff);1728int32_t exponent = (absV & 0x7c00);17291730bool isDenormal = (exponent == 0);17311732// Convert exponent to high-bits1733exponent = (exponent >> 3) + 14336;17341735int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16;17361737int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13);17381739float f, correction;1740memcpy(&f, &fBits, 4);1741memcpy(&correction, &denormalCorrection, 4);17421743return f - correction;1744}17451746static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)1747{1748Float fa = TwosCLHalfToFloat(a);17491750Float diff = fa - b;1751return diff * diff;1752}17531754static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)1755{1756Float fa = TwosCLHalfToFloat(a);1757Float fb = TwosCLHalfToFloat(b);17581759Float diff = fa - fb;1760return diff * diff;1761}17621763static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)1764{1765Float fa = TwosCLHalfToFloat(a) * aWeight;17661767Float diff = fa - b;1768return diff * diff;1769}17701771static int32_t RightShift(int32_t v, int bits)1772{1773return SignedRightShift(v, bits);1774}17751776static int32_t ToSInt16(int32_t v)1777{1778return v;1779}17801781static int32_t ToUInt16(int32_t v)1782{1783return v;1784}17851786static int32_t ToUInt15(int32_t v)1787{1788return v;1789}17901791static int32_t XMultiply(int32_t a, int32_t b)1792{1793return a * b;1794}17951796static int32_t CompactMultiply(int32_t a, int32_t b)1797{1798return a * b;1799}18001801static bool AnySet(bool v)1802{1803return v;1804}18051806static bool AllSet(bool v)1807{1808return v;1809}1810};18111812#endif1813}18141815#endif181618171818