Path: blob/21.2-virgl/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
4574 views
/****************************************************************************1* Copyright (C) 2017 Intel Corporation. All Rights Reserved.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21****************************************************************************/22#pragma once23#if 024//===========================================================================25// Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures.26//===========================================================================27struct SIMD256 // or SIMD4 or SIMD1628{29//=======================================================================30// SIMD Types31//32// These typedefs are examples. The SIMD256 and SIMD16 implementations will33// use different base types with this same naming.34using Float = __m256; // Packed single-precision float vector35using Double = __m256d; // Packed double-precision float vector36using Integer = __m256i; // Packed integer vector (mutable element widths)37using Mask = uint8_t; // Integer representing mask bits3839//=======================================================================40// Standard interface41// (available in both SIMD256 and SIMD16 widths)42//=======================================================================4344//-----------------------------------------------------------------------45// Single precision floating point arithmetic operations46//-----------------------------------------------------------------------47static Float add_ps(Float a, Float b); // return a + b48static Float div_ps(Float a, Float b); // return a / b49static Float fmadd_ps(Float a, Float b, Float c); // return (a * b) + c50static Float fmsub_ps(Float a, Float b, Float c); // return (a * b) - c51static Float max_ps(Float a, Float b); // return (a > b) ? a : b52static Float min_ps(Float a, Float b); // return (a < b) ? a : b53static Float mul_ps(Float a, Float b); // return a * b54static Float rcp_ps(Float a); // return 1.0f / a55static Float rsqrt_ps(Float a); // return 1.0f / sqrt(a)56static Float sub_ps(Float a, Float b); // return a - b5758enum class RoundMode59{60TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5)61TO_NEG_INF = 0x01, // Round to negative infinity62TO_POS_INF = 0x02, // Round to positive infinity63TO_ZERO = 0x03, // Round to 0 a.k.a. truncate64CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register6566RAISE_EXC = 0x00, // Raise exception on overflow67NO_EXC = 0x08, // Suppress exceptions6869NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC),70NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC),71FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC),72FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC),73CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC),74CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC),75TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC),76TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC),77RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC),78NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC),79};8081// return round_func(a)82//83// round_func is chosen on the RMT template parameter. See the documentation84// for the RoundMode enumeration above.85template <RoundMode RMT>86static Float round_ps(Float a); // return round(a)878889//-----------------------------------------------------------------------90// Integer (various width) arithmetic operations91//-----------------------------------------------------------------------92static Integer abs_epi32(Integer a); // return absolute_value(a) (int32)93static Integer add_epi32(Integer a, Integer b); // return a + b (int32)94static Integer add_epi8(Integer a, Integer b); // return a + b (int8)95static Integer adds_epu8(Integer a, Integer b); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)96static Integer max_epi32(Integer a, Integer b); // return (a > b) ? a : b (int32)97static Integer max_epu32(Integer a, Integer b); // return (a > b) ? a : b (uint32)98static Integer min_epi32(Integer a, Integer b); // return (a < b) ? a : b (int32)99static Integer min_epu32(Integer a, Integer b); // return (a < b) ? a : b (uint32)100static Integer mul_epi32(Integer a, Integer b); // return a * b (int32)101102// return (a * b) & 0xFFFFFFFF103//104// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,105// and store the low 32 bits of the intermediate integers in dst.106static Float mullo_epi32(Integer a, Integer b);107108static Integer sub_epi32(Integer a, Integer b); // return a - b (int32)109static Integer sub_epi64(Integer a, Integer b); // return a - b (int64)110static Integer subs_epu8(Integer a, Integer b); // return (b > a) ? 0 : (a - b) (uint8)111112//-----------------------------------------------------------------------113// Logical operations114//-----------------------------------------------------------------------115static Float and_ps(Float a, Float b); // return a & b (float treated as int)116static Integer and_si(Integer a, Integer b); // return a & b (int)117static Float andnot_ps(Float a, Float b); // return (~a) & b (float treated as int)118static Integer andnot_si(Integer a, Integer b); // return (~a) & b (int)119static Float or_ps(Float a, Float b); // return a | b (float treated as int)120static Float or_si(Integer a, Integer b); // return a | b (int)121static Float xor_ps(Float a, Float b); // return a ^ b (float treated as int)122static Integer xor_si(Integer a, Integer b); // return a ^ b (int)123124//-----------------------------------------------------------------------125// Shift operations126//-----------------------------------------------------------------------127template<int ImmT>128static Integer slli_epi32(Integer a); // return a << ImmT129static Integer sllv_epi32(Integer a, Integer b); // return a << b130template<int ImmT>131static Integer srai_epi32(Integer a); // return a >> ImmT (int32)132template<int ImmT>133static Integer srli_epi32(Integer a); // return a >> ImmT (uint32)134template<int ImmT> // for each 128-bit lane:135static Integer srli_si(Integer a); // return a >> (ImmT*8) (uint)136template<int ImmT>137static Float srlisi_ps(Float a); // same as srli_si, but with Float cast to int138static Integer srlv_epi32(Integer a, Integer b); // return a >> b (uint32)139140//-----------------------------------------------------------------------141// Conversion operations142//-----------------------------------------------------------------------143static Float castpd_ps(Double a); // return *(Float*)(&a)144static Integer castps_si(Float a); // return *(Integer*)(&a)145static Double castsi_pd(Integer a); // return *(Double*)(&a)146static Double castps_pd(Float a); // return *(Double*)(&a)147static Float castsi_ps(Integer a); // return *(Float*)(&a)148static Float cvtepi32_ps(Integer a); // return (float)a (int32 --> float)149static Integer cvtepu8_epi16(Integer a); // return (int16)a (uint8 --> int16)150static Integer cvtepu8_epi32(Integer a); // return (int32)a (uint8 --> int32)151static Integer cvtepu16_epi32(Integer a); // return (int32)a (uint16 --> int32)152static Integer cvtepu16_epi64(Integer a); // return (int64)a (uint16 --> int64)153static Integer cvtepu32_epi64(Integer a); // return (int64)a (uint32 --> int64)154static Integer cvtps_epi32(Float a); // return (int32)a (float --> int32)155static Integer cvttps_epi32(Float a); // return (int32)a (rnd_to_zero(float) --> int32)156157//-----------------------------------------------------------------------158// Comparison operations159//-----------------------------------------------------------------------160161// Comparison types used with cmp_ps:162// - ordered comparisons are always false if either operand is NaN163// - unordered comparisons are always true if either operand is NaN164// - signaling comparisons raise an exception if either operand is NaN165// - non-signaling comparisons will never raise an exception166//167// Ordered: return (a != NaN) && (b != NaN) && (a cmp b)168// Unordered: return (a == NaN) || (b == NaN) || (a cmp b)169enum class CompareType170{171EQ_OQ = 0x00, // Equal (ordered, nonsignaling)172LT_OS = 0x01, // Less-than (ordered, signaling)173LE_OS = 0x02, // Less-than-or-equal (ordered, signaling)174UNORD_Q = 0x03, // Unordered (nonsignaling)175NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling)176NLT_US = 0x05, // Not-less-than (unordered, signaling)177NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling)178ORD_Q = 0x07, // Ordered (nonsignaling)179EQ_UQ = 0x08, // Equal (unordered, non-signaling)180NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling)181NGT_US = 0x0A, // Not-greater-than (unordered, signaling)182FALSE_OQ = 0x0B, // False (ordered, nonsignaling)183NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling)184GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling)185GT_OS = 0x0E, // Greater-than (ordered, signaling)186TRUE_UQ = 0x0F, // True (unordered, non-signaling)187EQ_OS = 0x10, // Equal (ordered, signaling)188LT_OQ = 0x11, // Less-than (ordered, nonsignaling)189LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling)190UNORD_S = 0x13, // Unordered (signaling)191NEQ_US = 0x14, // Not-equal (unordered, signaling)192NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling)193NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)194ORD_S = 0x17, // Ordered (signaling)195EQ_US = 0x18, // Equal (unordered, signaling)196NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)197NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling)198FALSE_OS = 0x1B, // False (ordered, signaling)199NEQ_OS = 0x1C, // Not-equal (ordered, signaling)200GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)201GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling)202TRUE_US = 0x1F, // True (unordered, signaling)203};204205// return a (CmpTypeT) b (float)206//207// See documentation for CompareType above for valid values for CmpTypeT.208template<CompareType CmpTypeT>209static Float cmp_ps(Float a, Float b); // return a (CmtTypeT) b (see above)210static Float cmpgt_ps(Float a, Float b); // return cmp_ps<CompareType::GT_OQ>(a, b)211static Float cmple_ps(Float a, Float b); // return cmp_ps<CompareType::LE_OQ>(a, b)212static Float cmplt_ps(Float a, Float b); // return cmp_ps<CompareType::LT_OQ>(a, b)213static Float cmpneq_ps(Float a, Float b); // return cmp_ps<CompareType::NEQ_OQ>(a, b)214static Float cmpeq_ps(Float a, Float b); // return cmp_ps<CompareType::EQ_OQ>(a, b)215static Float cmpge_ps(Float a, Float b); // return cmp_ps<CompareType::GE_OQ>(a, b)216static Integer cmpeq_epi8(Integer a, Integer b); // return a == b (int8)217static Integer cmpeq_epi16(Integer a, Integer b); // return a == b (int16)218static Integer cmpeq_epi32(Integer a, Integer b); // return a == b (int32)219static Integer cmpeq_epi64(Integer a, Integer b); // return a == b (int64)220static Integer cmpgt_epi8(Integer a, Integer b); // return a > b (int8)221static Integer cmpgt_epi16(Integer a, Integer b); // return a > b (int16)222static Integer cmpgt_epi32(Integer a, Integer b); // return a > b (int32)223static Integer cmpgt_epi64(Integer a, Integer b); // return a > b (int64)224static Integer cmplt_epi32(Integer a, Integer b); // return a < b (int32)225static bool testz_ps(Float a, Float b); // return all_lanes_zero(a & b) ? 1 : 0 (float)226static bool testz_si(Integer a, Integer b); // return all_lanes_zero(a & b) ? 1 : 0 (int)227228//-----------------------------------------------------------------------229// Blend / shuffle / permute operations230//-----------------------------------------------------------------------231template<int ImmT>232static Float blend_ps(Float a, Float b); // return ImmT ? b : a (float)233static Integer blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int)234static Float blendv_ps(Float a, Float b, Float mask); // return mask ? b : a (float)235static Float broadcast_ss(float const *p); // return *p (all elements in vector get same value)236static Integer packs_epi16(Integer a, Integer b); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16237static Integer packs_epi32(Integer a, Integer b); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32238static Integer packus_epi16(Integer a, Integer b); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16239static Integer packus_epi32(Integer a, Integer b); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32240static Float permute_epi32(Integer a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (int32)241static Float permute_ps(Float a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (float)242template<int SwizT>243static Integer shuffle_epi32(Integer a, Integer b);244template<int SwizT>245static Integer shuffle_epi64(Integer a, Integer b);246static Integer shuffle_epi8(Integer a, Integer b);247template<int SwizT>248static Float shuffle_pd(Double a, Double b);249template<int SwizT>250static Float shuffle_ps(Float a, Float b);251static Integer unpackhi_epi16(Integer a, Integer b);252static Integer unpackhi_epi32(Integer a, Integer b);253static Integer unpackhi_epi64(Integer a, Integer b);254static Integer unpackhi_epi8(Integer a, Integer b);255static Float unpackhi_pd(Double a, Double b);256static Float unpackhi_ps(Float a, Float b);257static Integer unpacklo_epi16(Integer a, Integer b);258static Integer unpacklo_epi32(Integer a, Integer b);259static Integer unpacklo_epi64(Integer a, Integer b);260static Integer unpacklo_epi8(Integer a, Integer b);261static Float unpacklo_pd(Double a, Double b);262static Float unpacklo_ps(Float a, Float b);263264//-----------------------------------------------------------------------265// Load / store operations266//-----------------------------------------------------------------------267enum class ScaleFactor268{269SF_1, // No scaling270SF_2, // Scale offset by 2271SF_4, // Scale offset by 4272SF_8, // Scale offset by 8273};274275template<ScaleFactor ScaleT = ScaleFactor::SF_1>276static Float i32gather_ps(float const* p, Integer idx); // return *(float*)(((int8*)p) + (idx * ScaleT))277static Float load1_ps(float const *p); // return *p (broadcast 1 value to all elements)278static Float load_ps(float const *p); // return *p (loads SIMD width elements from memory)279static Integer load_si(Integer const *p); // return *p280static Float loadu_ps(float const *p); // return *p (same as load_ps but allows for unaligned mem)281static Integer loadu_si(Integer const *p); // return *p (same as load_si but allows for unaligned mem)282283// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old284template<int ScaleT>285static Float mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask);286287static void maskstore_ps(float *p, Integer mask, Float src);288static int movemask_epi8(Integer a);289static int movemask_pd(Double a);290static int movemask_ps(Float a);291static Integer set1_epi32(int i); // return i (all elements are same value)292static Integer set1_epi8(char i); // return i (all elements are same value)293static Float set1_ps(float f); // return f (all elements are same value)294static Float setzero_ps(); // return 0 (float)295static Integer setzero_si(); // return 0 (integer)296static void store_ps(float *p, Float a); // *p = a (stores all elements contiguously in memory)297static void store_si(Integer *p, Integer a); // *p = a298static void stream_ps(float *p, Float a); // *p = a (same as store_ps, but doesn't keep memory in cache)299300//=======================================================================301// Legacy interface (available only in SIMD256 width)302//=======================================================================303304static Float broadcast_ps(__m128 const *p);305template<int ImmT>306static __m128d extractf128_pd(Double a);307template<int ImmT>308static __m128 extractf128_ps(Float a);309template<int ImmT>310static __m128i extractf128_si(Integer a);311template<int ImmT>312static Double insertf128_pd(Double a, __m128d b);313template<int ImmT>314static Float insertf128_ps(Float a, __m128 b);315template<int ImmT>316static Integer insertf128_si(Integer a, __m128i b);317static Integer loadu2_si(__m128 const* phi, __m128 const* plo);318template<int ImmT>319static Double permute2f128_pd(Double a, Double b);320template<int ImmT>321static Float permute2f128_ps(Float a, Float b);322template<int ImmT>323static Integer permute2f128_si(Integer a, Integer b);324static Integer set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0);325static void storeu2_si(__m128i *phi, __m128i *plo, Integer src);326327//=======================================================================328// Advanced masking interface (currently available only in SIMD16 width)329//=======================================================================330};331#endif // #if 0332333334