Path: blob/master/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h
9903 views
// SPDX-License-Identifier: Apache-2.01// ----------------------------------------------------------------------------2// Copyright 2019-2024 Arm Limited3//4// Licensed under the Apache License, Version 2.0 (the "License"); you may not5// use this file except in compliance with the License. You may obtain a copy6// of the License at:7//8// http://www.apache.org/licenses/LICENSE-2.09//10// Unless required by applicable law or agreed to in writing, software11// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT12// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the13// License for the specific language governing permissions and limitations14// under the License.15// ----------------------------------------------------------------------------1617/**18* @brief 4x32-bit vectors, implemented using SSE.19*20* This module implements 4-wide 32-bit float, int, and mask vectors for x8621* SSE. The implementation requires at least SSE2, but higher levels of SSE can22* be selected at compile time to improve performance.23*24* There is a baseline level of functionality provided by all vector widths and25* implementations. This is implemented using identical function signatures,26* modulo data type, so we can use them as substitutable implementations in VLA27* code.28*29* The 4-wide vectors are also used as a fixed-width type, and significantly30* extend the functionality above that available to VLA code.31*/3233#ifndef ASTC_VECMATHLIB_SSE_4_H_INCLUDED34#define ASTC_VECMATHLIB_SSE_4_H_INCLUDED3536#ifndef ASTCENC_SIMD_INLINE37#error "Include astcenc_vecmathlib.h, do not include directly"38#endif3940#include <cstdio>41#include <cstring>4243// ============================================================================44// vfloat4 data type45// ============================================================================4647/**48* @brief Data type for 4-wide floats.49*/50struct vfloat451{52/**53* @brief Construct from zero-initialized value.54*/55ASTCENC_SIMD_INLINE vfloat4() = default;5657/**58* @brief Construct from 4 values loaded from an unaligned address.59*60* Consider using loada() which is better with vectors if data is aligned61* to vector length.62*/63ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)64{65m = _mm_loadu_ps(p);66}6768/**69* @brief Construct from 1 scalar value replicated across all lanes.70*71* Consider using zero() for constexpr zeros.72*/73ASTCENC_SIMD_INLINE explicit vfloat4(float a)74{75m = _mm_set1_ps(a);76}7778/**79* @brief Construct from 4 scalar values.80*81* The value of @c a is stored to lane 0 (LSB) in the SIMD register.82*/83ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)84{85m = _mm_set_ps(d, c, b, a);86}8788/**89* @brief Construct from an existing SIMD register.90*/91ASTCENC_SIMD_INLINE explicit vfloat4(__m128 a)92{93m = a;94}9596/**97* @brief Get the scalar value of a single lane.98*/99template <int l> ASTCENC_SIMD_INLINE float lane() const100{101return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l));102}103104/**105* @brief Set the scalar value of a single lane.106*/107template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)108{109#if ASTCENC_SSE >= 41110__m128 v = _mm_set1_ps(a);111m = _mm_insert_ps(m, v, l << 6 | l << 4);112#else113alignas(16) float idx[4];114_mm_store_ps(idx, m);115idx[l] = a;116m = _mm_load_ps(idx);117#endif118}119120/**121* @brief Factory that returns a vector of zeros.122*/123static ASTCENC_SIMD_INLINE vfloat4 zero()124{125return vfloat4(_mm_setzero_ps());126}127128/**129* @brief Factory that returns a replicated scalar loaded from memory.130*/131static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)132{133return vfloat4(_mm_load_ps1(p));134}135136/**137* @brief Factory that returns a vector loaded from 16B aligned memory.138*/139static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)140{141return vfloat4(_mm_load_ps(p));142}143144/**145* @brief Return a swizzled float 2.146*/147template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const148{149vfloat4 result(_mm_shuffle_ps(m, m, l0 | l1 << 2));150result.set_lane<2>(0.0f);151result.set_lane<3>(0.0f);152return result;153}154155/**156* @brief Return a swizzled float 3.157*/158template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const159{160vfloat4 result(_mm_shuffle_ps(m, m, l0 | l1 << 2 | l2 << 4));161result.set_lane<3>(0.0f);162return result;163}164165/**166* @brief Return a swizzled float 4.167*/168template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const169{170return vfloat4(_mm_shuffle_ps(m, m, l0 | l1 << 2 | l2 << 4 | l3 << 6));171}172173/**174* @brief The vector ...175*/176__m128 m;177};178179// ============================================================================180// vint4 data type181// ============================================================================182183/**184* @brief Data type for 4-wide ints.185*/186struct vint4187{188/**189* @brief Construct from zero-initialized value.190*/191ASTCENC_SIMD_INLINE vint4() = default;192193/**194* @brief Construct from 4 values loaded from an unaligned address.195*196* Consider using loada() which is better with vectors if data is aligned197* to vector length.198*/199ASTCENC_SIMD_INLINE explicit vint4(const int *p)200{201m = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));202}203204/**205* @brief Construct from 4 uint8_t loaded from an unaligned address.206*/207ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)208{209// _mm_loadu_si32 would be nicer syntax, but missing on older GCC210__m128i t = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(p));211212#if ASTCENC_SSE >= 41213m = _mm_cvtepu8_epi32(t);214#else215t = _mm_unpacklo_epi8(t, _mm_setzero_si128());216m = _mm_unpacklo_epi16(t, _mm_setzero_si128());217#endif218}219220/**221* @brief Construct from 1 scalar value replicated across all lanes.222*223* Consider using zero() for constexpr zeros.224*/225ASTCENC_SIMD_INLINE explicit vint4(int a)226{227m = _mm_set1_epi32(a);228}229230/**231* @brief Construct from 4 scalar values.232*233* The value of @c a is stored to lane 0 (LSB) in the SIMD register.234*/235ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)236{237m = _mm_set_epi32(d, c, b, a);238}239240/**241* @brief Construct from an existing SIMD register.242*/243ASTCENC_SIMD_INLINE explicit vint4(__m128i a)244{245m = a;246}247248/**249* @brief Get the scalar from a single lane.250*/251template <int l> ASTCENC_SIMD_INLINE int lane() const252{253return _mm_cvtsi128_si32(_mm_shuffle_epi32(m, l));254}255256/**257* @brief Set the scalar value of a single lane.258*/259template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)260{261#if ASTCENC_SSE >= 41262m = _mm_insert_epi32(m, a, l);263#else264alignas(16) int idx[4];265_mm_store_si128(reinterpret_cast<__m128i*>(idx), m);266idx[l] = a;267m = _mm_load_si128(reinterpret_cast<const __m128i*>(idx));268#endif269}270271/**272* @brief Factory that returns a vector of zeros.273*/274static ASTCENC_SIMD_INLINE vint4 zero()275{276return vint4(_mm_setzero_si128());277}278279/**280* @brief Factory that returns a replicated scalar loaded from memory.281*/282static ASTCENC_SIMD_INLINE vint4 load1(const int* p)283{284return vint4(*p);285}286287/**288* @brief Factory that returns a vector loaded from unaligned memory.289*/290static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)291{292#if ASTCENC_SSE >= 41293return vint4(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(p)));294#else295return vint4(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p)));296#endif297}298299/**300* @brief Factory that returns a vector loaded from 16B aligned memory.301*/302static ASTCENC_SIMD_INLINE vint4 loada(const int* p)303{304return vint4(_mm_load_si128(reinterpret_cast<const __m128i*>(p)));305}306307/**308* @brief Factory that returns a vector containing the lane IDs.309*/310static ASTCENC_SIMD_INLINE vint4 lane_id()311{312return vint4(_mm_set_epi32(3, 2, 1, 0));313}314315/**316* @brief The vector ...317*/318__m128i m;319};320321// ============================================================================322// vmask4 data type323// ============================================================================324325/**326* @brief Data type for 4-wide control plane masks.327*/328struct vmask4329{330/**331* @brief Construct from an existing SIMD register.332*/333ASTCENC_SIMD_INLINE explicit vmask4(__m128 a)334{335m = a;336}337338/**339* @brief Construct from an existing SIMD register.340*/341ASTCENC_SIMD_INLINE explicit vmask4(__m128i a)342{343m = _mm_castsi128_ps(a);344}345346/**347* @brief Construct from 1 scalar value.348*/349ASTCENC_SIMD_INLINE explicit vmask4(bool a)350{351vint4 mask(a == false ? 0 : -1);352m = _mm_castsi128_ps(mask.m);353}354355/**356* @brief Construct from 4 scalar values.357*358* The value of @c a is stored to lane 0 (LSB) in the SIMD register.359*/360ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)361{362vint4 mask(a == false ? 0 : -1,363b == false ? 0 : -1,364c == false ? 0 : -1,365d == false ? 0 : -1);366367m = _mm_castsi128_ps(mask.m);368}369370/**371* @brief Get the scalar value of a single lane.372*/373template <int l> ASTCENC_SIMD_INLINE bool lane() const374{375return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l)) != 0.0f;376}377378/**379* @brief The vector ...380*/381__m128 m;382};383384// ============================================================================385// vmask4 operators and functions386// ============================================================================387388/**389* @brief Overload: mask union (or).390*/391ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)392{393return vmask4(_mm_or_ps(a.m, b.m));394}395396/**397* @brief Overload: mask intersect (and).398*/399ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)400{401return vmask4(_mm_and_ps(a.m, b.m));402}403404/**405* @brief Overload: mask difference (xor).406*/407ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)408{409return vmask4(_mm_xor_ps(a.m, b.m));410}411412/**413* @brief Overload: mask invert (not).414*/415ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)416{417return vmask4(_mm_xor_si128(_mm_castps_si128(a.m), _mm_set1_epi32(-1)));418}419420/**421* @brief Return a 4-bit mask code indicating mask status.422*423* bit0 = lane 0424*/425ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)426{427return static_cast<unsigned int>(_mm_movemask_ps(a.m));428}429430/**431* @brief True if any lanes are enabled, false otherwise.432*/433ASTCENC_SIMD_INLINE bool any(vmask4 a)434{435return mask(a) != 0;436}437438/**439* @brief True if all lanes are enabled, false otherwise.440*/441ASTCENC_SIMD_INLINE bool all(vmask4 a)442{443return mask(a) == 0xF;444}445446// ============================================================================447// vint4 operators and functions448// ============================================================================449450/**451* @brief Overload: vector by vector addition.452*/453ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)454{455return vint4(_mm_add_epi32(a.m, b.m));456}457458/**459* @brief Overload: vector by vector subtraction.460*/461ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)462{463return vint4(_mm_sub_epi32(a.m, b.m));464}465466/**467* @brief Overload: vector by vector multiplication.468*/469ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)470{471#if ASTCENC_SSE >= 41472return vint4(_mm_mullo_epi32 (a.m, b.m));473#else474__m128i t1 = _mm_mul_epu32(a.m, b.m);475__m128i t2 = _mm_mul_epu32(476_mm_srli_si128(a.m, 4),477_mm_srli_si128(b.m, 4));478__m128i r = _mm_unpacklo_epi32(479_mm_shuffle_epi32(t1, _MM_SHUFFLE (0, 0, 2, 0)),480_mm_shuffle_epi32(t2, _MM_SHUFFLE (0, 0, 2, 0)));481return vint4(r);482#endif483}484485/**486* @brief Overload: vector bit invert.487*/488ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)489{490return vint4(_mm_xor_si128(a.m, _mm_set1_epi32(-1)));491}492493/**494* @brief Overload: vector by vector bitwise or.495*/496ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)497{498return vint4(_mm_or_si128(a.m, b.m));499}500501/**502* @brief Overload: vector by vector bitwise and.503*/504ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)505{506return vint4(_mm_and_si128(a.m, b.m));507}508509/**510* @brief Overload: vector by vector bitwise xor.511*/512ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)513{514return vint4(_mm_xor_si128(a.m, b.m));515}516517/**518* @brief Overload: vector by vector equality.519*/520ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)521{522return vmask4(_mm_cmpeq_epi32(a.m, b.m));523}524525/**526* @brief Overload: vector by vector inequality.527*/528ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)529{530return ~vmask4(_mm_cmpeq_epi32(a.m, b.m));531}532533/**534* @brief Overload: vector by vector less than.535*/536ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)537{538return vmask4(_mm_cmplt_epi32(a.m, b.m));539}540541/**542* @brief Overload: vector by vector greater than.543*/544ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)545{546return vmask4(_mm_cmpgt_epi32(a.m, b.m));547}548549/**550* @brief Logical shift left.551*/552template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)553{554return vint4(_mm_slli_epi32(a.m, s));555}556557/**558* @brief Logical shift right.559*/560template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)561{562return vint4(_mm_srli_epi32(a.m, s));563}564565/**566* @brief Arithmetic shift right.567*/568template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)569{570return vint4(_mm_srai_epi32(a.m, s));571}572573/**574* @brief Return the min vector of two vectors.575*/576ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)577{578#if ASTCENC_SSE >= 41579return vint4(_mm_min_epi32(a.m, b.m));580#else581vmask4 d = a < b;582__m128i ap = _mm_and_si128(_mm_castps_si128(d.m), a.m);583__m128i bp = _mm_andnot_si128(_mm_castps_si128(d.m), b.m);584return vint4(_mm_or_si128(ap,bp));585#endif586}587588/**589* @brief Return the max vector of two vectors.590*/591ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)592{593#if ASTCENC_SSE >= 41594return vint4(_mm_max_epi32(a.m, b.m));595#else596vmask4 d = a > b;597__m128i ap = _mm_and_si128(_mm_castps_si128(d.m), a.m);598__m128i bp = _mm_andnot_si128(_mm_castps_si128(d.m), b.m);599return vint4(_mm_or_si128(ap,bp));600#endif601}602603/**604* @brief Return the horizontal minimum of a vector.605*/606ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)607{608a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))));609a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2))));610return a;611}612613/*614* @brief Return the horizontal maximum of a vector.615*/616ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)617{618a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))));619a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2))));620return a;621}622623/**624* @brief Store a vector to a 16B aligned memory address.625*/626ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)627{628_mm_store_si128(reinterpret_cast<__m128i*>(p), a.m);629}630631/**632* @brief Store a vector to an unaligned memory address.633*/634ASTCENC_SIMD_INLINE void store(vint4 a, int* p)635{636// Cast due to missing intrinsics637_mm_storeu_ps(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));638}639640/**641* @brief Store a vector to an unaligned memory address.642*/643ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)644{645std::memcpy(p, &a.m, sizeof(int) * 4);646}647648/**649* @brief Store lowest N (vector width) bytes into an unaligned address.650*/651ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)652{653// Cast due to missing intrinsics654_mm_store_ss(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));655}656657/**658* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.659*/660ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p)661{662#if ASTCENC_SSE >= 41663__m128i shuf = _mm_set_epi8(0,0,0,0, 0,0,0,0, 0,0,0,0, 12,8,4,0);664a = vint4(_mm_shuffle_epi8(a.m, shuf));665store_nbytes(a, p);666#else667__m128i va = _mm_unpacklo_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(1,1,1,1)));668__m128i vb = _mm_unpackhi_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(3,3,3,3)));669a = vint4(_mm_unpacklo_epi16(va, vb));670store_nbytes(a, p);671#endif672}673674/**675* @brief Return lanes from @c b if @c cond is set, else @c a.676*/677ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)678{679__m128i condi = _mm_castps_si128(cond.m);680681#if ASTCENC_SSE >= 41682return vint4(_mm_blendv_epi8(a.m, b.m, condi));683#else684return vint4(_mm_or_si128(_mm_and_si128(condi, b.m), _mm_andnot_si128(condi, a.m)));685#endif686}687688// ============================================================================689// vfloat4 operators and functions690// ============================================================================691692/**693* @brief Overload: vector by vector addition.694*/695ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)696{697return vfloat4(_mm_add_ps(a.m, b.m));698}699700/**701* @brief Overload: vector by vector subtraction.702*/703ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)704{705return vfloat4(_mm_sub_ps(a.m, b.m));706}707708/**709* @brief Overload: vector by vector multiplication.710*/711ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)712{713return vfloat4(_mm_mul_ps(a.m, b.m));714}715716/**717* @brief Overload: vector by vector division.718*/719ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)720{721return vfloat4(_mm_div_ps(a.m, b.m));722}723724/**725* @brief Overload: vector by vector equality.726*/727ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)728{729return vmask4(_mm_cmpeq_ps(a.m, b.m));730}731732/**733* @brief Overload: vector by vector inequality.734*/735ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)736{737return vmask4(_mm_cmpneq_ps(a.m, b.m));738}739740/**741* @brief Overload: vector by vector less than.742*/743ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)744{745return vmask4(_mm_cmplt_ps(a.m, b.m));746}747748/**749* @brief Overload: vector by vector greater than.750*/751ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)752{753return vmask4(_mm_cmpgt_ps(a.m, b.m));754}755756/**757* @brief Overload: vector by vector less than or equal.758*/759ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)760{761return vmask4(_mm_cmple_ps(a.m, b.m));762}763764/**765* @brief Overload: vector by vector greater than or equal.766*/767ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)768{769return vmask4(_mm_cmpge_ps(a.m, b.m));770}771772/**773* @brief Return the min vector of two vectors.774*775* If either lane value is NaN, @c b will be returned for that lane.776*/777ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)778{779// Do not reorder - second operand will return if either is NaN780return vfloat4(_mm_min_ps(a.m, b.m));781}782783/**784* @brief Return the max vector of two vectors.785*786* If either lane value is NaN, @c b will be returned for that lane.787*/788ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)789{790// Do not reorder - second operand will return if either is NaN791return vfloat4(_mm_max_ps(a.m, b.m));792}793794/**795* @brief Return the absolute value of the float vector.796*/797ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)798{799return vfloat4(_mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a.m), a.m));800}801802/**803* @brief Return a float rounded to the nearest integer value.804*/805ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)806{807#if ASTCENC_SSE >= 41808constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;809return vfloat4(_mm_round_ps(a.m, flags));810#else811__m128 v = a.m;812__m128 neg_zero = _mm_castsi128_ps(_mm_set1_epi32(static_cast<int>(0x80000000)));813__m128 no_fraction = _mm_set1_ps(8388608.0f);814__m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));815__m128 sign = _mm_and_ps(v, neg_zero);816__m128 s_magic = _mm_or_ps(no_fraction, sign);817__m128 r1 = _mm_add_ps(v, s_magic);818r1 = _mm_sub_ps(r1, s_magic);819__m128 r2 = _mm_and_ps(v, abs_mask);820__m128 mask = _mm_cmple_ps(r2, no_fraction);821r2 = _mm_andnot_ps(mask, v);822r1 = _mm_and_ps(r1, mask);823return vfloat4(_mm_xor_ps(r1, r2));824#endif825}826827/**828* @brief Return the horizontal minimum of a vector.829*/830ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)831{832a = min(a, vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 3, 2))));833a = min(a, vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 0, 1))));834return vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 0, 0)));835}836837/**838* @brief Return the horizontal maximum of a vector.839*/840ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)841{842a = max(a, vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 3, 2))));843a = max(a, vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 0, 1))));844return vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 0, 0)));845}846847/**848* @brief Return the horizontal sum of a vector as a scalar.849*/850ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)851{852// Add top and bottom halves, lane 1/0853__m128 t = _mm_add_ps(a.m, _mm_movehl_ps(a.m, a.m));854855// Add top and bottom halves, lane 0 (_mm_hadd_ps exists but slow)856t = _mm_add_ss(t, _mm_shuffle_ps(t, t, 0x55));857858return _mm_cvtss_f32(t);859}860861/**862* @brief Return the sqrt of the lanes in the vector.863*/864ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)865{866return vfloat4(_mm_sqrt_ps(a.m));867}868869/**870* @brief Return lanes from @c b if @c cond is set, else @c a.871*/872ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)873{874#if ASTCENC_SSE >= 41875return vfloat4(_mm_blendv_ps(a.m, b.m, cond.m));876#else877return vfloat4(_mm_or_ps(_mm_and_ps(cond.m, b.m), _mm_andnot_ps(cond.m, a.m)));878#endif879}880881/**882* @brief Load a vector of gathered results from an array;883*/884ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)885{886#if ASTCENC_AVX >= 2 && ASTCENC_X86_GATHERS != 0887return vfloat4(_mm_i32gather_ps(base, indices.m, 4));888#else889alignas(16) int idx[4];890storea(indices, idx);891return vfloat4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);892#endif893}894895/**896* @brief Load a vector of gathered results from an array using byte indices from memory897*/898template<>899ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)900{901// Experimentally, in this particular use case (byte indices in memory),902// using 4 separate scalar loads is appreciably faster than using gathers903// even if they're available, on every x86 uArch tried, so always do the904// separate loads even when ASTCENC_X86_GATHERS is enabled.905//906// Tested on:907// - Intel Skylake-X, Coffee Lake, Crestmont, Redwood Cove908// - AMD Zen 2, Zen 4909return vfloat4(base[indices[0]], base[indices[1]], base[indices[2]], base[indices[3]]);910}911912/**913* @brief Store a vector to an unaligned memory address.914*/915ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)916{917_mm_storeu_ps(p, a.m);918}919920/**921* @brief Store a vector to a 16B aligned memory address.922*/923ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)924{925_mm_store_ps(p, a.m);926}927928/**929* @brief Return a integer value for a float vector, using truncation.930*/931ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)932{933return vint4(_mm_cvttps_epi32(a.m));934}935936/**937* @brief Return a integer value for a float vector, using round-to-nearest.938*/939ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)940{941a = a + vfloat4(0.5f);942return vint4(_mm_cvttps_epi32(a.m));943}944945/**946* @brief Return a float value for an integer vector.947*/948ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)949{950return vfloat4(_mm_cvtepi32_ps(a.m));951}952953/**954* @brief Return a float16 value for a float vector, using round-to-nearest.955*/956ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)957{958#if ASTCENC_F16C >= 1959__m128i packedf16 = _mm_cvtps_ph(a.m, 0);960__m128i f16 = _mm_cvtepu16_epi32(packedf16);961return vint4(f16);962#else963return vint4(964float_to_sf16(a.lane<0>()),965float_to_sf16(a.lane<1>()),966float_to_sf16(a.lane<2>()),967float_to_sf16(a.lane<3>()));968#endif969}970971/**972* @brief Return a float16 value for a float scalar, using round-to-nearest.973*/974static inline uint16_t float_to_float16(float a)975{976#if ASTCENC_F16C >= 1977__m128i f16 = _mm_cvtps_ph(_mm_set1_ps(a), 0);978return static_cast<uint16_t>(_mm_cvtsi128_si32(f16));979#else980return float_to_sf16(a);981#endif982}983984/**985* @brief Return a float value for a float16 vector.986*/987ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)988{989#if ASTCENC_F16C >= 1990__m128i packed = _mm_packs_epi32(a.m, a.m);991__m128 f32 = _mm_cvtph_ps(packed);992return vfloat4(f32);993#else994return vfloat4(995sf16_to_float(static_cast<uint16_t>(a.lane<0>())),996sf16_to_float(static_cast<uint16_t>(a.lane<1>())),997sf16_to_float(static_cast<uint16_t>(a.lane<2>())),998sf16_to_float(static_cast<uint16_t>(a.lane<3>())));999#endif1000}10011002/**1003* @brief Return a float value for a float16 scalar.1004*/1005ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)1006{1007#if ASTCENC_F16C >= 11008__m128i packed = _mm_set1_epi16(static_cast<short>(a));1009__m128 f32 = _mm_cvtph_ps(packed);1010return _mm_cvtss_f32(f32);1011#else1012return sf16_to_float(a);1013#endif1014}10151016/**1017* @brief Return a float value as an integer bit pattern (i.e. no conversion).1018*1019* It is a common trick to convert floats into integer bit patterns, perform1020* some bit hackery based on knowledge they are IEEE 754 layout, and then1021* convert them back again. This is the first half of that flip.1022*/1023ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)1024{1025return vint4(_mm_castps_si128(a.m));1026}10271028/**1029* @brief Return a integer value as a float bit pattern (i.e. no conversion).1030*1031* It is a common trick to convert floats into integer bit patterns, perform1032* some bit hackery based on knowledge they are IEEE 754 layout, and then1033* convert them back again. This is the second half of that flip.1034*/1035ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)1036{1037return vfloat4(_mm_castsi128_ps(v.m));1038}10391040/*1041* Table structure for a 16x 8-bit entry table.1042*/1043struct vtable4_16x8 {1044#if ASTCENC_SSE >= 411045vint4 t0;1046#else1047const uint8_t* data;1048#endif1049};10501051/*1052* Table structure for a 32x 8-bit entry table.1053*/1054struct vtable4_32x8 {1055#if ASTCENC_SSE >= 411056vint4 t0;1057vint4 t1;1058#else1059const uint8_t* data;1060#endif1061};10621063/*1064* Table structure for a 64x 8-bit entry table.1065*/1066struct vtable4_64x8 {1067#if ASTCENC_SSE >= 411068vint4 t0;1069vint4 t1;1070vint4 t2;1071vint4 t3;1072#else1073const uint8_t* data;1074#endif1075};10761077/**1078* @brief Prepare a vtable lookup table for 16x 8-bit entry table.1079*/1080ASTCENC_SIMD_INLINE void vtable_prepare(1081vtable4_16x8& table,1082const uint8_t* data1083) {1084#if ASTCENC_SSE >= 411085table.t0 = vint4::load(data);1086#else1087table.data = data;1088#endif1089}10901091/**1092* @brief Prepare a vtable lookup table for 32x 8-bit entry table.1093*/1094ASTCENC_SIMD_INLINE void vtable_prepare(1095vtable4_32x8& table,1096const uint8_t* data1097) {1098#if ASTCENC_SSE >= 411099table.t0 = vint4::load(data);1100table.t1 = vint4::load(data + 16);11011102table.t1 = table.t1 ^ table.t0;1103#else1104table.data = data;1105#endif1106}11071108/**1109* @brief Prepare a vtable lookup table 64x 8-bit entry table.1110*/1111ASTCENC_SIMD_INLINE void vtable_prepare(1112vtable4_64x8& table,1113const uint8_t* data1114) {1115#if ASTCENC_SSE >= 411116table.t0 = vint4::load(data);1117table.t1 = vint4::load(data + 16);1118table.t2 = vint4::load(data + 32);1119table.t3 = vint4::load(data + 48);11201121table.t3 = table.t3 ^ table.t2;1122table.t2 = table.t2 ^ table.t1;1123table.t1 = table.t1 ^ table.t0;1124#else1125table.data = data;1126#endif1127}11281129/**1130* @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.1131*/1132ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(1133const vtable4_16x8& tbl,1134vint4 idx1135) {1136#if ASTCENC_SSE >= 411137// Set index byte MSB to 1 for unused bytes so shuffle returns zero1138__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));11391140__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);1141return vint4(result);1142#else1143return vint4(tbl.data[idx.lane<0>()],1144tbl.data[idx.lane<1>()],1145tbl.data[idx.lane<2>()],1146tbl.data[idx.lane<3>()]);1147#endif1148}11491150/**1151* @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.1152*/1153ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(1154const vtable4_32x8& tbl,1155vint4 idx1156) {1157#if ASTCENC_SSE >= 411158// Set index byte MSB to 1 for unused bytes so shuffle returns zero1159__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));11601161__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);1162idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));11631164__m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx);1165result = _mm_xor_si128(result, result2);11661167return vint4(result);1168#else1169return vint4(tbl.data[idx.lane<0>()],1170tbl.data[idx.lane<1>()],1171tbl.data[idx.lane<2>()],1172tbl.data[idx.lane<3>()]);1173#endif1174}11751176/**1177* @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.1178*/1179ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(1180const vtable4_64x8& tbl,1181vint4 idx1182) {1183#if ASTCENC_SSE >= 411184// Set index byte MSB to 1 for unused bytes so shuffle returns zero1185__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));11861187__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);1188idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));11891190__m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx);1191result = _mm_xor_si128(result, result2);1192idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));11931194result2 = _mm_shuffle_epi8(tbl.t2.m, idxx);1195result = _mm_xor_si128(result, result2);1196idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));11971198result2 = _mm_shuffle_epi8(tbl.t3.m, idxx);1199result = _mm_xor_si128(result, result2);12001201return vint4(result);1202#else1203return vint4(tbl.data[idx.lane<0>()],1204tbl.data[idx.lane<1>()],1205tbl.data[idx.lane<2>()],1206tbl.data[idx.lane<3>()]);1207#endif1208}12091210/**1211* @brief Return a vector of interleaved RGBA data.1212*1213* Input vectors have the value stored in the bottom 8 bits of each lane,1214* with high bits set to zero.1215*1216* Output vector stores a single RGBA texel packed in each lane.1217*/1218ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)1219{1220// Workaround an XCode compiler internal fault; note is slower than slli_epi321221// so we should revert this when we get the opportunity1222#if defined(__APPLE__)1223__m128i value = r.m;1224value = _mm_add_epi32(value, _mm_bslli_si128(g.m, 1));1225value = _mm_add_epi32(value, _mm_bslli_si128(b.m, 2));1226value = _mm_add_epi32(value, _mm_bslli_si128(a.m, 3));1227return vint4(value);1228#else1229__m128i value = r.m;1230value = _mm_add_epi32(value, _mm_slli_epi32(g.m, 8));1231value = _mm_add_epi32(value, _mm_slli_epi32(b.m, 16));1232value = _mm_add_epi32(value, _mm_slli_epi32(a.m, 24));1233return vint4(value);1234#endif1235}12361237/**1238* @brief Store a single vector lane to an unaligned address.1239*/1240ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)1241{1242std::memcpy(base, &data, sizeof(int));1243}12441245/**1246* @brief Store a vector, skipping masked lanes.1247*1248* All masked lanes must be at the end of vector, after all non-masked lanes.1249*/1250ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)1251{1252#if ASTCENC_AVX >= 21253_mm_maskstore_epi32(reinterpret_cast<int*>(base), _mm_castps_si128(mask.m), data.m);1254#else1255// Note - we cannot use _mm_maskmoveu_si128 as the underlying hardware doesn't guarantee1256// fault suppression on masked lanes so we can get page faults at the end of an image.1257if (mask.lane<3>() != 0.0f)1258{1259store(data, base);1260}1261else if (mask.lane<2>() != 0.0f)1262{1263store_lane(base + 0, data.lane<0>());1264store_lane(base + 4, data.lane<1>());1265store_lane(base + 8, data.lane<2>());1266}1267else if (mask.lane<1>() != 0.0f)1268{1269store_lane(base + 0, data.lane<0>());1270store_lane(base + 4, data.lane<1>());1271}1272else if (mask.lane<0>() != 0.0f)1273{1274store_lane(base + 0, data.lane<0>());1275}1276#endif1277}12781279#if defined(ASTCENC_NO_INVARIANCE) && (ASTCENC_SSE >= 41)12801281#define ASTCENC_USE_NATIVE_DOT_PRODUCT 112821283/**1284* @brief Return the dot product for the full 4 lanes, returning scalar.1285*/1286ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)1287{1288return _mm_cvtss_f32(_mm_dp_ps(a.m, b.m, 0xFF));1289}12901291/**1292* @brief Return the dot product for the full 4 lanes, returning vector.1293*/1294ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)1295{1296return vfloat4(_mm_dp_ps(a.m, b.m, 0xFF));1297}12981299/**1300* @brief Return the dot product for the bottom 3 lanes, returning scalar.1301*/1302ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)1303{1304return _mm_cvtss_f32(_mm_dp_ps(a.m, b.m, 0x77));1305}13061307/**1308* @brief Return the dot product for the bottom 3 lanes, returning vector.1309*/1310ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)1311{1312return vfloat4(_mm_dp_ps(a.m, b.m, 0x77));1313}13141315#endif // #if defined(ASTCENC_NO_INVARIANCE) && (ASTCENC_SSE >= 41)13161317#if ASTCENC_POPCNT >= 113181319#define ASTCENC_USE_NATIVE_POPCOUNT 113201321/**1322* @brief Population bit count.1323*1324* @param v The value to population count.1325*1326* @return The number of 1 bits.1327*/1328ASTCENC_SIMD_INLINE int popcount(uint64_t v)1329{1330#if !defined(__x86_64__) && !defined(_M_AMD64)1331return static_cast<int>(__builtin_popcountll(v));1332#else1333return static_cast<int>(_mm_popcnt_u64(v));1334#endif1335}13361337#endif // ASTCENC_POPCNT >= 113381339#endif // #ifndef ASTC_VECMATHLIB_SSE_4_H_INCLUDED134013411342