Path: blob/master/thirdparty/astcenc/astcenc_vecmathlib_none_4.h
9896 views
// SPDX-License-Identifier: Apache-2.01// ----------------------------------------------------------------------------2// Copyright 2019-2025 Arm Limited3//4// Licensed under the Apache License, Version 2.0 (the "License"); you may not5// use this file except in compliance with the License. You may obtain a copy6// of the License at:7//8// http://www.apache.org/licenses/LICENSE-2.09//10// Unless required by applicable law or agreed to in writing, software11// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT12// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the13// License for the specific language governing permissions and limitations14// under the License.15// ----------------------------------------------------------------------------1617/**18* @brief 4x32-bit vectors, implemented using plain C++.19*20* This module implements 4-wide 32-bit float, int, and mask vectors. This21* module provides a scalar fallback for VLA code, primarily useful for22* debugging VLA algorithms without the complexity of handling SIMD. Only the23* baseline level of functionality needed to support VLA is provided.24*25* Note that the vector conditional operators implemented by this module are26* designed to behave like SIMD conditional operators that generate lane masks.27* Rather than returning 0/1 booleans like normal C++ code they will return28* 0/-1 to give a full lane-width bitmask.29*30* Note that the documentation for this module still talks about "vectors" to31* help developers think about the implied VLA behavior when writing optimized32* paths.33*/3435#ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED36#define ASTC_VECMATHLIB_NONE_4_H_INCLUDED3738#ifndef ASTCENC_SIMD_INLINE39#error "Include astcenc_vecmathlib.h, do not include directly"40#endif4142#include <algorithm>43#include <cstdio>44#include <cstring>45#include <cfenv>4647// ============================================================================48// vfloat4 data type49// ============================================================================5051/**52* @brief Data type for 4-wide floats.53*/54struct vfloat455{56/**57* @brief Construct from zero-initialized value.58*/59ASTCENC_SIMD_INLINE vfloat4() = default;6061/**62* @brief Construct from 4 values loaded from an unaligned address.63*64* Consider using loada() which is better with wider VLA vectors if data is65* aligned to vector length.66*/67ASTCENC_SIMD_INLINE explicit vfloat4(const float* p)68{69m[0] = p[0];70m[1] = p[1];71m[2] = p[2];72m[3] = p[3];73}7475/**76* @brief Construct from 4 scalar values replicated across all lanes.77*78* Consider using zero() for constexpr zeros.79*/80ASTCENC_SIMD_INLINE explicit vfloat4(float a)81{82m[0] = a;83m[1] = a;84m[2] = a;85m[3] = a;86}8788/**89* @brief Construct from 4 scalar values.90*91* The value of @c a is stored to lane 0 (LSB) in the SIMD register.92*/93ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)94{95m[0] = a;96m[1] = b;97m[2] = c;98m[3] = d;99}100101/**102* @brief Get the scalar value of a single lane.103*/104template <int l> ASTCENC_SIMD_INLINE float lane() const105{106return m[l];107}108109/**110* @brief Set the scalar value of a single lane.111*/112template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)113{114m[l] = a;115}116117/**118* @brief Factory that returns a vector of zeros.119*/120static ASTCENC_SIMD_INLINE vfloat4 zero()121{122return vfloat4(0.0f);123}124125/**126* @brief Factory that returns a replicated scalar loaded from memory.127*/128static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)129{130return vfloat4(*p);131}132133/**134* @brief Factory that returns a vector loaded from aligned memory.135*/136static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)137{138return vfloat4(p);139}140141/**142* @brief Return a swizzled float 2.143*/144template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const145{146return vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);147}148149/**150* @brief Return a swizzled float 3.151*/152template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const153{154return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);155}156157/**158* @brief Return a swizzled float 4.159*/160template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const161{162return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());163}164165/**166* @brief The vector ...167*/168float m[4];169};170171// ============================================================================172// vint4 data type173// ============================================================================174175/**176* @brief Data type for 4-wide ints.177*/178struct vint4179{180/**181* @brief Construct from zero-initialized value.182*/183ASTCENC_SIMD_INLINE vint4() = default;184185/**186* @brief Construct from 4 values loaded from an unaligned address.187*188* Consider using vint4::loada() which is better with wider VLA vectors189* if data is aligned.190*/191ASTCENC_SIMD_INLINE explicit vint4(const int* p)192{193m[0] = p[0];194m[1] = p[1];195m[2] = p[2];196m[3] = p[3];197}198199/**200* @brief Construct from 4 uint8_t loaded from an unaligned address.201*/202ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)203{204m[0] = p[0];205m[1] = p[1];206m[2] = p[2];207m[3] = p[3];208}209210/**211* @brief Construct from 4 scalar values.212*213* The value of @c a is stored to lane 0 (LSB) in the SIMD register.214*/215ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)216{217m[0] = a;218m[1] = b;219m[2] = c;220m[3] = d;221}222223224/**225* @brief Construct from 4 scalar values replicated across all lanes.226*227* Consider using zero() for constexpr zeros.228*/229ASTCENC_SIMD_INLINE explicit vint4(int a)230{231m[0] = a;232m[1] = a;233m[2] = a;234m[3] = a;235}236237/**238* @brief Get the scalar value of a single lane.239*/240template <int l> ASTCENC_SIMD_INLINE int lane() const241{242return m[l];243}244245/**246* @brief Set the scalar value of a single lane.247*/248template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)249{250m[l] = a;251}252253/**254* @brief Factory that returns a vector of zeros.255*/256static ASTCENC_SIMD_INLINE vint4 zero()257{258return vint4(0);259}260261/**262* @brief Factory that returns a replicated scalar loaded from memory.263*/264static ASTCENC_SIMD_INLINE vint4 load1(const int* p)265{266return vint4(*p);267}268269/**270* @brief Factory that returns a vector loaded from unaligned memory.271*/272static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)273{274vint4 data;275std::memcpy(&data.m, p, 4 * sizeof(int));276return data;277}278279/**280* @brief Factory that returns a vector loaded from 16B aligned memory.281*/282static ASTCENC_SIMD_INLINE vint4 loada(const int* p)283{284return vint4(p);285}286287/**288* @brief Factory that returns a vector containing the lane IDs.289*/290static ASTCENC_SIMD_INLINE vint4 lane_id()291{292return vint4(0, 1, 2, 3);293}294295/**296* @brief The vector ...297*/298int m[4];299};300301// ============================================================================302// vmask4 data type303// ============================================================================304305/**306* @brief Data type for 4-wide control plane masks.307*/308struct vmask4309{310/**311* @brief Construct from an existing mask value.312*/313ASTCENC_SIMD_INLINE explicit vmask4(int* p)314{315m[0] = p[0];316m[1] = p[1];317m[2] = p[2];318m[3] = p[3];319}320321/**322* @brief Construct from 1 scalar value.323*/324ASTCENC_SIMD_INLINE explicit vmask4(bool a)325{326m[0] = a == false ? 0 : -1;327m[1] = a == false ? 0 : -1;328m[2] = a == false ? 0 : -1;329m[3] = a == false ? 0 : -1;330}331332/**333* @brief Construct from 4 scalar values.334*335* The value of @c a is stored to lane 0 (LSB) in the SIMD register.336*/337ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)338{339m[0] = a == false ? 0 : -1;340m[1] = b == false ? 0 : -1;341m[2] = c == false ? 0 : -1;342m[3] = d == false ? 0 : -1;343}344345/**346* @brief Get the scalar value of a single lane.347*/348template <int l> ASTCENC_SIMD_INLINE bool lane() const349{350return m[l] != 0;351}352353/**354* @brief The vector ...355*/356int m[4];357};358359// ============================================================================360// vmask4 operators and functions361// ============================================================================362363/**364* @brief Overload: mask union (or).365*/366ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)367{368return vmask4(a.m[0] | b.m[0],369a.m[1] | b.m[1],370a.m[2] | b.m[2],371a.m[3] | b.m[3]);372}373374/**375* @brief Overload: mask intersect (and).376*/377ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)378{379return vmask4(a.m[0] & b.m[0],380a.m[1] & b.m[1],381a.m[2] & b.m[2],382a.m[3] & b.m[3]);383}384385/**386* @brief Overload: mask difference (xor).387*/388ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)389{390return vmask4(a.m[0] ^ b.m[0],391a.m[1] ^ b.m[1],392a.m[2] ^ b.m[2],393a.m[3] ^ b.m[3]);394}395396/**397* @brief Overload: mask invert (not).398*/399ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)400{401return vmask4(~a.m[0],402~a.m[1],403~a.m[2],404~a.m[3]);405}406407/**408* @brief Return a 1-bit mask code indicating mask status.409*410* bit0 = lane 0411*/412ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)413{414return (a.m[0] & 0x1) |415(a.m[1] & 0x2) |416(a.m[2] & 0x4) |417(a.m[3] & 0x8);418}419420/**421* @brief True if any lanes are enabled, false otherwise.422*/423ASTCENC_SIMD_INLINE bool any(vmask4 a)424{425return mask(a) != 0;426}427428/**429* @brief True if all lanes are enabled, false otherwise.430*/431ASTCENC_SIMD_INLINE bool all(vmask4 a)432{433return mask(a) == 0xF;434}435436// ============================================================================437// vint4 operators and functions438// ============================================================================439440/**441* @brief Overload: vector by vector addition.442*/443ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)444{445return vint4(a.m[0] + b.m[0],446a.m[1] + b.m[1],447a.m[2] + b.m[2],448a.m[3] + b.m[3]);449}450451/**452* @brief Overload: vector by vector subtraction.453*/454ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)455{456return vint4(a.m[0] - b.m[0],457a.m[1] - b.m[1],458a.m[2] - b.m[2],459a.m[3] - b.m[3]);460}461462/**463* @brief Overload: vector by vector multiplication.464*/465ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)466{467return vint4(a.m[0] * b.m[0],468a.m[1] * b.m[1],469a.m[2] * b.m[2],470a.m[3] * b.m[3]);471}472473/**474* @brief Overload: vector bit invert.475*/476ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)477{478return vint4(~a.m[0],479~a.m[1],480~a.m[2],481~a.m[3]);482}483484/**485* @brief Overload: vector by vector bitwise or.486*/487ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)488{489return vint4(a.m[0] | b.m[0],490a.m[1] | b.m[1],491a.m[2] | b.m[2],492a.m[3] | b.m[3]);493}494495/**496* @brief Overload: vector by vector bitwise and.497*/498ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)499{500return vint4(a.m[0] & b.m[0],501a.m[1] & b.m[1],502a.m[2] & b.m[2],503a.m[3] & b.m[3]);504}505506/**507* @brief Overload: vector by vector bitwise xor.508*/509ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)510{511return vint4(a.m[0] ^ b.m[0],512a.m[1] ^ b.m[1],513a.m[2] ^ b.m[2],514a.m[3] ^ b.m[3]);515}516517/**518* @brief Overload: vector by vector equality.519*/520ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)521{522return vmask4(a.m[0] == b.m[0],523a.m[1] == b.m[1],524a.m[2] == b.m[2],525a.m[3] == b.m[3]);526}527528/**529* @brief Overload: vector by vector inequality.530*/531ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)532{533return vmask4(a.m[0] != b.m[0],534a.m[1] != b.m[1],535a.m[2] != b.m[2],536a.m[3] != b.m[3]);537}538539/**540* @brief Overload: vector by vector less than.541*/542ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)543{544return vmask4(a.m[0] < b.m[0],545a.m[1] < b.m[1],546a.m[2] < b.m[2],547a.m[3] < b.m[3]);548}549550/**551* @brief Overload: vector by vector greater than.552*/553ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)554{555return vmask4(a.m[0] > b.m[0],556a.m[1] > b.m[1],557a.m[2] > b.m[2],558a.m[3] > b.m[3]);559}560561/**562* @brief Logical shift left.563*/564template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)565{566// Cast to unsigned to avoid shift in/out of sign bit undefined behavior567unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s;568unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s;569unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s;570unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s;571572return vint4(static_cast<int>(as0),573static_cast<int>(as1),574static_cast<int>(as2),575static_cast<int>(as3));576}577578/**579* @brief Logical shift right.580*/581template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)582{583// Cast to unsigned to avoid shift in/out of sign bit undefined behavior584unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;585unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;586unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;587unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s;588589return vint4(static_cast<int>(as0),590static_cast<int>(as1),591static_cast<int>(as2),592static_cast<int>(as3));593}594595/**596* @brief Arithmetic shift right.597*/598template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)599{600return vint4(a.m[0] >> s,601a.m[1] >> s,602a.m[2] >> s,603a.m[3] >> s);604}605606/**607* @brief Return the min vector of two vectors.608*/609ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)610{611return vint4(a.m[0] < b.m[0] ? a.m[0] : b.m[0],612a.m[1] < b.m[1] ? a.m[1] : b.m[1],613a.m[2] < b.m[2] ? a.m[2] : b.m[2],614a.m[3] < b.m[3] ? a.m[3] : b.m[3]);615}616617/**618* @brief Return the min vector of two vectors.619*/620ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)621{622return vint4(a.m[0] > b.m[0] ? a.m[0] : b.m[0],623a.m[1] > b.m[1] ? a.m[1] : b.m[1],624a.m[2] > b.m[2] ? a.m[2] : b.m[2],625a.m[3] > b.m[3] ? a.m[3] : b.m[3]);626}627628/**629* @brief Return the horizontal minimum of a single vector.630*/631ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)632{633int b = std::min(a.m[0], a.m[1]);634int c = std::min(a.m[2], a.m[3]);635return vint4(std::min(b, c));636}637638/**639* @brief Return the horizontal maximum of a single vector.640*/641ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)642{643int b = std::max(a.m[0], a.m[1]);644int c = std::max(a.m[2], a.m[3]);645return vint4(std::max(b, c));646}647648/**649* @brief Store a vector to an aligned memory address.650*/651ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)652{653p[0] = a.m[0];654p[1] = a.m[1];655p[2] = a.m[2];656p[3] = a.m[3];657}658659/**660* @brief Store a vector to an unaligned memory address.661*/662ASTCENC_SIMD_INLINE void store(vint4 a, int* p)663{664p[0] = a.m[0];665p[1] = a.m[1];666p[2] = a.m[2];667p[3] = a.m[3];668}669670/**671* @brief Store a vector to an unaligned memory address.672*/673ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)674{675std::memcpy(p, a.m, sizeof(int) * 4);676}677678/**679* @brief Store lowest N (vector width) bytes into an unaligned address.680*/681ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)682{683std::memcpy(p, a.m, sizeof(uint8_t) * 4);684}685686/**687* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.688*/689ASTCENC_SIMD_INLINE void pack_and_store_low_bytes(vint4 a, uint8_t* p)690{691int b0 = a.m[0] & 0xFF;692int b1 = a.m[1] & 0xFF;693int b2 = a.m[2] & 0xFF;694int b3 = a.m[3] & 0xFF;695696#if !defined(ASTCENC_BIG_ENDIAN)697int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);698#else699int b = b3 | (b2 << 8) | (b1 << 16) | (b0 << 24);700#endif701a = vint4(b, 0, 0, 0);702store_nbytes(a, p);703}704705/**706* @brief Return lanes from @c b if MSB of @c cond is set, else @c a.707*/708ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)709{710return vint4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],711(cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],712(cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],713(cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);714}715716// ============================================================================717// vfloat4 operators and functions718// ============================================================================719720/**721* @brief Overload: vector by vector addition.722*/723ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)724{725return vfloat4(a.m[0] + b.m[0],726a.m[1] + b.m[1],727a.m[2] + b.m[2],728a.m[3] + b.m[3]);729}730731/**732* @brief Overload: vector by vector subtraction.733*/734ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)735{736return vfloat4(a.m[0] - b.m[0],737a.m[1] - b.m[1],738a.m[2] - b.m[2],739a.m[3] - b.m[3]);740}741742/**743* @brief Overload: vector by vector multiplication.744*/745ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)746{747return vfloat4(a.m[0] * b.m[0],748a.m[1] * b.m[1],749a.m[2] * b.m[2],750a.m[3] * b.m[3]);751}752753/**754* @brief Overload: vector by vector division.755*/756ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)757{758return vfloat4(a.m[0] / b.m[0],759a.m[1] / b.m[1],760a.m[2] / b.m[2],761a.m[3] / b.m[3]);762}763764/**765* @brief Overload: vector by vector equality.766*/767ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)768{769return vmask4(a.m[0] == b.m[0],770a.m[1] == b.m[1],771a.m[2] == b.m[2],772a.m[3] == b.m[3]);773}774775/**776* @brief Overload: vector by vector inequality.777*/778ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)779{780return vmask4(a.m[0] != b.m[0],781a.m[1] != b.m[1],782a.m[2] != b.m[2],783a.m[3] != b.m[3]);784}785786/**787* @brief Overload: vector by vector less than.788*/789ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)790{791return vmask4(a.m[0] < b.m[0],792a.m[1] < b.m[1],793a.m[2] < b.m[2],794a.m[3] < b.m[3]);795}796797/**798* @brief Overload: vector by vector greater than.799*/800ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)801{802return vmask4(a.m[0] > b.m[0],803a.m[1] > b.m[1],804a.m[2] > b.m[2],805a.m[3] > b.m[3]);806}807808/**809* @brief Overload: vector by vector less than or equal.810*/811ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)812{813return vmask4(a.m[0] <= b.m[0],814a.m[1] <= b.m[1],815a.m[2] <= b.m[2],816a.m[3] <= b.m[3]);817}818819/**820* @brief Overload: vector by vector greater than or equal.821*/822ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)823{824return vmask4(a.m[0] >= b.m[0],825a.m[1] >= b.m[1],826a.m[2] >= b.m[2],827a.m[3] >= b.m[3]);828}829830/**831* @brief Return the min vector of two vectors.832*833* If either lane value is NaN, @c b will be returned for that lane.834*/835ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)836{837return vfloat4(a.m[0] < b.m[0] ? a.m[0] : b.m[0],838a.m[1] < b.m[1] ? a.m[1] : b.m[1],839a.m[2] < b.m[2] ? a.m[2] : b.m[2],840a.m[3] < b.m[3] ? a.m[3] : b.m[3]);841}842843/**844* @brief Return the max vector of two vectors.845*846* If either lane value is NaN, @c b will be returned for that lane.847*/848ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)849{850return vfloat4(a.m[0] > b.m[0] ? a.m[0] : b.m[0],851a.m[1] > b.m[1] ? a.m[1] : b.m[1],852a.m[2] > b.m[2] ? a.m[2] : b.m[2],853a.m[3] > b.m[3] ? a.m[3] : b.m[3]);854}855856/**857* @brief Return the absolute value of the float vector.858*/859ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)860{861return vfloat4(std::abs(a.m[0]),862std::abs(a.m[1]),863std::abs(a.m[2]),864std::abs(a.m[3]));865}866867/**868* @brief Return a float rounded to the nearest integer value.869*/870ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)871{872assert(std::fegetround() == FE_TONEAREST);873return vfloat4(std::nearbyint(a.m[0]),874std::nearbyint(a.m[1]),875std::nearbyint(a.m[2]),876std::nearbyint(a.m[3]));877}878879/**880* @brief Return the horizontal minimum of a vector.881*/882ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)883{884float tmp1 = std::min(a.m[0], a.m[1]);885float tmp2 = std::min(a.m[2], a.m[3]);886return vfloat4(std::min(tmp1, tmp2));887}888889/**890* @brief Return the horizontal maximum of a vector.891*/892ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)893{894float tmp1 = std::max(a.m[0], a.m[1]);895float tmp2 = std::max(a.m[2], a.m[3]);896return vfloat4(std::max(tmp1, tmp2));897}898899/**900* @brief Return the horizontal sum of a vector.901*/902ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)903{904// Use halving add, gives invariance with SIMD versions905return (a.m[0] + a.m[2]) + (a.m[1] + a.m[3]);906}907908/**909* @brief Return the sqrt of the lanes in the vector.910*/911ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)912{913return vfloat4(std::sqrt(a.m[0]),914std::sqrt(a.m[1]),915std::sqrt(a.m[2]),916std::sqrt(a.m[3]));917}918919/**920* @brief Return lanes from @c b if @c cond is set, else @c a.921*/922ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)923{924return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],925(cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],926(cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],927(cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);928}929930/**931* @brief Load a vector of gathered results from an array;932*/933ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)934{935return vfloat4(base[indices.m[0]],936base[indices.m[1]],937base[indices.m[2]],938base[indices.m[3]]);939}940941/**942* @brief Load a vector of gathered results from an array using byte indices from memory943*/944template<>945ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)946{947return vfloat4(base[indices[0]],948base[indices[1]],949base[indices[2]],950base[indices[3]]);951}952953/**954* @brief Store a vector to an unaligned memory address.955*/956ASTCENC_SIMD_INLINE void store(vfloat4 a, float* ptr)957{958ptr[0] = a.m[0];959ptr[1] = a.m[1];960ptr[2] = a.m[2];961ptr[3] = a.m[3];962}963964/**965* @brief Store a vector to an aligned memory address.966*/967ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* ptr)968{969ptr[0] = a.m[0];970ptr[1] = a.m[1];971ptr[2] = a.m[2];972ptr[3] = a.m[3];973}974975/**976* @brief Return a integer value for a float vector, using truncation.977*/978ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)979{980return vint4(static_cast<int>(a.m[0]),981static_cast<int>(a.m[1]),982static_cast<int>(a.m[2]),983static_cast<int>(a.m[3]));984}985986/**f987* @brief Return a integer value for a float vector, using round-to-nearest.988*/989ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)990{991a = a + vfloat4(0.5f);992return vint4(static_cast<int>(a.m[0]),993static_cast<int>(a.m[1]),994static_cast<int>(a.m[2]),995static_cast<int>(a.m[3]));996}997998/**999* @brief Return a float value for a integer vector.1000*/1001ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)1002{1003return vfloat4(static_cast<float>(a.m[0]),1004static_cast<float>(a.m[1]),1005static_cast<float>(a.m[2]),1006static_cast<float>(a.m[3]));1007}10081009/**1010* @brief Return a float16 value for a float vector, using round-to-nearest.1011*/1012ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)1013{1014return vint4(1015float_to_sf16(a.lane<0>()),1016float_to_sf16(a.lane<1>()),1017float_to_sf16(a.lane<2>()),1018float_to_sf16(a.lane<3>()));1019}10201021/**1022* @brief Return a float16 value for a float scalar, using round-to-nearest.1023*/1024static inline uint16_t float_to_float16(float a)1025{1026return float_to_sf16(a);1027}10281029/**1030* @brief Return a float value for a float16 vector.1031*/1032ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)1033{1034return vfloat4(1035sf16_to_float(static_cast<uint16_t>(a.lane<0>())),1036sf16_to_float(static_cast<uint16_t>(a.lane<1>())),1037sf16_to_float(static_cast<uint16_t>(a.lane<2>())),1038sf16_to_float(static_cast<uint16_t>(a.lane<3>())));1039}10401041/**1042* @brief Return a float value for a float16 scalar.1043*/1044ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)1045{1046return sf16_to_float(a);1047}10481049/**1050* @brief Return a float value as an integer bit pattern (i.e. no conversion).1051*1052* It is a common trick to convert floats into integer bit patterns, perform1053* some bit hackery based on knowledge they are IEEE 754 layout, and then1054* convert them back again. This is the first half of that flip.1055*/1056ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)1057{1058vint4 r;1059std::memcpy(r.m, a.m, 4 * 4);1060return r;1061}10621063/**1064* @brief Return a integer value as a float bit pattern (i.e. no conversion).1065*1066* It is a common trick to convert floats into integer bit patterns, perform1067* some bit hackery based on knowledge they are IEEE 754 layout, and then1068* convert them back again. This is the second half of that flip.1069*/1070ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)1071{1072vfloat4 r;1073std::memcpy(r.m, a.m, 4 * 4);1074return r;1075}10761077/*1078* Table structure for a 16x 8-bit entry table.1079*/1080struct vtable4_16x8 {1081const uint8_t* data;1082};10831084/*1085* Table structure for a 32x 8-bit entry table.1086*/1087struct vtable4_32x8 {1088const uint8_t* data;1089};10901091/*1092* Table structure for a 64x 8-bit entry table.1093*/1094struct vtable4_64x8 {1095const uint8_t* data;1096};10971098/**1099* @brief Prepare a vtable lookup table for 16x 8-bit entry table.1100*/1101ASTCENC_SIMD_INLINE void vtable_prepare(1102vtable4_16x8& table,1103const uint8_t* data1104) {1105table.data = data;1106}11071108/**1109* @brief Prepare a vtable lookup table for 32x 8-bit entry table.1110*/1111ASTCENC_SIMD_INLINE void vtable_prepare(1112vtable4_32x8& table,1113const uint8_t* data1114) {1115table.data = data;1116}11171118/**1119* @brief Prepare a vtable lookup table 64x 8-bit entry table.1120*/1121ASTCENC_SIMD_INLINE void vtable_prepare(1122vtable4_64x8& table,1123const uint8_t* data1124) {1125table.data = data;1126}11271128/**1129* @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.1130*/1131ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(1132const vtable4_16x8& table,1133vint4 idx1134) {1135return vint4(table.data[idx.lane<0>()],1136table.data[idx.lane<1>()],1137table.data[idx.lane<2>()],1138table.data[idx.lane<3>()]);1139}11401141/**1142* @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.1143*/1144ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(1145const vtable4_32x8& table,1146vint4 idx1147) {1148return vint4(table.data[idx.lane<0>()],1149table.data[idx.lane<1>()],1150table.data[idx.lane<2>()],1151table.data[idx.lane<3>()]);1152}11531154/**1155* @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.1156*/1157ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(1158const vtable4_64x8& table,1159vint4 idx1160) {1161return vint4(table.data[idx.lane<0>()],1162table.data[idx.lane<1>()],1163table.data[idx.lane<2>()],1164table.data[idx.lane<3>()]);1165}11661167/**1168* @brief Return a vector of interleaved RGBA data.1169*1170* Input vectors have the value stored in the bottom 8 bits of each lane,1171* with high bits set to zero.1172*1173* Output vector stores a single RGBA texel packed in each lane.1174*/1175ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)1176{1177#if !defined(ASTCENC_BIG_ENDIAN)1178return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);1179#else1180return a + lsl<8>(b) + lsl<16>(g) + lsl<24>(r);1181#endif1182}11831184/**1185* @brief Store a single vector lane to an unaligned address.1186*/1187ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)1188{1189std::memcpy(base, &data, sizeof(int));1190}11911192/**1193* @brief Store a vector, skipping masked lanes.1194*1195* All masked lanes must be at the end of vector, after all non-masked lanes.1196* Input is a byte array of at least 4 bytes per unmasked entry.1197*/1198ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)1199{1200if (mask.m[3])1201{1202store(data, base);1203}1204else if (mask.m[2])1205{1206store_lane(base + 0, data.lane<0>());1207store_lane(base + 4, data.lane<1>());1208store_lane(base + 8, data.lane<2>());1209}1210else if (mask.m[1])1211{1212store_lane(base + 0, data.lane<0>());1213store_lane(base + 4, data.lane<1>());1214}1215else if (mask.m[0])1216{1217store_lane(base + 0, data.lane<0>());1218}1219}12201221#endif // #ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED122212231224