Path: blob/master/thirdparty/astcenc/astcenc_vecmathlib_common_4.h
9896 views
// SPDX-License-Identifier: Apache-2.01// ----------------------------------------------------------------------------2// Copyright 2020-2025 Arm Limited3//4// Licensed under the Apache License, Version 2.0 (the "License"); you may not5// use this file except in compliance with the License. You may obtain a copy6// of the License at:7//8// http://www.apache.org/licenses/LICENSE-2.09//10// Unless required by applicable law or agreed to in writing, software11// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT12// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the13// License for the specific language governing permissions and limitations14// under the License.15// ----------------------------------------------------------------------------1617/**18* @brief Generic 4x32-bit vector functions.19*20* This module implements generic 4-wide vector functions that are valid for21* all instruction sets, typically implemented using lower level 4-wide22* operations that are ISA-specific.23*/2425#ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED26#define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED2728#ifndef ASTCENC_SIMD_INLINE29#error "Include astcenc_vecmathlib.h, do not include directly"30#endif3132#include <cstdio>33#include <limits>3435// ============================================================================36// vint4 operators and functions37// ============================================================================3839/**40* @brief Overload: vector by scalar addition.41*/42ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b)43{44return a + vint4(b);45}4647/**48* @brief Overload: vector by vector incremental addition.49*/50ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b)51{52a = a + b;53return a;54}5556/**57* @brief Overload: vector by scalar subtraction.58*/59ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b)60{61return a - vint4(b);62}6364/**65* @brief Overload: vector by scalar multiplication.66*/67ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b)68{69return a * vint4(b);70}7172/**73* @brief Overload: vector by scalar bitwise or.74*/75ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b)76{77return a | vint4(b);78}7980/**81* @brief Overload: vector by scalar bitwise and.82*/83ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b)84{85return a & vint4(b);86}8788/**89* @brief Overload: vector by scalar bitwise xor.90*/91ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b)92{93return a ^ vint4(b);94}9596/**97* @brief Return the clamped value between min and max.98*/99ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a)100{101return min(max(a, vint4(minv)), vint4(maxv));102}103104/**105* @brief Return the horizontal sum of RGB vector lanes as a scalar.106*/107ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)108{109return a.lane<0>() + a.lane<1>() + a.lane<2>();110}111112/**113* @brief Return the horizontal minimum of a vector.114*/115ASTCENC_SIMD_INLINE int hmin_s(vint4 a)116{117return hmin(a).lane<0>();118}119120/**121* @brief Generate a vint4 from a size_t.122*/123ASTCENC_SIMD_INLINE vint4 vint4_from_size(size_t a)124{125assert(a <= std::numeric_limits<int>::max());126return vint4(static_cast<int>(a));127}128129/**130* @brief Return the horizontal maximum of a vector.131*/132ASTCENC_SIMD_INLINE int hmax_s(vint4 a)133{134return hmax(a).lane<0>();135}136137// ============================================================================138// vfloat4 operators and functions139// ============================================================================140141/**142* @brief Overload: vector by vector incremental addition.143*/144ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b)145{146a = a + b;147return a;148}149150/**151* @brief Overload: vector by scalar addition.152*/153ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b)154{155return a + vfloat4(b);156}157158/**159* @brief Overload: vector by scalar subtraction.160*/161ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b)162{163return a - vfloat4(b);164}165166/**167* @brief Overload: vector by scalar multiplication.168*/169ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)170{171return a * vfloat4(b);172}173174/**175* @brief Overload: scalar by vector multiplication.176*/177ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)178{179return vfloat4(a) * b;180}181182/**183* @brief Overload: vector by scalar division.184*/185ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b)186{187return a / vfloat4(b);188}189190/**191* @brief Overload: scalar by vector division.192*/193ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b)194{195return vfloat4(a) / b;196}197198/**199* @brief Return the min vector of a vector and a scalar.200*201* If either lane value is NaN, @c b will be returned for that lane.202*/203ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b)204{205return min(a, vfloat4(b));206}207208/**209* @brief Return the max vector of a vector and a scalar.210*211* If either lane value is NaN, @c b will be returned for that lane.212*/213ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b)214{215return max(a, vfloat4(b));216}217218/**219* @brief Return the clamped value between min and max.220*221* It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN222* then @c min will be returned for that lane.223*/224ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)225{226// Do not reorder - second operand will return if either is NaN227return min(max(a, minv), maxv);228}229230/**231* @brief Return the clamped value between 0.0f and 1.0f.232*233* If @c a is NaN then zero will be returned for that lane.234*/235ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)236{237// Do not reorder - second operand will return if either is NaN238return min(max(a, vfloat4::zero()), 1.0f);239}240241/**242* @brief Return the horizontal minimum of a vector.243*/244ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a)245{246return hmin(a).lane<0>();247}248249/**250* @brief Return the horizontal min of RGB vector lanes as a scalar.251*/252ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a)253{254a.set_lane<3>(a.lane<0>());255return hmin_s(a);256}257258/**259* @brief Return the horizontal maximum of a vector.260*/261ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a)262{263return hmax(a).lane<0>();264}265266/**267* @brief Accumulate lane-wise sums for a vector.268*/269ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a)270{271accum = accum + a;272}273274/**275* @brief Accumulate lane-wise sums for a masked vector.276*/277ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)278{279a = select(vfloat4::zero(), a, m);280haccumulate(accum, a);281}282283/**284* @brief Return the horizontal sum of RGB vector lanes as a scalar.285*/286ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)287{288return a.lane<0>() + a.lane<1>() + a.lane<2>();289}290291#if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)292293/**294* @brief Return the dot product for the full 4 lanes, returning scalar.295*/296ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)297{298vfloat4 m = a * b;299return hadd_s(m);300}301302/**303* @brief Return the dot product for the full 4 lanes, returning vector.304*/305ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)306{307vfloat4 m = a * b;308return vfloat4(hadd_s(m));309}310311/**312* @brief Return the dot product for the bottom 3 lanes, returning scalar.313*/314ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)315{316vfloat4 m = a * b;317return hadd_rgb_s(m);318}319320/**321* @brief Return the dot product for the bottom 3 lanes, returning vector.322*/323ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)324{325vfloat4 m = a * b;326float d3 = hadd_rgb_s(m);327return vfloat4(d3, d3, d3, 0.0f);328}329330#endif331332#if !defined(ASTCENC_USE_NATIVE_POPCOUNT)333334/**335* @brief Population bit count.336*337* @param v The value to population count.338*339* @return The number of 1 bits.340*/341static inline int popcount(uint64_t v)342{343uint64_t mask1 = 0x5555555555555555ULL;344uint64_t mask2 = 0x3333333333333333ULL;345uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;346v -= (v >> 1) & mask1;347v = (v & mask2) + ((v >> 2) & mask2);348v += v >> 4;349v &= mask3;350v *= 0x0101010101010101ULL;351v >>= 56;352return static_cast<int>(v);353}354355#endif356357/**358* @brief Apply signed bit transfer.359*360* @param input0 The first encoded endpoint.361* @param input1 The second encoded endpoint.362*/363static ASTCENC_SIMD_INLINE void bit_transfer_signed(364vint4& input0,365vint4& input1366) {367input1 = lsr<1>(input1) | (input0 & 0x80);368input0 = lsr<1>(input0) & 0x3F;369370vmask4 mask = (input0 & 0x20) != vint4::zero();371input0 = select(input0, input0 - 0x40, mask);372}373374/**375* @brief Debug function to print a vector of ints.376*/377ASTCENC_SIMD_INLINE void print(vint4 a)378{379ASTCENC_ALIGNAS int v[4];380storea(a, v);381printf("v4_i32:\n %8d %8d %8d %8d\n",382v[0], v[1], v[2], v[3]);383}384385/**386* @brief Debug function to print a vector of ints.387*/388ASTCENC_SIMD_INLINE void printx(vint4 a)389{390ASTCENC_ALIGNAS int v[4];391storea(a, v);392393unsigned int uv[4];394std::memcpy(uv, v, sizeof(int) * 4);395396printf("v4_i32:\n %08x %08x %08x %08x\n",397uv[0], uv[1], uv[2], uv[3]);398}399400/**401* @brief Debug function to print a vector of floats.402*/403ASTCENC_SIMD_INLINE void print(vfloat4 a)404{405ASTCENC_ALIGNAS float v[4];406storea(a, v);407printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n",408static_cast<double>(v[0]), static_cast<double>(v[1]),409static_cast<double>(v[2]), static_cast<double>(v[3]));410}411412/**413* @brief Debug function to print a vector of masks.414*/415ASTCENC_SIMD_INLINE void print(vmask4 a)416{417print(select(vint4(0), vint4(1), a));418}419420#endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED421422423