Path: blob/master/thirdparty/astcenc/astcenc_vecmathlib.h
9896 views
// SPDX-License-Identifier: Apache-2.01// ----------------------------------------------------------------------------2// Copyright 2019-2025 Arm Limited3// Copyright 2008 Jose Fonseca4//5// Licensed under the Apache License, Version 2.0 (the "License"); you may not6// use this file except in compliance with the License. You may obtain a copy7// of the License at:8//9// http://www.apache.org/licenses/LICENSE-2.010//11// Unless required by applicable law or agreed to in writing, software12// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT13// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the14// License for the specific language governing permissions and limitations15// under the License.16// ----------------------------------------------------------------------------1718/*19* This module implements vector support for floats, ints, and vector lane20* control masks. It provides access to both explicit vector width types, and21* flexible N-wide types where N can be determined at compile time.22*23* The design of this module encourages use of vector length agnostic code, via24* the vint, vfloat, and vmask types. These will take on the widest SIMD vector25* with that is available at compile time. The current vector width is26* accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.27*28* Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.29* These are provided primarily for prototyping and algorithm debug of VLA30* implementations.31*32* Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask433* types. These are provided for use by VLA code, but are also expected to be34* used as a fixed-width type and will supported a reference C++ fallback for35* use on platforms without SIMD intrinsics.36*37* Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask838* types. These are provide for use by VLA code, and are not expected to be39* used as a fixed-width type in normal code. No reference C implementation is40* provided on platforms without underlying SIMD intrinsics.41*42* With the current implementation ISA support is provided for:43*44* * 1-wide for scalar reference45* * 4-wide for Armv8-A NEON46* * 4-wide for x86-64 SSE247* * 4-wide for x86-64 SSE4.148* * 8-wide for Armv8-A SVE49* * 8-wide for x86-64 AVX250*/5152#ifndef ASTC_VECMATHLIB_H_INCLUDED53#define ASTC_VECMATHLIB_H_INCLUDED5455#if ASTCENC_SSE != 0 || ASTCENC_AVX != 056#include <immintrin.h>57#endif5859#if ASTCENC_SVE != 060#include <arm_sve.h>61#include <arm_neon_sve_bridge.h>62#endif6364#if ASTCENC_NEON != 065#include <arm_neon.h>66#endif6768#if !defined(__clang__) && defined(_MSC_VER)69#define ASTCENC_SIMD_INLINE __forceinline70#define ASTCENC_NO_INLINE71#elif defined(__GNUC__) && !defined(__clang__)72#define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline73#define ASTCENC_NO_INLINE __attribute__ ((noinline))74#else75#define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline76#define ASTCENC_NO_INLINE __attribute__ ((noinline))77#endif7879template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indices);8081#if ASTCENC_AVX >= 282// If we have AVX2 expose 8-wide VLA.83#include "astcenc_vecmathlib_sse_4.h"84#include "astcenc_vecmathlib_common_4.h"85#include "astcenc_vecmathlib_avx2_8.h"8687#define ASTCENC_SIMD_WIDTH 88889using vfloat = vfloat8;9091#if defined(ASTCENC_NO_INVARIANCE)92using vfloatacc = vfloat8;93#else94using vfloatacc = vfloat4;95#endif9697using vint = vint8;98using vmask = vmask8;99100using vtable_16x8 = vtable8_16x8;101using vtable_32x8 = vtable8_32x8;102using vtable_64x8 = vtable8_64x8;103104constexpr auto loada = vfloat8::loada;105constexpr auto load1 = vfloat8::load1;106constexpr auto vint_from_size = vint8_from_size;107108#elif ASTCENC_SSE >= 20109// If we have SSE expose 4-wide VLA, and 4-wide fixed width.110#include "astcenc_vecmathlib_sse_4.h"111#include "astcenc_vecmathlib_common_4.h"112113#define ASTCENC_SIMD_WIDTH 4114115using vfloat = vfloat4;116using vfloatacc = vfloat4;117using vint = vint4;118using vmask = vmask4;119120using vtable_16x8 = vtable4_16x8;121using vtable_32x8 = vtable4_32x8;122using vtable_64x8 = vtable4_64x8;123124constexpr auto loada = vfloat4::loada;125constexpr auto load1 = vfloat4::load1;126constexpr auto vint_from_size = vint4_from_size;127128#elif ASTCENC_SVE == 8129// Check the compiler is configured with fixed-length 256-bit SVE.130#if !defined(__ARM_FEATURE_SVE_BITS) || (__ARM_FEATURE_SVE_BITS != 256)131#error "__ARM_FEATURE_SVE_BITS is not set to 256 bits"132#endif133134// If we have SVE configured as 8-wide, expose 8-wide VLA.135#include "astcenc_vecmathlib_neon_4.h"136#include "astcenc_vecmathlib_common_4.h"137#include "astcenc_vecmathlib_sve_8.h"138139#define ASTCENC_SIMD_WIDTH 8140141using vfloat = vfloat8;142143#if defined(ASTCENC_NO_INVARIANCE)144using vfloatacc = vfloat8;145#else146using vfloatacc = vfloat4;147#endif148149using vint = vint8;150using vmask = vmask8;151152using vtable_16x8 = vtable8_16x8;153using vtable_32x8 = vtable8_32x8;154using vtable_64x8 = vtable8_64x8;155156constexpr auto loada = vfloat8::loada;157constexpr auto load1 = vfloat8::load1;158constexpr auto vint_from_size = vint8_from_size;159160#elif ASTCENC_NEON > 0161// If we have NEON expose 4-wide VLA.162#include "astcenc_vecmathlib_neon_4.h"163#include "astcenc_vecmathlib_common_4.h"164165#define ASTCENC_SIMD_WIDTH 4166167using vfloat = vfloat4;168using vfloatacc = vfloat4;169using vint = vint4;170using vmask = vmask4;171172using vtable_16x8 = vtable4_16x8;173using vtable_32x8 = vtable4_32x8;174using vtable_64x8 = vtable4_64x8;175176constexpr auto loada = vfloat4::loada;177constexpr auto load1 = vfloat4::load1;178constexpr auto vint_from_size = vint4_from_size;179180#else181// If we have nothing expose 4-wide VLA, and 4-wide fixed width.182183// Note: We no longer expose the 1-wide scalar fallback because it is not184// invariant with the 4-wide path due to algorithms that use horizontal185// operations that accumulate a local vector sum before accumulating into186// a running sum.187//188// For 4 items adding into an accumulator using 1-wide vectors the sum is:189//190// result = ((((sum + l0) + l1) + l2) + l3)191//192// ... whereas the accumulator for a 4-wide vector sum is:193//194// result = sum + ((l0 + l2) + (l1 + l3))195//196// In "normal maths" this is the same, but the floating point reassociation197// differences mean that these will not produce the same result.198199#include "astcenc_vecmathlib_none_4.h"200#include "astcenc_vecmathlib_common_4.h"201202#define ASTCENC_SIMD_WIDTH 4203204using vfloat = vfloat4;205using vfloatacc = vfloat4;206using vint = vint4;207using vmask = vmask4;208209using vtable_16x8 = vtable4_16x8;210using vtable_32x8 = vtable4_32x8;211using vtable_64x8 = vtable4_64x8;212213constexpr auto loada = vfloat4::loada;214constexpr auto load1 = vfloat4::load1;215constexpr auto vint_from_size = vint4_from_size;216#endif217218/**219* @brief Round a count down to the largest multiple of the SIMD width.220*221* Assumption that the vector width is a power of two ...222*223* @param count The unrounded value.224*225* @return The rounded value.226*/227ASTCENC_SIMD_INLINE size_t round_down_to_simd_multiple_vla(size_t count)228{229return count & static_cast<size_t>(~(ASTCENC_SIMD_WIDTH - 1));230}231232/**233* @brief Round a count up to the largest multiple of the SIMD width.234*235* Assumption that the vector width is a power of two ...236*237* @param count The unrounded value.238*239* @return The rounded value.240*/241ASTCENC_SIMD_INLINE size_t round_up_to_simd_multiple_vla(size_t count)242{243size_t multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;244return multiples * ASTCENC_SIMD_WIDTH;245}246247/**248* @brief Return @c a with lanes negated if the @c b lane is negative.249*/250ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)251{252vint ia = float_as_int(a);253vint ib = float_as_int(b);254vint sign_mask(static_cast<int>(0x80000000));255vint r = ia ^ (ib & sign_mask);256return int_as_float(r);257}258259/**260* @brief Return fast, but approximate, vector atan(x).261*262* Max error of this implementation is 0.004883.263*/264ASTCENC_SIMD_INLINE vfloat atan(vfloat x)265{266vmask c = abs(x) > vfloat(1.0f);267vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x);268vfloat y = select(x, vfloat(1.0f) / x, c);269y = y / (y * y * vfloat(0.28f) + vfloat(1.0f));270return select(y, z - y, c);271}272273/**274* @brief Return fast, but approximate, vector atan2(x, y).275*/276ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)277{278vfloat z = atan(abs(y / x));279vmask xmask = x < vfloat::zero();280return change_sign(select(z, vfloat(astc::PI) - z, xmask), y);281}282283/*284* @brief Factory that returns a unit length 4 component vfloat4.285*/286static ASTCENC_SIMD_INLINE vfloat4 unit4()287{288return vfloat4(0.5f);289}290291/**292* @brief Factory that returns a unit length 3 component vfloat4.293*/294static ASTCENC_SIMD_INLINE vfloat4 unit3()295{296float val = 0.577350258827209473f;297return vfloat4(val, val, val, 0.0f);298}299300/**301* @brief Factory that returns a unit length 2 component vfloat4.302*/303static ASTCENC_SIMD_INLINE vfloat4 unit2()304{305float val = 0.707106769084930420f;306return vfloat4(val, val, 0.0f, 0.0f);307}308309/**310* @brief Factory that returns a 3 component vfloat4.311*/312static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)313{314return vfloat4(a, b, c, 0.0f);315}316317/**318* @brief Factory that returns a 2 component vfloat4.319*/320static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)321{322return vfloat4(a, b, 0.0f, 0.0f);323}324325/**326* @brief Normalize a non-zero length vector to unit length.327*/328static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)329{330vfloat4 length = dot(a, a);331return a / sqrt(length);332}333334/**335* @brief Normalize a vector, returning @c safe if len is zero.336*/337static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)338{339vfloat4 length = dot(a, a);340if (length.lane<0>() != 0.0f)341{342return a / sqrt(length);343}344345return safe;346}347348349350#define POLY0(x, c0) ( c0)351#define POLY1(x, c0, c1) ((POLY0(x, c1) * x) + c0)352#define POLY2(x, c0, c1, c2) ((POLY1(x, c1, c2) * x) + c0)353#define POLY3(x, c0, c1, c2, c3) ((POLY2(x, c1, c2, c3) * x) + c0)354#define POLY4(x, c0, c1, c2, c3, c4) ((POLY3(x, c1, c2, c3, c4) * x) + c0)355#define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)356357/**358* @brief Compute an approximate exp2(x) for each lane in the vector.359*360* Based on 5th degree minimax polynomials, ported from this blog361* https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html362*/363static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)364{365x = clamp(-126.99999f, 129.0f, x);366367vint4 ipart = float_to_int(x - 0.5f);368vfloat4 fpart = x - int_to_float(ipart);369370// Integer contrib, using 1 << ipart371vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));372373// Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)374vfloat4 fexp = POLY5(fpart,3759.9999994e-1f,3766.9315308e-1f,3772.4015361e-1f,3785.5826318e-2f,3798.9893397e-3f,3801.8775767e-3f);381382return iexp * fexp;383}384385/**386* @brief Compute an approximate log2(x) for each lane in the vector.387*388* Based on 5th degree minimax polynomials, ported from this blog389* https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html390*/391static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)392{393vint4 exp(0x7F800000);394vint4 mant(0x007FFFFF);395vint4 one(0x3F800000);396397vint4 i = float_as_int(x);398399vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);400401vfloat4 m = int_as_float((i & mant) | one);402403// Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)404vfloat4 p = POLY4(m,4052.8882704548164776201f,406-2.52074962577807006663f,4071.48116647521213171641f,408-0.465725644288844778798f,4090.0596515482674574969533f);410411// Increases the polynomial degree, but ensures that log2(1) == 0412p = p * (m - 1.0f);413414return p + e;415}416417/**418* @brief Compute an approximate pow(x, y) for each lane in the vector.419*420* Power function based on the exp2(log2(x) * y) transform.421*/422static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)423{424vmask4 zero_mask = y == vfloat4(0.0f);425vfloat4 estimate = exp2(log2(x) * y);426427// Guarantee that y == 0 returns exactly 1.0f428return select(estimate, vfloat4(1.0f), zero_mask);429}430431/**432* @brief Count the leading zeros for each lane in @c a.433*434* Valid for all data values of @c a; will return a per-lane value [0, 32].435*/436static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)437{438// This function is a horrible abuse of floating point exponents to convert439// the original integer value into a 2^N encoding we can recover easily.440441// Convert to float without risk of rounding up by keeping only top 8 bits.442// This trick is is guaranteed to keep top 8 bits and clear the 9th.443a = (~lsr<8>(a)) & a;444a = float_as_int(int_to_float(a));445446// Extract and unbias exponent447a = vint4(127 + 31) - lsr<23>(a);448449// Clamp result to a valid 32-bit range450return clamp(0, 32, a);451}452453/**454* @brief Return lanewise 2^a for each lane in @c a.455*456* Use of signed int means that this is only valid for values in range [0, 31].457*/458static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)459{460// 2^30 is the largest signed number than can be represented461assert(all(a < vint4(31)));462463// This function is a horrible abuse of floating point to use the exponent464// and float conversion to generate a 2^N multiple.465466// Bias the exponent467vint4 exp = a + 127;468exp = lsl<23>(exp);469470// Reinterpret the bits as a float, and then convert to an int471vfloat4 f = int_as_float(exp);472return float_to_int(f);473}474475/**476* @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].477*/478static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)479{480vint4 fp16_one = vint4(0x3C00);481vint4 fp16_small = lsl<8>(p);482483vmask4 is_one = p == vint4(0xFFFF);484vmask4 is_small = p < vint4(4);485486// Manually inline clz() on Visual Studio to avoid release build codegen bug487// see https://github.com/ARM-software/astc-encoder/issues/259488#if !defined(__clang__) && defined(_MSC_VER)489vint4 a = (~lsr<8>(p)) & p;490a = float_as_int(int_to_float(a));491a = vint4(127 + 31) - lsr<23>(a);492vint4 lz = clamp(0, 32, a) - 16;493#else494vint4 lz = clz(p) - 16;495#endif496497p = p * two_to_the_n(lz + 1);498p = p & vint4(0xFFFF);499500p = lsr<6>(p);501502p = p | lsl<10>(vint4(14) - lz);503504vint4 r = select(p, fp16_one, is_one);505r = select(r, fp16_small, is_small);506return r;507}508509/**510* @brief Convert 16-bit LNS to float16.511*/512static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)513{514vint4 mc = p & 0x7FF;515vint4 ec = lsr<11>(p);516517vint4 mc_512 = mc * 3;518vmask4 mask_512 = mc < vint4(512);519520vint4 mc_1536 = mc * 4 - 512;521vmask4 mask_1536 = mc < vint4(1536);522523vint4 mc_else = mc * 5 - 2048;524525vint4 mt = mc_else;526mt = select(mt, mc_1536, mask_1536);527mt = select(mt, mc_512, mask_512);528529vint4 res = lsl<10>(ec) | lsr<3>(mt);530return min(res, vint4(0x7BFF));531}532533/**534* @brief Extract mantissa and exponent of a float value.535*536* @param a The input value.537* @param[out] exp The output exponent.538*539* @return The mantissa.540*/541static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)542{543// Interpret the bits as an integer544vint4 ai = float_as_int(a);545546// Extract and unbias the exponent547exp = (lsr<23>(ai) & 0xFF) - 126;548549// Extract and unbias the mantissa550vint4 manti = (ai & static_cast<int>(0x807FFFFF)) | 0x3F000000;551return int_as_float(manti);552}553554/**555* @brief Convert float to 16-bit LNS.556*/557static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)558{559vint4 exp;560vfloat4 mant = frexp(a, exp);561562// Do these early before we start messing about ...563vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));564vmask4 mask_infinity = a >= vfloat4(65536.0f);565566// If input is smaller than 2^-14, multiply by 2^25 and don't bias.567vmask4 exp_lt_m13 = exp < vint4(-13);568569vfloat4 a1a = a * 33554432.0f;570vint4 expa = vint4::zero();571572vfloat4 a1b = (mant - 0.5f) * 4096;573vint4 expb = exp + 14;574575a = select(a1b, a1a, exp_lt_m13);576exp = select(expb, expa, exp_lt_m13);577578vmask4 a_lt_384 = a < vfloat4(384.0f);579vmask4 a_lt_1408 = a <= vfloat4(1408.0f);580581vfloat4 a2a = a * (4.0f / 3.0f);582vfloat4 a2b = a + 128.0f;583vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);584585a = a2c;586a = select(a, a2b, a_lt_1408);587a = select(a, a2a, a_lt_384);588589a = a + (int_to_float(exp) * 2048.0f) + 1.0f;590591a = select(a, vfloat4(65535.0f), mask_infinity);592a = select(a, vfloat4::zero(), mask_underflow_nan);593594return a;595}596597namespace astc598{599600static ASTCENC_SIMD_INLINE float pow(float x, float y)601{602return pow(vfloat4(x), vfloat4(y)).lane<0>();603}604605}606607#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED608609610