Path: blob/master/thirdparty/astcenc/astcenc_mathlib_softfloat.cpp
9896 views
// SPDX-License-Identifier: Apache-2.01// ----------------------------------------------------------------------------2// Copyright 2011-2021 Arm Limited3//4// Licensed under the Apache License, Version 2.0 (the "License"); you may not5// use this file except in compliance with the License. You may obtain a copy6// of the License at:7//8// http://www.apache.org/licenses/LICENSE-2.09//10// Unless required by applicable law or agreed to in writing, software11// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT12// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the13// License for the specific language governing permissions and limitations14// under the License.15// ----------------------------------------------------------------------------1617/**18* @brief Soft-float library for IEEE-754.19*/20#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)2122#include "astcenc_mathlib.h"2324/* sized soft-float types. These are mapped to the sized integer25types of C99, instead of C's floating-point types; this is because26the library needs to maintain exact, bit-level control on all27operations on these data types. */28typedef uint16_t sf16;29typedef uint32_t sf32;3031/******************************************32helper functions and their lookup tables33******************************************/34/* count leading zeros functions. Only used when the input is nonzero. */3536#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))37#elif defined(__arm__) && defined(__ARMCC_VERSION)38#elif defined(__arm__) && defined(__GNUC__)39#else40/* table used for the slow default versions. */41static const uint8_t clz_table[256] =42{438, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,443, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,452, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,462, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,471, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,481, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,491, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,501, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,510, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,520, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,530, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,540, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,550, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,560, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,570, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,580, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 059};60#endif6162/*6332-bit count-leading-zeros function: use the Assembly instruction whenever possible. */64static uint32_t clz32(uint32_t inp)65{66#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))67uint32_t bsr;68__asm__("bsrl %1, %0": "=r"(bsr):"r"(inp | 1));69return 31 - bsr;70#else71#if defined(__arm__) && defined(__ARMCC_VERSION)72return __clz(inp); /* armcc builtin */73#else74#if defined(__arm__) && defined(__GNUC__)75uint32_t lz;76__asm__("clz %0, %1": "=r"(lz):"r"(inp));77return lz;78#else79/* slow default version */80uint32_t summa = 24;81if (inp >= UINT32_C(0x10000))82{83inp >>= 16;84summa -= 16;85}86if (inp >= UINT32_C(0x100))87{88inp >>= 8;89summa -= 8;90}91return summa + clz_table[inp];92#endif93#endif94#endif95}9697/* the five rounding modes that IEEE-754r defines */98typedef enum99{100SF_UP = 0, /* round towards positive infinity */101SF_DOWN = 1, /* round towards negative infinity */102SF_TOZERO = 2, /* round towards zero */103SF_NEARESTEVEN = 3, /* round toward nearest value; if mid-between, round to even value */104SF_NEARESTAWAY = 4 /* round toward nearest value; if mid-between, round away from zero */105} roundmode;106107108static uint32_t rtne_shift32(uint32_t inp, uint32_t shamt)109{110uint32_t vl1 = UINT32_C(1) << shamt;111uint32_t inp2 = inp + (vl1 >> 1); /* added 0.5 ULP */112uint32_t msk = (inp | UINT32_C(1)) & vl1; /* nonzero if odd. '| 1' forces it to 1 if the shamt is 0. */113msk--; /* negative if even, nonnegative if odd. */114inp2 -= (msk >> 31); /* subtract epsilon before shift if even. */115inp2 >>= shamt;116return inp2;117}118119static uint32_t rtna_shift32(uint32_t inp, uint32_t shamt)120{121uint32_t vl1 = (UINT32_C(1) << shamt) >> 1;122inp += vl1;123inp >>= shamt;124return inp;125}126127static uint32_t rtup_shift32(uint32_t inp, uint32_t shamt)128{129uint32_t vl1 = UINT32_C(1) << shamt;130inp += vl1;131inp--;132inp >>= shamt;133return inp;134}135136/* convert from FP16 to FP32. */137static sf32 sf16_to_sf32(sf16 inp)138{139uint32_t inpx = inp;140141/*142This table contains, for every FP16 sign/exponent value combination,143the difference between the input FP16 value and the value obtained144by shifting the correct FP32 result right by 13 bits.145This table allows us to handle every case except denormals and NaN146with just 1 table lookup, 2 shifts and 1 add.147*/148149#define WITH_MSB(a) (UINT32_C(a) | (1u << 31))150static const uint32_t tbl[64] =151{152WITH_MSB(0x00000), 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,1530x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,1540x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,1550x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, WITH_MSB(0x38000),156WITH_MSB(0x38000), 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,1570x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,1580x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,1590x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, WITH_MSB(0x70000)160};161162uint32_t res = tbl[inpx >> 10];163res += inpx;164165/* Normal cases: MSB of 'res' not set. */166if ((res & WITH_MSB(0)) == 0)167{168return res << 13;169}170171/* Infinity and Zero: 10 LSB of 'res' not set. */172if ((res & 0x3FF) == 0)173{174return res << 13;175}176177/* NaN: the exponent field of 'inp' is non-zero. */178if ((inpx & 0x7C00) != 0)179{180/* All NaNs are quietened. */181return (res << 13) | 0x400000;182}183184/* Denormal cases */185uint32_t sign = (inpx & 0x8000) << 16;186uint32_t mskval = inpx & 0x7FFF;187uint32_t leadingzeroes = clz32(mskval);188mskval <<= leadingzeroes;189return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign;190}191192/* Conversion routine that converts from FP32 to FP16. It supports denormals and all rounding modes. If a NaN is given as input, it is quietened. */193static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)194{195/* for each possible sign/exponent combination, store a case index. This gives a 512-byte table */196static const uint8_t tab[512] {1970, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,19810, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,19910, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,20010, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,20110, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,20210, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,20310, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,20420, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,20530, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40,20640, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,20740, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,20840, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,20940, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,21040, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,21140, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,21240, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 50,2132145, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,21515, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,21615, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,21715, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,21815, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,21915, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,22015, 15, 15, 15, 15, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,22125, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,22235, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 45,22345, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,22445, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,22545, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,22645, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,22745, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,22845, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,22945, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 55,230};231232/* many of the cases below use a case-dependent magic constant. So we look up a magic constant before actually performing the switch. This table allows us to group cases, thereby minimizing code233size. */234static const uint32_t tabx[60] {235UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x80000000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),236UINT32_C(1), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8001), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),237UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),238UINT32_C(0xC8001FFF), UINT32_C(0xC8000000), UINT32_C(0xC8000000), UINT32_C(0xC8000FFF), UINT32_C(0xC8001000),239UINT32_C(0x58000000), UINT32_C(0x38001FFF), UINT32_C(0x58000000), UINT32_C(0x58000FFF), UINT32_C(0x58001000),240UINT32_C(0x7C00), UINT32_C(0x7BFF), UINT32_C(0x7BFF), UINT32_C(0x7C00), UINT32_C(0x7C00),241UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFC00),242UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000),243UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000)244};245246uint32_t p;247uint32_t idx = rmode + tab[inp >> 23];248uint32_t vlx = tabx[idx];249switch (idx)250{251/*252Positive number which may be Infinity or NaN.253We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa.254(If we don't do this quieting, then a NaN that is distinguished only by having255its low-order bits set, would be turned into an INF. */256case 50:257case 51:258case 52:259case 53:260case 54:261case 55:262case 56:263case 57:264case 58:265case 59:266/*267the input value is 0x7F800000 or 0xFF800000 if it is INF.268By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero.269For NaNs, however, this operation will keep bit 23 with the value 1.270We can then extract bit 23, and logical-OR bit 9 of the result with this271bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit272of the mantissa is set.)273*/274p = (inp - 1) & UINT32_C(0x800000); /* zero if INF, nonzero if NaN. */275return static_cast<sf16>(((inp + vlx) >> 13) | (p >> 14));276/*277positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.278If it is, then return 0, else return 1 (the smallest representable nonzero number)279*/280case 0:281/*282-inp will set the MSB if the input number is nonzero.283Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.284*/285return static_cast<sf16>(static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31);286287/*288negative, exponent = , round-mode == DOWN, need to check whether number is289actually 0. If it is, return 0x8000 ( float -0.0 )290Else return the smallest negative number ( 0x8001 ) */291case 6:292/*293in this case 'vlx' is 0x80000000. By subtracting the input value from it,294we obtain a value that is 0 if the input value is in fact zero and has295the MSB set if it isn't. We then right-shift the value by 31 places to296get a value that is 0 if the input is -0.0 and 1 otherwise.297*/298return static_cast<sf16>(((vlx - inp) >> 31) + UINT32_C(0x8000));299300/*301for all other cases involving underflow/overflow, we don't need to302do actual tests; we just return 'vlx'.303*/304case 1:305case 2:306case 3:307case 4:308case 5:309case 7:310case 8:311case 9:312case 10:313case 11:314case 12:315case 13:316case 14:317case 15:318case 16:319case 17:320case 18:321case 19:322case 40:323case 41:324case 42:325case 43:326case 44:327case 45:328case 46:329case 47:330case 48:331case 49:332return static_cast<sf16>(vlx);333334/*335for normal numbers, 'vlx' is the difference between the FP32 value of a number and the336FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is337baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away338from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero.339for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even340except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */341342/* normal number, all rounding modes except round-to-nearest-even: */343case 30:344case 31:345case 32:346case 34:347case 35:348case 36:349case 37:350case 39:351return static_cast<sf16>((inp + vlx) >> 13);352353/* normal number, round-to-nearest-even. */354case 33:355case 38:356p = inp + vlx;357p += (inp >> 13) & 1;358return static_cast<sf16>(p >> 13);359360/*361the various denormal cases. These are not expected to be common, so their performance is a bit362less important. For each of these cases, we need to extract an exponent and a mantissa363(including the implicit '1'!), and then right-shift the mantissa by a shift-amount that364depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the365sign of the resulting denormal number.366*/367case 21:368case 22:369case 25:370case 27:371/* denormal, round towards zero. */372p = 126 - ((inp >> 23) & 0xFF);373return static_cast<sf16>((((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx);374case 20:375case 26:376/* denormal, round away from zero. */377p = 126 - ((inp >> 23) & 0xFF);378return static_cast<sf16>(rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);379case 24:380case 29:381/* denormal, round to nearest-away */382p = 126 - ((inp >> 23) & 0xFF);383return static_cast<sf16>(rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);384case 23:385case 28:386/* denormal, round to nearest-even. */387p = 126 - ((inp >> 23) & 0xFF);388return static_cast<sf16>(rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);389}390391return 0;392}393394/* convert from soft-float to native-float */395float sf16_to_float(uint16_t p)396{397if32 i;398i.u = sf16_to_sf32(p);399return i.f;400}401402/* convert from native-float to soft-float */403uint16_t float_to_sf16(float p)404{405if32 i;406i.f = p;407return sf32_to_sf16(i.u, SF_NEARESTEVEN);408}409410#endif411412413