CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/MIPSVFPUFallbacks.cpp
Views: 1401
#include <cmath>12#include "Common/BitScan.h"34#include "Core/MIPS/MIPSVFPUFallbacks.h"5#include "Core/MIPS/MIPSVFPUUtils.h"67// MIPSVFPUUtils now has the high precision instructions implemented by fp648// in https://github.com/hrydgard/ppsspp/pull/16984 .9//10// These are our older approximations that are quite good but has flaws,11// but we need them to fall back to if the table files are missing.12//13// Note that currently, some of the new functions are not integrated in the JIT14// and are thus not normally used anyway.1516// First the "trivial" fallbacks where we haven't done any accuracy work previously.1718float vfpu_asin_fallback(float angle) {19return (float)(asinf(angle) / M_PI_2);20}2122float vfpu_rcp_fallback(float x) {23return 1.0f / x;24}2526float vfpu_log2_fallback(float x) {27return log2f(x);28}2930float vfpu_exp2_fallback(float x) {31return exp2f(x);32}3334// Flushes the angle to 0 if exponent smaller than this in vfpu_sin/vfpu_cos/vfpu_sincos.35// Was measured to be around 0x68, but GTA on Mac is somehow super sensitive36// to the shape of the sine curve which seem to be very slightly different.37//38// So setting a lower value.39#define PRECISION_EXP_THRESHOLD 0x654041union float2int {42uint32_t i;43float f;44};4546float vfpu_sqrt_fallback(float a) {47float2int val;48val.f = a;4950if ((val.i & 0xff800000) == 0x7f800000) {51if ((val.i & 0x007fffff) != 0) {52val.i = 0x7f800001;53}54return val.f;55}56if ((val.i & 0x7f800000) == 0) {57// Kill any sign.58val.i = 0;59return val.f;60}61if (val.i & 0x80000000) {62val.i = 0x7f800001;63return val.f;64}6566int k = get_exp(val.i);67uint32_t sp = get_mant(val.i);68int less_bits = k & 1;69k >>= 1;7071uint32_t z = 0x00C00000 >> less_bits;72int64_t halfsp = sp >> 1;73halfsp <<= 23 - less_bits;74for (int i = 0; i < 6; ++i) {75z = (z >> 1) + (uint32_t)(halfsp / z);76}7778val.i = ((k + 127) << 23) | ((z << less_bits) & 0x007FFFFF);79// The lower two bits never end up set on the PSP, it seems like.80val.i &= 0xFFFFFFFC;8182return val.f;83}8485static inline uint32_t mant_mul(uint32_t a, uint32_t b) {86uint64_t m = (uint64_t)a * (uint64_t)b;87if (m & 0x007FFFFF) {88m += 0x01437000;89}90return (uint32_t)(m >> 23);91}9293float vfpu_rsqrt_fallback(float a) {94float2int val;95val.f = a;9697if (val.i == 0x7f800000) {98return 0.0f;99}100if ((val.i & 0x7fffffff) > 0x7f800000) {101val.i = (val.i & 0x80000000) | 0x7f800001;102return val.f;103}104if ((val.i & 0x7f800000) == 0) {105val.i = (val.i & 0x80000000) | 0x7f800000;106return val.f;107}108if (val.i & 0x80000000) {109val.i = 0xff800001;110return val.f;111}112113int k = get_exp(val.i);114uint32_t sp = get_mant(val.i);115int less_bits = k & 1;116k = -(k >> 1);117118uint32_t z = 0x00800000 >> less_bits;119uint32_t halfsp = sp >> (1 + less_bits);120for (int i = 0; i < 6; ++i) {121uint32_t zsq = mant_mul(z, z);122uint32_t correction = 0x00C00000 - mant_mul(halfsp, zsq);123z = mant_mul(z, correction);124}125126int8_t shift = (int8_t)clz32_nonzero(z) - 8 + less_bits;127if (shift < 1) {128z >>= -shift;129k += -shift;130} else if (shift > 0) {131z <<= shift;132k -= shift;133}134135z >>= less_bits;136137val.i = ((k + 127) << 23) | (z & 0x007FFFFF);138val.i &= 0xFFFFFFFC;139140return val.f;141}142143float vfpu_sin_fallback(float a) {144float2int val;145val.f = a;146147int32_t k = get_uexp(val.i);148if (k == 255) {149val.i = (val.i & 0xFF800001) | 1;150return val.f;151}152153if (k < PRECISION_EXP_THRESHOLD) {154val.i &= 0x80000000;155return val.f;156}157158// Okay, now modulus by 4 to begin with (identical wave every 4.)159int32_t mantissa = get_mant(val.i);160if (k > 0x80) {161const uint8_t over = k & 0x1F;162mantissa = (mantissa << over) & 0x00FFFFFF;163k = 0x80;164}165// This subtracts off the 2. If we do, flip sign to inverse the wave.166if (k == 0x80 && mantissa >= (1 << 23)) {167val.i ^= 0x80000000;168mantissa -= 1 << 23;169}170171int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;172mantissa <<= norm_shift;173k -= norm_shift;174175if (k <= 0 || mantissa == 0) {176val.i &= 0x80000000;177return val.f;178}179180// This is the value with modulus applied.181val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));182val.f = (float)sin((double)val.f * M_PI_2);183val.i &= 0xFFFFFFFC;184return val.f;185}186187float vfpu_cos_fallback(float a) {188float2int val;189val.f = a;190bool negate = false;191192int32_t k = get_uexp(val.i);193if (k == 255) {194// Note: unlike sin, cos always returns +NAN.195val.i = (val.i & 0x7F800001) | 1;196return val.f;197}198199if (k < PRECISION_EXP_THRESHOLD)200return 1.0f;201202// Okay, now modulus by 4 to begin with (identical wave every 4.)203int32_t mantissa = get_mant(val.i);204if (k > 0x80) {205const uint8_t over = k & 0x1F;206mantissa = (mantissa << over) & 0x00FFFFFF;207k = 0x80;208}209// This subtracts off the 2. If we do, negate the result value.210if (k == 0x80 && mantissa >= (1 << 23)) {211mantissa -= 1 << 23;212negate = true;213}214215int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;216mantissa <<= norm_shift;217k -= norm_shift;218219if (k <= 0 || mantissa == 0)220return negate ? -1.0f : 1.0f;221222// This is the value with modulus applied.223val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));224if (val.f == 1.0f || val.f == -1.0f) {225return negate ? 0.0f : -0.0f;226}227val.f = (float)cos((double)val.f * M_PI_2);228val.i &= 0xFFFFFFFC;229return negate ? -val.f : val.f;230}231232void vfpu_sincos_fallback(float a, float &s, float &c) {233float2int val;234val.f = a;235// For sin, negate the input, for cos negate the output.236bool negate = false;237238int32_t k = get_uexp(val.i);239if (k == 255) {240val.i = (val.i & 0xFF800001) | 1;241s = val.f;242val.i &= 0x7F800001;243c = val.f;244return;245}246247if (k < PRECISION_EXP_THRESHOLD) {248val.i &= 0x80000000;249s = val.f;250c = 1.0f;251return;252}253254// Okay, now modulus by 4 to begin with (identical wave every 4.)255int32_t mantissa = get_mant(val.i);256if (k > 0x80) {257const uint8_t over = k & 0x1F;258mantissa = (mantissa << over) & 0x00FFFFFF;259k = 0x80;260}261// This subtracts off the 2. If we do, flip signs.262if (k == 0x80 && mantissa >= (1 << 23)) {263mantissa -= 1 << 23;264negate = true;265}266267int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;268mantissa <<= norm_shift;269k -= norm_shift;270271if (k <= 0 || mantissa == 0) {272val.i &= 0x80000000;273if (negate)274val.i ^= 0x80000000;275s = val.f;276c = negate ? -1.0f : 1.0f;277return;278}279280// This is the value with modulus applied.281val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));282float2int i_sine, i_cosine;283if (val.f == 1.0f) {284i_sine.f = negate ? -1.0f : 1.0f;285i_cosine.f = negate ? 0.0f : -0.0f;286} else if (val.f == -1.0f) {287i_sine.f = negate ? 1.0f : -1.0f;288i_cosine.f = negate ? 0.0f : -0.0f;289} else if (negate) {290i_sine.f = (float)sin((double)-val.f * M_PI_2);291i_cosine.f = -(float)cos((double)val.f * M_PI_2);292} else {293double angle = (double)val.f * M_PI_2;294#if defined(__linux__)295double d_sine;296double d_cosine;297sincos(angle, &d_sine, &d_cosine);298i_sine.f = (float)d_sine;299i_cosine.f = (float)d_cosine;300#else301i_sine.f = (float)sin(angle);302i_cosine.f = (float)cos(angle);303#endif304}305306i_sine.i &= 0xFFFFFFFC;307i_cosine.i &= 0xFFFFFFFC;308s = i_sine.f;309c = i_cosine.f;310return;311}312313314