CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/MIPSVFPUUtils.cpp
Views: 1401
// Copyright (c) 2012- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include <cstdint>18#include <limits>19#include <cstdio>20#include <cstring>2122#include "Common/BitScan.h"23#include "Common/CommonFuncs.h"24#include "Common/File/VFS/VFS.h"25#include "Common/StringUtils.h"26#include "Core/Reporting.h"27#include "Core/MIPS/MIPS.h"28#include "Core/MIPS/MIPSVFPUUtils.h"29#include "Core/MIPS/MIPSVFPUFallbacks.h"3031#ifdef _MSC_VER32#pragma warning(disable: 4146)33#endif3435union float2int {36uint32_t i;37float f;38};3940void GetVectorRegs(u8 regs[4], VectorSize N, int vectorReg) {41int mtx = (vectorReg >> 2) & 7;42int col = vectorReg & 3;43int row = 0;44int length = 0;45int transpose = (vectorReg>>5) & 1;4647switch (N) {48case V_Single: transpose = 0; row=(vectorReg>>5)&3; length = 1; break;49case V_Pair: row=(vectorReg>>5)&2; length = 2; break;50case V_Triple: row=(vectorReg>>6)&1; length = 3; break;51case V_Quad: row=(vectorReg>>5)&2; length = 4; break;52default: _assert_msg_(false, "%s: Bad vector size", __FUNCTION__);53}5455for (int i = 0; i < length; i++) {56int index = mtx * 4;57if (transpose)58index += ((row+i)&3) + col*32;59else60index += col + ((row+i)&3)*32;61regs[i] = index;62}63}6465void GetMatrixRegs(u8 regs[16], MatrixSize N, int matrixReg) {66int mtx = (matrixReg >> 2) & 7;67int col = matrixReg & 3;6869int row = 0;70int side = 0;71int transpose = (matrixReg >> 5) & 1;7273switch (N) {74case M_1x1: transpose = 0; row = (matrixReg >> 5) & 3; side = 1; break;75case M_2x2: row = (matrixReg >> 5) & 2; side = 2; break;76case M_3x3: row = (matrixReg >> 6) & 1; side = 3; break;77case M_4x4: row = (matrixReg >> 5) & 2; side = 4; break;78default: _assert_msg_(false, "%s: Bad matrix size", __FUNCTION__);79}8081for (int i = 0; i < side; i++) {82for (int j = 0; j < side; j++) {83int index = mtx * 4;84if (transpose)85index += ((row+i)&3) + ((col+j)&3)*32;86else87index += ((col+j)&3) + ((row+i)&3)*32;88regs[j*4 + i] = index;89}90}91}9293int GetMatrixName(int matrix, MatrixSize msize, int column, int row, bool transposed) {94// TODO: Fix (?)95int name = (matrix * 4) | (transposed << 5);96switch (msize) {97case M_4x4:98if (row || column) {99ERROR_LOG(Log::JIT, "GetMatrixName: Invalid row %i or column %i for size %i", row, column, msize);100}101break;102103case M_3x3:104if (row & ~2) {105ERROR_LOG(Log::JIT, "GetMatrixName: Invalid row %i for size %i", row, msize);106}107if (column & ~2) {108ERROR_LOG(Log::JIT, "GetMatrixName: Invalid col %i for size %i", column, msize);109}110name |= (row << 6) | column;111break;112113case M_2x2:114if (row & ~2) {115ERROR_LOG(Log::JIT, "GetMatrixName: Invalid row %i for size %i", row, msize);116}117if (column & ~2) {118ERROR_LOG(Log::JIT, "GetMatrixName: Invalid col %i for size %i", column, msize);119}120name |= (row << 5) | column;121break;122123default: _assert_msg_(false, "%s: Bad matrix size", __FUNCTION__);124}125126return name;127}128129int GetColumnName(int matrix, MatrixSize msize, int column, int offset) {130return matrix * 4 + column + offset * 32;131}132133int GetRowName(int matrix, MatrixSize msize, int column, int offset) {134return 0x20 | (matrix * 4 + column + offset * 32);135}136137void GetMatrixColumns(int matrixReg, MatrixSize msize, u8 vecs[4]) {138int n = GetMatrixSide(msize);139140int col = matrixReg & 3;141int row = (matrixReg >> 5) & 2;142int transpose = (matrixReg >> 5) & 1;143144for (int i = 0; i < n; i++) {145vecs[i] = (transpose << 5) | (row << 5) | (matrixReg & 0x1C) | (i + col);146}147}148149void GetMatrixRows(int matrixReg, MatrixSize msize, u8 vecs[4]) {150int n = GetMatrixSide(msize);151int col = matrixReg & 3;152int row = (matrixReg >> 5) & 2;153154int swappedCol = row ? (msize == M_3x3 ? 1 : 2) : 0;155int swappedRow = col ? 2 : 0;156int transpose = ((matrixReg >> 5) & 1) ^ 1;157158for (int i = 0; i < n; i++) {159vecs[i] = (transpose << 5) | (swappedRow << 5) | (matrixReg & 0x1C) | (i + swappedCol);160}161}162163void ReadVector(float *rd, VectorSize size, int reg) {164int row;165int length;166switch (size) {167case V_Single: rd[0] = currentMIPS->v[voffset[reg]]; return; // transpose = 0; row=(reg>>5)&3; length = 1; break;168case V_Pair: row=(reg>>5)&2; length = 2; break;169case V_Triple: row=(reg>>6)&1; length = 3; break;170case V_Quad: row=(reg>>5)&2; length = 4; break;171default: length = 0; break;172}173int transpose = (reg >> 5) & 1;174const int mtx = ((reg << 2) & 0x70);175const int col = reg & 3;176// NOTE: We now skip the voffset lookups.177if (transpose) {178const int base = mtx + col;179for (int i = 0; i < length; i++)180rd[i] = currentMIPS->v[base + ((row + i) & 3) * 4];181} else {182const int base = mtx + col * 4;183for (int i = 0; i < length; i++)184rd[i] = currentMIPS->v[base + ((row + i) & 3)];185}186}187188void WriteVector(const float *rd, VectorSize size, int reg) {189int row;190int length;191192switch (size) {193case V_Single: if (!currentMIPS->VfpuWriteMask(0)) currentMIPS->v[voffset[reg]] = rd[0]; return; // transpose = 0; row=(reg>>5)&3; length = 1; break;194case V_Pair: row=(reg>>5)&2; length = 2; break;195case V_Triple: row=(reg>>6)&1; length = 3; break;196case V_Quad: row=(reg>>5)&2; length = 4; break;197default: length = 0; break;198}199200const int mtx = ((reg << 2) & 0x70);201const int col = reg & 3;202bool transpose = (reg >> 5) & 1;203// NOTE: We now skip the voffset lookups.204if (transpose) {205const int base = mtx + col;206if (currentMIPS->VfpuWriteMask() == 0) {207for (int i = 0; i < length; i++)208currentMIPS->v[base + ((row+i) & 3) * 4] = rd[i];209} else {210for (int i = 0; i < length; i++) {211if (!currentMIPS->VfpuWriteMask(i)) {212currentMIPS->v[base + ((row+i) & 3) * 4] = rd[i];213}214}215}216} else {217const int base = mtx + col * 4;218if (currentMIPS->VfpuWriteMask() == 0) {219for (int i = 0; i < length; i++)220currentMIPS->v[base + ((row + i) & 3)] = rd[i];221} else {222for (int i = 0; i < length; i++) {223if (!currentMIPS->VfpuWriteMask(i)) {224currentMIPS->v[base + ((row + i) & 3)] = rd[i];225}226}227}228}229}230231u32 VFPURewritePrefix(int ctrl, u32 remove, u32 add) {232u32 prefix = currentMIPS->vfpuCtrl[ctrl];233return (prefix & ~remove) | add;234}235236void ReadMatrix(float *rd, MatrixSize size, int reg) {237int row = 0;238int side = 0;239int transpose = (reg >> 5) & 1;240241switch (size) {242case M_1x1: transpose = 0; row = (reg >> 5) & 3; side = 1; break;243case M_2x2: row = (reg >> 5) & 2; side = 2; break;244case M_3x3: row = (reg >> 6) & 1; side = 3; break;245case M_4x4: row = (reg >> 5) & 2; side = 4; break;246default: side = 0; break;247}248249int mtx = (reg >> 2) & 7;250int col = reg & 3;251252// The voffset ordering is now integrated in these formulas,253// eliminating a table lookup.254const float *v = currentMIPS->v + (size_t)mtx * 16;255if (transpose) {256if (side == 4 && col == 0 && row == 0) {257// Fast path: Simple 4x4 transpose. TODO: Optimize.258for (int j = 0; j < 4; j++) {259for (int i = 0; i < 4; i++) {260rd[j * 4 + i] = v[i * 4 + j];261}262}263} else {264for (int j = 0; j < side; j++) {265for (int i = 0; i < side; i++) {266int index = ((row + i) & 3) * 4 + ((col + j) & 3);267rd[j * 4 + i] = v[index];268}269}270}271} else {272if (side == 4 && col == 0 && row == 0) {273// Fast path274memcpy(rd, v, sizeof(float) * 16); // rd[j * 4 + i] = v[j * 4 + i];275} else {276for (int j = 0; j < side; j++) {277for (int i = 0; i < side; i++) {278int index = ((col + j) & 3) * 4 + ((row + i) & 3);279rd[j * 4 + i] = v[index];280}281}282}283}284}285286void WriteMatrix(const float *rd, MatrixSize size, int reg) {287int mtx = (reg>>2)&7;288int col = reg&3;289290int row;291int side;292int transpose = (reg >> 5) & 1;293294switch (size) {295case M_1x1: transpose = 0; row = (reg >> 5) & 3; side = 1; break;296case M_2x2: row = (reg >> 5) & 2; side = 2; break;297case M_3x3: row = (reg >> 6) & 1; side = 3; break;298case M_4x4: row = (reg >> 5) & 2; side = 4; break;299default: side = 0;300}301302if (currentMIPS->VfpuWriteMask() != 0) {303ERROR_LOG_REPORT(Log::CPU, "Write mask used with vfpu matrix instruction.");304}305306// The voffset ordering is now integrated in these formulas,307// eliminating a table lookup.308float *v = currentMIPS->v + (size_t)mtx * 16;309if (transpose) {310if (side == 4 && row == 0 && col == 0 && currentMIPS->VfpuWriteMask() == 0x0) {311// Fast path: Simple 4x4 transpose. TODO: Optimize.312for (int j = 0; j < side; j++) {313for (int i = 0; i < side; i++) {314v[i * 4 + j] = rd[j * 4 + i];315}316}317} else {318for (int j = 0; j < side; j++) {319for (int i = 0; i < side; i++) {320if (j != side - 1 || !currentMIPS->VfpuWriteMask(i)) {321int index = ((row + i) & 3) * 4 + ((col + j) & 3);322v[index] = rd[j * 4 + i];323}324}325}326}327} else {328if (side == 4 && row == 0 && col == 0 && currentMIPS->VfpuWriteMask() == 0x0) {329memcpy(v, rd, sizeof(float) * 16); // v[j * 4 + i] = rd[j * 4 + i];330} else {331for (int j = 0; j < side; j++) {332for (int i = 0; i < side; i++) {333if (j != side - 1 || !currentMIPS->VfpuWriteMask(i)) {334int index = ((col + j) & 3) * 4 + ((row + i) & 3);335v[index] = rd[j * 4 + i];336}337}338}339}340}341}342343int GetVectorOverlap(int vec1, VectorSize size1, int vec2, VectorSize size2) {344// Different matrices? Can't overlap, return early.345if (((vec1 >> 2) & 7) != ((vec2 >> 2) & 7))346return 0;347348int n1 = GetNumVectorElements(size1);349int n2 = GetNumVectorElements(size2);350u8 regs1[4];351u8 regs2[4];352GetVectorRegs(regs1, size1, vec1);353GetVectorRegs(regs2, size1, vec2);354int count = 0;355for (int i = 0; i < n1; i++) {356for (int j = 0; j < n2; j++) {357if (regs1[i] == regs2[j])358count++;359}360}361return count;362}363364VectorSize GetHalfVectorSizeSafe(VectorSize sz) {365switch (sz) {366case V_Pair: return V_Single;367case V_Quad: return V_Pair;368default: return V_Invalid;369}370}371372VectorSize GetHalfVectorSize(VectorSize sz) {373VectorSize res = GetHalfVectorSizeSafe(sz);374_assert_msg_(res != V_Invalid, "%s: Bad vector size", __FUNCTION__);375return res;376}377378VectorSize GetDoubleVectorSizeSafe(VectorSize sz) {379switch (sz) {380case V_Single: return V_Pair;381case V_Pair: return V_Quad;382default: return V_Invalid;383}384}385386VectorSize GetDoubleVectorSize(VectorSize sz) {387VectorSize res = GetDoubleVectorSizeSafe(sz);388_assert_msg_(res != V_Invalid, "%s: Bad vector size", __FUNCTION__);389return res;390}391392VectorSize GetVectorSizeSafe(MatrixSize sz) {393switch (sz) {394case M_1x1: return V_Single;395case M_2x2: return V_Pair;396case M_3x3: return V_Triple;397case M_4x4: return V_Quad;398default: return V_Invalid;399}400}401402VectorSize GetVectorSize(MatrixSize sz) {403VectorSize res = GetVectorSizeSafe(sz);404_assert_msg_(res != V_Invalid, "%s: Bad vector size", __FUNCTION__);405return res;406}407408MatrixSize GetMatrixSizeSafe(VectorSize sz) {409switch (sz) {410case V_Single: return M_1x1;411case V_Pair: return M_2x2;412case V_Triple: return M_3x3;413case V_Quad: return M_4x4;414default: return M_Invalid;415}416}417418MatrixSize GetMatrixSize(VectorSize sz) {419MatrixSize res = GetMatrixSizeSafe(sz);420_assert_msg_(res != M_Invalid, "%s: Bad vector size", __FUNCTION__);421return res;422}423424VectorSize MatrixVectorSizeSafe(MatrixSize sz) {425switch (sz) {426case M_1x1: return V_Single;427case M_2x2: return V_Pair;428case M_3x3: return V_Triple;429case M_4x4: return V_Quad;430default: return V_Invalid;431}432}433434VectorSize MatrixVectorSize(MatrixSize sz) {435VectorSize res = MatrixVectorSizeSafe(sz);436_assert_msg_(res != V_Invalid, "%s: Bad matrix size", __FUNCTION__);437return res;438}439440int GetMatrixSideSafe(MatrixSize sz) {441switch (sz) {442case M_1x1: return 1;443case M_2x2: return 2;444case M_3x3: return 3;445case M_4x4: return 4;446default: return 0;447}448}449450int GetMatrixSide(MatrixSize sz) {451int res = GetMatrixSideSafe(sz);452_assert_msg_(res != 0, "%s: Bad matrix size", __FUNCTION__);453return res;454}455456// TODO: Optimize457MatrixOverlapType GetMatrixOverlap(int mtx1, int mtx2, MatrixSize msize) {458int n = GetMatrixSide(msize);459460if (mtx1 == mtx2)461return OVERLAP_EQUAL;462463u8 m1[16];464u8 m2[16];465GetMatrixRegs(m1, msize, mtx1);466GetMatrixRegs(m2, msize, mtx2);467468// Simply do an exhaustive search.469for (int x = 0; x < n; x++) {470for (int y = 0; y < n; y++) {471int val = m1[y * 4 + x];472for (int a = 0; a < n; a++) {473for (int b = 0; b < n; b++) {474if (m2[a * 4 + b] == val) {475return OVERLAP_PARTIAL;476}477}478}479}480}481482return OVERLAP_NONE;483}484485std::string GetVectorNotation(int reg, VectorSize size) {486int mtx = (reg>>2)&7;487int col = reg&3;488int row = 0;489int transpose = (reg>>5)&1;490char c;491switch (size) {492case V_Single: transpose=0; c='S'; row=(reg>>5)&3; break;493case V_Pair: c='C'; row=(reg>>5)&2; break;494case V_Triple: c='C'; row=(reg>>6)&1; break;495case V_Quad: c='C'; row=(reg>>5)&2; break;496default: c='?'; break;497}498if (transpose && c == 'C') c='R';499if (transpose)500return StringFromFormat("%c%i%i%i", c, mtx, row, col);501return StringFromFormat("%c%i%i%i", c, mtx, col, row);502}503504std::string GetMatrixNotation(int reg, MatrixSize size) {505int mtx = (reg>>2)&7;506int col = reg&3;507int row = 0;508int transpose = (reg>>5)&1;509char c;510switch (size)511{512case M_2x2: c='M'; row=(reg>>5)&2; break;513case M_3x3: c='M'; row=(reg>>6)&1; break;514case M_4x4: c='M'; row=(reg>>5)&2; break;515default: c='?'; break;516}517if (transpose && c=='M') c='E';518if (transpose)519return StringFromFormat("%c%i%i%i", c, mtx, row, col);520return StringFromFormat("%c%i%i%i", c, mtx, col, row);521}522523bool GetVFPUCtrlMask(int reg, u32 *mask) {524switch (reg) {525case VFPU_CTRL_SPREFIX:526case VFPU_CTRL_TPREFIX:527*mask = 0x000FFFFF;528return true;529case VFPU_CTRL_DPREFIX:530*mask = 0x00000FFF;531return true;532case VFPU_CTRL_CC:533*mask = 0x0000003F;534return true;535case VFPU_CTRL_INF4:536*mask = 0xFFFFFFFF;537return true;538case VFPU_CTRL_RSV5:539case VFPU_CTRL_RSV6:540case VFPU_CTRL_REV:541// Don't change anything, these regs are read only.542return false;543case VFPU_CTRL_RCX0:544case VFPU_CTRL_RCX1:545case VFPU_CTRL_RCX2:546case VFPU_CTRL_RCX3:547case VFPU_CTRL_RCX4:548case VFPU_CTRL_RCX5:549case VFPU_CTRL_RCX6:550case VFPU_CTRL_RCX7:551*mask = 0x3FFFFFFF;552return true;553default:554return false;555}556}557558float Float16ToFloat32(unsigned short l)559{560float2int f2i;561562unsigned short float16 = l;563unsigned int sign = (float16 >> VFPU_SH_FLOAT16_SIGN) & VFPU_MASK_FLOAT16_SIGN;564int exponent = (float16 >> VFPU_SH_FLOAT16_EXP) & VFPU_MASK_FLOAT16_EXP;565unsigned int fraction = float16 & VFPU_MASK_FLOAT16_FRAC;566567float f;568if (exponent == VFPU_FLOAT16_EXP_MAX)569{570f2i.i = sign << 31;571f2i.i |= 255 << 23;572f2i.i |= fraction;573f = f2i.f;574}575else if (exponent == 0 && fraction == 0)576{577f = sign == 1 ? -0.0f : 0.0f;578}579else580{581if (exponent == 0)582{583do584{585fraction <<= 1;586exponent--;587}588while (!(fraction & (VFPU_MASK_FLOAT16_FRAC + 1)));589590fraction &= VFPU_MASK_FLOAT16_FRAC;591}592593/* Convert to 32-bit single-precision IEEE754. */594f2i.i = sign << 31;595f2i.i |= (exponent + 112) << 23;596f2i.i |= fraction << 13;597f=f2i.f;598}599return f;600}601602// Reference C++ version.603static float vfpu_dot_cpp(const float a[4], const float b[4]) {604static const int EXTRA_BITS = 2;605float2int result;606float2int src[2];607608int32_t exps[4];609int32_t mants[4];610int32_t signs[4];611int32_t max_exp = 0;612int32_t last_inf = -1;613614for (int i = 0; i < 4; i++) {615src[0].f = a[i];616src[1].f = b[i];617618int32_t aexp = get_uexp(src[0].i);619int32_t bexp = get_uexp(src[1].i);620int32_t amant = get_mant(src[0].i) << EXTRA_BITS;621int32_t bmant = get_mant(src[1].i) << EXTRA_BITS;622623exps[i] = aexp + bexp - 127;624if (aexp == 255) {625// INF * 0 = NAN626if ((src[0].i & 0x007FFFFF) != 0 || bexp == 0) {627result.i = 0x7F800001;628return result.f;629}630mants[i] = get_mant(0) << EXTRA_BITS;631exps[i] = 255;632} else if (bexp == 255) {633if ((src[1].i & 0x007FFFFF) != 0 || aexp == 0) {634result.i = 0x7F800001;635return result.f;636}637mants[i] = get_mant(0) << EXTRA_BITS;638exps[i] = 255;639} else {640// TODO: Adjust precision?641uint64_t adjust = (uint64_t)amant * (uint64_t)bmant;642mants[i] = (adjust >> (23 + EXTRA_BITS)) & 0x7FFFFFFF;643}644signs[i] = get_sign(src[0].i) ^ get_sign(src[1].i);645646if (exps[i] > max_exp) {647max_exp = exps[i];648}649if (exps[i] >= 255) {650// Infinity minus infinity is not a real number.651if (last_inf != -1 && signs[i] != last_inf) {652result.i = 0x7F800001;653return result.f;654}655last_inf = signs[i];656}657}658659int32_t mant_sum = 0;660for (int i = 0; i < 4; i++) {661int exp = max_exp - exps[i];662if (exp >= 32) {663mants[i] = 0;664} else {665mants[i] >>= exp;666}667if (signs[i]) {668mants[i] = -mants[i];669}670mant_sum += mants[i];671}672673uint32_t sign_sum = 0;674if (mant_sum < 0) {675sign_sum = 0x80000000;676mant_sum = -mant_sum;677}678679// Truncate off the extra bits now. We want to zero them for rounding purposes.680mant_sum >>= EXTRA_BITS;681682if (mant_sum == 0 || max_exp <= 0) {683return 0.0f;684}685686int8_t shift = (int8_t)clz32_nonzero(mant_sum) - 8;687if (shift < 0) {688// Round to even if we'd shift away a 0.5.689const uint32_t round_bit = 1 << (-shift - 1);690if ((mant_sum & round_bit) && (mant_sum & (round_bit << 1))) {691mant_sum += round_bit;692shift = (int8_t)clz32_nonzero(mant_sum) - 8;693} else if ((mant_sum & round_bit) && (mant_sum & (round_bit - 1))) {694mant_sum += round_bit;695shift = (int8_t)clz32_nonzero(mant_sum) - 8;696}697mant_sum >>= -shift;698max_exp += -shift;699} else {700mant_sum <<= shift;701max_exp -= shift;702}703_dbg_assert_msg_((mant_sum & 0x00800000) != 0, "Mantissa wrong: %08x", mant_sum);704705if (max_exp >= 255) {706max_exp = 255;707mant_sum = 0;708} else if (max_exp <= 0) {709return 0.0f;710}711712result.i = sign_sum | (max_exp << 23) | (mant_sum & 0x007FFFFF);713return result.f;714}715716#if defined(__SSE2__)717718#include <emmintrin.h>719720static inline __m128i mulhi32x4(__m128i a, __m128i b) {721__m128i m02 = _mm_mul_epu32(a, b);722__m128i m13 = _mm_mul_epu32(723_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)),724_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)));725__m128i m=_mm_unpacklo_epi32(726_mm_shuffle_epi32(m02, _MM_SHUFFLE(3, 2, 3, 1)),727_mm_shuffle_epi32(m13, _MM_SHUFFLE(3, 2, 3, 1)));728return m;729}730731// Values of rounding_mode:732// -1 - detect at runtime733// 0 - assume round-to-nearest-ties-to-even734// 1 - round yourself in integer math735template<int rounding_mode=-1>736static float vfpu_dot_sse2(const float a[4], const float b[4])737{738static const int EXTRA_BITS = 2;739740bool is_default_rounding_mode = (rounding_mode == 0);741if(rounding_mode == -1)742{743volatile float test05 = 5.9604644775390625e-08f; // 0.5*2^-23744volatile float test15 = 1.78813934326171875e-07f; // 1.5*2^-23745const float res15 = 1.0000002384185791015625f; // 1+2^-22746test05 += 1.0f;747test15 += 1.0f;748is_default_rounding_mode = (test05 == 1.0f && test15 == res15);749}750__m128 A = _mm_loadu_ps(a);751__m128 B = _mm_loadu_ps(b);752// Extract exponents.753__m128 exp_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7F800000));754__m128 eA = _mm_and_ps(A, exp_mask);755__m128 eB = _mm_and_ps(B, exp_mask);756__m128i exps = _mm_srli_epi32(_mm_add_epi32(757_mm_castps_si128(eA),758_mm_castps_si128(eB)),23);759// Find maximum exponent, stored as float32 in [1;2),760// so we can use _mm_max_ps() with normal arguments.761__m128 t = _mm_or_ps(_mm_castsi128_ps(exps), _mm_set1_ps(1.0f));762t = _mm_max_ps(t, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(t), _MM_SHUFFLE(2, 3, 0, 1))));763t = _mm_max_ps(t, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(t), _MM_SHUFFLE(1, 0, 3, 2))));764t = _mm_max_ps(t, _mm_castsi128_ps(_mm_set1_epi32(0x3F80007F)));765int32_t mexp = _mm_cvtsi128_si32(_mm_castps_si128(t)) & 511;766// NOTE: mexp is doubly-biased, same for exps.767int32_t max_exp = mexp - 127;768// Fall back on anything weird.769__m128 finiteA = _mm_sub_ps(A, A);770__m128 finiteB = _mm_sub_ps(B, B);771finiteA = _mm_cmpeq_ps(finiteA, finiteA);772finiteB = _mm_cmpeq_ps(finiteB, finiteB);773if(max_exp >= 255 || _mm_movemask_ps(_mm_and_ps(finiteA, finiteB)) != 15) return vfpu_dot_cpp(a, b);774// Extract significands.775__m128i mA = _mm_or_si128(_mm_and_si128(_mm_castps_si128(A),_mm_set1_epi32(0x007FFFFF)),_mm_set1_epi32(0x00800000));776__m128i mB = _mm_or_si128(_mm_and_si128(_mm_castps_si128(B),_mm_set1_epi32(0x007FFFFF)),_mm_set1_epi32(0x00800000));777// Multiply.778// NOTE: vfpu_dot does multiplication as779// ((x<<EXTRA_BITS)*(y<<EXTRA_BITS))>>(23+EXTRA_BITS),780// here we do (x*y)>>(23-EXTRA_BITS-1),781// which produces twice the result (neither expression782// overflows in our case). We need that because our783// variable-shift scheme (below) must shift by at least 1 bit.784static const int s = 32-(23 - EXTRA_BITS - 1), s0 = s / 2,s1 = s - s0;785// We compute ((x*y)>>shift) as786// (((x*y)<<(32-shift))>>32), which we express as787// (((x<<s0)*(y<<s1))>>32) (neither shift overflows).788__m128i m = mulhi32x4(_mm_slli_epi32(mA, s0), _mm_slli_epi32(mB, s1));789// Shift according to max_exp. Since SSE2 doesn't have790// variable per-lane shifts, we multiply *again*,791// specifically, x>>y turns into (x<<(1<<(32-y)))>>32.792// We compute 1<<(32-y) using floating-point casts.793// NOTE: the cast for 1<<31 produces the correct value,794// since the _mm_cvttps_epi32 error code just happens795// to be 0x80000000.796// So (since we pre-multiplied m by 2), we need797// (m>>1)>>(mexp-exps),798// i.e. m>>(mexp+1-exps),799// i.e. (m<<(32-(mexp+1-exps)))>>32,800// i.e. (m<<(exps-(mexp-31)))>>32.801__m128i amounts = _mm_sub_epi32(exps, _mm_set1_epi32(mexp - 31));802// Clamp by 0. Both zero and negative amounts produce zero,803// since they correspond to right-shifting by 32 or more bits.804amounts = _mm_and_si128(amounts, _mm_cmpgt_epi32(amounts, _mm_set1_epi32(0)));805// Set up multipliers.806__m128i bits = _mm_add_epi32(_mm_set1_epi32(0x3F800000), _mm_slli_epi32(amounts, 23));807__m128i muls = _mm_cvttps_epi32(_mm_castsi128_ps(bits));808m = mulhi32x4(m, muls);809// Extract signs.810__m128i signs = _mm_cmpgt_epi32(811_mm_set1_epi32(0),812_mm_xor_si128(_mm_castps_si128(A), _mm_castps_si128(B)));813// Apply signs to m.814m = _mm_sub_epi32(_mm_xor_si128(m, signs), signs);815// Horizontal sum.816// See https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction817__m128i h64 = _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2));818__m128i s64 = _mm_add_epi32(h64, m);819__m128i h32 = _mm_shufflelo_epi16(s64, _MM_SHUFFLE(1, 0, 3, 2));820__m128i s32 = _mm_add_epi32(s64, h32);821int32_t mant_sum = _mm_cvtsi128_si32(s32);822823// The rest is scalar.824uint32_t sign_sum = 0;825if (mant_sum < 0) {826sign_sum = 0x80000000;827mant_sum = -mant_sum;828}829830// Truncate off the extra bits now. We want to zero them for rounding purposes.831mant_sum >>= EXTRA_BITS;832833if (mant_sum == 0 || max_exp <= 0) {834return 0.0f;835}836837if(is_default_rounding_mode)838{839float2int r;840r.f = (float)mant_sum;841mant_sum = (r.i & 0x007FFFFF) | 0x00800000;842max_exp += (r.i >> 23) - 0x96;843}844else845{846int8_t shift = (int8_t)clz32_nonzero(mant_sum) - 8;847if (shift < 0) {848// Round to even if we'd shift away a 0.5.849const uint32_t round_bit = 1 << (-shift - 1);850if ((mant_sum & round_bit) && (mant_sum & (round_bit << 1))) {851mant_sum += round_bit;852shift = (int8_t)clz32_nonzero(mant_sum) - 8;853} else if ((mant_sum & round_bit) && (mant_sum & (round_bit - 1))) {854mant_sum += round_bit;855shift = (int8_t)clz32_nonzero(mant_sum) - 8;856}857mant_sum >>= -shift;858max_exp += -shift;859} else {860mant_sum <<= shift;861max_exp -= shift;862}863_dbg_assert_msg_((mant_sum & 0x00800000) != 0, "Mantissa wrong: %08x", mant_sum);864}865866if (max_exp >= 255) {867max_exp = 255;868mant_sum = 0;869} else if (max_exp <= 0) {870return 0.0f;871}872873float2int result;874result.i = sign_sum | (max_exp << 23) | (mant_sum & 0x007FFFFF);875return result.f;876}877878#endif // defined(__SSE2__)879880float vfpu_dot(const float a[4], const float b[4]) {881#if defined(__SSE2__)882return vfpu_dot_sse2(a, b);883#else884return vfpu_dot_cpp(a, b);885#endif886}887888//==============================================================================889// The code below attempts to exactly match behaviour of890// PSP's vrnd instructions. See investigation starting around891// https://github.com/hrydgard/ppsspp/issues/16946#issuecomment-1467261209892// for details.893894// Redundant currently, since MIPSState::Init() already895// does this on its own, but left as-is to be self-contained.896void vrnd_init_default(uint32_t *rcx) {897rcx[0] = 0x00000001;898rcx[1] = 0x00000002;899rcx[2] = 0x00000004;900rcx[3] = 0x00000008;901rcx[4] = 0x00000000;902rcx[5] = 0x00000000;903rcx[6] = 0x00000000;904rcx[7] = 0x00000000;905}906907void vrnd_init(uint32_t seed, uint32_t *rcx) {908for(int i = 0; i < 8; ++i) rcx[i] =9090x3F800000u | // 1.0f mask.910((seed >> ((i / 4) * 16)) & 0xFFFFu) | // lower or upper half of the seed.911(((seed >> (4 * i)) & 0xF) << 16); // remaining nibble.912913}914915uint32_t vrnd_generate(uint32_t *rcx) {916// The actual RNG state appears to be 5 parts917// (32-bit each) stored into the registers as follows:918uint32_t A = (rcx[0] & 0xFFFFu) | (rcx[4] << 16);919uint32_t B = (rcx[1] & 0xFFFFu) | (rcx[5] << 16);920uint32_t C = (rcx[2] & 0xFFFFu) | (rcx[6] << 16);921uint32_t D = (rcx[3] & 0xFFFFu) | (rcx[7] << 16);922uint32_t E = (((rcx[0] >> 16) & 0xF) << 0) |923(((rcx[1] >> 16) & 0xF) << 4) |924(((rcx[2] >> 16) & 0xF) << 8) |925(((rcx[3] >> 16) & 0xF) << 12) |926(((rcx[4] >> 16) & 0xF) << 16) |927(((rcx[5] >> 16) & 0xF) << 20) |928(((rcx[6] >> 16) & 0xF) << 24) |929(((rcx[7] >> 16) & 0xF) << 28);930// Update.931// LCG with classic parameters.932A = 69069u * A + 1u; // NOTE: decimal constants.933// Xorshift, with classic parameters. Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs".934B ^= B << 13;935B ^= B >> 17;936B ^= B << 5;937// Sequence similar to Pell numbers ( https://en.wikipedia.org/wiki/Pell_number ),938// except with different starting values, and an occasional increment (E).939uint32_t t= 2u * D + C + E;940// NOTE: the details of how E-part is set are somewhat of a guess941// at the moment. The expression below looks weird, but does match942// the available test data.943E = uint32_t((uint64_t(C) + uint64_t(D >> 1) + uint64_t(E)) >> 32);944C = D;945D = t;946// Store.947rcx[0] = 0x3F800000u | (((E >> 0) & 0xF) << 16) | (A & 0xFFFFu);948rcx[1] = 0x3F800000u | (((E >> 4) & 0xF) << 16) | (B & 0xFFFFu);949rcx[2] = 0x3F800000u | (((E >> 8) & 0xF) << 16) | (C & 0xFFFFu);950rcx[3] = 0x3F800000u | (((E >> 12) & 0xF) << 16) | (D & 0xFFFFu);951rcx[4] = 0x3F800000u | (((E >> 16) & 0xF) << 16) | (A >> 16);952rcx[5] = 0x3F800000u | (((E >> 20) & 0xF) << 16) | (B >> 16);953rcx[6] = 0x3F800000u | (((E >> 24) & 0xF) << 16) | (C >> 16);954rcx[7] = 0x3F800000u | (((E >> 28) & 0xF) << 16) | (D >> 16);955// Return value.956return A + B + D;957}958959//==============================================================================960// The code below attempts to exactly match the output of961// several PSP's VFPU functions. For the sake of962// making lookup tables smaller the code is963// somewhat gnarly.964// Lookup tables sometimes store deltas from (explicitly computable)965// estimations, to allow to store them in smaller types.966// See https://github.com/hrydgard/ppsspp/issues/16946 for details.967968// Lookup tables.969// Note: these are never unloaded, and stay till program termination.970static uint32_t (*vfpu_sin_lut8192)=nullptr;971static int8_t (*vfpu_sin_lut_delta)[2]=nullptr;972static int16_t (*vfpu_sin_lut_interval_delta)=nullptr;973static uint8_t (*vfpu_sin_lut_exceptions)=nullptr;974975static int8_t (*vfpu_sqrt_lut)[2]=nullptr;976977static int8_t (*vfpu_rsqrt_lut)[2]=nullptr;978979static uint32_t (*vfpu_exp2_lut65536)=nullptr;980static uint8_t (*vfpu_exp2_lut)[2]=nullptr;981982static uint32_t (*vfpu_log2_lut65536)=nullptr;983static uint32_t (*vfpu_log2_lut65536_quadratic)=nullptr;984static uint8_t (*vfpu_log2_lut)[131072][2]=nullptr;985986static int32_t (*vfpu_asin_lut65536)[3]=nullptr;987static uint64_t (*vfpu_asin_lut_deltas)=nullptr;988static uint16_t (*vfpu_asin_lut_indices)=nullptr;989990static int8_t (*vfpu_rcp_lut)[2]=nullptr;991992template<typename T>993static inline bool load_vfpu_table(T *&ptr, const char *filename, size_t expected_size) {994#if COMMON_BIG_ENDIAN995// Tables are little-endian.996#error Byteswap for VFPU tables not implemented997#endif998if (ptr) return true; // Already loaded.999size_t size = 0u;1000INFO_LOG(Log::CPU, "Loading '%s'...", filename);1001ptr = reinterpret_cast<decltype(&*ptr)>(g_VFS.ReadFile(filename, &size));1002if (!ptr || size != expected_size) {1003ERROR_LOG(Log::CPU, "Error loading '%s' (size=%u, expected: %u)", filename, (unsigned)size, (unsigned)expected_size);1004delete[] ptr;1005ptr = nullptr;1006return false;1007}1008INFO_LOG(Log::CPU, "Successfully loaded '%s'", filename);1009return true;1010}10111012#define LOAD_TABLE(name, expected_size)\1013load_vfpu_table(name,"vfpu/" #name ".dat",expected_size)10141015// Note: PSP sin/cos output only has 22 significant1016// binary digits.1017static inline uint32_t vfpu_sin_quantum(uint32_t x) {1018return x < 1u << 22?10191u:10201u << (32 - 22 - clz32_nonzero(x));1021}10221023static inline uint32_t vfpu_sin_truncate_bits(u32 x) {1024return x & -vfpu_sin_quantum(x);1025}10261027static inline uint32_t vfpu_sin_fixed(uint32_t arg) {1028// Handle endpoints.1029if(arg == 0u) return 0u;1030if(arg == 0x00800000) return 0x10000000;1031// Get endpoints for 8192-wide interval.1032uint32_t L = vfpu_sin_lut8192[(arg >> 13) + 0];1033uint32_t H = vfpu_sin_lut8192[(arg >> 13) + 1];1034// Approximate endpoints for 64-wide interval via lerp.1035uint32_t A = L+(((H - L)*(((arg >> 6) & 127) + 0)) >> 7);1036uint32_t B = L+(((H - L)*(((arg >> 6) & 127) + 1)) >> 7);1037// Adjust endpoints from deltas, and increase working precision.1038uint64_t a = (uint64_t(A) << 5) + uint64_t(vfpu_sin_lut_delta[arg >> 6][0]) * vfpu_sin_quantum(A);1039uint64_t b = (uint64_t(B) << 5) + uint64_t(vfpu_sin_lut_delta[arg >> 6][1]) * vfpu_sin_quantum(B);1040// Compute approximation via lerp. Is off by at most 1 quantum.1041uint32_t v = uint32_t(((a * (64 - (arg & 63)) + b * (arg & 63)) >> 6) >> 5);1042v=vfpu_sin_truncate_bits(v);1043// Look up exceptions via binary search.1044// Note: vfpu_sin_lut_interval_delta stores1045// deltas from interval estimation.1046uint32_t lo = ((169u * ((arg >> 7) + 0)) >> 7)+uint32_t(vfpu_sin_lut_interval_delta[(arg >> 7) + 0]) + 16384u;1047uint32_t hi = ((169u * ((arg >> 7) + 1)) >> 7)+uint32_t(vfpu_sin_lut_interval_delta[(arg >> 7) + 1]) + 16384u;1048while(lo < hi) {1049uint32_t m = (lo + hi) / 2;1050// Note: vfpu_sin_lut_exceptions stores1051// index&127 (for each initial interval the1052// upper bits of index are the same, namely1053// arg&-128), plus direction (0 for +1, and1054// 128 for -1).1055uint32_t b = vfpu_sin_lut_exceptions[m];1056uint32_t e = (arg & -128u)+(b & 127u);1057if(e == arg) {1058v += vfpu_sin_quantum(v) * (b >> 7 ? -1u : +1u);1059break;1060}1061else if(e < arg) lo = m + 1;1062else hi = m;1063}1064return v;1065}10661067float vfpu_sin(float x) {1068static bool loaded =1069LOAD_TABLE(vfpu_sin_lut8192, 4100)&&1070LOAD_TABLE(vfpu_sin_lut_delta, 262144)&&1071LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&&1072LOAD_TABLE(vfpu_sin_lut_exceptions, 86938);1073if (!loaded)1074return vfpu_sin_fallback(x);1075uint32_t bits;1076memcpy(&bits, &x, sizeof(x));1077uint32_t sign = bits & 0x80000000u;1078uint32_t exponent = (bits >> 23) & 0xFFu;1079uint32_t significand = (bits & 0x007FFFFFu) | 0x00800000u;1080if(exponent == 0xFFu) {1081// NOTE: this bitpattern is a signaling1082// NaN on x86, so maybe just return1083// a normal qNaN?1084float y;1085bits=sign ^ 0x7F800001u;1086memcpy(&y, &bits, sizeof(y));1087return y;1088}1089if(exponent < 0x7Fu) {1090if(exponent < 0x7Fu-23u) significand = 0u;1091else significand >>= (0x7F - exponent);1092}1093else if(exponent > 0x7Fu) {1094// There is weirdness for large exponents.1095if(exponent - 0x7Fu >= 25u && exponent - 0x7Fu < 32u) significand = 0u;1096else if((exponent & 0x9Fu) == 0x9Fu) significand = 0u;1097else significand <<= (exponent - 0x7Fu);1098}1099sign ^= ((significand << 7) & 0x80000000u);1100significand &= 0x00FFFFFFu;1101if(significand > 0x00800000u) significand = 0x01000000u - significand;1102uint32_t ret = vfpu_sin_fixed(significand);1103return (sign ? -1.0f : +1.0f) * float(int32_t(ret)) * 3.7252903e-09f; // 0x1p-28f1104}11051106float vfpu_cos(float x) {1107static bool loaded =1108LOAD_TABLE(vfpu_sin_lut8192, 4100)&&1109LOAD_TABLE(vfpu_sin_lut_delta, 262144)&&1110LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&&1111LOAD_TABLE(vfpu_sin_lut_exceptions, 86938);1112if (!loaded)1113return vfpu_cos_fallback(x);1114uint32_t bits;1115memcpy(&bits, &x, sizeof(x));1116bits &= 0x7FFFFFFFu;1117uint32_t sign = 0u;1118uint32_t exponent = (bits >> 23) & 0xFFu;1119uint32_t significand = (bits & 0x007FFFFFu) | 0x00800000u;1120if(exponent == 0xFFu) {1121// NOTE: this bitpattern is a signaling1122// NaN on x86, so maybe just return1123// a normal qNaN?1124float y;1125bits = sign ^ 0x7F800001u;1126memcpy(&y, &bits, sizeof(y));1127return y;1128}1129if(exponent < 0x7Fu) {1130if(exponent < 0x7Fu - 23u) significand = 0u;1131else significand >>= (0x7F - exponent);1132}1133else if(exponent > 0x7Fu) {1134// There is weirdness for large exponents.1135if(exponent - 0x7Fu >= 25u && exponent - 0x7Fu < 32u) significand = 0u;1136else if((exponent & 0x9Fu) == 0x9Fu) significand = 0u;1137else significand <<= (exponent - 0x7Fu);1138}1139sign ^= ((significand << 7) & 0x80000000u);1140significand &= 0x00FFFFFFu;1141if(significand >= 0x00800000u) {1142significand = 0x01000000u - significand;1143sign ^= 0x80000000u;1144}1145uint32_t ret = vfpu_sin_fixed(0x00800000u - significand);1146return (sign ? -1.0f : +1.0f) * float(int32_t(ret)) * 3.7252903e-09f; // 0x1p-28f1147}11481149void vfpu_sincos(float a, float &s, float &c) {1150// Just invoke both sin and cos.1151// Suboptimal but whatever.1152s = vfpu_sin(a);1153c = vfpu_cos(a);1154}11551156// Integer square root of 2^23*x (rounded to zero).1157// Input is in 2^23 <= x < 2^25, and representable in float.1158static inline uint32_t isqrt23(uint32_t x) {1159#if 01160// Reference version.1161int dir=fesetround(FE_TOWARDZERO);1162uint32_t ret=uint32_t(int32_t(sqrtf(float(int32_t(x)) * 8388608.0f)));1163fesetround(dir);1164return ret;1165#elif 11166// Double version.1167// Verified to produce correct result for all valid inputs,1168// in all rounding modes, both in double and double-extended (x87)1169// precision.1170// Requires correctly-rounded sqrt (which on any IEEE-754 system1171// it should be).1172return uint32_t(int32_t(sqrt(double(x) * 8388608.0)));1173#else1174// Pure integer version, if you don't like floating point.1175// Based on code from Hacker's Delight. See isqrt4 in1176// https://github.com/hcs0/Hackers-Delight/blob/master/isqrt.c.txt1177// Relatively slow.1178uint64_t t=uint64_t(x) << 23, m, y, b;1179m=0x4000000000000000ull;1180y=0;1181while(m != 0) // Do 32 times.1182{1183b=y | m;1184y=y >> 1;1185if(t >= b)1186{1187t = t - b;1188y = y | m;1189}1190m = m >> 2;1191}1192return y;1193#endif1194}11951196// Returns floating-point bitpattern.1197static inline uint32_t vfpu_sqrt_fixed(uint32_t x) {1198// Endpoints of input.1199uint32_t lo =(x + 0u) & -64u;1200uint32_t hi = (x + 64u) & -64u;1201// Convert input to 9.23 fixed-point.1202lo = (lo >= 0x00400000u ? 4u * lo : 0x00800000u + 2u * lo);1203hi = (hi >= 0x00400000u ? 4u * hi : 0x00800000u + 2u * hi);1204// Estimate endpoints of output.1205uint32_t A = 0x3F000000u + isqrt23(lo);1206uint32_t B = 0x3F000000u + isqrt23(hi);1207// Apply deltas, and increase the working precision.1208uint64_t a = (uint64_t(A) << 4) + uint64_t(vfpu_sqrt_lut[x >> 6][0]);1209uint64_t b = (uint64_t(B) << 4) + uint64_t(vfpu_sqrt_lut[x >> 6][1]);1210uint32_t ret = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);1211// Truncate lower 2 bits.1212ret &= -4u;1213return ret;1214}12151216float vfpu_sqrt(float x) {1217static bool loaded =1218LOAD_TABLE(vfpu_sqrt_lut, 262144);1219if (!loaded)1220return vfpu_sqrt_fallback(x);1221uint32_t bits;1222memcpy(&bits, &x, sizeof(bits));1223if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {1224// Denormals (and zeroes) get +0, regardless1225// of sign.1226return +0.0f;1227}1228if(bits >> 31) {1229// Other negatives get NaN.1230bits = 0x7F800001u;1231memcpy(&x, &bits, sizeof(x));1232return x;1233}1234if((bits >> 23) == 255u) {1235// Inf/NaN gets Inf/NaN.1236bits = 0x7F800000u + ((bits & 0x007FFFFFu) != 0u);1237memcpy(&x, &bits, sizeof(bits));1238return x;1239}1240int32_t exponent = int32_t(bits >> 23) - 127;1241// Bottom bit of exponent (inverted) + significand (except bottom bit).1242uint32_t index = ((bits + 0x00800000u) >> 1) & 0x007FFFFFu;1243bits = vfpu_sqrt_fixed(index);1244bits += uint32_t(exponent >> 1) << 23;1245memcpy(&x, &bits, sizeof(bits));1246return x;1247}12481249// Returns floor(2^33/sqrt(x)), for 2^22 <= x < 2^24.1250static inline uint32_t rsqrt_floor22(uint32_t x) {1251#if 11252// Verified correct in all rounding directions,1253// by exhaustive search.1254return uint32_t(8589934592.0 / sqrt(double(x))); // 0x1p331255#else1256// Pure integer version, if you don't like floating point.1257// Based on code from Hacker's Delight. See isqrt4 in1258// https://github.com/hcs0/Hackers-Delight/blob/master/isqrt.c.txt1259// Relatively slow.1260uint64_t t=uint64_t(x) << 22, m, y, b;1261m = 0x4000000000000000ull;1262y = 0;1263while(m != 0) // Do 32 times.1264{1265b = y | m;1266y = y >> 1;1267if(t >= b)1268{1269t = t - b;1270y = y | m;1271}1272m = m >> 2;1273}1274y = (1ull << 44) / y;1275// Decrement if y > floor(2^33 / sqrt(x)).1276// This hack works because exhaustive1277// search (on [2^22;2^24]) says it does.1278if((y * y >> 3) * x > (1ull << 63) - 3ull * (((y & 7) == 6) << 21)) --y;1279return uint32_t(y);1280#endif1281}12821283// Returns floating-point bitpattern.1284static inline uint32_t vfpu_rsqrt_fixed(uint32_t x) {1285// Endpoints of input.1286uint32_t lo = (x + 0u) & -64u;1287uint32_t hi = (x + 64u) & -64u;1288// Convert input to 10.22 fixed-point.1289lo = (lo >= 0x00400000u ? 2u * lo : 0x00400000u + lo);1290hi = (hi >= 0x00400000u ? 2u * hi : 0x00400000u + hi);1291// Estimate endpoints of output.1292uint32_t A = 0x3E800000u + 4u * rsqrt_floor22(lo);1293uint32_t B = 0x3E800000u + 4u * rsqrt_floor22(hi);1294// Apply deltas, and increase the working precision.1295uint64_t a = (uint64_t(A) << 4) + uint64_t(vfpu_rsqrt_lut[x >> 6][0]);1296uint64_t b = (uint64_t(B) << 4) + uint64_t(vfpu_rsqrt_lut[x >> 6][1]);1297// Evaluate via lerp.1298uint32_t ret = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);1299// Truncate lower 2 bits.1300ret &= -4u;1301return ret;1302}13031304float vfpu_rsqrt(float x) {1305static bool loaded =1306LOAD_TABLE(vfpu_rsqrt_lut, 262144);1307if (!loaded)1308return vfpu_rsqrt_fallback(x);1309uint32_t bits;1310memcpy(&bits, &x, sizeof(bits));1311if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {1312// Denormals (and zeroes) get inf of the same sign.1313bits = 0x7F800000u | (bits & 0x80000000u);1314memcpy(&x, &bits, sizeof(x));1315return x;1316}1317if(bits >> 31) {1318// Other negatives get negative NaN.1319bits = 0xFF800001u;1320memcpy(&x, &bits, sizeof(x));1321return x;1322}1323if((bits >> 23) == 255u) {1324// inf gets 0, NaN gets NaN.1325bits = ((bits & 0x007FFFFFu) ? 0x7F800001u : 0u);1326memcpy(&x, &bits, sizeof(bits));1327return x;1328}1329int32_t exponent = int32_t(bits >> 23) - 127;1330// Bottom bit of exponent (inverted) + significand (except bottom bit).1331uint32_t index = ((bits + 0x00800000u) >> 1) & 0x007FFFFFu;1332bits = vfpu_rsqrt_fixed(index);1333bits -= uint32_t(exponent >> 1) << 23;1334memcpy(&x, &bits, sizeof(bits));1335return x;1336}13371338static inline uint32_t vfpu_asin_quantum(uint32_t x) {1339return x<1u<<23?13401u:13411u<<(32-23-clz32_nonzero(x));1342}13431344static inline uint32_t vfpu_asin_truncate_bits(uint32_t x) {1345return x & -vfpu_asin_quantum(x);1346}13471348// Input is fixed 9.23, output is fixed 2.30.1349static inline uint32_t vfpu_asin_approx(uint32_t x) {1350const int32_t *C = vfpu_asin_lut65536[x >> 16];1351x &= 0xFFFFu;1352return vfpu_asin_truncate_bits(uint32_t((((((int64_t(C[2]) * x) >> 16) + int64_t(C[1])) * x) >> 16) + C[0]));1353}13541355// Input is fixed 9.23, output is fixed 2.30.1356static uint32_t vfpu_asin_fixed(uint32_t x) {1357if(x == 0u) return 0u;1358if(x == 1u << 23) return 1u << 30;1359uint32_t ret = vfpu_asin_approx(x);1360uint32_t index = vfpu_asin_lut_indices[x / 21u];1361uint64_t deltas = vfpu_asin_lut_deltas[index];1362return ret + (3u - uint32_t((deltas >> (3u * (x % 21u))) & 7u)) * vfpu_asin_quantum(ret);1363}13641365float vfpu_asin(float x) {1366static bool loaded =1367LOAD_TABLE(vfpu_asin_lut65536, 1536)&&1368LOAD_TABLE(vfpu_asin_lut_indices, 798916)&&1369LOAD_TABLE(vfpu_asin_lut_deltas, 517448);1370if (!loaded)1371return vfpu_asin_fallback(x);13721373uint32_t bits;1374memcpy(&bits, &x, sizeof(x));1375uint32_t sign = bits & 0x80000000u;1376bits = bits & 0x7FFFFFFFu;1377if(bits > 0x3F800000u) {1378bits = 0x7F800001u ^ sign;1379memcpy(&x, &bits, sizeof(x));1380return x;1381}13821383bits = vfpu_asin_fixed(uint32_t(int32_t(fabsf(x) * 8388608.0f))); // 0x1p231384x=float(int32_t(bits)) * 9.31322574615478515625e-10f; // 0x1p-301385if(sign) x = -x;1386return x;1387}13881389static inline uint32_t vfpu_exp2_approx(uint32_t x) {1390if(x == 0x00800000u) return 0x00800000u;1391uint32_t a=vfpu_exp2_lut65536[x >> 16];1392x &= 0x0000FFFFu;1393uint32_t b = uint32_t(((2977151143ull * x) >> 23) + ((1032119999ull * (x * x)) >> 46));1394return (a + uint32_t((uint64_t(a + (1u << 23)) * uint64_t(b)) >> 32)) & -4u;1395}13961397static inline uint32_t vfpu_exp2_fixed(uint32_t x) {1398if(x == 0u) return 0u;1399if(x == 0x00800000u) return 0x00800000u;1400uint32_t A = vfpu_exp2_approx((x ) & -64u);1401uint32_t B = vfpu_exp2_approx((x + 64) & -64u);1402uint64_t a = (A<<4)+vfpu_exp2_lut[x >> 6][0]-64u;1403uint64_t b = (B<<4)+vfpu_exp2_lut[x >> 6][1]-64u;1404uint32_t y = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);1405y &= -4u;1406return y;1407}14081409float vfpu_exp2(float x) {1410static bool loaded =1411LOAD_TABLE(vfpu_exp2_lut65536, 512)&&1412LOAD_TABLE(vfpu_exp2_lut, 262144);1413if (!loaded)1414return vfpu_exp2_fallback(x);1415int32_t bits;1416memcpy(&bits, &x, sizeof(bits));1417if((bits & 0x7FFFFFFF) <= 0x007FFFFF) {1418// Denormals are treated as 0.1419return 1.0f;1420}1421if(x != x) {1422// NaN gets NaN.1423bits = 0x7F800001u;1424memcpy(&x, &bits, sizeof(x));1425return x;1426}1427if(x <= -126.0f) {1428// Small numbers get 0 (exp2(-126) is smallest positive non-denormal).1429// But yes, -126.0f produces +0.0f.1430return 0.0f;1431}1432if(x >= +128.0f) {1433// Large numbers get infinity.1434bits = 0x7F800000u;1435memcpy(&x, &bits, sizeof(x));1436return x;1437}1438bits = int32_t(x * 0x1p23f);1439if(x < 0.0f) --bits; // Yes, really.1440bits = int32_t(0x3F800000) + (bits & int32_t(0xFF800000)) + int32_t(vfpu_exp2_fixed(bits & int32_t(0x007FFFFF)));1441memcpy(&x, &bits, sizeof(bits));1442return x;1443}14441445float vfpu_rexp2(float x) {1446return vfpu_exp2(-x);1447}14481449// Input fixed 9.23, output fixed 10.22.1450// Returns log2(1+x).1451static inline uint32_t vfpu_log2_approx(uint32_t x) {1452uint32_t a = vfpu_log2_lut65536[(x >> 16) + 0];1453uint32_t b = vfpu_log2_lut65536[(x >> 16) + 1];1454uint32_t c = vfpu_log2_lut65536_quadratic[x >> 16];1455x &= 0xFFFFu;1456uint64_t ret = uint64_t(a) * (0x10000u - x) + uint64_t(b) * x;1457uint64_t d = (uint64_t(c) * x * (0x10000u-x)) >> 40;1458ret += d;1459return uint32_t(ret >> 16);1460}14611462// Matches PSP output on all known values.1463float vfpu_log2(float x) {1464static bool loaded =1465LOAD_TABLE(vfpu_log2_lut65536, 516)&&1466LOAD_TABLE(vfpu_log2_lut65536_quadratic, 512)&&1467LOAD_TABLE(vfpu_log2_lut, 2097152);1468if (!loaded)1469return vfpu_log2_fallback(x);1470uint32_t bits;1471memcpy(&bits, &x, sizeof(bits));1472if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {1473// Denormals (and zeroes) get -inf.1474bits = 0xFF800000u;1475memcpy(&x, &bits, sizeof(x));1476return x;1477}1478if(bits & 0x80000000u) {1479// Other negatives get NaN.1480bits = 0x7F800001u;1481memcpy(&x, &bits, sizeof(x));1482return x;1483}1484if((bits >> 23) == 255u) {1485// NaN gets NaN, +inf gets +inf.1486bits = 0x7F800000u + ((bits & 0x007FFFFFu) != 0);1487memcpy(&x, &bits, sizeof(x));1488return x;1489}1490uint32_t e = (bits & 0x7F800000u) - 0x3F800000u;1491uint32_t i = bits & 0x007FFFFFu;1492if(e >> 31 && i >= 0x007FFE00u) {1493// Process 1-2^{-14}<=x*2^n<1 (for n>0) separately,1494// since the table doesn't give the right answer.1495float c = float(int32_t(~e) >> 23);1496// Note: if c is 0 the sign of -0 output is correct.1497return i < 0x007FFEF7u ? // 1-265*2^{-24}1498-3.05175781e-05f - c:1499-0.0f - c;1500}1501int d = (e < 0x01000000u ? 0 : 8 - clz32_nonzero(e) - int(e >> 31));1502//assert(d >= 0 && d < 8);1503uint32_t q = 1u << d;1504uint32_t A = vfpu_log2_approx((i ) & -64u) & -q;1505uint32_t B = vfpu_log2_approx((i + 64) & -64u) & -q;1506uint64_t a = (A << 6)+(uint64_t(vfpu_log2_lut[d][i >> 6][0]) - 80ull) * q;1507uint64_t b = (B << 6)+(uint64_t(vfpu_log2_lut[d][i >> 6][1]) - 80ull) * q;1508uint32_t v = uint32_t((a +(((b - a) * (i & 63)) >> 6)) >> 6);1509v &= -q;1510bits = e ^ (2u * v);1511x = float(int32_t(bits)) * 1.1920928955e-7f; // 0x1p-23f1512return x;1513}15141515static inline uint32_t vfpu_rcp_approx(uint32_t i) {1516return 0x3E800000u + (uint32_t((1ull << 47) / ((1ull << 23) + i)) & -4u);1517}15181519float vfpu_rcp(float x) {1520static bool loaded =1521LOAD_TABLE(vfpu_rcp_lut, 262144);1522if (!loaded)1523return vfpu_rcp_fallback(x);1524uint32_t bits;1525memcpy(&bits, &x, sizeof(bits));1526uint32_t s = bits & 0x80000000u;1527uint32_t e = bits & 0x7F800000u;1528uint32_t i = bits & 0x007FFFFFu;1529if((bits & 0x7FFFFFFFu) > 0x7E800000u) {1530bits = (e == 0x7F800000u && i ? s ^ 0x7F800001u : s);1531memcpy(&x, &bits, sizeof(x));1532return x;1533}1534if(e==0u) {1535bits = s^0x7F800000u;1536memcpy(&x, &bits, sizeof(x));1537return x;1538}1539uint32_t A = vfpu_rcp_approx((i ) & -64u);1540uint32_t B = vfpu_rcp_approx((i + 64) & -64u);1541uint64_t a = (uint64_t(A) << 6) + uint64_t(vfpu_rcp_lut[i >> 6][0]) * 4u;1542uint64_t b = (uint64_t(B) << 6) + uint64_t(vfpu_rcp_lut[i >> 6][1]) * 4u;1543uint32_t v = uint32_t((a+(((b-a)*(i&63))>>6))>>6);1544v &= -4u;1545bits = s + (0x3F800000u - e) + v;1546memcpy(&x, &bits, sizeof(x));1547return x;1548}15491550//==============================================================================15511552void InitVFPU() {1553#if 01554// Load all in advance.1555LOAD_TABLE(vfpu_asin_lut65536 , 1536);1556LOAD_TABLE(vfpu_asin_lut_deltas , 517448);1557LOAD_TABLE(vfpu_asin_lut_indices , 798916);1558LOAD_TABLE(vfpu_exp2_lut65536 , 512);1559LOAD_TABLE(vfpu_exp2_lut , 262144);1560LOAD_TABLE(vfpu_log2_lut65536 , 516);1561LOAD_TABLE(vfpu_log2_lut65536_quadratic, 512);1562LOAD_TABLE(vfpu_log2_lut , 2097152);1563LOAD_TABLE(vfpu_rcp_lut , 262144);1564LOAD_TABLE(vfpu_rsqrt_lut , 262144);1565LOAD_TABLE(vfpu_sin_lut8192 , 4100);1566LOAD_TABLE(vfpu_sin_lut_delta , 262144);1567LOAD_TABLE(vfpu_sin_lut_exceptions , 86938);1568LOAD_TABLE(vfpu_sin_lut_interval_delta , 131074);1569LOAD_TABLE(vfpu_sqrt_lut , 262144);1570#endif1571}157215731574