CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Core/MIPS/IR/IRInterpreter.cpp
Views: 1401
#include <algorithm>1#include <cmath>23#include "ppsspp_config.h"4#include "Common/BitSet.h"5#include "Common/BitScan.h"6#include "Common/Common.h"7#include "Common/Data/Convert/SmallDataConvert.h"8#include "Common/Math/math_util.h"910#ifdef _M_SSE11#include <emmintrin.h>12#endif1314#if PPSSPP_ARCH(ARM_NEON)15#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)16#include <arm64_neon.h>17#else18#include <arm_neon.h>19#endif20#endif2122#include "Core/Core.h"23#include "Core/CoreTiming.h"24#include "Core/Debugger/Breakpoints.h"25#include "Core/HLE/HLE.h"26#include "Core/HLE/ReplaceTables.h"27#include "Core/MemMap.h"28#include "Core/MIPS/MIPS.h"29#include "Core/MIPS/MIPSTables.h"30#include "Core/MIPS/MIPSVFPUUtils.h"31#include "Core/MIPS/IR/IRInst.h"32#include "Core/MIPS/IR/IRInterpreter.h"33#include "Core/System.h"34#include "Core/MIPS/MIPSTracer.h"3536#ifdef mips37// Why do MIPS compilers define something so generic? Try to keep defined, at least...38#undef mips39#define mips mips40#endif4142alignas(16) static const float vec4InitValues[8][4] = {43{ 0.0f, 0.0f, 0.0f, 0.0f },44{ 1.0f, 1.0f, 1.0f, 1.0f },45{ -1.0f, -1.0f, -1.0f, -1.0f },46{ 1.0f, 0.0f, 0.0f, 0.0f },47{ 0.0f, 1.0f, 0.0f, 0.0f },48{ 0.0f, 0.0f, 1.0f, 0.0f },49{ 0.0f, 0.0f, 0.0f, 1.0f },50};5152alignas(16) static const uint32_t signBits[4] = {530x80000000, 0x80000000, 0x80000000, 0x80000000,54};5556alignas(16) static const uint32_t noSignMask[4] = {570x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,58};5960alignas(16) static const uint32_t lowBytesMask[4] = {610x000000FF, 0x000000FF, 0x000000FF, 0x000000FF,62};6364u32 IRRunBreakpoint(u32 pc) {65// Should we skip this breakpoint?66uint32_t skipFirst = CBreakPoints::CheckSkipFirst();67if (skipFirst == pc || skipFirst == currentMIPS->pc)68return 0;6970// Did we already hit one?71if (coreState != CORE_RUNNING && coreState != CORE_NEXTFRAME)72return 1;7374CBreakPoints::ExecBreakPoint(pc);75return coreState != CORE_RUNNING ? 1 : 0;76}7778u32 IRRunMemCheck(u32 pc, u32 addr) {79// Should we skip this breakpoint?80uint32_t skipFirst = CBreakPoints::CheckSkipFirst();81if (skipFirst == pc || skipFirst == currentMIPS->pc)82return 0;8384// Did we already hit one?85if (coreState != CORE_RUNNING && coreState != CORE_NEXTFRAME)86return 1;8788CBreakPoints::ExecOpMemCheck(addr, pc);89return coreState != CORE_RUNNING ? 1 : 0;90}9192void IRApplyRounding(MIPSState *mips) {93u32 fcr1Bits = mips->fcr31 & 0x01000003;94// If these are 0, we just leave things as they are.95if (fcr1Bits) {96int rmode = fcr1Bits & 3;97bool ftz = (fcr1Bits & 0x01000000) != 0;98#if PPSSPP_ARCH(SSE2)99u32 csr = _mm_getcsr() & ~0x6000;100// Translate the rounding mode bits to X86, the same way as in Asm.cpp.101if (rmode & 1) {102rmode ^= 2;103}104csr |= rmode << 13;105106if (ftz) {107// Flush to zero108csr |= 0x8000;109}110_mm_setcsr(csr);111#elif PPSSPP_ARCH(ARM64) && !PPSSPP_PLATFORM(WINDOWS)112// On ARM64 we need to use inline assembly for a portable solution.113// Unfortunately we don't have this possibility on Windows with MSVC, so ifdeffed out above.114// Note that in the JIT, for fcvts, we use specific conversions. We could use the FCVTS variants115// directly through inline assembly.116u64 fpcr; // not really 64-bit, just to match the register size.117asm volatile ("mrs %0, fpcr" : "=r" (fpcr));118119// Translate MIPS to ARM rounding mode120static const u8 lookup[4] = {0, 3, 1, 2};121122fpcr &= ~(3 << 22); // Clear bits [23:22]123fpcr |= (lookup[rmode] << 22);124125if (ftz) {126fpcr |= 1 << 24;127}128// Write back the modified FPCR129asm volatile ("msr fpcr, %0" : : "r" (fpcr));130#endif131}132}133134void IRRestoreRounding() {135#if PPSSPP_ARCH(SSE2)136// TODO: We should avoid this if we didn't apply rounding in the first place.137// In the meantime, clear out FTZ and rounding mode bits.138u32 csr = _mm_getcsr();139csr &= ~(7 << 13);140_mm_setcsr(csr);141#elif PPSSPP_ARCH(ARM64) && !PPSSPP_PLATFORM(WINDOWS)142u64 fpcr; // not really 64-bit, just to match the regsiter size.143asm volatile ("mrs %0, fpcr" : "=r" (fpcr));144fpcr &= ~(7 << 22); // Clear bits [23:22] for rounding, 24 for FTZ145// Write back the modified FPCR146asm volatile ("msr fpcr, %0" : : "r" (fpcr));147#endif148}149150// We cannot use NEON on ARM32 here until we make it a hard dependency. We can, however, on ARM64.151u32 IRInterpret(MIPSState *mips, const IRInst *inst) {152while (true) {153switch (inst->op) {154case IROp::SetConst:155mips->r[inst->dest] = inst->constant;156break;157case IROp::SetConstF:158memcpy(&mips->f[inst->dest], &inst->constant, 4);159break;160case IROp::Add:161mips->r[inst->dest] = mips->r[inst->src1] + mips->r[inst->src2];162break;163case IROp::Sub:164mips->r[inst->dest] = mips->r[inst->src1] - mips->r[inst->src2];165break;166case IROp::And:167mips->r[inst->dest] = mips->r[inst->src1] & mips->r[inst->src2];168break;169case IROp::Or:170mips->r[inst->dest] = mips->r[inst->src1] | mips->r[inst->src2];171break;172case IROp::Xor:173mips->r[inst->dest] = mips->r[inst->src1] ^ mips->r[inst->src2];174break;175case IROp::Mov:176mips->r[inst->dest] = mips->r[inst->src1];177break;178case IROp::AddConst:179mips->r[inst->dest] = mips->r[inst->src1] + inst->constant;180break;181case IROp::OptAddConst: // For this one, it's worth having a "unary" variant of the above that only needs to read one register param.182mips->r[inst->dest] += inst->constant;183break;184case IROp::SubConst:185mips->r[inst->dest] = mips->r[inst->src1] - inst->constant;186break;187case IROp::AndConst:188mips->r[inst->dest] = mips->r[inst->src1] & inst->constant;189break;190case IROp::OptAndConst: // For this one, it's worth having a "unary" variant of the above that only needs to read one register param.191mips->r[inst->dest] &= inst->constant;192break;193case IROp::OrConst:194mips->r[inst->dest] = mips->r[inst->src1] | inst->constant;195break;196case IROp::OptOrConst:197mips->r[inst->dest] |= inst->constant;198break;199case IROp::XorConst:200mips->r[inst->dest] = mips->r[inst->src1] ^ inst->constant;201break;202case IROp::Neg:203mips->r[inst->dest] = (u32)(-(s32)mips->r[inst->src1]);204break;205case IROp::Not:206mips->r[inst->dest] = ~mips->r[inst->src1];207break;208case IROp::Ext8to32:209mips->r[inst->dest] = SignExtend8ToU32(mips->r[inst->src1]);210break;211case IROp::Ext16to32:212mips->r[inst->dest] = SignExtend16ToU32(mips->r[inst->src1]);213break;214case IROp::ReverseBits:215mips->r[inst->dest] = ReverseBits32(mips->r[inst->src1]);216break;217218case IROp::Load8:219mips->r[inst->dest] = Memory::ReadUnchecked_U8(mips->r[inst->src1] + inst->constant);220break;221case IROp::Load8Ext:222mips->r[inst->dest] = SignExtend8ToU32(Memory::ReadUnchecked_U8(mips->r[inst->src1] + inst->constant));223break;224case IROp::Load16:225mips->r[inst->dest] = Memory::ReadUnchecked_U16(mips->r[inst->src1] + inst->constant);226break;227case IROp::Load16Ext:228mips->r[inst->dest] = SignExtend16ToU32(Memory::ReadUnchecked_U16(mips->r[inst->src1] + inst->constant));229break;230case IROp::Load32:231mips->r[inst->dest] = Memory::ReadUnchecked_U32(mips->r[inst->src1] + inst->constant);232break;233case IROp::Load32Left:234{235u32 addr = mips->r[inst->src1] + inst->constant;236u32 shift = (addr & 3) * 8;237u32 mem = Memory::ReadUnchecked_U32(addr & 0xfffffffc);238u32 destMask = 0x00ffffff >> shift;239mips->r[inst->dest] = (mips->r[inst->dest] & destMask) | (mem << (24 - shift));240break;241}242case IROp::Load32Right:243{244u32 addr = mips->r[inst->src1] + inst->constant;245u32 shift = (addr & 3) * 8;246u32 mem = Memory::ReadUnchecked_U32(addr & 0xfffffffc);247u32 destMask = 0xffffff00 << (24 - shift);248mips->r[inst->dest] = (mips->r[inst->dest] & destMask) | (mem >> shift);249break;250}251case IROp::Load32Linked:252if (inst->dest != MIPS_REG_ZERO)253mips->r[inst->dest] = Memory::ReadUnchecked_U32(mips->r[inst->src1] + inst->constant);254mips->llBit = 1;255break;256case IROp::LoadFloat:257mips->f[inst->dest] = Memory::ReadUnchecked_Float(mips->r[inst->src1] + inst->constant);258break;259260case IROp::Store8:261Memory::WriteUnchecked_U8(mips->r[inst->src3], mips->r[inst->src1] + inst->constant);262break;263case IROp::Store16:264Memory::WriteUnchecked_U16(mips->r[inst->src3], mips->r[inst->src1] + inst->constant);265break;266case IROp::Store32:267Memory::WriteUnchecked_U32(mips->r[inst->src3], mips->r[inst->src1] + inst->constant);268break;269case IROp::Store32Left:270{271u32 addr = mips->r[inst->src1] + inst->constant;272u32 shift = (addr & 3) * 8;273u32 mem = Memory::ReadUnchecked_U32(addr & 0xfffffffc);274u32 memMask = 0xffffff00 << shift;275u32 result = (mips->r[inst->src3] >> (24 - shift)) | (mem & memMask);276Memory::WriteUnchecked_U32(result, addr & 0xfffffffc);277break;278}279case IROp::Store32Right:280{281u32 addr = mips->r[inst->src1] + inst->constant;282u32 shift = (addr & 3) * 8;283u32 mem = Memory::ReadUnchecked_U32(addr & 0xfffffffc);284u32 memMask = 0x00ffffff >> (24 - shift);285u32 result = (mips->r[inst->src3] << shift) | (mem & memMask);286Memory::WriteUnchecked_U32(result, addr & 0xfffffffc);287break;288}289case IROp::Store32Conditional:290if (mips->llBit) {291Memory::WriteUnchecked_U32(mips->r[inst->src3], mips->r[inst->src1] + inst->constant);292if (inst->dest != MIPS_REG_ZERO) {293mips->r[inst->dest] = 1;294}295} else if (inst->dest != MIPS_REG_ZERO) {296mips->r[inst->dest] = 0;297}298break;299case IROp::StoreFloat:300Memory::WriteUnchecked_Float(mips->f[inst->src3], mips->r[inst->src1] + inst->constant);301break;302303case IROp::LoadVec4:304{305u32 base = mips->r[inst->src1] + inst->constant;306// This compiles to a nice SSE load/store on x86, and hopefully similar on ARM.307memcpy(&mips->f[inst->dest], Memory::GetPointerUnchecked(base), 4 * 4);308break;309}310case IROp::StoreVec4:311{312u32 base = mips->r[inst->src1] + inst->constant;313memcpy((float *)Memory::GetPointerUnchecked(base), &mips->f[inst->dest], 4 * 4);314break;315}316317case IROp::Vec4Init:318{319memcpy(&mips->f[inst->dest], vec4InitValues[inst->src1], 4 * sizeof(float));320break;321}322323case IROp::Vec4Shuffle:324{325// Can't use the SSE shuffle here because it takes an immediate. pshufb with a table would work though,326// or a big switch - there are only 256 shuffles possible (4^4)327float temp[4];328for (int i = 0; i < 4; i++)329temp[i] = mips->f[inst->src1 + ((inst->src2 >> (i * 2)) & 3)];330const int dest = inst->dest;331for (int i = 0; i < 4; i++)332mips->f[dest + i] = temp[i];333break;334}335336case IROp::Vec4Blend:337{338const int dest = inst->dest;339const int src1 = inst->src1;340const int src2 = inst->src2;341const int constant = inst->constant;342// 90% of calls to this is inst->constant == 7 or inst->constant == 8. Some are 1 and 4, others very rare.343// Could use _mm_blendv_ps (SSE4+BMI), vbslq_f32 (ARM), __riscv_vmerge_vvm (RISC-V)344for (int i = 0; i < 4; i++)345mips->f[dest + i] = ((constant >> i) & 1) ? mips->f[src2 + i] : mips->f[src1 + i];346break;347}348349case IROp::Vec4Mov:350{351#if defined(_M_SSE)352_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1]));353#elif PPSSPP_ARCH(ARM_NEON)354vst1q_f32(&mips->f[inst->dest], vld1q_f32(&mips->f[inst->src1]));355#else356memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float));357#endif358break;359}360361case IROp::Vec4Add:362{363#if defined(_M_SSE)364_mm_store_ps(&mips->f[inst->dest], _mm_add_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));365#elif PPSSPP_ARCH(ARM_NEON)366vst1q_f32(&mips->f[inst->dest], vaddq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));367#else368for (int i = 0; i < 4; i++)369mips->f[inst->dest + i] = mips->f[inst->src1 + i] + mips->f[inst->src2 + i];370#endif371break;372}373374case IROp::Vec4Sub:375{376#if defined(_M_SSE)377_mm_store_ps(&mips->f[inst->dest], _mm_sub_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));378#elif PPSSPP_ARCH(ARM_NEON)379vst1q_f32(&mips->f[inst->dest], vsubq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));380#else381for (int i = 0; i < 4; i++)382mips->f[inst->dest + i] = mips->f[inst->src1 + i] - mips->f[inst->src2 + i];383#endif384break;385}386387case IROp::Vec4Mul:388{389#if defined(_M_SSE)390_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));391#elif PPSSPP_ARCH(ARM_NEON)392vst1q_f32(&mips->f[inst->dest], vmulq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));393#else394for (int i = 0; i < 4; i++)395mips->f[inst->dest + i] = mips->f[inst->src1 + i] * mips->f[inst->src2 + i];396#endif397break;398}399400case IROp::Vec4Div:401{402#if defined(_M_SSE)403_mm_store_ps(&mips->f[inst->dest], _mm_div_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));404#elif PPSSPP_ARCH(ARM64_NEON)405vst1q_f32(&mips->f[inst->dest], vdivq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));406#else407for (int i = 0; i < 4; i++)408mips->f[inst->dest + i] = mips->f[inst->src1 + i] / mips->f[inst->src2 + i];409#endif410break;411}412413case IROp::Vec4Scale:414{415#if defined(_M_SSE)416_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_set1_ps(mips->f[inst->src2])));417#elif PPSSPP_ARCH(ARM_NEON)418vst1q_f32(&mips->f[inst->dest], vmulq_lane_f32(vld1q_f32(&mips->f[inst->src1]), vdup_n_f32(mips->f[inst->src2]), 0));419#else420const float factor = mips->f[inst->src2];421for (int i = 0; i < 4; i++)422mips->f[inst->dest + i] = mips->f[inst->src1 + i] * factor;423#endif424break;425}426427case IROp::Vec4Neg:428{429#if defined(_M_SSE)430_mm_store_ps(&mips->f[inst->dest], _mm_xor_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)signBits)));431#elif PPSSPP_ARCH(ARM_NEON)432vst1q_f32(&mips->f[inst->dest], vnegq_f32(vld1q_f32(&mips->f[inst->src1])));433#else434for (int i = 0; i < 4; i++)435mips->f[inst->dest + i] = -mips->f[inst->src1 + i];436#endif437break;438}439440case IROp::Vec4Abs:441{442#if defined(_M_SSE)443_mm_store_ps(&mips->f[inst->dest], _mm_and_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)noSignMask)));444#elif PPSSPP_ARCH(ARM_NEON)445vst1q_f32(&mips->f[inst->dest], vabsq_f32(vld1q_f32(&mips->f[inst->src1])));446#else447for (int i = 0; i < 4; i++)448mips->f[inst->dest + i] = fabsf(mips->f[inst->src1 + i]);449#endif450break;451}452453case IROp::Vec2Unpack16To31:454{455const int dest = inst->dest;456const int src1 = inst->src1;457mips->fi[dest] = (mips->fi[src1] << 16) >> 1;458mips->fi[dest + 1] = (mips->fi[src1] & 0xFFFF0000) >> 1;459break;460}461462case IROp::Vec2Unpack16To32:463{464const int dest = inst->dest;465const int src1 = inst->src1;466mips->fi[dest] = (mips->fi[src1] << 16);467mips->fi[dest + 1] = (mips->fi[src1] & 0xFFFF0000);468break;469}470471case IROp::Vec4Unpack8To32:472{473#if defined(_M_SSE)474__m128i src = _mm_cvtsi32_si128(mips->fi[inst->src1]);475src = _mm_unpacklo_epi8(src, _mm_setzero_si128());476src = _mm_unpacklo_epi16(src, _mm_setzero_si128());477_mm_store_si128((__m128i *)&mips->fi[inst->dest], _mm_slli_epi32(src, 24));478#elif PPSSPP_ARCH(ARM_NEON) && 0 // Untested479const uint8x8_t value = (uint8x8_t)vdup_n_u32(mips->fi[inst->src1]);480const uint16x8_t value16 = vmovl_u8(value);481const uint32x4_t value32 = vshll_n_u16(vget_low_u16(value16), 24);482vst1q_u32(&mips->fi[inst->dest], value32);483#else484mips->fi[inst->dest] = (mips->fi[inst->src1] << 24);485mips->fi[inst->dest + 1] = (mips->fi[inst->src1] << 16) & 0xFF000000;486mips->fi[inst->dest + 2] = (mips->fi[inst->src1] << 8) & 0xFF000000;487mips->fi[inst->dest + 3] = (mips->fi[inst->src1]) & 0xFF000000;488#endif489break;490}491492case IROp::Vec2Pack32To16:493{494u32 val = mips->fi[inst->src1] >> 16;495mips->fi[inst->dest] = (mips->fi[inst->src1 + 1] & 0xFFFF0000) | val;496break;497}498499case IROp::Vec2Pack31To16:500{501// Used in Tekken 6502503u32 val = (mips->fi[inst->src1] >> 15) & 0xFFFF;504val |= (mips->fi[inst->src1 + 1] << 1) & 0xFFFF0000;505mips->fi[inst->dest] = val;506break;507}508509case IROp::Vec4Pack32To8:510{511// Removed previous SSE code due to the need for unsigned 16-bit pack, which I'm too lazy to work around the lack of in SSE2.512// pshufb or SSE4 instructions can be used instead.513u32 val = mips->fi[inst->src1] >> 24;514val |= (mips->fi[inst->src1 + 1] >> 16) & 0xFF00;515val |= (mips->fi[inst->src1 + 2] >> 8) & 0xFF0000;516val |= (mips->fi[inst->src1 + 3]) & 0xFF000000;517mips->fi[inst->dest] = val;518break;519}520521case IROp::Vec4Pack31To8:522{523// Used in Tekken 6524525// Removed previous SSE code due to the need for unsigned 16-bit pack, which I'm too lazy to work around the lack of in SSE2.526// pshufb or SSE4 instructions can be used instead.527#if PPSSPP_ARCH(ARM_NEON) && 0528// Untested529uint32x4_t value = vld1q_u32(&mips->fi[inst->src1]);530value = vshlq_n_u32(value, 1);531uint32x2_t halved = vshrn_n_u32(value, 8);532uint32x2_t halvedAgain = vshrn_n_u32(vcombine_u32(halved, vdup_n_u32(0)), 8);533mips->fi[inst->dest] = vget_lane_u32(halvedAgain, 0);534#else535u32 val = (mips->fi[inst->src1] >> 23) & 0xFF;536val |= (mips->fi[inst->src1 + 1] >> 15) & 0xFF00;537val |= (mips->fi[inst->src1 + 2] >> 7) & 0xFF0000;538val |= (mips->fi[inst->src1 + 3] << 1) & 0xFF000000;539mips->fi[inst->dest] = val;540#endif541break;542}543544case IROp::Vec2ClampToZero:545{546for (int i = 0; i < 2; i++) {547u32 val = mips->fi[inst->src1 + i];548mips->fi[inst->dest + i] = (int)val >= 0 ? val : 0;549}550break;551}552553case IROp::Vec4ClampToZero:554{555#if defined(_M_SSE)556// Trickery: Expand the sign bit, and use andnot to zero negative values.557__m128i val = _mm_load_si128((const __m128i *)&mips->fi[inst->src1]);558__m128i mask = _mm_srai_epi32(val, 31);559val = _mm_andnot_si128(mask, val);560_mm_store_si128((__m128i *)&mips->fi[inst->dest], val);561#else562const int src1 = inst->src1;563const int dest = inst->dest;564for (int i = 0; i < 4; i++) {565u32 val = mips->fi[src1 + i];566mips->fi[dest + i] = (int)val >= 0 ? val : 0;567}568#endif569break;570}571572case IROp::Vec4DuplicateUpperBitsAndShift1: // For vuc2i, the weird one.573{574const int src1 = inst->src1;575const int dest = inst->dest;576for (int i = 0; i < 4; i++) {577u32 val = mips->fi[src1 + i];578val = val | (val >> 8);579val = val | (val >> 16);580val >>= 1;581mips->fi[dest + i] = val;582}583break;584}585586case IROp::FCmpVfpuBit:587{588const int op = inst->dest & 0xF;589const int bit = inst->dest >> 4;590int result = 0;591switch (op) {592case VC_EQ: result = mips->f[inst->src1] == mips->f[inst->src2]; break;593case VC_NE: result = mips->f[inst->src1] != mips->f[inst->src2]; break;594case VC_LT: result = mips->f[inst->src1] < mips->f[inst->src2]; break;595case VC_LE: result = mips->f[inst->src1] <= mips->f[inst->src2]; break;596case VC_GT: result = mips->f[inst->src1] > mips->f[inst->src2]; break;597case VC_GE: result = mips->f[inst->src1] >= mips->f[inst->src2]; break;598case VC_EZ: result = mips->f[inst->src1] == 0.0f; break;599case VC_NZ: result = mips->f[inst->src1] != 0.0f; break;600case VC_EN: result = my_isnan(mips->f[inst->src1]); break;601case VC_NN: result = !my_isnan(mips->f[inst->src1]); break;602case VC_EI: result = my_isinf(mips->f[inst->src1]); break;603case VC_NI: result = !my_isinf(mips->f[inst->src1]); break;604case VC_ES: result = my_isnanorinf(mips->f[inst->src1]); break;605case VC_NS: result = !my_isnanorinf(mips->f[inst->src1]); break;606case VC_TR: result = 1; break;607case VC_FL: result = 0; break;608default:609result = 0;610}611if (result != 0) {612mips->vfpuCtrl[VFPU_CTRL_CC] |= (1 << bit);613} else {614mips->vfpuCtrl[VFPU_CTRL_CC] &= ~(1 << bit);615}616break;617}618619case IROp::FCmpVfpuAggregate:620{621const u32 mask = inst->dest;622const u32 cc = mips->vfpuCtrl[VFPU_CTRL_CC];623int anyBit = (cc & mask) ? 0x10 : 0x00;624int allBit = (cc & mask) == mask ? 0x20 : 0x00;625mips->vfpuCtrl[VFPU_CTRL_CC] = (cc & ~0x30) | anyBit | allBit;626break;627}628629case IROp::FCmovVfpuCC:630if (((mips->vfpuCtrl[VFPU_CTRL_CC] >> (inst->src2 & 0xf)) & 1) == ((u32)inst->src2 >> 7)) {631mips->f[inst->dest] = mips->f[inst->src1];632}633break;634635case IROp::Vec4Dot:636{637// Not quickly implementable on all platforms, unfortunately.638// Though, this is still pretty fast compared to one split into multiple IR instructions.639// This might be good though: https://gist.github.com/rikusalminen/3040241640float dot = mips->f[inst->src1] * mips->f[inst->src2];641for (int i = 1; i < 4; i++)642dot += mips->f[inst->src1 + i] * mips->f[inst->src2 + i];643mips->f[inst->dest] = dot;644break;645}646647case IROp::FSin:648mips->f[inst->dest] = vfpu_sin(mips->f[inst->src1]);649break;650case IROp::FCos:651mips->f[inst->dest] = vfpu_cos(mips->f[inst->src1]);652break;653case IROp::FRSqrt:654mips->f[inst->dest] = 1.0f / sqrtf(mips->f[inst->src1]);655break;656case IROp::FRecip:657mips->f[inst->dest] = 1.0f / mips->f[inst->src1];658break;659case IROp::FAsin:660mips->f[inst->dest] = vfpu_asin(mips->f[inst->src1]);661break;662663case IROp::ShlImm:664mips->r[inst->dest] = mips->r[inst->src1] << (int)inst->src2;665break;666case IROp::ShrImm:667mips->r[inst->dest] = mips->r[inst->src1] >> (int)inst->src2;668break;669case IROp::SarImm:670mips->r[inst->dest] = (s32)mips->r[inst->src1] >> (int)inst->src2;671break;672case IROp::RorImm:673{674u32 x = mips->r[inst->src1];675int sa = inst->src2;676mips->r[inst->dest] = (x >> sa) | (x << (32 - sa));677}678break;679680case IROp::Shl:681mips->r[inst->dest] = mips->r[inst->src1] << (mips->r[inst->src2] & 31);682break;683case IROp::Shr:684mips->r[inst->dest] = mips->r[inst->src1] >> (mips->r[inst->src2] & 31);685break;686case IROp::Sar:687mips->r[inst->dest] = (s32)mips->r[inst->src1] >> (mips->r[inst->src2] & 31);688break;689case IROp::Ror:690{691u32 x = mips->r[inst->src1];692int sa = mips->r[inst->src2] & 31;693mips->r[inst->dest] = (x >> sa) | (x << (32 - sa));694break;695}696697case IROp::Clz:698{699mips->r[inst->dest] = clz32(mips->r[inst->src1]);700break;701}702703case IROp::Slt:704mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)mips->r[inst->src2];705break;706707case IROp::SltU:708mips->r[inst->dest] = mips->r[inst->src1] < mips->r[inst->src2];709break;710711case IROp::SltConst:712mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)inst->constant;713break;714715case IROp::SltUConst:716mips->r[inst->dest] = mips->r[inst->src1] < inst->constant;717break;718719case IROp::MovZ:720if (mips->r[inst->src1] == 0)721mips->r[inst->dest] = mips->r[inst->src2];722break;723case IROp::MovNZ:724if (mips->r[inst->src1] != 0)725mips->r[inst->dest] = mips->r[inst->src2];726break;727728case IROp::Max:729mips->r[inst->dest] = (s32)mips->r[inst->src1] > (s32)mips->r[inst->src2] ? mips->r[inst->src1] : mips->r[inst->src2];730break;731case IROp::Min:732mips->r[inst->dest] = (s32)mips->r[inst->src1] < (s32)mips->r[inst->src2] ? mips->r[inst->src1] : mips->r[inst->src2];733break;734735case IROp::MtLo:736mips->lo = mips->r[inst->src1];737break;738case IROp::MtHi:739mips->hi = mips->r[inst->src1];740break;741case IROp::MfLo:742mips->r[inst->dest] = mips->lo;743break;744case IROp::MfHi:745mips->r[inst->dest] = mips->hi;746break;747748case IROp::Mult:749{750s64 result = (s64)(s32)mips->r[inst->src1] * (s64)(s32)mips->r[inst->src2];751memcpy(&mips->lo, &result, 8);752break;753}754case IROp::MultU:755{756u64 result = (u64)mips->r[inst->src1] * (u64)mips->r[inst->src2];757memcpy(&mips->lo, &result, 8);758break;759}760case IROp::Madd:761{762s64 result;763memcpy(&result, &mips->lo, 8);764result += (s64)(s32)mips->r[inst->src1] * (s64)(s32)mips->r[inst->src2];765memcpy(&mips->lo, &result, 8);766break;767}768case IROp::MaddU:769{770s64 result;771memcpy(&result, &mips->lo, 8);772result += (u64)mips->r[inst->src1] * (u64)mips->r[inst->src2];773memcpy(&mips->lo, &result, 8);774break;775}776case IROp::Msub:777{778s64 result;779memcpy(&result, &mips->lo, 8);780result -= (s64)(s32)mips->r[inst->src1] * (s64)(s32)mips->r[inst->src2];781memcpy(&mips->lo, &result, 8);782break;783}784case IROp::MsubU:785{786s64 result;787memcpy(&result, &mips->lo, 8);788result -= (u64)mips->r[inst->src1] * (u64)mips->r[inst->src2];789memcpy(&mips->lo, &result, 8);790break;791}792793case IROp::Div:794{795s32 numerator = (s32)mips->r[inst->src1];796s32 denominator = (s32)mips->r[inst->src2];797if (numerator == (s32)0x80000000 && denominator == -1) {798mips->lo = 0x80000000;799mips->hi = -1;800} else if (denominator != 0) {801mips->lo = (u32)(numerator / denominator);802mips->hi = (u32)(numerator % denominator);803} else {804mips->lo = numerator < 0 ? 1 : -1;805mips->hi = numerator;806}807break;808}809case IROp::DivU:810{811u32 numerator = mips->r[inst->src1];812u32 denominator = mips->r[inst->src2];813if (denominator != 0) {814mips->lo = numerator / denominator;815mips->hi = numerator % denominator;816} else {817mips->lo = numerator <= 0xFFFF ? 0xFFFF : -1;818mips->hi = numerator;819}820break;821}822823case IROp::BSwap16:824{825u32 x = mips->r[inst->src1];826// Don't think we can beat this with intrinsics.827mips->r[inst->dest] = ((x & 0xFF00FF00) >> 8) | ((x & 0x00FF00FF) << 8);828break;829}830case IROp::BSwap32:831{832mips->r[inst->dest] = swap32(mips->r[inst->src1]);833break;834}835836case IROp::FAdd:837mips->f[inst->dest] = mips->f[inst->src1] + mips->f[inst->src2];838break;839case IROp::FSub:840mips->f[inst->dest] = mips->f[inst->src1] - mips->f[inst->src2];841break;842case IROp::FMul:843#if 1844{845float a = mips->f[inst->src1];846float b = mips->f[inst->src2];847if ((b == 0.0f && my_isinf(a)) || (a == 0.0f && my_isinf(b))) {848mips->fi[inst->dest] = 0x7fc00000;849} else {850mips->f[inst->dest] = a * b;851}852}853break;854#else855// Not sure if faster since it needs to load the operands twice? But the code is simpler.856{857// Takes care of negative zero by masking away the top bit, which also makes the inf check shorter.858u32 a = mips->fi[inst->src1] & 0x7FFFFFFF;859u32 b = mips->fi[inst->src2] & 0x7FFFFFFF;860if ((a == 0 && b == 0x7F800000) || (b == 0 && a == 0x7F800000)) {861mips->fi[inst->dest] = 0x7fc00000;862} else {863mips->f[inst->dest] = mips->f[inst->src1] * mips->f[inst->src2];864}865break;866}867#endif868case IROp::FDiv:869mips->f[inst->dest] = mips->f[inst->src1] / mips->f[inst->src2];870break;871case IROp::FMin:872if (my_isnan(mips->f[inst->src1]) || my_isnan(mips->f[inst->src2])) {873// See interpreter for this logic: this is for vmin, we're comparing mantissa+exp.874if (mips->fs[inst->src1] < 0 && mips->fs[inst->src2] < 0) {875mips->fs[inst->dest] = std::max(mips->fs[inst->src1], mips->fs[inst->src2]);876} else {877mips->fs[inst->dest] = std::min(mips->fs[inst->src1], mips->fs[inst->src2]);878}879} else {880mips->f[inst->dest] = std::min(mips->f[inst->src1], mips->f[inst->src2]);881}882break;883case IROp::FMax:884if (my_isnan(mips->f[inst->src1]) || my_isnan(mips->f[inst->src2])) {885// See interpreter for this logic: this is for vmax, we're comparing mantissa+exp.886if (mips->fs[inst->src1] < 0 && mips->fs[inst->src2] < 0) {887mips->fs[inst->dest] = std::min(mips->fs[inst->src1], mips->fs[inst->src2]);888} else {889mips->fs[inst->dest] = std::max(mips->fs[inst->src1], mips->fs[inst->src2]);890}891} else {892mips->f[inst->dest] = std::max(mips->f[inst->src1], mips->f[inst->src2]);893}894break;895896case IROp::FMov:897mips->f[inst->dest] = mips->f[inst->src1];898break;899case IROp::FAbs:900mips->f[inst->dest] = fabsf(mips->f[inst->src1]);901break;902case IROp::FSqrt:903mips->f[inst->dest] = sqrtf(mips->f[inst->src1]);904break;905case IROp::FNeg:906mips->f[inst->dest] = -mips->f[inst->src1];907break;908case IROp::FSat0_1:909// We have to do this carefully to handle NAN and -0.0f.910mips->f[inst->dest] = vfpu_clamp(mips->f[inst->src1], 0.0f, 1.0f);911break;912case IROp::FSatMinus1_1:913mips->f[inst->dest] = vfpu_clamp(mips->f[inst->src1], -1.0f, 1.0f);914break;915916case IROp::FSign:917{918// Bitwise trickery919u32 val;920memcpy(&val, &mips->f[inst->src1], sizeof(u32));921if (val == 0 || val == 0x80000000)922mips->f[inst->dest] = 0.0f;923else if ((val >> 31) == 0)924mips->f[inst->dest] = 1.0f;925else926mips->f[inst->dest] = -1.0f;927break;928}929930case IROp::FpCondFromReg:931mips->fpcond = mips->r[inst->dest];932break;933case IROp::FpCondToReg:934mips->r[inst->dest] = mips->fpcond;935break;936case IROp::FpCtrlFromReg:937mips->fcr31 = mips->r[inst->src1] & 0x0181FFFF;938// Extract the new fpcond value.939// TODO: Is it really helping us to keep it separate?940mips->fpcond = (mips->fcr31 >> 23) & 1;941break;942case IROp::FpCtrlToReg:943// Update the fpcond bit first.944mips->fcr31 = (mips->fcr31 & ~(1 << 23)) | ((mips->fpcond & 1) << 23);945mips->r[inst->dest] = mips->fcr31;946break;947case IROp::VfpuCtrlToReg:948mips->r[inst->dest] = mips->vfpuCtrl[inst->src1];949break;950case IROp::FRound:951{952float value = mips->f[inst->src1];953if (my_isnanorinf(value)) {954mips->fi[inst->dest] = my_isinf(value) && value < 0.0f ? -2147483648LL : 2147483647LL;955break;956} else {957mips->fs[inst->dest] = (int)round_ieee_754(value);958}959break;960}961case IROp::FTrunc:962{963float value = mips->f[inst->src1];964if (my_isnanorinf(value)) {965mips->fi[inst->dest] = my_isinf(value) && value < 0.0f ? -2147483648LL : 2147483647LL;966break;967} else {968if (value >= 0.0f) {969mips->fs[inst->dest] = (int)floorf(value);970// Overflow, but it was positive.971if (mips->fs[inst->dest] == -2147483648LL) {972mips->fs[inst->dest] = 2147483647LL;973}974} else {975// Overflow happens to be the right value anyway.976mips->fs[inst->dest] = (int)ceilf(value);977}978break;979}980}981case IROp::FCeil:982{983float value = mips->f[inst->src1];984if (my_isnanorinf(value)) {985mips->fi[inst->dest] = my_isinf(value) && value < 0.0f ? -2147483648LL : 2147483647LL;986break;987} else {988mips->fs[inst->dest] = (int)ceilf(value);989}990break;991}992case IROp::FFloor:993{994float value = mips->f[inst->src1];995if (my_isnanorinf(value)) {996mips->fi[inst->dest] = my_isinf(value) && value < 0.0f ? -2147483648LL : 2147483647LL;997break;998} else {999mips->fs[inst->dest] = (int)floorf(value);1000}1001break;1002}1003case IROp::FCmp:1004switch (inst->dest) {1005case IRFpCompareMode::False:1006mips->fpcond = 0;1007break;1008case IRFpCompareMode::EitherUnordered:1009{1010float a = mips->f[inst->src1];1011float b = mips->f[inst->src2];1012mips->fpcond = !(a > b || a < b || a == b);1013break;1014}1015case IRFpCompareMode::EqualOrdered:1016mips->fpcond = mips->f[inst->src1] == mips->f[inst->src2];1017break;1018case IRFpCompareMode::EqualUnordered:1019mips->fpcond = mips->f[inst->src1] == mips->f[inst->src2] || my_isnan(mips->f[inst->src1]) || my_isnan(mips->f[inst->src2]);1020break;1021case IRFpCompareMode::LessEqualOrdered:1022mips->fpcond = mips->f[inst->src1] <= mips->f[inst->src2];1023break;1024case IRFpCompareMode::LessEqualUnordered:1025mips->fpcond = !(mips->f[inst->src1] > mips->f[inst->src2]);1026break;1027case IRFpCompareMode::LessOrdered:1028mips->fpcond = mips->f[inst->src1] < mips->f[inst->src2];1029break;1030case IRFpCompareMode::LessUnordered:1031mips->fpcond = !(mips->f[inst->src1] >= mips->f[inst->src2]);1032break;1033}1034break;10351036case IROp::FCvtSW:1037mips->f[inst->dest] = (float)mips->fs[inst->src1];1038break;1039case IROp::FCvtWS:1040{1041float src = mips->f[inst->src1];1042if (my_isnanorinf(src)) {1043mips->fs[inst->dest] = my_isinf(src) && src < 0.0f ? -2147483648LL : 2147483647LL;1044break;1045}1046// TODO: Inline assembly to use here would be better.1047switch (IRRoundMode(mips->fcr31 & 3)) {1048case IRRoundMode::RINT_0: mips->fs[inst->dest] = (int)round_ieee_754(src); break;1049case IRRoundMode::CAST_1: mips->fs[inst->dest] = (int)src; break;1050case IRRoundMode::CEIL_2: mips->fs[inst->dest] = (int)ceilf(src); break;1051case IRRoundMode::FLOOR_3: mips->fs[inst->dest] = (int)floorf(src); break;1052}1053break; //cvt.w.s1054}1055case IROp::FCvtScaledSW:1056mips->f[inst->dest] = (float)mips->fs[inst->src1] * (1.0f / (1UL << (inst->src2 & 0x1F)));1057break;1058case IROp::FCvtScaledWS:1059{1060float src = mips->f[inst->src1];1061if (my_isnan(src)) {1062// TODO: True for negatives too?1063mips->fs[inst->dest] = 2147483647L;1064break;1065}10661067float mult = (float)(1UL << (inst->src2 & 0x1F));1068double sv = src * mult; // (float)0x7fffffff == (float)0x800000001069// Cap/floor it to 0x7fffffff / 0x800000001070if (sv > (double)0x7fffffff) {1071mips->fs[inst->dest] = 0x7fffffff;1072} else if (sv <= (double)(int)0x80000000) {1073mips->fs[inst->dest] = 0x80000000;1074} else {1075switch (IRRoundMode(inst->src2 >> 6)) {1076case IRRoundMode::RINT_0: mips->fs[inst->dest] = (int)round_ieee_754(sv); break;1077case IRRoundMode::CAST_1: mips->fs[inst->dest] = src >= 0 ? (int)floor(sv) : (int)ceil(sv); break;1078case IRRoundMode::CEIL_2: mips->fs[inst->dest] = (int)ceil(sv); break;1079case IRRoundMode::FLOOR_3: mips->fs[inst->dest] = (int)floor(sv); break;1080}1081}1082break;1083}10841085case IROp::FMovFromGPR:1086memcpy(&mips->f[inst->dest], &mips->r[inst->src1], 4);1087break;1088case IROp::OptFCvtSWFromGPR:1089mips->f[inst->dest] = (float)(int)mips->r[inst->src1];1090break;1091case IROp::FMovToGPR:1092memcpy(&mips->r[inst->dest], &mips->f[inst->src1], 4);1093break;1094case IROp::OptFMovToGPRShr8:1095{1096u32 temp;1097memcpy(&temp, &mips->f[inst->src1], 4);1098mips->r[inst->dest] = temp >> 8;1099break;1100}11011102case IROp::ExitToConst:1103return inst->constant;11041105case IROp::ExitToReg:1106return mips->r[inst->src1];11071108case IROp::ExitToConstIfEq:1109if (mips->r[inst->src1] == mips->r[inst->src2])1110return inst->constant;1111break;1112case IROp::ExitToConstIfNeq:1113if (mips->r[inst->src1] != mips->r[inst->src2])1114return inst->constant;1115break;1116case IROp::ExitToConstIfGtZ:1117if ((s32)mips->r[inst->src1] > 0)1118return inst->constant;1119break;1120case IROp::ExitToConstIfGeZ:1121if ((s32)mips->r[inst->src1] >= 0)1122return inst->constant;1123break;1124case IROp::ExitToConstIfLtZ:1125if ((s32)mips->r[inst->src1] < 0)1126return inst->constant;1127break;1128case IROp::ExitToConstIfLeZ:1129if ((s32)mips->r[inst->src1] <= 0)1130return inst->constant;1131break;11321133case IROp::Downcount:1134mips->downcount -= (int)inst->constant;1135break;11361137case IROp::SetPC:1138mips->pc = mips->r[inst->src1];1139break;11401141case IROp::SetPCConst:1142mips->pc = inst->constant;1143break;11441145case IROp::Syscall:1146// IROp::SetPC was (hopefully) executed before.1147{1148MIPSOpcode op(inst->constant);1149CallSyscall(op);1150if (coreState != CORE_RUNNING)1151CoreTiming::ForceCheck();1152break;1153}11541155case IROp::ExitToPC:1156return mips->pc;11571158case IROp::Interpret: // SLOW fallback. Can be made faster. Ideally should be removed but may be useful for debugging.1159{1160MIPSOpcode op(inst->constant);1161MIPSInterpret(op);1162break;1163}11641165case IROp::CallReplacement:1166{1167int funcIndex = inst->constant;1168const ReplacementTableEntry *f = GetReplacementFunc(funcIndex);1169int cycles = f->replaceFunc();1170mips->r[inst->dest] = cycles < 0 ? -1 : 0;1171mips->downcount -= cycles < 0 ? -cycles : cycles;1172break;1173}11741175case IROp::SetCtrlVFPU:1176mips->vfpuCtrl[inst->dest] = inst->constant;1177break;11781179case IROp::SetCtrlVFPUReg:1180mips->vfpuCtrl[inst->dest] = mips->r[inst->src1];1181break;11821183case IROp::SetCtrlVFPUFReg:1184memcpy(&mips->vfpuCtrl[inst->dest], &mips->f[inst->src1], 4);1185break;11861187case IROp::ApplyRoundingMode:1188IRApplyRounding(mips);1189break;1190case IROp::RestoreRoundingMode:1191IRRestoreRounding();1192break;1193case IROp::UpdateRoundingMode:1194// TODO: Implement1195break;11961197case IROp::Break:1198Core_Break(mips->pc);1199return mips->pc + 4;12001201case IROp::Breakpoint:1202if (IRRunBreakpoint(inst->constant)) {1203CoreTiming::ForceCheck();1204return mips->pc;1205}1206break;12071208case IROp::MemoryCheck:1209if (IRRunMemCheck(mips->pc + inst->dest, mips->r[inst->src1] + inst->constant)) {1210CoreTiming::ForceCheck();1211return mips->pc;1212}1213break;12141215case IROp::ValidateAddress8:1216if (RunValidateAddress<1>(mips->pc, mips->r[inst->src1] + inst->constant, inst->src2)) {1217CoreTiming::ForceCheck();1218return mips->pc;1219}1220break;1221case IROp::ValidateAddress16:1222if (RunValidateAddress<2>(mips->pc, mips->r[inst->src1] + inst->constant, inst->src2)) {1223CoreTiming::ForceCheck();1224return mips->pc;1225}1226break;1227case IROp::ValidateAddress32:1228if (RunValidateAddress<4>(mips->pc, mips->r[inst->src1] + inst->constant, inst->src2)) {1229CoreTiming::ForceCheck();1230return mips->pc;1231}1232break;1233case IROp::ValidateAddress128:1234if (RunValidateAddress<16>(mips->pc, mips->r[inst->src1] + inst->constant, inst->src2)) {1235CoreTiming::ForceCheck();1236return mips->pc;1237}1238break;1239case IROp::LogIRBlock:1240if (mipsTracer.tracing_enabled) {1241mipsTracer.executed_blocks.push_back(inst->constant);1242}1243break;12441245case IROp::Nop: // TODO: This shouldn't crash, but for now we should not emit nops, so...1246case IROp::Bad:1247default:1248Crash();1249break;1250// Unimplemented IR op. Bad.1251}12521253#ifdef _DEBUG1254if (mips->r[0] != 0)1255Crash();1256#endif1257inst++;1258}12591260// We should not reach here anymore.1261return 0;1262}126312641265