CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!
Path: blob/master/Common/ArmEmitter.h
Views: 1401
// Copyright (C) 2003 Dolphin Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official SVN repository and contact information can be found at15// http://code.google.com/p/dolphin-emu/1617#pragma once1819#include <vector>20#include <cstdint>2122#include "Common/CommonTypes.h"23#include "Common/Log.h"24#include "Common/ArmCommon.h"25#include "Common/CodeBlock.h"2627// VCVT flags28#define TO_FLOAT 029#define TO_INT 1 << 030#define IS_SIGNED 1 << 131#define ROUND_TO_ZERO 1 << 23233namespace ArmGen34{35enum ARMReg36{37// GPRs38R0 = 0, R1, R2, R3, R4, R5,39R6, R7, R8, R9, R10, R11,4041// SPRs42// R13 - R15 are SP, LR, and PC.43// Almost always referred to by name instead of register number44R12 = 12, R13 = 13, R14 = 14, R15 = 15,45R_IP = 12, R_SP = 13, R_LR = 14, R_PC = 15,464748// VFP single precision registers49S0, S1, S2, S3, S4, S5, S6,50S7, S8, S9, S10, S11, S12, S13,51S14, S15, S16, S17, S18, S19, S20,52S21, S22, S23, S24, S25, S26, S27,53S28, S29, S30, S31,5455// VFP Double Precision registers56D0, D1, D2, D3, D4, D5, D6, D7,57D8, D9, D10, D11, D12, D13, D14, D15,58D16, D17, D18, D19, D20, D21, D22, D23,59D24, D25, D26, D27, D28, D29, D30, D31,6061// ASIMD Quad-Word registers62Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7,63Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15,6465// for NEON VLD/VST instructions66REG_UPDATE = R13,67INVALID_REG = 0xFFFFFFFF68};6970enum ShiftType71{72ST_LSL = 0,73ST_ASL = 0,74ST_LSR = 1,75ST_ASR = 2,76ST_ROR = 3,77ST_RRX = 478};79enum IntegerSize80{81I_I8 = 0,82I_I16,83I_I32,84I_I6485};8687enum88{89NUMGPRs = 13,90};9192class ARMXEmitter;9394enum OpType95{96TYPE_IMM = 0,97TYPE_REG,98TYPE_IMMSREG,99TYPE_RSR,100TYPE_MEM101};102103// This is no longer a proper operand2 class. Need to split up.104class Operand2105{106friend class ARMXEmitter;107protected:108u32 Value;109110private:111OpType Type;112113// IMM types114u8 Rotation = 0; // Only for u8 values115116// Register types117u8 IndexOrShift = 0;118ShiftType Shift = ST_LSL;119public:120OpType GetType() const {121return Type;122}123Operand2() {124Type = TYPE_IMM;125Value = 0;126}127Operand2(u32 imm, OpType type = TYPE_IMM) {128Type = type;129Value = imm;130}131132Operand2(ARMReg Reg) {133Type = TYPE_REG;134Value = Reg;135}136Operand2(u8 imm, u8 rotation) {137Type = TYPE_IMM;138Value = imm;139Rotation = rotation;140}141Operand2(ARMReg base, ShiftType type, ARMReg shift) // RSR142{143Type = TYPE_RSR;144_assert_msg_(type != ST_RRX, "Invalid Operand2: RRX does not take a register shift amount");145IndexOrShift = shift;146Shift = type;147Value = base;148}149150Operand2(ARMReg base, ShiftType type, u8 shift)// For IMM shifted register151{152if(shift == 32) shift = 0;153switch (type)154{155case ST_LSL:156_assert_msg_(shift < 32, "Invalid Operand2: LSL %u", shift);157break;158case ST_LSR:159_assert_msg_(shift <= 32, "Invalid Operand2: LSR %u", shift);160if (!shift)161type = ST_LSL;162if (shift == 32)163shift = 0;164break;165case ST_ASR:166_assert_msg_(shift < 32, "Invalid Operand2: ASR %u", shift);167if (!shift)168type = ST_LSL;169if (shift == 32)170shift = 0;171break;172case ST_ROR:173_assert_msg_(shift < 32, "Invalid Operand2: ROR %u", shift);174if (!shift)175type = ST_LSL;176break;177case ST_RRX:178_assert_msg_(shift == 0, "Invalid Operand2: RRX does not take an immediate shift amount");179type = ST_ROR;180break;181}182IndexOrShift = shift;183Shift = type;184Value = base;185Type = TYPE_IMMSREG;186}187u32 GetData()188{189switch(Type)190{191case TYPE_IMM:192return Imm12Mod(); // This'll need to be changed later193case TYPE_REG:194return Rm();195case TYPE_IMMSREG:196return IMMSR();197case TYPE_RSR:198return RSR();199default:200_assert_msg_(false, "GetData with Invalid Type");201return 0;202}203}204u32 IMMSR() // IMM shifted register205{206_assert_msg_(Type == TYPE_IMMSREG, "IMMSR must be imm shifted register");207return ((IndexOrShift & 0x1f) << 7 | (Shift << 5) | Value);208}209u32 RSR() // Register shifted register210{211_assert_msg_(Type == TYPE_RSR, "RSR must be RSR Of Course");212return (IndexOrShift << 8) | (Shift << 5) | 0x10 | Value;213}214u32 Rm() const215{216_assert_msg_(Type == TYPE_REG, "Rm must be with Reg");217return Value;218}219220u32 Imm5() const221{222_assert_msg_((Type == TYPE_IMM), "Imm5 not IMM value");223return ((Value & 0x0000001F) << 7);224}225u32 Imm8() const226{227_assert_msg_((Type == TYPE_IMM), "Imm8Rot not IMM value");228return Value & 0xFF;229}230u32 Imm8Rot() const // IMM8 with Rotation231{232_assert_msg_((Type == TYPE_IMM), "Imm8Rot not IMM value");233_assert_msg_((Rotation & 0xE1) != 0, "Invalid Operand2: immediate rotation %u", Rotation);234return (1 << 25) | (Rotation << 7) | (Value & 0x000000FF);235}236u32 Imm12() const237{238_assert_msg_((Type == TYPE_IMM), "Imm12 not IMM");239return (Value & 0x00000FFF);240}241242u32 Imm12Mod() const243{244// This is an IMM12 with the top four bits being rotation and the245// bottom eight being an IMM. This is for instructions that need to246// expand a 8bit IMM to a 32bit value and gives you some rotation as247// well.248// Each rotation rotates to the right by 2 bits249_assert_msg_((Type == TYPE_IMM), "Imm12Mod not IMM");250return ((Rotation & 0xF) << 8) | (Value & 0xFF);251}252u32 Imm16() const253{254_assert_msg_((Type == TYPE_IMM), "Imm16 not IMM");255return ( (Value & 0xF000) << 4) | (Value & 0x0FFF);256}257u32 Imm16Low() const258{259return Imm16();260}261u32 Imm16High() const // Returns high 16bits262{263_assert_msg_((Type == TYPE_IMM), "Imm16 not IMM");264return ( ((Value >> 16) & 0xF000) << 4) | ((Value >> 16) & 0x0FFF);265}266u32 Imm24() const267{268_assert_msg_((Type == TYPE_IMM), "Imm16 not IMM");269return (Value & 0x0FFFFFFF);270}271// NEON and ASIMD specific272u32 Imm8ASIMD() const273{274_assert_msg_((Type == TYPE_IMM), "Imm8ASIMD not IMM");275return ((Value & 0x80) << 17) | ((Value & 0x70) << 12) | (Value & 0xF);276}277u32 Imm8VFP() const278{279_assert_msg_((Type == TYPE_IMM), "Imm8VFP not IMM");280return ((Value & 0xF0) << 12) | (Value & 0xF);281}282};283284// Use these when you don't know if an imm can be represented as an operand2.285// This lets you generate both an optimal and a fallback solution by checking286// the return value, which will be false if these fail to find a Operand2 that287// represents your 32-bit imm value.288bool TryMakeOperand2(u32 imm, Operand2 &op2);289bool TryMakeOperand2_AllowInverse(u32 imm, Operand2 &op2, bool *inverse);290bool TryMakeOperand2_AllowNegation(s32 imm, Operand2 &op2, bool *negated);291292// Use this only when you know imm can be made into an Operand2.293Operand2 AssumeMakeOperand2(u32 imm);294295inline Operand2 R(ARMReg Reg) { return Operand2(Reg, TYPE_REG); }296inline Operand2 IMM(u32 Imm) { return Operand2(Imm, TYPE_IMM); }297inline Operand2 Mem(void *ptr) { return Operand2((u32)(uintptr_t)ptr, TYPE_IMM); }298//usage: struct {int e;} s; STRUCT_OFFSET(s,e)299#define STRUCT_OFF(str,elem) ((u32)((u32)&(str).elem-(u32)&(str)))300301302struct FixupBranch303{304u8 *ptr;305u32 condition; // Remembers our codition at the time306int type; //0 = B 1 = BL307};308309struct LiteralPool310{311intptr_t loc;312u8* ldr_address;313u32 val;314};315316typedef const u8* JumpTarget;317318// XXX: Stop polluting the global namespace319const u32 I_8 = (1 << 0);320const u32 I_16 = (1 << 1);321const u32 I_32 = (1 << 2);322const u32 I_64 = (1 << 3);323const u32 I_SIGNED = (1 << 4);324const u32 I_UNSIGNED = (1 << 5);325const u32 F_32 = (1 << 6);326const u32 I_POLYNOMIAL = (1 << 7); // Only used in VMUL/VMULL327328enum VIMMMode {329VIMM___x___x = 0x0, // 0000 VMOV330VIMM__x___x_ = 0x2, // 0010331VIMM_x___x__ = 0x4, // 0100332VIMMx___x___ = 0x6, // 0110333VIMM_x_x_x_x = 0x8, // 1000334VIMMx_x_x_x_ = 0xA, // 1010335VIMM__x1__x1 = 0xC, // 1100336VIMM_x11_x11 = 0xD, // 1101337VIMMxxxxxxxx = 0xE, // 1110 // op == 0338VIMMf000f000 = 0xF, // 1111 // op == 0 ( really aBbbbbbc defgh 00000000 00000000 ) where B = NOT b339VIMMbits2bytes = 0x1E, // Bit replication into bytes! Easily created 111111111 00000000 masks!340};341342u32 EncodeVd(ARMReg Vd);343u32 EncodeVn(ARMReg Vn);344u32 EncodeVm(ARMReg Vm);345346u32 encodedSize(u32 value);347348// Subtracts the base from the register to give us the real one349ARMReg SubBase(ARMReg Reg);350351inline bool IsQ(ARMReg r) {352return r >= Q0 && r <= Q15;353}354355inline bool IsD(ARMReg r) {356return r >= D0 && r <= D31;357}358359// See A.7.1 in the ARMv7-A360// VMUL F32 scalars can only be up to D15[0], D15[1] - higher scalars cannot be individually addressed361ARMReg DScalar(ARMReg dreg, int subScalar);362ARMReg QScalar(ARMReg qreg, int subScalar);363inline ARMReg XScalar(ARMReg reg, int subScalar) {364if (IsQ(reg))365return QScalar(reg, subScalar);366else367return DScalar(reg, subScalar);368}369370const char *ARMRegAsString(ARMReg reg);371372// Get the two halves of a Q register.373inline ARMReg D_0(ARMReg q) {374if (q >= Q0 && q <= Q15) {375return ARMReg(D0 + (q - Q0) * 2);376} else if (q >= D0 && q <= D31) {377return q;378} else {379return INVALID_REG;380}381}382inline ARMReg D_1(ARMReg q) {383return ARMReg(D0 + (q - Q0) * 2 + 1);384}385386enum NEONAlignment {387ALIGN_NONE = 0,388ALIGN_64 = 1,389ALIGN_128 = 2,390ALIGN_256 = 3391};392393394class NEONXEmitter;395396class ARMXEmitter397{398friend struct OpArg; // for Write8 etc399friend class NEONXEmitter;400private:401u8 *code, *startcode;402u8 *lastCacheFlushEnd;403u32 condition;404std::vector<LiteralPool> currentLitPool;405406void WriteStoreOp(u32 Op, ARMReg Rt, ARMReg Rn, Operand2 op2, bool RegAdd);407void WriteRegStoreOp(u32 op, ARMReg dest, bool WriteBack, u16 RegList);408void WriteVRegStoreOp(u32 op, ARMReg dest, bool Double, bool WriteBack, ARMReg firstreg, u8 numregs);409void WriteShiftedDataOp(u32 op, bool SetFlags, ARMReg dest, ARMReg src, ARMReg op2);410void WriteShiftedDataOp(u32 op, bool SetFlags, ARMReg dest, ARMReg src, Operand2 op2);411void WriteSignedMultiply(u32 Op, u32 Op2, u32 Op3, ARMReg dest, ARMReg r1, ARMReg r2);412413void WriteVFPDataOp(u32 Op, ARMReg Vd, ARMReg Vn, ARMReg Vm);414415void Write4OpMultiply(u32 op, ARMReg destLo, ARMReg destHi, ARMReg rn, ARMReg rm);416417// New Ops418void WriteInstruction(u32 op, ARMReg Rd, ARMReg Rn, Operand2 Rm, bool SetFlags = false);419420void WriteVLDST1(bool load, u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align, ARMReg Rm);421void WriteVLDST1_lane(bool load, u32 Size, ARMReg Vd, ARMReg Rn, int lane, bool aligned, ARMReg Rm);422423void WriteVimm(ARMReg Vd, int cmode, u8 imm, int op);424425void EncodeShiftByImm(u32 Size, ARMReg Vd, ARMReg Vm, int shiftAmount, u8 opcode, bool quad, bool inverse, bool halve);426427protected:428inline void Write32(u32 value) {*(u32*)code = value; code+=4;}429430public:431ARMXEmitter() : code(0), startcode(0), lastCacheFlushEnd(0) {432condition = CC_AL << 28;433}434ARMXEmitter(u8 *code_ptr) {435code = code_ptr;436lastCacheFlushEnd = code_ptr;437startcode = code_ptr;438condition = CC_AL << 28;439}440virtual ~ARMXEmitter() {}441442void SetCodePointer(u8 *ptr, u8 *writePtr);443const u8 *GetCodePointer() const;444445void ReserveCodeSpace(u32 bytes);446const u8 *AlignCode16();447const u8 *AlignCodePage();448const u8 *NopAlignCode16();449450void FlushIcache();451void FlushIcacheSection(u8 *start, u8 *end);452u8 *GetWritableCodePtr();453454void FlushLitPool();455void AddNewLit(u32 val);456bool TrySetValue_TwoOp(ARMReg reg, u32 val);457458CCFlags GetCC() const { return CCFlags(condition >> 28); }459void SetCC(CCFlags cond = CC_AL);460461// Special purpose instructions462463// Dynamic Endian Switching464void SETEND(bool BE);465// Debug Breakpoint466void BKPT(u16 arg);467468// Hint instruction469void YIELD();470471// Do nothing472void NOP(int count = 1); //nop padding - TODO: fast nop slides, for amd and intel (check their manuals)473474#ifdef CALL475#undef CALL476#endif477478// Branching479FixupBranch B();480FixupBranch B_CC(CCFlags Cond);481void B_CC(CCFlags Cond, const void *fnptr);482FixupBranch BL();483FixupBranch BL_CC(CCFlags Cond);484void SetJumpTarget(FixupBranch const &branch);485486void B (const void *fnptr);487void B (ARMReg src);488void BL(const void *fnptr);489void BL(ARMReg src);490bool BLInRange(const void *fnptr) const;491492void PUSH(const int num, ...);493void POP(const int num, ...);494495// New Data Ops496void AND (ARMReg Rd, ARMReg Rn, Operand2 Rm);497void ANDS(ARMReg Rd, ARMReg Rn, Operand2 Rm);498void EOR (ARMReg dest, ARMReg src, Operand2 op2);499void EORS(ARMReg dest, ARMReg src, Operand2 op2);500void SUB (ARMReg dest, ARMReg src, Operand2 op2);501void SUBS(ARMReg dest, ARMReg src, Operand2 op2);502void RSB (ARMReg dest, ARMReg src, Operand2 op2);503void RSBS(ARMReg dest, ARMReg src, Operand2 op2);504void ADD (ARMReg dest, ARMReg src, Operand2 op2);505void ADDS(ARMReg dest, ARMReg src, Operand2 op2);506void ADC (ARMReg dest, ARMReg src, Operand2 op2);507void ADCS(ARMReg dest, ARMReg src, Operand2 op2);508void LSL (ARMReg dest, ARMReg src, Operand2 op2);509void LSL (ARMReg dest, ARMReg src, ARMReg op2);510void LSLS(ARMReg dest, ARMReg src, Operand2 op2);511void LSLS(ARMReg dest, ARMReg src, ARMReg op2);512void LSR (ARMReg dest, ARMReg src, Operand2 op2);513void LSRS(ARMReg dest, ARMReg src, Operand2 op2);514void LSR (ARMReg dest, ARMReg src, ARMReg op2);515void LSRS(ARMReg dest, ARMReg src, ARMReg op2);516void ASR (ARMReg dest, ARMReg src, Operand2 op2);517void ASRS(ARMReg dest, ARMReg src, Operand2 op2);518void ASR (ARMReg dest, ARMReg src, ARMReg op2);519void ASRS(ARMReg dest, ARMReg src, ARMReg op2);520521void SBC (ARMReg dest, ARMReg src, Operand2 op2);522void SBCS(ARMReg dest, ARMReg src, Operand2 op2);523void RBIT(ARMReg dest, ARMReg src);524void REV (ARMReg dest, ARMReg src);525void REV16 (ARMReg dest, ARMReg src);526void RSC (ARMReg dest, ARMReg src, Operand2 op2);527void RSCS(ARMReg dest, ARMReg src, Operand2 op2);528void TST ( ARMReg src, Operand2 op2);529void TEQ ( ARMReg src, Operand2 op2);530void CMP ( ARMReg src, Operand2 op2);531void CMN ( ARMReg src, Operand2 op2);532void ORR (ARMReg dest, ARMReg src, Operand2 op2);533void ORRS(ARMReg dest, ARMReg src, Operand2 op2);534void MOV (ARMReg dest, Operand2 op2);535void MOVS(ARMReg dest, Operand2 op2);536void BIC (ARMReg dest, ARMReg src, Operand2 op2); // BIC = ANDN537void BICS(ARMReg dest, ARMReg src, Operand2 op2);538void MVN (ARMReg dest, Operand2 op2);539void MVNS(ARMReg dest, Operand2 op2);540void MOVW(ARMReg dest, Operand2 op2);541void MOVT(ARMReg dest, Operand2 op2, bool TopBits = false);542543// UDIV and SDIV are only available on CPUs that have544// the idiva hardare capacity545void UDIV(ARMReg dest, ARMReg dividend, ARMReg divisor);546void SDIV(ARMReg dest, ARMReg dividend, ARMReg divisor);547548void MUL (ARMReg dest, ARMReg src, ARMReg op2);549void MULS(ARMReg dest, ARMReg src, ARMReg op2);550551void UMULL(ARMReg destLo, ARMReg destHi, ARMReg rn, ARMReg rm);552void SMULL(ARMReg destLo, ARMReg destHi, ARMReg rn, ARMReg rm);553554void UMLAL(ARMReg destLo, ARMReg destHi, ARMReg rn, ARMReg rm);555void SMLAL(ARMReg destLo, ARMReg destHi, ARMReg rn, ARMReg rm);556557void SXTB(ARMReg dest, ARMReg op2);558void SXTH(ARMReg dest, ARMReg op2, u8 rotation = 0);559void SXTAH(ARMReg dest, ARMReg src, ARMReg op2, u8 rotation = 0);560void BFI(ARMReg rd, ARMReg rn, u8 lsb, u8 width);561void BFC(ARMReg rd, u8 lsb, u8 width);562void UBFX(ARMReg dest, ARMReg op2, u8 lsb, u8 width);563void SBFX(ARMReg dest, ARMReg op2, u8 lsb, u8 width);564void CLZ(ARMReg rd, ARMReg rm);565void PLD(ARMReg rd, int offset, bool forWrite = false);566567// Using just MSR here messes with our defines on the PPC side of stuff (when this code was in dolphin...)568// Just need to put an underscore here, bit annoying.569void _MSR (bool nzcvq, bool g, Operand2 op2);570void _MSR (bool nzcvq, bool g, ARMReg src);571void MRS (ARMReg dest);572573// Memory load/store operations574void LDR (ARMReg dest, ARMReg base, Operand2 op2 = 0, bool RegAdd = true);575void LDRB (ARMReg dest, ARMReg base, Operand2 op2 = 0, bool RegAdd = true);576void LDRH (ARMReg dest, ARMReg base, Operand2 op2 = 0, bool RegAdd = true);577void LDRSB(ARMReg dest, ARMReg base, Operand2 op2 = 0, bool RegAdd = true);578void LDRSH(ARMReg dest, ARMReg base, Operand2 op2 = 0, bool RegAdd = true);579void STR (ARMReg result, ARMReg base, Operand2 op2 = 0, bool RegAdd = true);580void STRB (ARMReg result, ARMReg base, Operand2 op2 = 0, bool RegAdd = true);581void STRH (ARMReg result, ARMReg base, Operand2 op2 = 0, bool RegAdd = true);582583void STMFD(ARMReg dest, bool WriteBack, const int Regnum, ...);584void LDMFD(ARMReg dest, bool WriteBack, const int Regnum, ...);585void STMIA(ARMReg dest, bool WriteBack, const int Regnum, ...);586void LDMIA(ARMReg dest, bool WriteBack, const int Regnum, ...);587void STM(ARMReg dest, bool Add, bool Before, bool WriteBack, const int Regnum, ...);588void LDM(ARMReg dest, bool Add, bool Before, bool WriteBack, const int Regnum, ...);589void STMBitmask(ARMReg dest, bool Add, bool Before, bool WriteBack, const u16 RegList);590void LDMBitmask(ARMReg dest, bool Add, bool Before, bool WriteBack, const u16 RegList);591592// Exclusive Access operations593void LDREX(ARMReg dest, ARMReg base);594// result contains the result if the instruction managed to store the value595void STREX(ARMReg result, ARMReg base, ARMReg op);596void DMB ();597void SVC(Operand2 op);598599// NEON and ASIMD instructions600// None of these will be created with conditional since ARM601// is deprecating conditional execution of ASIMD instructions.602// ASIMD instructions don't even have a conditional encoding.603604// NEON Only605void VABD(IntegerSize size, ARMReg Vd, ARMReg Vn, ARMReg Vm);606void VADD(IntegerSize size, ARMReg Vd, ARMReg Vn, ARMReg Vm);607void VSUB(IntegerSize size, ARMReg Vd, ARMReg Vn, ARMReg Vm);608609// VFP Only610void VLDMIA(ARMReg dest, bool WriteBack, ARMReg firstreg, int numregs);611void VSTMIA(ARMReg dest, bool WriteBack, ARMReg firstreg, int numregs);612void VLDMDB(ARMReg dest, bool WriteBack, ARMReg firstreg, int numregs);613void VSTMDB(ARMReg dest, bool WriteBack, ARMReg firstreg, int numregs);614void VPUSH(ARMReg firstvreg, int numvregs) {615VSTMDB(R_SP, true, firstvreg, numvregs);616}617void VPOP(ARMReg firstvreg, int numvregs) {618VLDMIA(R_SP, true, firstvreg, numvregs);619}620void VLDR(ARMReg Dest, ARMReg Base, s16 offset);621void VSTR(ARMReg Src, ARMReg Base, s16 offset);622void VCMP(ARMReg Vd, ARMReg Vm);623void VCMPE(ARMReg Vd, ARMReg Vm);624// Compares against zero625void VCMP(ARMReg Vd);626void VCMPE(ARMReg Vd);627628void VNMLA(ARMReg Vd, ARMReg Vn, ARMReg Vm);629void VNMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm);630void VNMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm);631void VDIV(ARMReg Vd, ARMReg Vn, ARMReg Vm);632void VSQRT(ARMReg Vd, ARMReg Vm);633634// NEON and VFP635void VADD(ARMReg Vd, ARMReg Vn, ARMReg Vm);636void VSUB(ARMReg Vd, ARMReg Vn, ARMReg Vm);637void VABS(ARMReg Vd, ARMReg Vm);638void VNEG(ARMReg Vd, ARMReg Vm);639void VMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm);640void VMLA(ARMReg Vd, ARMReg Vn, ARMReg Vm);641void VMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm);642void VMOV(ARMReg Dest, Operand2 op2);643void VMOV(ARMReg Dest, ARMReg Src, bool high);644void VMOV(ARMReg Dest, ARMReg Src);645// Either Vd, Rt, Rt2 or Rt, Rt2, Vd.646void VMOV(ARMReg Dest, ARMReg Src1, ARMReg Src2);647void VCVT(ARMReg Dest, ARMReg Src, int flags);648649// NEON, need to check for this (supported if VFP4 is supported)650void VCVTF32F16(ARMReg Dest, ARMReg Src);651void VCVTF16F32(ARMReg Dest, ARMReg Src);652653void VABA(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);654void VABAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);655void VABD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);656void VABDL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);657void VABS(u32 Size, ARMReg Vd, ARMReg Vm);658void VACGE(ARMReg Vd, ARMReg Vn, ARMReg Vm);659void VACGT(ARMReg Vd, ARMReg Vn, ARMReg Vm);660void VACLE(ARMReg Vd, ARMReg Vn, ARMReg Vm);661void VACLT(ARMReg Vd, ARMReg Vn, ARMReg Vm);662void VADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);663void VADDHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);664void VADDL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);665void VADDW(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);666void VBIF(ARMReg Vd, ARMReg Vn, ARMReg Vm);667void VBIT(ARMReg Vd, ARMReg Vn, ARMReg Vm);668void VBSL(ARMReg Vd, ARMReg Vn, ARMReg Vm);669void VCEQ(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);670void VCEQ(u32 Size, ARMReg Vd, ARMReg Vm);671void VCGE(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);672void VCGE(u32 Size, ARMReg Vd, ARMReg Vm);673void VCGT(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);674void VCGT(u32 Size, ARMReg Vd, ARMReg Vm);675void VCLE(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);676void VCLE(u32 Size, ARMReg Vd, ARMReg Vm);677void VCLS(u32 Size, ARMReg Vd, ARMReg Vm);678void VCLT(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);679void VCLT(u32 Size, ARMReg Vd, ARMReg Vm);680void VCLZ(u32 Size, ARMReg Vd, ARMReg Vm);681void VCNT(u32 Size, ARMReg Vd, ARMReg Vm);682void VDUP(u32 Size, ARMReg Vd, ARMReg Vm, u8 index);683void VDUP(u32 Size, ARMReg Vd, ARMReg Rt);684void VEXT(ARMReg Vd, ARMReg Vn, ARMReg Vm, u8 index);685void VFMA(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);686void VFMS(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);687void VHADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);688void VHSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);689void VMAX(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);690void VMIN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);691692// Three registers693void VMLA(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);694void VMLS(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);695void VMLAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);696void VMLSL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);697void VMUL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);698void VMULL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);699void VQDMLAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);700void VQDMLSL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);701void VQDMULH(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);702void VQDMULL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);703void VQRDMULH(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);704705// Two registers and a scalar706// These two are super useful for matrix multiplication707void VMUL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);708void VMLA_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);709710// TODO:711/*712void VMLS_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);713void VMLAL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);714void VMLSL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);715void VMULL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);716void VQDMLAL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);717void VQDMLSL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);718void VQDMULH_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);719void VQDMULL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);720void VQRDMULH_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);721*/722723// Vector bitwise. These don't have an element size for obvious reasons.724void VAND(ARMReg Vd, ARMReg Vn, ARMReg Vm);725void VBIC(ARMReg Vd, ARMReg Vn, ARMReg Vm);726void VEOR(ARMReg Vd, ARMReg Vn, ARMReg Vm);727void VORN(ARMReg Vd, ARMReg Vn, ARMReg Vm);728void VORR(ARMReg Vd, ARMReg Vn, ARMReg Vm);729inline void VMOV_neon(ARMReg Dest, ARMReg Src) {730VORR(Dest, Src, Src);731}732void VMOV_neon(u32 Size, ARMReg Vd, u32 imm);733void VMOV_neon(u32 Size, ARMReg Vd, float imm) {734_dbg_assert_msg_(Size == F_32, "Expecting F_32 immediate for VMOV_neon float arg.");735union {736float f;737u32 u;738} val;739val.f = imm;740VMOV_neon(I_32, Vd, val.u);741}742void VMOV_neon(u32 Size, ARMReg Vd, ARMReg Rt, int lane);743744void VNEG(u32 Size, ARMReg Vd, ARMReg Vm);745void VMVN(ARMReg Vd, ARMReg Vm);746void VPADAL(u32 Size, ARMReg Vd, ARMReg Vm);747void VPADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);748void VPADDL(u32 Size, ARMReg Vd, ARMReg Vm);749void VPMAX(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);750void VPMIN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);751void VQABS(u32 Size, ARMReg Vd, ARMReg Vm);752void VQADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);753void VQNEG(u32 Size, ARMReg Vd, ARMReg Vm);754void VQRSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);755void VQSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);756void VQSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);757void VRADDHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);758void VRECPE(u32 Size, ARMReg Vd, ARMReg Vm);759void VRECPS(ARMReg Vd, ARMReg Vn, ARMReg Vm);760void VRHADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);761void VRSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);762void VRSQRTE(u32 Size, ARMReg Vd, ARMReg Vm);763void VRSQRTS(ARMReg Vd, ARMReg Vn, ARMReg Vm);764void VRSUBHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);765void VSHL(u32 Size, ARMReg Vd, ARMReg Vm, ARMReg Vn); // Register shift766void VSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);767void VSUBHN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);768void VSUBL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);769void VSUBW(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);770void VSWP(ARMReg Vd, ARMReg Vm);771void VTRN(u32 Size, ARMReg Vd, ARMReg Vm);772void VTST(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);773void VUZP(u32 Size, ARMReg Vd, ARMReg Vm);774void VZIP(u32 Size, ARMReg Vd, ARMReg Vm);775void VREVX(u32 size, u32 Size, ARMReg Vd, ARMReg Vm);776void VREV64(u32 Size, ARMReg Vd, ARMReg Vm);777void VREV32(u32 Size, ARMReg Vd, ARMReg Vm);778void VREV16(u32 Size, ARMReg Vd, ARMReg Vm);779780781// NEON immediate instructions782783784void VMOV_imm(u32 Size, ARMReg Vd, VIMMMode type, int imm);785void VMOV_immf(ARMReg Vd, float value); // This only works with a select few values (1.0f and -1.0f).786787void VORR_imm(u32 Size, ARMReg Vd, VIMMMode type, int imm);788void VMVN_imm(u32 Size, ARMReg Vd, VIMMMode type, int imm);789void VBIC_imm(u32 Size, ARMReg Vd, VIMMMode type, int imm);790791// Widening and narrowing moves792void VMOVL(u32 Size, ARMReg Vd, ARMReg Vm);793void VMOVN(u32 Size, ARMReg Vd, ARMReg Vm);794void VQMOVN(u32 Size, ARMReg Vd, ARMReg Vm);795void VQMOVUN(u32 Size, ARMReg Vd, ARMReg Vm);796797// Shifts by immediate798void VSHL(u32 Size, ARMReg Vd, ARMReg Vm, int shiftAmount);799void VSHLL(u32 Size, ARMReg Vd, ARMReg Vm, int shiftAmount); // widening800void VSHR(u32 Size, ARMReg Vd, ARMReg Vm, int shiftAmount);801void VSHRN(u32 Size, ARMReg Vd, ARMReg Vm, int shiftAmount); // narrowing802803// Vector VCVT804void VCVT(u32 DestSize, ARMReg Dest, ARMReg Src);805806807// Notes:808// Rm == R_PC is interpreted as no offset, otherwise, effective address is sum of Rn and Rm809// Rm == R13 is interpreted as VLD1, .... [Rn]! Added a REG_UPDATE pseudo register.810811// Load/store multiple registers full of elements (a register is a D register)812// Specifying alignment when it can be guaranteed is documented to improve load/store performance.813// For example, when loading a set of four 64-bit registers that we know is 32-byte aligned, we should specify ALIGN_256.814void VLD1(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align = ALIGN_NONE, ARMReg Rm = R_PC);815void VST1(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align = ALIGN_NONE, ARMReg Rm = R_PC);816817// Load/store single lanes of D registers818void VLD1_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, bool aligned, ARMReg Rm = R_PC);819void VST1_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, bool aligned, ARMReg Rm = R_PC);820821// Load one value into all lanes of a D or a Q register (either supported, all formats should work).822void VLD1_all_lanes(u32 Size, ARMReg Vd, ARMReg Rn, bool aligned, ARMReg Rm = R_PC);823824/*825// Deinterleave two loads... or something. TODO826void VLD2(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align = ALIGN_NONE, ARMReg Rm = R_PC);827void VST2(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align = ALIGN_NONE, ARMReg Rm = R_PC);828829void VLD2_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm = R_PC);830void VST2_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm = R_PC);831832void VLD3(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align = ALIGN_NONE, ARMReg Rm = R_PC);833void VST3(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align = ALIGN_NONE, ARMReg Rm = R_PC);834835void VLD3_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm = R_PC);836void VST3_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm = R_PC);837838void VLD4(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align = ALIGN_NONE, ARMReg Rm = R_PC);839void VST4(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, NEONAlignment align = ALIGN_NONE, ARMReg Rm = R_PC);840841void VLD4_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm = R_PC);842void VST4_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm = R_PC);843*/844845void VMRS_APSR();846void VMRS(ARMReg Rt);847void VMSR(ARMReg Rt);848849void QuickCallFunction(ARMReg scratchreg, const void *func);850template <typename T> void QuickCallFunction(ARMReg scratchreg, T func) {851QuickCallFunction(scratchreg, (const void *)func);852}853854// Wrapper around MOVT/MOVW with fallbacks.855void MOVI2R(ARMReg reg, u32 val, bool optimize = true);856void MOVI2FR(ARMReg dest, float val, bool negate = false);857void MOVI2F(ARMReg dest, float val, ARMReg tempReg, bool negate = false);858void MOVI2F_neon(ARMReg dest, float val, ARMReg tempReg, bool negate = false);859860// Load pointers without casting861template <class T> void MOVP2R(ARMReg reg, T *val) {862MOVI2R(reg, (u32)(uintptr_t)(void *)val);863}864865void MOVIU2F(ARMReg dest, u32 val, ARMReg tempReg, bool negate = false) {866union {867u32 u;868float f;869} v = {val};870MOVI2F(dest, v.f, tempReg, negate);871}872873void ADDI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch);874bool TryADDI2R(ARMReg rd, ARMReg rs, u32 val);875void SUBI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch);876bool TrySUBI2R(ARMReg rd, ARMReg rs, u32 val);877void ANDI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch);878bool TryANDI2R(ARMReg rd, ARMReg rs, u32 val);879void CMPI2R(ARMReg rs, u32 val, ARMReg scratch);880bool TryCMPI2R(ARMReg rs, u32 val);881void TSTI2R(ARMReg rs, u32 val, ARMReg scratch);882bool TryTSTI2R(ARMReg rs, u32 val);883void ORI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch);884bool TryORI2R(ARMReg rd, ARMReg rs, u32 val);885void EORI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch);886bool TryEORI2R(ARMReg rd, ARMReg rs, u32 val);887}; // class ARMXEmitter888889890// Everything that needs to generate machine code should inherit from this.891// You get memory management for free, plus, you can use all the MOV etc functions without892// having to prefix them with gen-> or something similar.893894class ARMXCodeBlock : public CodeBlock<ARMXEmitter> {895public:896void PoisonMemory(int offset) override;897};898899// VFP Specific900struct VFPEnc {901s16 opc1;902s16 opc2;903};904extern const VFPEnc VFPOps[16][2];905extern const char *VFPOpNames[16];906907} // namespace908909910