Path: blob/main/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
35269 views
//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file contains the AArch64 implementation of the TargetInstrInfo class.9//10//===----------------------------------------------------------------------===//1112#include "AArch64InstrInfo.h"13#include "AArch64ExpandImm.h"14#include "AArch64FrameLowering.h"15#include "AArch64MachineFunctionInfo.h"16#include "AArch64PointerAuth.h"17#include "AArch64Subtarget.h"18#include "MCTargetDesc/AArch64AddressingModes.h"19#include "MCTargetDesc/AArch64MCTargetDesc.h"20#include "Utils/AArch64BaseInfo.h"21#include "llvm/ADT/ArrayRef.h"22#include "llvm/ADT/STLExtras.h"23#include "llvm/ADT/SmallVector.h"24#include "llvm/CodeGen/LivePhysRegs.h"25#include "llvm/CodeGen/MachineBasicBlock.h"26#include "llvm/CodeGen/MachineCombinerPattern.h"27#include "llvm/CodeGen/MachineFrameInfo.h"28#include "llvm/CodeGen/MachineFunction.h"29#include "llvm/CodeGen/MachineInstr.h"30#include "llvm/CodeGen/MachineInstrBuilder.h"31#include "llvm/CodeGen/MachineMemOperand.h"32#include "llvm/CodeGen/MachineModuleInfo.h"33#include "llvm/CodeGen/MachineOperand.h"34#include "llvm/CodeGen/MachineRegisterInfo.h"35#include "llvm/CodeGen/RegisterScavenging.h"36#include "llvm/CodeGen/StackMaps.h"37#include "llvm/CodeGen/TargetRegisterInfo.h"38#include "llvm/CodeGen/TargetSubtargetInfo.h"39#include "llvm/IR/DebugInfoMetadata.h"40#include "llvm/IR/DebugLoc.h"41#include "llvm/IR/GlobalValue.h"42#include "llvm/IR/Module.h"43#include "llvm/MC/MCAsmInfo.h"44#include "llvm/MC/MCInst.h"45#include "llvm/MC/MCInstBuilder.h"46#include "llvm/MC/MCInstrDesc.h"47#include "llvm/Support/Casting.h"48#include "llvm/Support/CodeGen.h"49#include "llvm/Support/CommandLine.h"50#include "llvm/Support/ErrorHandling.h"51#include "llvm/Support/LEB128.h"52#include "llvm/Support/MathExtras.h"53#include "llvm/Target/TargetMachine.h"54#include "llvm/Target/TargetOptions.h"55#include <cassert>56#include <cstdint>57#include <iterator>58#include <utility>5960using namespace llvm;6162#define GET_INSTRINFO_CTOR_DTOR63#include "AArch64GenInstrInfo.inc"6465static cl::opt<unsigned> TBZDisplacementBits(66"aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),67cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));6869static cl::opt<unsigned> CBZDisplacementBits(70"aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),71cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));7273static cl::opt<unsigned>74BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),75cl::desc("Restrict range of Bcc instructions (DEBUG)"));7677static cl::opt<unsigned>78BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),79cl::desc("Restrict range of B instructions (DEBUG)"));8081AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)82: AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,83AArch64::CATCHRET),84RI(STI.getTargetTriple()), Subtarget(STI) {}8586/// GetInstSize - Return the number of bytes of code the specified87/// instruction may be. This returns the maximum number of bytes.88unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {89const MachineBasicBlock &MBB = *MI.getParent();90const MachineFunction *MF = MBB.getParent();91const Function &F = MF->getFunction();92const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();9394{95auto Op = MI.getOpcode();96if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)97return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);98}99100// Meta-instructions emit no code.101if (MI.isMetaInstruction())102return 0;103104// FIXME: We currently only handle pseudoinstructions that don't get expanded105// before the assembly printer.106unsigned NumBytes = 0;107const MCInstrDesc &Desc = MI.getDesc();108109// Size should be preferably set in110// llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).111// Specific cases handle instructions of variable sizes112switch (Desc.getOpcode()) {113default:114if (Desc.getSize())115return Desc.getSize();116117// Anything not explicitly designated otherwise (i.e. pseudo-instructions118// with fixed constant size but not specified in .td file) is a normal119// 4-byte insn.120NumBytes = 4;121break;122case TargetOpcode::STACKMAP:123// The upper bound for a stackmap intrinsic is the full length of its shadow124NumBytes = StackMapOpers(&MI).getNumPatchBytes();125assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");126break;127case TargetOpcode::PATCHPOINT:128// The size of the patchpoint intrinsic is the number of bytes requested129NumBytes = PatchPointOpers(&MI).getNumPatchBytes();130assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");131break;132case TargetOpcode::STATEPOINT:133NumBytes = StatepointOpers(&MI).getNumPatchBytes();134assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");135// No patch bytes means a normal call inst is emitted136if (NumBytes == 0)137NumBytes = 4;138break;139case TargetOpcode::PATCHABLE_FUNCTION_ENTER:140// If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER141// instructions are expanded to the specified number of NOPs. Otherwise,142// they are expanded to 36-byte XRay sleds.143NumBytes =144F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;145break;146case TargetOpcode::PATCHABLE_FUNCTION_EXIT:147case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:148// An XRay sled can be 4 bytes of alignment plus a 32-byte block.149NumBytes = 36;150break;151case TargetOpcode::PATCHABLE_EVENT_CALL:152// EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).153NumBytes = 24;154break;155156case AArch64::SPACE:157NumBytes = MI.getOperand(1).getImm();158break;159case TargetOpcode::BUNDLE:160NumBytes = getInstBundleLength(MI);161break;162}163164return NumBytes;165}166167unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {168unsigned Size = 0;169MachineBasicBlock::const_instr_iterator I = MI.getIterator();170MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();171while (++I != E && I->isInsideBundle()) {172assert(!I->isBundle() && "No nested bundle!");173Size += getInstSizeInBytes(*I);174}175return Size;176}177178static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,179SmallVectorImpl<MachineOperand> &Cond) {180// Block ends with fall-through condbranch.181switch (LastInst->getOpcode()) {182default:183llvm_unreachable("Unknown branch instruction?");184case AArch64::Bcc:185Target = LastInst->getOperand(1).getMBB();186Cond.push_back(LastInst->getOperand(0));187break;188case AArch64::CBZW:189case AArch64::CBZX:190case AArch64::CBNZW:191case AArch64::CBNZX:192Target = LastInst->getOperand(1).getMBB();193Cond.push_back(MachineOperand::CreateImm(-1));194Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));195Cond.push_back(LastInst->getOperand(0));196break;197case AArch64::TBZW:198case AArch64::TBZX:199case AArch64::TBNZW:200case AArch64::TBNZX:201Target = LastInst->getOperand(2).getMBB();202Cond.push_back(MachineOperand::CreateImm(-1));203Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));204Cond.push_back(LastInst->getOperand(0));205Cond.push_back(LastInst->getOperand(1));206}207}208209static unsigned getBranchDisplacementBits(unsigned Opc) {210switch (Opc) {211default:212llvm_unreachable("unexpected opcode!");213case AArch64::B:214return BDisplacementBits;215case AArch64::TBNZW:216case AArch64::TBZW:217case AArch64::TBNZX:218case AArch64::TBZX:219return TBZDisplacementBits;220case AArch64::CBNZW:221case AArch64::CBZW:222case AArch64::CBNZX:223case AArch64::CBZX:224return CBZDisplacementBits;225case AArch64::Bcc:226return BCCDisplacementBits;227}228}229230bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,231int64_t BrOffset) const {232unsigned Bits = getBranchDisplacementBits(BranchOp);233assert(Bits >= 3 && "max branch displacement must be enough to jump"234"over conditional branch expansion");235return isIntN(Bits, BrOffset / 4);236}237238MachineBasicBlock *239AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {240switch (MI.getOpcode()) {241default:242llvm_unreachable("unexpected opcode!");243case AArch64::B:244return MI.getOperand(0).getMBB();245case AArch64::TBZW:246case AArch64::TBNZW:247case AArch64::TBZX:248case AArch64::TBNZX:249return MI.getOperand(2).getMBB();250case AArch64::CBZW:251case AArch64::CBNZW:252case AArch64::CBZX:253case AArch64::CBNZX:254case AArch64::Bcc:255return MI.getOperand(1).getMBB();256}257}258259void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,260MachineBasicBlock &NewDestBB,261MachineBasicBlock &RestoreBB,262const DebugLoc &DL,263int64_t BrOffset,264RegScavenger *RS) const {265assert(RS && "RegScavenger required for long branching");266assert(MBB.empty() &&267"new block should be inserted for expanding unconditional branch");268assert(MBB.pred_size() == 1);269assert(RestoreBB.empty() &&270"restore block should be inserted for restoring clobbered registers");271272auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {273// Offsets outside of the signed 33-bit range are not supported for ADRP +274// ADD.275if (!isInt<33>(BrOffset))276report_fatal_error(277"Branch offsets outside of the signed 33-bit range not supported");278279BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)280.addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);281BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)282.addReg(Reg)283.addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)284.addImm(0);285BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);286};287288RS->enterBasicBlockEnd(MBB);289// If X16 is unused, we can rely on the linker to insert a range extension290// thunk if NewDestBB is out of range of a single B instruction.291constexpr Register Reg = AArch64::X16;292if (!RS->isRegUsed(Reg)) {293insertUnconditionalBranch(MBB, &NewDestBB, DL);294RS->setRegUsed(Reg);295return;296}297298// If there's a free register and it's worth inflating the code size,299// manually insert the indirect branch.300Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);301if (Scavenged != AArch64::NoRegister &&302MBB.getSectionID() == MBBSectionID::ColdSectionID) {303buildIndirectBranch(Scavenged, NewDestBB);304RS->setRegUsed(Scavenged);305return;306}307308// Note: Spilling X16 briefly moves the stack pointer, making it incompatible309// with red zones.310AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();311if (!AFI || AFI->hasRedZone().value_or(true))312report_fatal_error(313"Unable to insert indirect branch inside function that has red zone");314315// Otherwise, spill X16 and defer range extension to the linker.316BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))317.addReg(AArch64::SP, RegState::Define)318.addReg(Reg)319.addReg(AArch64::SP)320.addImm(-16);321322BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);323324BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))325.addReg(AArch64::SP, RegState::Define)326.addReg(Reg, RegState::Define)327.addReg(AArch64::SP)328.addImm(16);329}330331// Branch analysis.332bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,333MachineBasicBlock *&TBB,334MachineBasicBlock *&FBB,335SmallVectorImpl<MachineOperand> &Cond,336bool AllowModify) const {337// If the block has no terminators, it just falls into the block after it.338MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();339if (I == MBB.end())340return false;341342// Skip over SpeculationBarrierEndBB terminators343if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||344I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {345--I;346}347348if (!isUnpredicatedTerminator(*I))349return false;350351// Get the last instruction in the block.352MachineInstr *LastInst = &*I;353354// If there is only one terminator instruction, process it.355unsigned LastOpc = LastInst->getOpcode();356if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {357if (isUncondBranchOpcode(LastOpc)) {358TBB = LastInst->getOperand(0).getMBB();359return false;360}361if (isCondBranchOpcode(LastOpc)) {362// Block ends with fall-through condbranch.363parseCondBranch(LastInst, TBB, Cond);364return false;365}366return true; // Can't handle indirect branch.367}368369// Get the instruction before it if it is a terminator.370MachineInstr *SecondLastInst = &*I;371unsigned SecondLastOpc = SecondLastInst->getOpcode();372373// If AllowModify is true and the block ends with two or more unconditional374// branches, delete all but the first unconditional branch.375if (AllowModify && isUncondBranchOpcode(LastOpc)) {376while (isUncondBranchOpcode(SecondLastOpc)) {377LastInst->eraseFromParent();378LastInst = SecondLastInst;379LastOpc = LastInst->getOpcode();380if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {381// Return now the only terminator is an unconditional branch.382TBB = LastInst->getOperand(0).getMBB();383return false;384}385SecondLastInst = &*I;386SecondLastOpc = SecondLastInst->getOpcode();387}388}389390// If we're allowed to modify and the block ends in a unconditional branch391// which could simply fallthrough, remove the branch. (Note: This case only392// matters when we can't understand the whole sequence, otherwise it's also393// handled by BranchFolding.cpp.)394if (AllowModify && isUncondBranchOpcode(LastOpc) &&395MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {396LastInst->eraseFromParent();397LastInst = SecondLastInst;398LastOpc = LastInst->getOpcode();399if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {400assert(!isUncondBranchOpcode(LastOpc) &&401"unreachable unconditional branches removed above");402403if (isCondBranchOpcode(LastOpc)) {404// Block ends with fall-through condbranch.405parseCondBranch(LastInst, TBB, Cond);406return false;407}408return true; // Can't handle indirect branch.409}410SecondLastInst = &*I;411SecondLastOpc = SecondLastInst->getOpcode();412}413414// If there are three terminators, we don't know what sort of block this is.415if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))416return true;417418// If the block ends with a B and a Bcc, handle it.419if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {420parseCondBranch(SecondLastInst, TBB, Cond);421FBB = LastInst->getOperand(0).getMBB();422return false;423}424425// If the block ends with two unconditional branches, handle it. The second426// one is not executed, so remove it.427if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {428TBB = SecondLastInst->getOperand(0).getMBB();429I = LastInst;430if (AllowModify)431I->eraseFromParent();432return false;433}434435// ...likewise if it ends with an indirect branch followed by an unconditional436// branch.437if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {438I = LastInst;439if (AllowModify)440I->eraseFromParent();441return true;442}443444// Otherwise, can't handle this.445return true;446}447448bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,449MachineBranchPredicate &MBP,450bool AllowModify) const {451// For the moment, handle only a block which ends with a cb(n)zx followed by452// a fallthrough. Why this? Because it is a common form.453// TODO: Should we handle b.cc?454455MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();456if (I == MBB.end())457return true;458459// Skip over SpeculationBarrierEndBB terminators460if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||461I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {462--I;463}464465if (!isUnpredicatedTerminator(*I))466return true;467468// Get the last instruction in the block.469MachineInstr *LastInst = &*I;470unsigned LastOpc = LastInst->getOpcode();471if (!isCondBranchOpcode(LastOpc))472return true;473474switch (LastOpc) {475default:476return true;477case AArch64::CBZW:478case AArch64::CBZX:479case AArch64::CBNZW:480case AArch64::CBNZX:481break;482};483484MBP.TrueDest = LastInst->getOperand(1).getMBB();485assert(MBP.TrueDest && "expected!");486MBP.FalseDest = MBB.getNextNode();487488MBP.ConditionDef = nullptr;489MBP.SingleUseCondition = false;490491MBP.LHS = LastInst->getOperand(0);492MBP.RHS = MachineOperand::CreateImm(0);493MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE494: MachineBranchPredicate::PRED_EQ;495return false;496}497498bool AArch64InstrInfo::reverseBranchCondition(499SmallVectorImpl<MachineOperand> &Cond) const {500if (Cond[0].getImm() != -1) {501// Regular Bcc502AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();503Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));504} else {505// Folded compare-and-branch506switch (Cond[1].getImm()) {507default:508llvm_unreachable("Unknown conditional branch!");509case AArch64::CBZW:510Cond[1].setImm(AArch64::CBNZW);511break;512case AArch64::CBNZW:513Cond[1].setImm(AArch64::CBZW);514break;515case AArch64::CBZX:516Cond[1].setImm(AArch64::CBNZX);517break;518case AArch64::CBNZX:519Cond[1].setImm(AArch64::CBZX);520break;521case AArch64::TBZW:522Cond[1].setImm(AArch64::TBNZW);523break;524case AArch64::TBNZW:525Cond[1].setImm(AArch64::TBZW);526break;527case AArch64::TBZX:528Cond[1].setImm(AArch64::TBNZX);529break;530case AArch64::TBNZX:531Cond[1].setImm(AArch64::TBZX);532break;533}534}535536return false;537}538539unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,540int *BytesRemoved) const {541MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();542if (I == MBB.end())543return 0;544545if (!isUncondBranchOpcode(I->getOpcode()) &&546!isCondBranchOpcode(I->getOpcode()))547return 0;548549// Remove the branch.550I->eraseFromParent();551552I = MBB.end();553554if (I == MBB.begin()) {555if (BytesRemoved)556*BytesRemoved = 4;557return 1;558}559--I;560if (!isCondBranchOpcode(I->getOpcode())) {561if (BytesRemoved)562*BytesRemoved = 4;563return 1;564}565566// Remove the branch.567I->eraseFromParent();568if (BytesRemoved)569*BytesRemoved = 8;570571return 2;572}573574void AArch64InstrInfo::instantiateCondBranch(575MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,576ArrayRef<MachineOperand> Cond) const {577if (Cond[0].getImm() != -1) {578// Regular Bcc579BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);580} else {581// Folded compare-and-branch582// Note that we use addOperand instead of addReg to keep the flags.583const MachineInstrBuilder MIB =584BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);585if (Cond.size() > 3)586MIB.addImm(Cond[3].getImm());587MIB.addMBB(TBB);588}589}590591unsigned AArch64InstrInfo::insertBranch(592MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,593ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {594// Shouldn't be a fall through.595assert(TBB && "insertBranch must not be told to insert a fallthrough");596597if (!FBB) {598if (Cond.empty()) // Unconditional branch?599BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);600else601instantiateCondBranch(MBB, DL, TBB, Cond);602603if (BytesAdded)604*BytesAdded = 4;605606return 1;607}608609// Two-way conditional branch.610instantiateCondBranch(MBB, DL, TBB, Cond);611BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);612613if (BytesAdded)614*BytesAdded = 8;615616return 2;617}618619// Find the original register that VReg is copied from.620static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {621while (Register::isVirtualRegister(VReg)) {622const MachineInstr *DefMI = MRI.getVRegDef(VReg);623if (!DefMI->isFullCopy())624return VReg;625VReg = DefMI->getOperand(1).getReg();626}627return VReg;628}629630// Determine if VReg is defined by an instruction that can be folded into a631// csel instruction. If so, return the folded opcode, and the replacement632// register.633static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,634unsigned *NewVReg = nullptr) {635VReg = removeCopies(MRI, VReg);636if (!Register::isVirtualRegister(VReg))637return 0;638639bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));640const MachineInstr *DefMI = MRI.getVRegDef(VReg);641unsigned Opc = 0;642unsigned SrcOpNum = 0;643switch (DefMI->getOpcode()) {644case AArch64::ADDSXri:645case AArch64::ADDSWri:646// if NZCV is used, do not fold.647if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,648true) == -1)649return 0;650// fall-through to ADDXri and ADDWri.651[[fallthrough]];652case AArch64::ADDXri:653case AArch64::ADDWri:654// add x, 1 -> csinc.655if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||656DefMI->getOperand(3).getImm() != 0)657return 0;658SrcOpNum = 1;659Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;660break;661662case AArch64::ORNXrr:663case AArch64::ORNWrr: {664// not x -> csinv, represented as orn dst, xzr, src.665unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());666if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)667return 0;668SrcOpNum = 2;669Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;670break;671}672673case AArch64::SUBSXrr:674case AArch64::SUBSWrr:675// if NZCV is used, do not fold.676if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,677true) == -1)678return 0;679// fall-through to SUBXrr and SUBWrr.680[[fallthrough]];681case AArch64::SUBXrr:682case AArch64::SUBWrr: {683// neg x -> csneg, represented as sub dst, xzr, src.684unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());685if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)686return 0;687SrcOpNum = 2;688Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;689break;690}691default:692return 0;693}694assert(Opc && SrcOpNum && "Missing parameters");695696if (NewVReg)697*NewVReg = DefMI->getOperand(SrcOpNum).getReg();698return Opc;699}700701bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,702ArrayRef<MachineOperand> Cond,703Register DstReg, Register TrueReg,704Register FalseReg, int &CondCycles,705int &TrueCycles,706int &FalseCycles) const {707// Check register classes.708const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();709const TargetRegisterClass *RC =710RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));711if (!RC)712return false;713714// Also need to check the dest regclass, in case we're trying to optimize715// something like:716// %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2717if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))718return false;719720// Expanding cbz/tbz requires an extra cycle of latency on the condition.721unsigned ExtraCondLat = Cond.size() != 1;722723// GPRs are handled by csel.724// FIXME: Fold in x+1, -x, and ~x when applicable.725if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||726AArch64::GPR32allRegClass.hasSubClassEq(RC)) {727// Single-cycle csel, csinc, csinv, and csneg.728CondCycles = 1 + ExtraCondLat;729TrueCycles = FalseCycles = 1;730if (canFoldIntoCSel(MRI, TrueReg))731TrueCycles = 0;732else if (canFoldIntoCSel(MRI, FalseReg))733FalseCycles = 0;734return true;735}736737// Scalar floating point is handled by fcsel.738// FIXME: Form fabs, fmin, and fmax when applicable.739if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||740AArch64::FPR32RegClass.hasSubClassEq(RC)) {741CondCycles = 5 + ExtraCondLat;742TrueCycles = FalseCycles = 2;743return true;744}745746// Can't do vectors.747return false;748}749750void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,751MachineBasicBlock::iterator I,752const DebugLoc &DL, Register DstReg,753ArrayRef<MachineOperand> Cond,754Register TrueReg, Register FalseReg) const {755MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();756757// Parse the condition code, see parseCondBranch() above.758AArch64CC::CondCode CC;759switch (Cond.size()) {760default:761llvm_unreachable("Unknown condition opcode in Cond");762case 1: // b.cc763CC = AArch64CC::CondCode(Cond[0].getImm());764break;765case 3: { // cbz/cbnz766// We must insert a compare against 0.767bool Is64Bit;768switch (Cond[1].getImm()) {769default:770llvm_unreachable("Unknown branch opcode in Cond");771case AArch64::CBZW:772Is64Bit = false;773CC = AArch64CC::EQ;774break;775case AArch64::CBZX:776Is64Bit = true;777CC = AArch64CC::EQ;778break;779case AArch64::CBNZW:780Is64Bit = false;781CC = AArch64CC::NE;782break;783case AArch64::CBNZX:784Is64Bit = true;785CC = AArch64CC::NE;786break;787}788Register SrcReg = Cond[2].getReg();789if (Is64Bit) {790// cmp reg, #0 is actually subs xzr, reg, #0.791MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);792BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)793.addReg(SrcReg)794.addImm(0)795.addImm(0);796} else {797MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);798BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)799.addReg(SrcReg)800.addImm(0)801.addImm(0);802}803break;804}805case 4: { // tbz/tbnz806// We must insert a tst instruction.807switch (Cond[1].getImm()) {808default:809llvm_unreachable("Unknown branch opcode in Cond");810case AArch64::TBZW:811case AArch64::TBZX:812CC = AArch64CC::EQ;813break;814case AArch64::TBNZW:815case AArch64::TBNZX:816CC = AArch64CC::NE;817break;818}819// cmp reg, #foo is actually ands xzr, reg, #1<<foo.820if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)821BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)822.addReg(Cond[2].getReg())823.addImm(824AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));825else826BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)827.addReg(Cond[2].getReg())828.addImm(829AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));830break;831}832}833834unsigned Opc = 0;835const TargetRegisterClass *RC = nullptr;836bool TryFold = false;837if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {838RC = &AArch64::GPR64RegClass;839Opc = AArch64::CSELXr;840TryFold = true;841} else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {842RC = &AArch64::GPR32RegClass;843Opc = AArch64::CSELWr;844TryFold = true;845} else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {846RC = &AArch64::FPR64RegClass;847Opc = AArch64::FCSELDrrr;848} else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {849RC = &AArch64::FPR32RegClass;850Opc = AArch64::FCSELSrrr;851}852assert(RC && "Unsupported regclass");853854// Try folding simple instructions into the csel.855if (TryFold) {856unsigned NewVReg = 0;857unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);858if (FoldedOpc) {859// The folded opcodes csinc, csinc and csneg apply the operation to860// FalseReg, so we need to invert the condition.861CC = AArch64CC::getInvertedCondCode(CC);862TrueReg = FalseReg;863} else864FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);865866// Fold the operation. Leave any dead instructions for DCE to clean up.867if (FoldedOpc) {868FalseReg = NewVReg;869Opc = FoldedOpc;870// The extends the live range of NewVReg.871MRI.clearKillFlags(NewVReg);872}873}874875// Pull all virtual register into the appropriate class.876MRI.constrainRegClass(TrueReg, RC);877MRI.constrainRegClass(FalseReg, RC);878879// Insert the csel.880BuildMI(MBB, I, DL, get(Opc), DstReg)881.addReg(TrueReg)882.addReg(FalseReg)883.addImm(CC);884}885886// Return true if Imm can be loaded into a register by a "cheap" sequence of887// instructions. For now, "cheap" means at most two instructions.888static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {889if (BitSize == 32)890return true;891892assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");893uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());894SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;895AArch64_IMM::expandMOVImm(Imm, BitSize, Is);896897return Is.size() <= 2;898}899900// FIXME: this implementation should be micro-architecture dependent, so a901// micro-architecture target hook should be introduced here in future.902bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {903if (Subtarget.hasExynosCheapAsMoveHandling()) {904if (isExynosCheapAsMove(MI))905return true;906return MI.isAsCheapAsAMove();907}908909switch (MI.getOpcode()) {910default:911return MI.isAsCheapAsAMove();912913case AArch64::ADDWrs:914case AArch64::ADDXrs:915case AArch64::SUBWrs:916case AArch64::SUBXrs:917return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;918919// If MOVi32imm or MOVi64imm can be expanded into ORRWri or920// ORRXri, it is as cheap as MOV.921// Likewise if it can be expanded to MOVZ/MOVN/MOVK.922case AArch64::MOVi32imm:923return isCheapImmediate(MI, 32);924case AArch64::MOVi64imm:925return isCheapImmediate(MI, 64);926}927}928929bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {930switch (MI.getOpcode()) {931default:932return false;933934case AArch64::ADDWrs:935case AArch64::ADDXrs:936case AArch64::ADDSWrs:937case AArch64::ADDSXrs: {938unsigned Imm = MI.getOperand(3).getImm();939unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);940if (ShiftVal == 0)941return true;942return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;943}944945case AArch64::ADDWrx:946case AArch64::ADDXrx:947case AArch64::ADDXrx64:948case AArch64::ADDSWrx:949case AArch64::ADDSXrx:950case AArch64::ADDSXrx64: {951unsigned Imm = MI.getOperand(3).getImm();952switch (AArch64_AM::getArithExtendType(Imm)) {953default:954return false;955case AArch64_AM::UXTB:956case AArch64_AM::UXTH:957case AArch64_AM::UXTW:958case AArch64_AM::UXTX:959return AArch64_AM::getArithShiftValue(Imm) <= 4;960}961}962963case AArch64::SUBWrs:964case AArch64::SUBSWrs: {965unsigned Imm = MI.getOperand(3).getImm();966unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);967return ShiftVal == 0 ||968(AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);969}970971case AArch64::SUBXrs:972case AArch64::SUBSXrs: {973unsigned Imm = MI.getOperand(3).getImm();974unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);975return ShiftVal == 0 ||976(AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);977}978979case AArch64::SUBWrx:980case AArch64::SUBXrx:981case AArch64::SUBXrx64:982case AArch64::SUBSWrx:983case AArch64::SUBSXrx:984case AArch64::SUBSXrx64: {985unsigned Imm = MI.getOperand(3).getImm();986switch (AArch64_AM::getArithExtendType(Imm)) {987default:988return false;989case AArch64_AM::UXTB:990case AArch64_AM::UXTH:991case AArch64_AM::UXTW:992case AArch64_AM::UXTX:993return AArch64_AM::getArithShiftValue(Imm) == 0;994}995}996997case AArch64::LDRBBroW:998case AArch64::LDRBBroX:999case AArch64::LDRBroW:1000case AArch64::LDRBroX:1001case AArch64::LDRDroW:1002case AArch64::LDRDroX:1003case AArch64::LDRHHroW:1004case AArch64::LDRHHroX:1005case AArch64::LDRHroW:1006case AArch64::LDRHroX:1007case AArch64::LDRQroW:1008case AArch64::LDRQroX:1009case AArch64::LDRSBWroW:1010case AArch64::LDRSBWroX:1011case AArch64::LDRSBXroW:1012case AArch64::LDRSBXroX:1013case AArch64::LDRSHWroW:1014case AArch64::LDRSHWroX:1015case AArch64::LDRSHXroW:1016case AArch64::LDRSHXroX:1017case AArch64::LDRSWroW:1018case AArch64::LDRSWroX:1019case AArch64::LDRSroW:1020case AArch64::LDRSroX:1021case AArch64::LDRWroW:1022case AArch64::LDRWroX:1023case AArch64::LDRXroW:1024case AArch64::LDRXroX:1025case AArch64::PRFMroW:1026case AArch64::PRFMroX:1027case AArch64::STRBBroW:1028case AArch64::STRBBroX:1029case AArch64::STRBroW:1030case AArch64::STRBroX:1031case AArch64::STRDroW:1032case AArch64::STRDroX:1033case AArch64::STRHHroW:1034case AArch64::STRHHroX:1035case AArch64::STRHroW:1036case AArch64::STRHroX:1037case AArch64::STRQroW:1038case AArch64::STRQroX:1039case AArch64::STRSroW:1040case AArch64::STRSroX:1041case AArch64::STRWroW:1042case AArch64::STRWroX:1043case AArch64::STRXroW:1044case AArch64::STRXroX: {1045unsigned IsSigned = MI.getOperand(3).getImm();1046return !IsSigned;1047}1048}1049}10501051bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {1052unsigned Opc = MI.getOpcode();1053switch (Opc) {1054default:1055return false;1056case AArch64::SEH_StackAlloc:1057case AArch64::SEH_SaveFPLR:1058case AArch64::SEH_SaveFPLR_X:1059case AArch64::SEH_SaveReg:1060case AArch64::SEH_SaveReg_X:1061case AArch64::SEH_SaveRegP:1062case AArch64::SEH_SaveRegP_X:1063case AArch64::SEH_SaveFReg:1064case AArch64::SEH_SaveFReg_X:1065case AArch64::SEH_SaveFRegP:1066case AArch64::SEH_SaveFRegP_X:1067case AArch64::SEH_SetFP:1068case AArch64::SEH_AddFP:1069case AArch64::SEH_Nop:1070case AArch64::SEH_PrologEnd:1071case AArch64::SEH_EpilogStart:1072case AArch64::SEH_EpilogEnd:1073case AArch64::SEH_PACSignLR:1074case AArch64::SEH_SaveAnyRegQP:1075case AArch64::SEH_SaveAnyRegQPX:1076return true;1077}1078}10791080bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,1081Register &SrcReg, Register &DstReg,1082unsigned &SubIdx) const {1083switch (MI.getOpcode()) {1084default:1085return false;1086case AArch64::SBFMXri: // aka sxtw1087case AArch64::UBFMXri: // aka uxtw1088// Check for the 32 -> 64 bit extension case, these instructions can do1089// much more.1090if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)1091return false;1092// This is a signed or unsigned 32 -> 64 bit extension.1093SrcReg = MI.getOperand(1).getReg();1094DstReg = MI.getOperand(0).getReg();1095SubIdx = AArch64::sub_32;1096return true;1097}1098}10991100bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(1101const MachineInstr &MIa, const MachineInstr &MIb) const {1102const TargetRegisterInfo *TRI = &getRegisterInfo();1103const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;1104int64_t OffsetA = 0, OffsetB = 0;1105TypeSize WidthA(0, false), WidthB(0, false);1106bool OffsetAIsScalable = false, OffsetBIsScalable = false;11071108assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");1109assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");11101111if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||1112MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())1113return false;11141115// Retrieve the base, offset from the base and width. Width1116// is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If1117// base are identical, and the offset of a lower memory access +1118// the width doesn't overlap the offset of a higher memory access,1119// then the memory accesses are different.1120// If OffsetAIsScalable and OffsetBIsScalable are both true, they1121// are assumed to have the same scale (vscale).1122if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,1123WidthA, TRI) &&1124getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,1125WidthB, TRI)) {1126if (BaseOpA->isIdenticalTo(*BaseOpB) &&1127OffsetAIsScalable == OffsetBIsScalable) {1128int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;1129int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;1130TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;1131if (LowWidth.isScalable() == OffsetAIsScalable &&1132LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)1133return true;1134}1135}1136return false;1137}11381139bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,1140const MachineBasicBlock *MBB,1141const MachineFunction &MF) const {1142if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))1143return true;11441145// Do not move an instruction that can be recognized as a branch target.1146if (hasBTISemantics(MI))1147return true;11481149switch (MI.getOpcode()) {1150case AArch64::HINT:1151// CSDB hints are scheduling barriers.1152if (MI.getOperand(0).getImm() == 0x14)1153return true;1154break;1155case AArch64::DSB:1156case AArch64::ISB:1157// DSB and ISB also are scheduling barriers.1158return true;1159case AArch64::MSRpstatesvcrImm1:1160// SMSTART and SMSTOP are also scheduling barriers.1161return true;1162default:;1163}1164if (isSEHInstruction(MI))1165return true;1166auto Next = std::next(MI.getIterator());1167return Next != MBB->end() && Next->isCFIInstruction();1168}11691170/// analyzeCompare - For a comparison instruction, return the source registers1171/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.1172/// Return true if the comparison instruction can be analyzed.1173bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,1174Register &SrcReg2, int64_t &CmpMask,1175int64_t &CmpValue) const {1176// The first operand can be a frame index where we'd normally expect a1177// register.1178assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");1179if (!MI.getOperand(1).isReg())1180return false;11811182switch (MI.getOpcode()) {1183default:1184break;1185case AArch64::PTEST_PP:1186case AArch64::PTEST_PP_ANY:1187SrcReg = MI.getOperand(0).getReg();1188SrcReg2 = MI.getOperand(1).getReg();1189// Not sure about the mask and value for now...1190CmpMask = ~0;1191CmpValue = 0;1192return true;1193case AArch64::SUBSWrr:1194case AArch64::SUBSWrs:1195case AArch64::SUBSWrx:1196case AArch64::SUBSXrr:1197case AArch64::SUBSXrs:1198case AArch64::SUBSXrx:1199case AArch64::ADDSWrr:1200case AArch64::ADDSWrs:1201case AArch64::ADDSWrx:1202case AArch64::ADDSXrr:1203case AArch64::ADDSXrs:1204case AArch64::ADDSXrx:1205// Replace SUBSWrr with SUBWrr if NZCV is not used.1206SrcReg = MI.getOperand(1).getReg();1207SrcReg2 = MI.getOperand(2).getReg();1208CmpMask = ~0;1209CmpValue = 0;1210return true;1211case AArch64::SUBSWri:1212case AArch64::ADDSWri:1213case AArch64::SUBSXri:1214case AArch64::ADDSXri:1215SrcReg = MI.getOperand(1).getReg();1216SrcReg2 = 0;1217CmpMask = ~0;1218CmpValue = MI.getOperand(2).getImm();1219return true;1220case AArch64::ANDSWri:1221case AArch64::ANDSXri:1222// ANDS does not use the same encoding scheme as the others xxxS1223// instructions.1224SrcReg = MI.getOperand(1).getReg();1225SrcReg2 = 0;1226CmpMask = ~0;1227CmpValue = AArch64_AM::decodeLogicalImmediate(1228MI.getOperand(2).getImm(),1229MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);1230return true;1231}12321233return false;1234}12351236static bool UpdateOperandRegClass(MachineInstr &Instr) {1237MachineBasicBlock *MBB = Instr.getParent();1238assert(MBB && "Can't get MachineBasicBlock here");1239MachineFunction *MF = MBB->getParent();1240assert(MF && "Can't get MachineFunction here");1241const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();1242const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();1243MachineRegisterInfo *MRI = &MF->getRegInfo();12441245for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;1246++OpIdx) {1247MachineOperand &MO = Instr.getOperand(OpIdx);1248const TargetRegisterClass *OpRegCstraints =1249Instr.getRegClassConstraint(OpIdx, TII, TRI);12501251// If there's no constraint, there's nothing to do.1252if (!OpRegCstraints)1253continue;1254// If the operand is a frame index, there's nothing to do here.1255// A frame index operand will resolve correctly during PEI.1256if (MO.isFI())1257continue;12581259assert(MO.isReg() &&1260"Operand has register constraints without being a register!");12611262Register Reg = MO.getReg();1263if (Reg.isPhysical()) {1264if (!OpRegCstraints->contains(Reg))1265return false;1266} else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&1267!MRI->constrainRegClass(Reg, OpRegCstraints))1268return false;1269}12701271return true;1272}12731274/// Return the opcode that does not set flags when possible - otherwise1275/// return the original opcode. The caller is responsible to do the actual1276/// substitution and legality checking.1277static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {1278// Don't convert all compare instructions, because for some the zero register1279// encoding becomes the sp register.1280bool MIDefinesZeroReg = false;1281if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||1282MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))1283MIDefinesZeroReg = true;12841285switch (MI.getOpcode()) {1286default:1287return MI.getOpcode();1288case AArch64::ADDSWrr:1289return AArch64::ADDWrr;1290case AArch64::ADDSWri:1291return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;1292case AArch64::ADDSWrs:1293return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;1294case AArch64::ADDSWrx:1295return AArch64::ADDWrx;1296case AArch64::ADDSXrr:1297return AArch64::ADDXrr;1298case AArch64::ADDSXri:1299return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;1300case AArch64::ADDSXrs:1301return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;1302case AArch64::ADDSXrx:1303return AArch64::ADDXrx;1304case AArch64::SUBSWrr:1305return AArch64::SUBWrr;1306case AArch64::SUBSWri:1307return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;1308case AArch64::SUBSWrs:1309return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;1310case AArch64::SUBSWrx:1311return AArch64::SUBWrx;1312case AArch64::SUBSXrr:1313return AArch64::SUBXrr;1314case AArch64::SUBSXri:1315return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;1316case AArch64::SUBSXrs:1317return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;1318case AArch64::SUBSXrx:1319return AArch64::SUBXrx;1320}1321}13221323enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };13241325/// True when condition flags are accessed (either by writing or reading)1326/// on the instruction trace starting at From and ending at To.1327///1328/// Note: If From and To are from different blocks it's assumed CC are accessed1329/// on the path.1330static bool areCFlagsAccessedBetweenInstrs(1331MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,1332const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {1333// Early exit if To is at the beginning of the BB.1334if (To == To->getParent()->begin())1335return true;13361337// Check whether the instructions are in the same basic block1338// If not, assume the condition flags might get modified somewhere.1339if (To->getParent() != From->getParent())1340return true;13411342// From must be above To.1343assert(std::any_of(1344++To.getReverse(), To->getParent()->rend(),1345[From](MachineInstr &MI) { return MI.getIterator() == From; }));13461347// We iterate backward starting at \p To until we hit \p From.1348for (const MachineInstr &Instr :1349instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {1350if (((AccessToCheck & AK_Write) &&1351Instr.modifiesRegister(AArch64::NZCV, TRI)) ||1352((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))1353return true;1354}1355return false;1356}13571358std::optional<unsigned>1359AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,1360MachineInstr *Pred,1361const MachineRegisterInfo *MRI) const {1362unsigned MaskOpcode = Mask->getOpcode();1363unsigned PredOpcode = Pred->getOpcode();1364bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);1365bool PredIsWhileLike = isWhileOpcode(PredOpcode);13661367if (PredIsWhileLike) {1368// For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc1369// instruction and the condition is "any" since WHILcc does an implicit1370// PTEST(ALL, PG) check and PG is always a subset of ALL.1371if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)1372return PredOpcode;13731374// For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is1375// redundant since WHILE performs an implicit PTEST with an all active1376// mask.1377if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&1378getElementSizeForOpcode(MaskOpcode) ==1379getElementSizeForOpcode(PredOpcode))1380return PredOpcode;13811382return {};1383}13841385if (PredIsPTestLike) {1386// For PTEST(PG, PG), PTEST is redundant when PG is the result of an1387// instruction that sets the flags as PTEST would and the condition is1388// "any" since PG is always a subset of the governing predicate of the1389// ptest-like instruction.1390if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)1391return PredOpcode;13921393// For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the1394// the element size matches and either the PTEST_LIKE instruction uses1395// the same all active mask or the condition is "any".1396if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&1397getElementSizeForOpcode(MaskOpcode) ==1398getElementSizeForOpcode(PredOpcode)) {1399auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());1400if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)1401return PredOpcode;1402}14031404// For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the1405// flags are set based on the same mask 'PG', but PTEST_LIKE must operate1406// on 8-bit predicates like the PTEST. Otherwise, for instructions like1407// compare that also support 16/32/64-bit predicates, the implicit PTEST1408// performed by the compare could consider fewer lanes for these element1409// sizes.1410//1411// For example, consider1412//1413// ptrue p0.b ; P0=1111-1111-1111-11111414// index z0.s, #0, #1 ; Z0=<0,1,2,3>1415// index z1.s, #1, #1 ; Z1=<1,2,3,4>1416// cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-00011417// ; ^ last active1418// ptest p0, p1.b ; P1=0001-0001-0001-00011419// ; ^ last active1420//1421// where the compare generates a canonical all active 32-bit predicate1422// (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last1423// active flag, whereas the PTEST instruction with the same mask doesn't.1424// For PTEST_ANY this doesn't apply as the flags in this case would be1425// identical regardless of element size.1426auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());1427uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);1428if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||1429PTest->getOpcode() == AArch64::PTEST_PP_ANY))1430return PredOpcode;14311432return {};1433}14341435// If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the1436// opcode so the PTEST becomes redundant.1437switch (PredOpcode) {1438case AArch64::AND_PPzPP:1439case AArch64::BIC_PPzPP:1440case AArch64::EOR_PPzPP:1441case AArch64::NAND_PPzPP:1442case AArch64::NOR_PPzPP:1443case AArch64::ORN_PPzPP:1444case AArch64::ORR_PPzPP:1445case AArch64::BRKA_PPzP:1446case AArch64::BRKPA_PPzPP:1447case AArch64::BRKB_PPzP:1448case AArch64::BRKPB_PPzPP:1449case AArch64::RDFFR_PPz: {1450// Check to see if our mask is the same. If not the resulting flag bits1451// may be different and we can't remove the ptest.1452auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());1453if (Mask != PredMask)1454return {};1455break;1456}1457case AArch64::BRKN_PPzP: {1458// BRKN uses an all active implicit mask to set flags unlike the other1459// flag-setting instructions.1460// PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).1461if ((MaskOpcode != AArch64::PTRUE_B) ||1462(Mask->getOperand(1).getImm() != 31))1463return {};1464break;1465}1466case AArch64::PTRUE_B:1467// PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)1468break;1469default:1470// Bail out if we don't recognize the input1471return {};1472}14731474return convertToFlagSettingOpc(PredOpcode);1475}14761477/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating1478/// operation which could set the flags in an identical manner1479bool AArch64InstrInfo::optimizePTestInstr(1480MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,1481const MachineRegisterInfo *MRI) const {1482auto *Mask = MRI->getUniqueVRegDef(MaskReg);1483auto *Pred = MRI->getUniqueVRegDef(PredReg);1484unsigned PredOpcode = Pred->getOpcode();1485auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);1486if (!NewOp)1487return false;14881489const TargetRegisterInfo *TRI = &getRegisterInfo();14901491// If another instruction between Pred and PTest accesses flags, don't remove1492// the ptest or update the earlier instruction to modify them.1493if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))1494return false;14951496// If we pass all the checks, it's safe to remove the PTEST and use the flags1497// as they are prior to PTEST. Sometimes this requires the tested PTEST1498// operand to be replaced with an equivalent instruction that also sets the1499// flags.1500PTest->eraseFromParent();1501if (*NewOp != PredOpcode) {1502Pred->setDesc(get(*NewOp));1503bool succeeded = UpdateOperandRegClass(*Pred);1504(void)succeeded;1505assert(succeeded && "Operands have incompatible register classes!");1506Pred->addRegisterDefined(AArch64::NZCV, TRI);1507}15081509// Ensure that the flags def is live.1510if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {1511unsigned i = 0, e = Pred->getNumOperands();1512for (; i != e; ++i) {1513MachineOperand &MO = Pred->getOperand(i);1514if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {1515MO.setIsDead(false);1516break;1517}1518}1519}1520return true;1521}15221523/// Try to optimize a compare instruction. A compare instruction is an1524/// instruction which produces AArch64::NZCV. It can be truly compare1525/// instruction1526/// when there are no uses of its destination register.1527///1528/// The following steps are tried in order:1529/// 1. Convert CmpInstr into an unconditional version.1530/// 2. Remove CmpInstr if above there is an instruction producing a needed1531/// condition code or an instruction which can be converted into such an1532/// instruction.1533/// Only comparison with zero is supported.1534bool AArch64InstrInfo::optimizeCompareInstr(1535MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,1536int64_t CmpValue, const MachineRegisterInfo *MRI) const {1537assert(CmpInstr.getParent());1538assert(MRI);15391540// Replace SUBSWrr with SUBWrr if NZCV is not used.1541int DeadNZCVIdx =1542CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);1543if (DeadNZCVIdx != -1) {1544if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||1545CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {1546CmpInstr.eraseFromParent();1547return true;1548}1549unsigned Opc = CmpInstr.getOpcode();1550unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);1551if (NewOpc == Opc)1552return false;1553const MCInstrDesc &MCID = get(NewOpc);1554CmpInstr.setDesc(MCID);1555CmpInstr.removeOperand(DeadNZCVIdx);1556bool succeeded = UpdateOperandRegClass(CmpInstr);1557(void)succeeded;1558assert(succeeded && "Some operands reg class are incompatible!");1559return true;1560}15611562if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||1563CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)1564return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);15651566if (SrcReg2 != 0)1567return false;15681569// CmpInstr is a Compare instruction if destination register is not used.1570if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))1571return false;15721573if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))1574return true;1575return (CmpValue == 0 || CmpValue == 1) &&1576removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);1577}15781579/// Get opcode of S version of Instr.1580/// If Instr is S version its opcode is returned.1581/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version1582/// or we are not interested in it.1583static unsigned sForm(MachineInstr &Instr) {1584switch (Instr.getOpcode()) {1585default:1586return AArch64::INSTRUCTION_LIST_END;15871588case AArch64::ADDSWrr:1589case AArch64::ADDSWri:1590case AArch64::ADDSXrr:1591case AArch64::ADDSXri:1592case AArch64::SUBSWrr:1593case AArch64::SUBSWri:1594case AArch64::SUBSXrr:1595case AArch64::SUBSXri:1596return Instr.getOpcode();15971598case AArch64::ADDWrr:1599return AArch64::ADDSWrr;1600case AArch64::ADDWri:1601return AArch64::ADDSWri;1602case AArch64::ADDXrr:1603return AArch64::ADDSXrr;1604case AArch64::ADDXri:1605return AArch64::ADDSXri;1606case AArch64::ADCWr:1607return AArch64::ADCSWr;1608case AArch64::ADCXr:1609return AArch64::ADCSXr;1610case AArch64::SUBWrr:1611return AArch64::SUBSWrr;1612case AArch64::SUBWri:1613return AArch64::SUBSWri;1614case AArch64::SUBXrr:1615return AArch64::SUBSXrr;1616case AArch64::SUBXri:1617return AArch64::SUBSXri;1618case AArch64::SBCWr:1619return AArch64::SBCSWr;1620case AArch64::SBCXr:1621return AArch64::SBCSXr;1622case AArch64::ANDWri:1623return AArch64::ANDSWri;1624case AArch64::ANDXri:1625return AArch64::ANDSXri;1626}1627}16281629/// Check if AArch64::NZCV should be alive in successors of MBB.1630static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {1631for (auto *BB : MBB->successors())1632if (BB->isLiveIn(AArch64::NZCV))1633return true;1634return false;1635}16361637/// \returns The condition code operand index for \p Instr if it is a branch1638/// or select and -1 otherwise.1639static int1640findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {1641switch (Instr.getOpcode()) {1642default:1643return -1;16441645case AArch64::Bcc: {1646int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);1647assert(Idx >= 2);1648return Idx - 2;1649}16501651case AArch64::CSINVWr:1652case AArch64::CSINVXr:1653case AArch64::CSINCWr:1654case AArch64::CSINCXr:1655case AArch64::CSELWr:1656case AArch64::CSELXr:1657case AArch64::CSNEGWr:1658case AArch64::CSNEGXr:1659case AArch64::FCSELSrrr:1660case AArch64::FCSELDrrr: {1661int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);1662assert(Idx >= 1);1663return Idx - 1;1664}1665}1666}16671668/// Find a condition code used by the instruction.1669/// Returns AArch64CC::Invalid if either the instruction does not use condition1670/// codes or we don't optimize CmpInstr in the presence of such instructions.1671static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {1672int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);1673return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(1674Instr.getOperand(CCIdx).getImm())1675: AArch64CC::Invalid;1676}16771678static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {1679assert(CC != AArch64CC::Invalid);1680UsedNZCV UsedFlags;1681switch (CC) {1682default:1683break;16841685case AArch64CC::EQ: // Z set1686case AArch64CC::NE: // Z clear1687UsedFlags.Z = true;1688break;16891690case AArch64CC::HI: // Z clear and C set1691case AArch64CC::LS: // Z set or C clear1692UsedFlags.Z = true;1693[[fallthrough]];1694case AArch64CC::HS: // C set1695case AArch64CC::LO: // C clear1696UsedFlags.C = true;1697break;16981699case AArch64CC::MI: // N set1700case AArch64CC::PL: // N clear1701UsedFlags.N = true;1702break;17031704case AArch64CC::VS: // V set1705case AArch64CC::VC: // V clear1706UsedFlags.V = true;1707break;17081709case AArch64CC::GT: // Z clear, N and V the same1710case AArch64CC::LE: // Z set, N and V differ1711UsedFlags.Z = true;1712[[fallthrough]];1713case AArch64CC::GE: // N and V the same1714case AArch64CC::LT: // N and V differ1715UsedFlags.N = true;1716UsedFlags.V = true;1717break;1718}1719return UsedFlags;1720}17211722/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV1723/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.1724/// \returns std::nullopt otherwise.1725///1726/// Collect instructions using that flags in \p CCUseInstrs if provided.1727std::optional<UsedNZCV>1728llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,1729const TargetRegisterInfo &TRI,1730SmallVectorImpl<MachineInstr *> *CCUseInstrs) {1731MachineBasicBlock *CmpParent = CmpInstr.getParent();1732if (MI.getParent() != CmpParent)1733return std::nullopt;17341735if (areCFlagsAliveInSuccessors(CmpParent))1736return std::nullopt;17371738UsedNZCV NZCVUsedAfterCmp;1739for (MachineInstr &Instr : instructionsWithoutDebug(1740std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {1741if (Instr.readsRegister(AArch64::NZCV, &TRI)) {1742AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);1743if (CC == AArch64CC::Invalid) // Unsupported conditional instruction1744return std::nullopt;1745NZCVUsedAfterCmp |= getUsedNZCV(CC);1746if (CCUseInstrs)1747CCUseInstrs->push_back(&Instr);1748}1749if (Instr.modifiesRegister(AArch64::NZCV, &TRI))1750break;1751}1752return NZCVUsedAfterCmp;1753}17541755static bool isADDSRegImm(unsigned Opcode) {1756return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;1757}17581759static bool isSUBSRegImm(unsigned Opcode) {1760return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;1761}17621763/// Check if CmpInstr can be substituted by MI.1764///1765/// CmpInstr can be substituted:1766/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'1767/// - and, MI and CmpInstr are from the same MachineBB1768/// - and, condition flags are not alive in successors of the CmpInstr parent1769/// - and, if MI opcode is the S form there must be no defs of flags between1770/// MI and CmpInstr1771/// or if MI opcode is not the S form there must be neither defs of flags1772/// nor uses of flags between MI and CmpInstr.1773/// - and, if C/V flags are not used after CmpInstr1774/// or if N flag is used but MI produces poison value if signed overflow1775/// occurs.1776static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,1777const TargetRegisterInfo &TRI) {1778// NOTE this assertion guarantees that MI.getOpcode() is add or subtraction1779// that may or may not set flags.1780assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);17811782const unsigned CmpOpcode = CmpInstr.getOpcode();1783if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))1784return false;17851786assert((CmpInstr.getOperand(2).isImm() &&1787CmpInstr.getOperand(2).getImm() == 0) &&1788"Caller guarantees that CmpInstr compares with constant 0");17891790std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);1791if (!NZVCUsed || NZVCUsed->C)1792return false;17931794// CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either1795// '%vreg = add ...' or '%vreg = sub ...'.1796// Condition flag V is used to indicate signed overflow.1797// 1) MI and CmpInstr set N and V to the same value.1798// 2) If MI is add/sub with no-signed-wrap, it produces a poison value when1799// signed overflow occurs, so CmpInstr could still be simplified away.1800if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap))1801return false;18021803AccessKind AccessToCheck = AK_Write;1804if (sForm(MI) != MI.getOpcode())1805AccessToCheck = AK_All;1806return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);1807}18081809/// Substitute an instruction comparing to zero with another instruction1810/// which produces needed condition flags.1811///1812/// Return true on success.1813bool AArch64InstrInfo::substituteCmpToZero(1814MachineInstr &CmpInstr, unsigned SrcReg,1815const MachineRegisterInfo &MRI) const {1816// Get the unique definition of SrcReg.1817MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);1818if (!MI)1819return false;18201821const TargetRegisterInfo &TRI = getRegisterInfo();18221823unsigned NewOpc = sForm(*MI);1824if (NewOpc == AArch64::INSTRUCTION_LIST_END)1825return false;18261827if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))1828return false;18291830// Update the instruction to set NZCV.1831MI->setDesc(get(NewOpc));1832CmpInstr.eraseFromParent();1833bool succeeded = UpdateOperandRegClass(*MI);1834(void)succeeded;1835assert(succeeded && "Some operands reg class are incompatible!");1836MI->addRegisterDefined(AArch64::NZCV, &TRI);1837return true;1838}18391840/// \returns True if \p CmpInstr can be removed.1841///1842/// \p IsInvertCC is true if, after removing \p CmpInstr, condition1843/// codes used in \p CCUseInstrs must be inverted.1844static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,1845int CmpValue, const TargetRegisterInfo &TRI,1846SmallVectorImpl<MachineInstr *> &CCUseInstrs,1847bool &IsInvertCC) {1848assert((CmpValue == 0 || CmpValue == 1) &&1849"Only comparisons to 0 or 1 considered for removal!");18501851// MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'1852unsigned MIOpc = MI.getOpcode();1853if (MIOpc == AArch64::CSINCWr) {1854if (MI.getOperand(1).getReg() != AArch64::WZR ||1855MI.getOperand(2).getReg() != AArch64::WZR)1856return false;1857} else if (MIOpc == AArch64::CSINCXr) {1858if (MI.getOperand(1).getReg() != AArch64::XZR ||1859MI.getOperand(2).getReg() != AArch64::XZR)1860return false;1861} else {1862return false;1863}1864AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI);1865if (MICC == AArch64CC::Invalid)1866return false;18671868// NZCV needs to be defined1869if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)1870return false;18711872// CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'1873const unsigned CmpOpcode = CmpInstr.getOpcode();1874bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);1875if (CmpValue && !IsSubsRegImm)1876return false;1877if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))1878return false;18791880// MI conditions allowed: eq, ne, mi, pl1881UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);1882if (MIUsedNZCV.C || MIUsedNZCV.V)1883return false;18841885std::optional<UsedNZCV> NZCVUsedAfterCmp =1886examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);1887// Condition flags are not used in CmpInstr basic block successors and only1888// Z or N flags allowed to be used after CmpInstr within its basic block1889if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)1890return false;1891// Z or N flag used after CmpInstr must correspond to the flag used in MI1892if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||1893(MIUsedNZCV.N && NZCVUsedAfterCmp->Z))1894return false;1895// If CmpInstr is comparison to zero MI conditions are limited to eq, ne1896if (MIUsedNZCV.N && !CmpValue)1897return false;18981899// There must be no defs of flags between MI and CmpInstr1900if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))1901return false;19021903// Condition code is inverted in the following cases:1904// 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'1905// 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'1906IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||1907(!CmpValue && MICC == AArch64CC::NE);1908return true;1909}19101911/// Remove comparison in csinc-cmp sequence1912///1913/// Examples:1914/// 1. \code1915/// csinc w9, wzr, wzr, ne1916/// cmp w9, #01917/// b.eq1918/// \endcode1919/// to1920/// \code1921/// csinc w9, wzr, wzr, ne1922/// b.ne1923/// \endcode1924///1925/// 2. \code1926/// csinc x2, xzr, xzr, mi1927/// cmp x2, #11928/// b.pl1929/// \endcode1930/// to1931/// \code1932/// csinc x2, xzr, xzr, mi1933/// b.pl1934/// \endcode1935///1936/// \param CmpInstr comparison instruction1937/// \return True when comparison removed1938bool AArch64InstrInfo::removeCmpToZeroOrOne(1939MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,1940const MachineRegisterInfo &MRI) const {1941MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);1942if (!MI)1943return false;1944const TargetRegisterInfo &TRI = getRegisterInfo();1945SmallVector<MachineInstr *, 4> CCUseInstrs;1946bool IsInvertCC = false;1947if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,1948IsInvertCC))1949return false;1950// Make transformation1951CmpInstr.eraseFromParent();1952if (IsInvertCC) {1953// Invert condition codes in CmpInstr CC users1954for (MachineInstr *CCUseInstr : CCUseInstrs) {1955int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);1956assert(Idx >= 0 && "Unexpected instruction using CC.");1957MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);1958AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(1959static_cast<AArch64CC::CondCode>(CCOperand.getImm()));1960CCOperand.setImm(CCUse);1961}1962}1963return true;1964}19651966bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {1967if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&1968MI.getOpcode() != AArch64::CATCHRET)1969return false;19701971MachineBasicBlock &MBB = *MI.getParent();1972auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();1973auto TRI = Subtarget.getRegisterInfo();1974DebugLoc DL = MI.getDebugLoc();19751976if (MI.getOpcode() == AArch64::CATCHRET) {1977// Skip to the first instruction before the epilog.1978const TargetInstrInfo *TII =1979MBB.getParent()->getSubtarget().getInstrInfo();1980MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();1981auto MBBI = MachineBasicBlock::iterator(MI);1982MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);1983while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&1984FirstEpilogSEH != MBB.begin())1985FirstEpilogSEH = std::prev(FirstEpilogSEH);1986if (FirstEpilogSEH != MBB.begin())1987FirstEpilogSEH = std::next(FirstEpilogSEH);1988BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))1989.addReg(AArch64::X0, RegState::Define)1990.addMBB(TargetMBB);1991BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))1992.addReg(AArch64::X0, RegState::Define)1993.addReg(AArch64::X0)1994.addMBB(TargetMBB)1995.addImm(0);1996return true;1997}19981999Register Reg = MI.getOperand(0).getReg();2000Module &M = *MBB.getParent()->getFunction().getParent();2001if (M.getStackProtectorGuard() == "sysreg") {2002const AArch64SysReg::SysReg *SrcReg =2003AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());2004if (!SrcReg)2005report_fatal_error("Unknown SysReg for Stack Protector Guard Register");20062007// mrs xN, sysreg2008BuildMI(MBB, MI, DL, get(AArch64::MRS))2009.addDef(Reg, RegState::Renamable)2010.addImm(SrcReg->Encoding);2011int Offset = M.getStackProtectorGuardOffset();2012if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {2013// ldr xN, [xN, #offset]2014BuildMI(MBB, MI, DL, get(AArch64::LDRXui))2015.addDef(Reg)2016.addUse(Reg, RegState::Kill)2017.addImm(Offset / 8);2018} else if (Offset >= -256 && Offset <= 255) {2019// ldur xN, [xN, #offset]2020BuildMI(MBB, MI, DL, get(AArch64::LDURXi))2021.addDef(Reg)2022.addUse(Reg, RegState::Kill)2023.addImm(Offset);2024} else if (Offset >= -4095 && Offset <= 4095) {2025if (Offset > 0) {2026// add xN, xN, #offset2027BuildMI(MBB, MI, DL, get(AArch64::ADDXri))2028.addDef(Reg)2029.addUse(Reg, RegState::Kill)2030.addImm(Offset)2031.addImm(0);2032} else {2033// sub xN, xN, #offset2034BuildMI(MBB, MI, DL, get(AArch64::SUBXri))2035.addDef(Reg)2036.addUse(Reg, RegState::Kill)2037.addImm(-Offset)2038.addImm(0);2039}2040// ldr xN, [xN]2041BuildMI(MBB, MI, DL, get(AArch64::LDRXui))2042.addDef(Reg)2043.addUse(Reg, RegState::Kill)2044.addImm(0);2045} else {2046// Cases that are larger than +/- 4095 and not a multiple of 8, or larger2047// than 23760.2048// It might be nice to use AArch64::MOVi32imm here, which would get2049// expanded in PreSched2 after PostRA, but our lone scratch Reg already2050// contains the MRS result. findScratchNonCalleeSaveRegister() in2051// AArch64FrameLowering might help us find such a scratch register2052// though. If we failed to find a scratch register, we could emit a2053// stream of add instructions to build up the immediate. Or, we could try2054// to insert a AArch64::MOVi32imm before register allocation so that we2055// didn't need to scavenge for a scratch register.2056report_fatal_error("Unable to encode Stack Protector Guard Offset");2057}2058MBB.erase(MI);2059return true;2060}20612062const GlobalValue *GV =2063cast<GlobalValue>((*MI.memoperands_begin())->getValue());2064const TargetMachine &TM = MBB.getParent()->getTarget();2065unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);2066const unsigned char MO_NC = AArch64II::MO_NC;20672068if ((OpFlags & AArch64II::MO_GOT) != 0) {2069BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)2070.addGlobalAddress(GV, 0, OpFlags);2071if (Subtarget.isTargetILP32()) {2072unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);2073BuildMI(MBB, MI, DL, get(AArch64::LDRWui))2074.addDef(Reg32, RegState::Dead)2075.addUse(Reg, RegState::Kill)2076.addImm(0)2077.addMemOperand(*MI.memoperands_begin())2078.addDef(Reg, RegState::Implicit);2079} else {2080BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)2081.addReg(Reg, RegState::Kill)2082.addImm(0)2083.addMemOperand(*MI.memoperands_begin());2084}2085} else if (TM.getCodeModel() == CodeModel::Large) {2086assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");2087BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)2088.addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)2089.addImm(0);2090BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)2091.addReg(Reg, RegState::Kill)2092.addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)2093.addImm(16);2094BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)2095.addReg(Reg, RegState::Kill)2096.addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)2097.addImm(32);2098BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)2099.addReg(Reg, RegState::Kill)2100.addGlobalAddress(GV, 0, AArch64II::MO_G3)2101.addImm(48);2102BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)2103.addReg(Reg, RegState::Kill)2104.addImm(0)2105.addMemOperand(*MI.memoperands_begin());2106} else if (TM.getCodeModel() == CodeModel::Tiny) {2107BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)2108.addGlobalAddress(GV, 0, OpFlags);2109} else {2110BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)2111.addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);2112unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;2113if (Subtarget.isTargetILP32()) {2114unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);2115BuildMI(MBB, MI, DL, get(AArch64::LDRWui))2116.addDef(Reg32, RegState::Dead)2117.addUse(Reg, RegState::Kill)2118.addGlobalAddress(GV, 0, LoFlags)2119.addMemOperand(*MI.memoperands_begin())2120.addDef(Reg, RegState::Implicit);2121} else {2122BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)2123.addReg(Reg, RegState::Kill)2124.addGlobalAddress(GV, 0, LoFlags)2125.addMemOperand(*MI.memoperands_begin());2126}2127}21282129MBB.erase(MI);21302131return true;2132}21332134// Return true if this instruction simply sets its single destination register2135// to zero. This is equivalent to a register rename of the zero-register.2136bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {2137switch (MI.getOpcode()) {2138default:2139break;2140case AArch64::MOVZWi:2141case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)2142if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {2143assert(MI.getDesc().getNumOperands() == 3 &&2144MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");2145return true;2146}2147break;2148case AArch64::ANDWri: // and Rd, Rzr, #imm2149return MI.getOperand(1).getReg() == AArch64::WZR;2150case AArch64::ANDXri:2151return MI.getOperand(1).getReg() == AArch64::XZR;2152case TargetOpcode::COPY:2153return MI.getOperand(1).getReg() == AArch64::WZR;2154}2155return false;2156}21572158// Return true if this instruction simply renames a general register without2159// modifying bits.2160bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {2161switch (MI.getOpcode()) {2162default:2163break;2164case TargetOpcode::COPY: {2165// GPR32 copies will by lowered to ORRXrs2166Register DstReg = MI.getOperand(0).getReg();2167return (AArch64::GPR32RegClass.contains(DstReg) ||2168AArch64::GPR64RegClass.contains(DstReg));2169}2170case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)2171if (MI.getOperand(1).getReg() == AArch64::XZR) {2172assert(MI.getDesc().getNumOperands() == 4 &&2173MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");2174return true;2175}2176break;2177case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)2178if (MI.getOperand(2).getImm() == 0) {2179assert(MI.getDesc().getNumOperands() == 4 &&2180MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");2181return true;2182}2183break;2184}2185return false;2186}21872188// Return true if this instruction simply renames a general register without2189// modifying bits.2190bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {2191switch (MI.getOpcode()) {2192default:2193break;2194case TargetOpcode::COPY: {2195Register DstReg = MI.getOperand(0).getReg();2196return AArch64::FPR128RegClass.contains(DstReg);2197}2198case AArch64::ORRv16i8:2199if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {2200assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&2201"invalid ORRv16i8 operands");2202return true;2203}2204break;2205}2206return false;2207}22082209Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,2210int &FrameIndex) const {2211switch (MI.getOpcode()) {2212default:2213break;2214case AArch64::LDRWui:2215case AArch64::LDRXui:2216case AArch64::LDRBui:2217case AArch64::LDRHui:2218case AArch64::LDRSui:2219case AArch64::LDRDui:2220case AArch64::LDRQui:2221case AArch64::LDR_PXI:2222if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&2223MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {2224FrameIndex = MI.getOperand(1).getIndex();2225return MI.getOperand(0).getReg();2226}2227break;2228}22292230return 0;2231}22322233Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,2234int &FrameIndex) const {2235switch (MI.getOpcode()) {2236default:2237break;2238case AArch64::STRWui:2239case AArch64::STRXui:2240case AArch64::STRBui:2241case AArch64::STRHui:2242case AArch64::STRSui:2243case AArch64::STRDui:2244case AArch64::STRQui:2245case AArch64::STR_PXI:2246if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&2247MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {2248FrameIndex = MI.getOperand(1).getIndex();2249return MI.getOperand(0).getReg();2250}2251break;2252}2253return 0;2254}22552256/// Check all MachineMemOperands for a hint to suppress pairing.2257bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {2258return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {2259return MMO->getFlags() & MOSuppressPair;2260});2261}22622263/// Set a flag on the first MachineMemOperand to suppress pairing.2264void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {2265if (MI.memoperands_empty())2266return;2267(*MI.memoperands_begin())->setFlags(MOSuppressPair);2268}22692270/// Check all MachineMemOperands for a hint that the load/store is strided.2271bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {2272return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {2273return MMO->getFlags() & MOStridedAccess;2274});2275}22762277bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {2278switch (Opc) {2279default:2280return false;2281case AArch64::STURSi:2282case AArch64::STRSpre:2283case AArch64::STURDi:2284case AArch64::STRDpre:2285case AArch64::STURQi:2286case AArch64::STRQpre:2287case AArch64::STURBBi:2288case AArch64::STURHHi:2289case AArch64::STURWi:2290case AArch64::STRWpre:2291case AArch64::STURXi:2292case AArch64::STRXpre:2293case AArch64::LDURSi:2294case AArch64::LDRSpre:2295case AArch64::LDURDi:2296case AArch64::LDRDpre:2297case AArch64::LDURQi:2298case AArch64::LDRQpre:2299case AArch64::LDURWi:2300case AArch64::LDRWpre:2301case AArch64::LDURXi:2302case AArch64::LDRXpre:2303case AArch64::LDRSWpre:2304case AArch64::LDURSWi:2305case AArch64::LDURHHi:2306case AArch64::LDURBBi:2307case AArch64::LDURSBWi:2308case AArch64::LDURSHWi:2309return true;2310}2311}23122313std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {2314switch (Opc) {2315default: return {};2316case AArch64::PRFMui: return AArch64::PRFUMi;2317case AArch64::LDRXui: return AArch64::LDURXi;2318case AArch64::LDRWui: return AArch64::LDURWi;2319case AArch64::LDRBui: return AArch64::LDURBi;2320case AArch64::LDRHui: return AArch64::LDURHi;2321case AArch64::LDRSui: return AArch64::LDURSi;2322case AArch64::LDRDui: return AArch64::LDURDi;2323case AArch64::LDRQui: return AArch64::LDURQi;2324case AArch64::LDRBBui: return AArch64::LDURBBi;2325case AArch64::LDRHHui: return AArch64::LDURHHi;2326case AArch64::LDRSBXui: return AArch64::LDURSBXi;2327case AArch64::LDRSBWui: return AArch64::LDURSBWi;2328case AArch64::LDRSHXui: return AArch64::LDURSHXi;2329case AArch64::LDRSHWui: return AArch64::LDURSHWi;2330case AArch64::LDRSWui: return AArch64::LDURSWi;2331case AArch64::STRXui: return AArch64::STURXi;2332case AArch64::STRWui: return AArch64::STURWi;2333case AArch64::STRBui: return AArch64::STURBi;2334case AArch64::STRHui: return AArch64::STURHi;2335case AArch64::STRSui: return AArch64::STURSi;2336case AArch64::STRDui: return AArch64::STURDi;2337case AArch64::STRQui: return AArch64::STURQi;2338case AArch64::STRBBui: return AArch64::STURBBi;2339case AArch64::STRHHui: return AArch64::STURHHi;2340}2341}23422343unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {2344switch (Opc) {2345default:2346return 2;2347case AArch64::LDPXi:2348case AArch64::LDPDi:2349case AArch64::STPXi:2350case AArch64::STPDi:2351case AArch64::LDNPXi:2352case AArch64::LDNPDi:2353case AArch64::STNPXi:2354case AArch64::STNPDi:2355case AArch64::LDPQi:2356case AArch64::STPQi:2357case AArch64::LDNPQi:2358case AArch64::STNPQi:2359case AArch64::LDPWi:2360case AArch64::LDPSi:2361case AArch64::STPWi:2362case AArch64::STPSi:2363case AArch64::LDNPWi:2364case AArch64::LDNPSi:2365case AArch64::STNPWi:2366case AArch64::STNPSi:2367case AArch64::LDG:2368case AArch64::STGPi:23692370case AArch64::LD1B_IMM:2371case AArch64::LD1B_H_IMM:2372case AArch64::LD1B_S_IMM:2373case AArch64::LD1B_D_IMM:2374case AArch64::LD1SB_H_IMM:2375case AArch64::LD1SB_S_IMM:2376case AArch64::LD1SB_D_IMM:2377case AArch64::LD1H_IMM:2378case AArch64::LD1H_S_IMM:2379case AArch64::LD1H_D_IMM:2380case AArch64::LD1SH_S_IMM:2381case AArch64::LD1SH_D_IMM:2382case AArch64::LD1W_IMM:2383case AArch64::LD1W_D_IMM:2384case AArch64::LD1SW_D_IMM:2385case AArch64::LD1D_IMM:23862387case AArch64::LD2B_IMM:2388case AArch64::LD2H_IMM:2389case AArch64::LD2W_IMM:2390case AArch64::LD2D_IMM:2391case AArch64::LD3B_IMM:2392case AArch64::LD3H_IMM:2393case AArch64::LD3W_IMM:2394case AArch64::LD3D_IMM:2395case AArch64::LD4B_IMM:2396case AArch64::LD4H_IMM:2397case AArch64::LD4W_IMM:2398case AArch64::LD4D_IMM:23992400case AArch64::ST1B_IMM:2401case AArch64::ST1B_H_IMM:2402case AArch64::ST1B_S_IMM:2403case AArch64::ST1B_D_IMM:2404case AArch64::ST1H_IMM:2405case AArch64::ST1H_S_IMM:2406case AArch64::ST1H_D_IMM:2407case AArch64::ST1W_IMM:2408case AArch64::ST1W_D_IMM:2409case AArch64::ST1D_IMM:24102411case AArch64::ST2B_IMM:2412case AArch64::ST2H_IMM:2413case AArch64::ST2W_IMM:2414case AArch64::ST2D_IMM:2415case AArch64::ST3B_IMM:2416case AArch64::ST3H_IMM:2417case AArch64::ST3W_IMM:2418case AArch64::ST3D_IMM:2419case AArch64::ST4B_IMM:2420case AArch64::ST4H_IMM:2421case AArch64::ST4W_IMM:2422case AArch64::ST4D_IMM:24232424case AArch64::LD1RB_IMM:2425case AArch64::LD1RB_H_IMM:2426case AArch64::LD1RB_S_IMM:2427case AArch64::LD1RB_D_IMM:2428case AArch64::LD1RSB_H_IMM:2429case AArch64::LD1RSB_S_IMM:2430case AArch64::LD1RSB_D_IMM:2431case AArch64::LD1RH_IMM:2432case AArch64::LD1RH_S_IMM:2433case AArch64::LD1RH_D_IMM:2434case AArch64::LD1RSH_S_IMM:2435case AArch64::LD1RSH_D_IMM:2436case AArch64::LD1RW_IMM:2437case AArch64::LD1RW_D_IMM:2438case AArch64::LD1RSW_IMM:2439case AArch64::LD1RD_IMM:24402441case AArch64::LDNT1B_ZRI:2442case AArch64::LDNT1H_ZRI:2443case AArch64::LDNT1W_ZRI:2444case AArch64::LDNT1D_ZRI:2445case AArch64::STNT1B_ZRI:2446case AArch64::STNT1H_ZRI:2447case AArch64::STNT1W_ZRI:2448case AArch64::STNT1D_ZRI:24492450case AArch64::LDNF1B_IMM:2451case AArch64::LDNF1B_H_IMM:2452case AArch64::LDNF1B_S_IMM:2453case AArch64::LDNF1B_D_IMM:2454case AArch64::LDNF1SB_H_IMM:2455case AArch64::LDNF1SB_S_IMM:2456case AArch64::LDNF1SB_D_IMM:2457case AArch64::LDNF1H_IMM:2458case AArch64::LDNF1H_S_IMM:2459case AArch64::LDNF1H_D_IMM:2460case AArch64::LDNF1SH_S_IMM:2461case AArch64::LDNF1SH_D_IMM:2462case AArch64::LDNF1W_IMM:2463case AArch64::LDNF1W_D_IMM:2464case AArch64::LDNF1SW_D_IMM:2465case AArch64::LDNF1D_IMM:2466return 3;2467case AArch64::ADDG:2468case AArch64::STGi:2469case AArch64::LDR_PXI:2470case AArch64::STR_PXI:2471return 2;2472}2473}24742475bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {2476switch (MI.getOpcode()) {2477default:2478return false;2479// Scaled instructions.2480case AArch64::STRSui:2481case AArch64::STRDui:2482case AArch64::STRQui:2483case AArch64::STRXui:2484case AArch64::STRWui:2485case AArch64::LDRSui:2486case AArch64::LDRDui:2487case AArch64::LDRQui:2488case AArch64::LDRXui:2489case AArch64::LDRWui:2490case AArch64::LDRSWui:2491// Unscaled instructions.2492case AArch64::STURSi:2493case AArch64::STRSpre:2494case AArch64::STURDi:2495case AArch64::STRDpre:2496case AArch64::STURQi:2497case AArch64::STRQpre:2498case AArch64::STURWi:2499case AArch64::STRWpre:2500case AArch64::STURXi:2501case AArch64::STRXpre:2502case AArch64::LDURSi:2503case AArch64::LDRSpre:2504case AArch64::LDURDi:2505case AArch64::LDRDpre:2506case AArch64::LDURQi:2507case AArch64::LDRQpre:2508case AArch64::LDURWi:2509case AArch64::LDRWpre:2510case AArch64::LDURXi:2511case AArch64::LDRXpre:2512case AArch64::LDURSWi:2513case AArch64::LDRSWpre:2514return true;2515}2516}25172518bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {2519switch (MI.getOpcode()) {2520default:2521assert((!MI.isCall() || !MI.isReturn()) &&2522"Unexpected instruction - was a new tail call opcode introduced?");2523return false;2524case AArch64::TCRETURNdi:2525case AArch64::TCRETURNri:2526case AArch64::TCRETURNrix16x17:2527case AArch64::TCRETURNrix17:2528case AArch64::TCRETURNrinotx16:2529case AArch64::TCRETURNriALL:2530case AArch64::AUTH_TCRETURN:2531case AArch64::AUTH_TCRETURN_BTI:2532return true;2533}2534}25352536unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {2537switch (Opc) {2538default:2539llvm_unreachable("Opcode has no flag setting equivalent!");2540// 32-bit cases:2541case AArch64::ADDWri:2542return AArch64::ADDSWri;2543case AArch64::ADDWrr:2544return AArch64::ADDSWrr;2545case AArch64::ADDWrs:2546return AArch64::ADDSWrs;2547case AArch64::ADDWrx:2548return AArch64::ADDSWrx;2549case AArch64::ANDWri:2550return AArch64::ANDSWri;2551case AArch64::ANDWrr:2552return AArch64::ANDSWrr;2553case AArch64::ANDWrs:2554return AArch64::ANDSWrs;2555case AArch64::BICWrr:2556return AArch64::BICSWrr;2557case AArch64::BICWrs:2558return AArch64::BICSWrs;2559case AArch64::SUBWri:2560return AArch64::SUBSWri;2561case AArch64::SUBWrr:2562return AArch64::SUBSWrr;2563case AArch64::SUBWrs:2564return AArch64::SUBSWrs;2565case AArch64::SUBWrx:2566return AArch64::SUBSWrx;2567// 64-bit cases:2568case AArch64::ADDXri:2569return AArch64::ADDSXri;2570case AArch64::ADDXrr:2571return AArch64::ADDSXrr;2572case AArch64::ADDXrs:2573return AArch64::ADDSXrs;2574case AArch64::ADDXrx:2575return AArch64::ADDSXrx;2576case AArch64::ANDXri:2577return AArch64::ANDSXri;2578case AArch64::ANDXrr:2579return AArch64::ANDSXrr;2580case AArch64::ANDXrs:2581return AArch64::ANDSXrs;2582case AArch64::BICXrr:2583return AArch64::BICSXrr;2584case AArch64::BICXrs:2585return AArch64::BICSXrs;2586case AArch64::SUBXri:2587return AArch64::SUBSXri;2588case AArch64::SUBXrr:2589return AArch64::SUBSXrr;2590case AArch64::SUBXrs:2591return AArch64::SUBSXrs;2592case AArch64::SUBXrx:2593return AArch64::SUBSXrx;2594// SVE instructions:2595case AArch64::AND_PPzPP:2596return AArch64::ANDS_PPzPP;2597case AArch64::BIC_PPzPP:2598return AArch64::BICS_PPzPP;2599case AArch64::EOR_PPzPP:2600return AArch64::EORS_PPzPP;2601case AArch64::NAND_PPzPP:2602return AArch64::NANDS_PPzPP;2603case AArch64::NOR_PPzPP:2604return AArch64::NORS_PPzPP;2605case AArch64::ORN_PPzPP:2606return AArch64::ORNS_PPzPP;2607case AArch64::ORR_PPzPP:2608return AArch64::ORRS_PPzPP;2609case AArch64::BRKA_PPzP:2610return AArch64::BRKAS_PPzP;2611case AArch64::BRKPA_PPzPP:2612return AArch64::BRKPAS_PPzPP;2613case AArch64::BRKB_PPzP:2614return AArch64::BRKBS_PPzP;2615case AArch64::BRKPB_PPzPP:2616return AArch64::BRKPBS_PPzPP;2617case AArch64::BRKN_PPzP:2618return AArch64::BRKNS_PPzP;2619case AArch64::RDFFR_PPz:2620return AArch64::RDFFRS_PPz;2621case AArch64::PTRUE_B:2622return AArch64::PTRUES_B;2623}2624}26252626// Is this a candidate for ld/st merging or pairing? For example, we don't2627// touch volatiles or load/stores that have a hint to avoid pair formation.2628bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {26292630bool IsPreLdSt = isPreLdSt(MI);26312632// If this is a volatile load/store, don't mess with it.2633if (MI.hasOrderedMemoryRef())2634return false;26352636// Make sure this is a reg/fi+imm (as opposed to an address reloc).2637// For Pre-inc LD/ST, the operand is shifted by one.2638assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||2639MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&2640"Expected a reg or frame index operand.");26412642// For Pre-indexed addressing quadword instructions, the third operand is the2643// immediate value.2644bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();26452646if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)2647return false;26482649// Can't merge/pair if the instruction modifies the base register.2650// e.g., ldr x0, [x0]2651// This case will never occur with an FI base.2652// However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or2653// STR<S,D,Q,W,X>pre, it can be merged.2654// For example:2655// ldr q0, [x11, #32]!2656// ldr q1, [x11, #16]2657// to2658// ldp q0, q1, [x11, #32]!2659if (MI.getOperand(1).isReg() && !IsPreLdSt) {2660Register BaseReg = MI.getOperand(1).getReg();2661const TargetRegisterInfo *TRI = &getRegisterInfo();2662if (MI.modifiesRegister(BaseReg, TRI))2663return false;2664}26652666// Check if this load/store has a hint to avoid pair formation.2667// MachineMemOperands hints are set by the AArch64StorePairSuppress pass.2668if (isLdStPairSuppressed(MI))2669return false;26702671// Do not pair any callee-save store/reload instructions in the2672// prologue/epilogue if the CFI information encoded the operations as separate2673// instructions, as that will cause the size of the actual prologue to mismatch2674// with the prologue size recorded in the Windows CFI.2675const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();2676bool NeedsWinCFI = MAI->usesWindowsCFI() &&2677MI.getMF()->getFunction().needsUnwindTableEntry();2678if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||2679MI.getFlag(MachineInstr::FrameDestroy)))2680return false;26812682// On some CPUs quad load/store pairs are slower than two single load/stores.2683if (Subtarget.isPaired128Slow()) {2684switch (MI.getOpcode()) {2685default:2686break;2687case AArch64::LDURQi:2688case AArch64::STURQi:2689case AArch64::LDRQui:2690case AArch64::STRQui:2691return false;2692}2693}26942695return true;2696}26972698bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(2699const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,2700int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,2701const TargetRegisterInfo *TRI) const {2702if (!LdSt.mayLoadOrStore())2703return false;27042705const MachineOperand *BaseOp;2706TypeSize WidthN(0, false);2707if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,2708WidthN, TRI))2709return false;2710// The maximum vscale is 16 under AArch64, return the maximal extent for the2711// vector.2712Width = LocationSize::precise(WidthN);2713BaseOps.push_back(BaseOp);2714return true;2715}27162717std::optional<ExtAddrMode>2718AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,2719const TargetRegisterInfo *TRI) const {2720const MachineOperand *Base; // Filled with the base operand of MI.2721int64_t Offset; // Filled with the offset of MI.2722bool OffsetIsScalable;2723if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))2724return std::nullopt;27252726if (!Base->isReg())2727return std::nullopt;2728ExtAddrMode AM;2729AM.BaseReg = Base->getReg();2730AM.Displacement = Offset;2731AM.ScaledReg = 0;2732AM.Scale = 0;2733return AM;2734}27352736bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,2737Register Reg,2738const MachineInstr &AddrI,2739ExtAddrMode &AM) const {2740// Filter out instructions into which we cannot fold.2741unsigned NumBytes;2742int64_t OffsetScale = 1;2743switch (MemI.getOpcode()) {2744default:2745return false;27462747case AArch64::LDURQi:2748case AArch64::STURQi:2749NumBytes = 16;2750break;27512752case AArch64::LDURDi:2753case AArch64::STURDi:2754case AArch64::LDURXi:2755case AArch64::STURXi:2756NumBytes = 8;2757break;27582759case AArch64::LDURWi:2760case AArch64::LDURSWi:2761case AArch64::STURWi:2762NumBytes = 4;2763break;27642765case AArch64::LDURHi:2766case AArch64::STURHi:2767case AArch64::LDURHHi:2768case AArch64::STURHHi:2769case AArch64::LDURSHXi:2770case AArch64::LDURSHWi:2771NumBytes = 2;2772break;27732774case AArch64::LDRBroX:2775case AArch64::LDRBBroX:2776case AArch64::LDRSBXroX:2777case AArch64::LDRSBWroX:2778case AArch64::STRBroX:2779case AArch64::STRBBroX:2780case AArch64::LDURBi:2781case AArch64::LDURBBi:2782case AArch64::LDURSBXi:2783case AArch64::LDURSBWi:2784case AArch64::STURBi:2785case AArch64::STURBBi:2786case AArch64::LDRBui:2787case AArch64::LDRBBui:2788case AArch64::LDRSBXui:2789case AArch64::LDRSBWui:2790case AArch64::STRBui:2791case AArch64::STRBBui:2792NumBytes = 1;2793break;27942795case AArch64::LDRQroX:2796case AArch64::STRQroX:2797case AArch64::LDRQui:2798case AArch64::STRQui:2799NumBytes = 16;2800OffsetScale = 16;2801break;28022803case AArch64::LDRDroX:2804case AArch64::STRDroX:2805case AArch64::LDRXroX:2806case AArch64::STRXroX:2807case AArch64::LDRDui:2808case AArch64::STRDui:2809case AArch64::LDRXui:2810case AArch64::STRXui:2811NumBytes = 8;2812OffsetScale = 8;2813break;28142815case AArch64::LDRWroX:2816case AArch64::LDRSWroX:2817case AArch64::STRWroX:2818case AArch64::LDRWui:2819case AArch64::LDRSWui:2820case AArch64::STRWui:2821NumBytes = 4;2822OffsetScale = 4;2823break;28242825case AArch64::LDRHroX:2826case AArch64::STRHroX:2827case AArch64::LDRHHroX:2828case AArch64::STRHHroX:2829case AArch64::LDRSHXroX:2830case AArch64::LDRSHWroX:2831case AArch64::LDRHui:2832case AArch64::STRHui:2833case AArch64::LDRHHui:2834case AArch64::STRHHui:2835case AArch64::LDRSHXui:2836case AArch64::LDRSHWui:2837NumBytes = 2;2838OffsetScale = 2;2839break;2840}28412842// Check the fold operand is not the loaded/stored value.2843const MachineOperand &BaseRegOp = MemI.getOperand(0);2844if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)2845return false;28462847// Handle memory instructions with a [Reg, Reg] addressing mode.2848if (MemI.getOperand(2).isReg()) {2849// Bail if the addressing mode already includes extension of the offset2850// register.2851if (MemI.getOperand(3).getImm())2852return false;28532854// Check if we actually have a scaled offset.2855if (MemI.getOperand(4).getImm() == 0)2856OffsetScale = 1;28572858// If the address instructions is folded into the base register, then the2859// addressing mode must not have a scale. Then we can swap the base and the2860// scaled registers.2861if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)2862return false;28632864switch (AddrI.getOpcode()) {2865default:2866return false;28672868case AArch64::SBFMXri:2869// sxtw Xa, Wm2870// ldr Xd, [Xn, Xa, lsl #N]2871// ->2872// ldr Xd, [Xn, Wm, sxtw #N]2873if (AddrI.getOperand(2).getImm() != 0 ||2874AddrI.getOperand(3).getImm() != 31)2875return false;28762877AM.BaseReg = MemI.getOperand(1).getReg();2878if (AM.BaseReg == Reg)2879AM.BaseReg = MemI.getOperand(2).getReg();2880AM.ScaledReg = AddrI.getOperand(1).getReg();2881AM.Scale = OffsetScale;2882AM.Displacement = 0;2883AM.Form = ExtAddrMode::Formula::SExtScaledReg;2884return true;28852886case TargetOpcode::SUBREG_TO_REG: {2887// mov Wa, Wm2888// ldr Xd, [Xn, Xa, lsl #N]2889// ->2890// ldr Xd, [Xn, Wm, uxtw #N]28912892// Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.2893if (AddrI.getOperand(1).getImm() != 0 ||2894AddrI.getOperand(3).getImm() != AArch64::sub_32)2895return false;28962897const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();2898Register OffsetReg = AddrI.getOperand(2).getReg();2899if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))2900return false;29012902const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);2903if (DefMI.getOpcode() != AArch64::ORRWrs ||2904DefMI.getOperand(1).getReg() != AArch64::WZR ||2905DefMI.getOperand(3).getImm() != 0)2906return false;29072908AM.BaseReg = MemI.getOperand(1).getReg();2909if (AM.BaseReg == Reg)2910AM.BaseReg = MemI.getOperand(2).getReg();2911AM.ScaledReg = DefMI.getOperand(2).getReg();2912AM.Scale = OffsetScale;2913AM.Displacement = 0;2914AM.Form = ExtAddrMode::Formula::ZExtScaledReg;2915return true;2916}2917}2918}29192920// Handle memory instructions with a [Reg, #Imm] addressing mode.29212922// Check we are not breaking a potential conversion to an LDP.2923auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,2924int64_t NewOffset) -> bool {2925int64_t MinOffset, MaxOffset;2926switch (NumBytes) {2927default:2928return true;2929case 4:2930MinOffset = -256;2931MaxOffset = 252;2932break;2933case 8:2934MinOffset = -512;2935MaxOffset = 504;2936break;2937case 16:2938MinOffset = -1024;2939MaxOffset = 1008;2940break;2941}2942return OldOffset < MinOffset || OldOffset > MaxOffset ||2943(NewOffset >= MinOffset && NewOffset <= MaxOffset);2944};2945auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {2946int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;2947int64_t NewOffset = OldOffset + Disp;2948if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))2949return false;2950// If the old offset would fit into an LDP, but the new offset wouldn't,2951// bail out.2952if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))2953return false;2954AM.BaseReg = AddrI.getOperand(1).getReg();2955AM.ScaledReg = 0;2956AM.Scale = 0;2957AM.Displacement = NewOffset;2958AM.Form = ExtAddrMode::Formula::Basic;2959return true;2960};29612962auto canFoldAddRegIntoAddrMode =2963[&](int64_t Scale,2964ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {2965if (MemI.getOperand(2).getImm() != 0)2966return false;2967if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))2968return false;2969AM.BaseReg = AddrI.getOperand(1).getReg();2970AM.ScaledReg = AddrI.getOperand(2).getReg();2971AM.Scale = Scale;2972AM.Displacement = 0;2973AM.Form = Form;2974return true;2975};29762977auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {2978unsigned Opcode = MemI.getOpcode();2979return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&2980Subtarget.isSTRQroSlow();2981};29822983int64_t Disp = 0;2984const bool OptSize = MemI.getMF()->getFunction().hasOptSize();2985switch (AddrI.getOpcode()) {2986default:2987return false;29882989case AArch64::ADDXri:2990// add Xa, Xn, #N2991// ldr Xd, [Xa, #M]2992// ->2993// ldr Xd, [Xn, #N'+M]2994Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();2995return canFoldAddSubImmIntoAddrMode(Disp);29962997case AArch64::SUBXri:2998// sub Xa, Xn, #N2999// ldr Xd, [Xa, #M]3000// ->3001// ldr Xd, [Xn, #N'+M]3002Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();3003return canFoldAddSubImmIntoAddrMode(-Disp);30043005case AArch64::ADDXrs: {3006// add Xa, Xn, Xm, lsl #N3007// ldr Xd, [Xa]3008// ->3009// ldr Xd, [Xn, Xm, lsl #N]30103011// Don't fold the add if the result would be slower, unless optimising for3012// size.3013unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());3014if (AArch64_AM::getShiftType(Shift) != AArch64_AM::ShiftExtendType::LSL)3015return false;3016Shift = AArch64_AM::getShiftValue(Shift);3017if (!OptSize) {3018if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())3019return false;3020if (avoidSlowSTRQ(MemI))3021return false;3022}3023return canFoldAddRegIntoAddrMode(1ULL << Shift);3024}30253026case AArch64::ADDXrr:3027// add Xa, Xn, Xm3028// ldr Xd, [Xa]3029// ->3030// ldr Xd, [Xn, Xm, lsl #0]30313032// Don't fold the add if the result would be slower, unless optimising for3033// size.3034if (!OptSize && avoidSlowSTRQ(MemI))3035return false;3036return canFoldAddRegIntoAddrMode(1);30373038case AArch64::ADDXrx:3039// add Xa, Xn, Wm, {s,u}xtw #N3040// ldr Xd, [Xa]3041// ->3042// ldr Xd, [Xn, Wm, {s,u}xtw #N]30433044// Don't fold the add if the result would be slower, unless optimising for3045// size.3046if (!OptSize && avoidSlowSTRQ(MemI))3047return false;30483049// Can fold only sign-/zero-extend of a word.3050unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());3051AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);3052if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)3053return false;30543055return canFoldAddRegIntoAddrMode(30561ULL << AArch64_AM::getArithShiftValue(Imm),3057(Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg3058: ExtAddrMode::Formula::ZExtScaledReg);3059}3060}30613062// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,3063// return the opcode of an instruction performing the same operation, but using3064// the [Reg, Reg] addressing mode.3065static unsigned regOffsetOpcode(unsigned Opcode) {3066switch (Opcode) {3067default:3068llvm_unreachable("Address folding not implemented for instruction");30693070case AArch64::LDURQi:3071case AArch64::LDRQui:3072return AArch64::LDRQroX;3073case AArch64::STURQi:3074case AArch64::STRQui:3075return AArch64::STRQroX;3076case AArch64::LDURDi:3077case AArch64::LDRDui:3078return AArch64::LDRDroX;3079case AArch64::STURDi:3080case AArch64::STRDui:3081return AArch64::STRDroX;3082case AArch64::LDURXi:3083case AArch64::LDRXui:3084return AArch64::LDRXroX;3085case AArch64::STURXi:3086case AArch64::STRXui:3087return AArch64::STRXroX;3088case AArch64::LDURWi:3089case AArch64::LDRWui:3090return AArch64::LDRWroX;3091case AArch64::LDURSWi:3092case AArch64::LDRSWui:3093return AArch64::LDRSWroX;3094case AArch64::STURWi:3095case AArch64::STRWui:3096return AArch64::STRWroX;3097case AArch64::LDURHi:3098case AArch64::LDRHui:3099return AArch64::LDRHroX;3100case AArch64::STURHi:3101case AArch64::STRHui:3102return AArch64::STRHroX;3103case AArch64::LDURHHi:3104case AArch64::LDRHHui:3105return AArch64::LDRHHroX;3106case AArch64::STURHHi:3107case AArch64::STRHHui:3108return AArch64::STRHHroX;3109case AArch64::LDURSHXi:3110case AArch64::LDRSHXui:3111return AArch64::LDRSHXroX;3112case AArch64::LDURSHWi:3113case AArch64::LDRSHWui:3114return AArch64::LDRSHWroX;3115case AArch64::LDURBi:3116case AArch64::LDRBui:3117return AArch64::LDRBroX;3118case AArch64::LDURBBi:3119case AArch64::LDRBBui:3120return AArch64::LDRBBroX;3121case AArch64::LDURSBXi:3122case AArch64::LDRSBXui:3123return AArch64::LDRSBXroX;3124case AArch64::LDURSBWi:3125case AArch64::LDRSBWui:3126return AArch64::LDRSBWroX;3127case AArch64::STURBi:3128case AArch64::STRBui:3129return AArch64::STRBroX;3130case AArch64::STURBBi:3131case AArch64::STRBBui:3132return AArch64::STRBBroX;3133}3134}31353136// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return3137// the opcode of an instruction performing the same operation, but using the3138// [Reg, #Imm] addressing mode with scaled offset.3139unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {3140switch (Opcode) {3141default:3142llvm_unreachable("Address folding not implemented for instruction");31433144case AArch64::LDURQi:3145Scale = 16;3146return AArch64::LDRQui;3147case AArch64::STURQi:3148Scale = 16;3149return AArch64::STRQui;3150case AArch64::LDURDi:3151Scale = 8;3152return AArch64::LDRDui;3153case AArch64::STURDi:3154Scale = 8;3155return AArch64::STRDui;3156case AArch64::LDURXi:3157Scale = 8;3158return AArch64::LDRXui;3159case AArch64::STURXi:3160Scale = 8;3161return AArch64::STRXui;3162case AArch64::LDURWi:3163Scale = 4;3164return AArch64::LDRWui;3165case AArch64::LDURSWi:3166Scale = 4;3167return AArch64::LDRSWui;3168case AArch64::STURWi:3169Scale = 4;3170return AArch64::STRWui;3171case AArch64::LDURHi:3172Scale = 2;3173return AArch64::LDRHui;3174case AArch64::STURHi:3175Scale = 2;3176return AArch64::STRHui;3177case AArch64::LDURHHi:3178Scale = 2;3179return AArch64::LDRHHui;3180case AArch64::STURHHi:3181Scale = 2;3182return AArch64::STRHHui;3183case AArch64::LDURSHXi:3184Scale = 2;3185return AArch64::LDRSHXui;3186case AArch64::LDURSHWi:3187Scale = 2;3188return AArch64::LDRSHWui;3189case AArch64::LDURBi:3190Scale = 1;3191return AArch64::LDRBui;3192case AArch64::LDURBBi:3193Scale = 1;3194return AArch64::LDRBBui;3195case AArch64::LDURSBXi:3196Scale = 1;3197return AArch64::LDRSBXui;3198case AArch64::LDURSBWi:3199Scale = 1;3200return AArch64::LDRSBWui;3201case AArch64::STURBi:3202Scale = 1;3203return AArch64::STRBui;3204case AArch64::STURBBi:3205Scale = 1;3206return AArch64::STRBBui;3207case AArch64::LDRQui:3208case AArch64::STRQui:3209Scale = 16;3210return Opcode;3211case AArch64::LDRDui:3212case AArch64::STRDui:3213case AArch64::LDRXui:3214case AArch64::STRXui:3215Scale = 8;3216return Opcode;3217case AArch64::LDRWui:3218case AArch64::LDRSWui:3219case AArch64::STRWui:3220Scale = 4;3221return Opcode;3222case AArch64::LDRHui:3223case AArch64::STRHui:3224case AArch64::LDRHHui:3225case AArch64::STRHHui:3226case AArch64::LDRSHXui:3227case AArch64::LDRSHWui:3228Scale = 2;3229return Opcode;3230case AArch64::LDRBui:3231case AArch64::LDRBBui:3232case AArch64::LDRSBXui:3233case AArch64::LDRSBWui:3234case AArch64::STRBui:3235case AArch64::STRBBui:3236Scale = 1;3237return Opcode;3238}3239}32403241// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return3242// the opcode of an instruction performing the same operation, but using the3243// [Reg, #Imm] addressing mode with unscaled offset.3244unsigned unscaledOffsetOpcode(unsigned Opcode) {3245switch (Opcode) {3246default:3247llvm_unreachable("Address folding not implemented for instruction");32483249case AArch64::LDURQi:3250case AArch64::STURQi:3251case AArch64::LDURDi:3252case AArch64::STURDi:3253case AArch64::LDURXi:3254case AArch64::STURXi:3255case AArch64::LDURWi:3256case AArch64::LDURSWi:3257case AArch64::STURWi:3258case AArch64::LDURHi:3259case AArch64::STURHi:3260case AArch64::LDURHHi:3261case AArch64::STURHHi:3262case AArch64::LDURSHXi:3263case AArch64::LDURSHWi:3264case AArch64::LDURBi:3265case AArch64::STURBi:3266case AArch64::LDURBBi:3267case AArch64::STURBBi:3268case AArch64::LDURSBWi:3269case AArch64::LDURSBXi:3270return Opcode;3271case AArch64::LDRQui:3272return AArch64::LDURQi;3273case AArch64::STRQui:3274return AArch64::STURQi;3275case AArch64::LDRDui:3276return AArch64::LDURDi;3277case AArch64::STRDui:3278return AArch64::STURDi;3279case AArch64::LDRXui:3280return AArch64::LDURXi;3281case AArch64::STRXui:3282return AArch64::STURXi;3283case AArch64::LDRWui:3284return AArch64::LDURWi;3285case AArch64::LDRSWui:3286return AArch64::LDURSWi;3287case AArch64::STRWui:3288return AArch64::STURWi;3289case AArch64::LDRHui:3290return AArch64::LDURHi;3291case AArch64::STRHui:3292return AArch64::STURHi;3293case AArch64::LDRHHui:3294return AArch64::LDURHHi;3295case AArch64::STRHHui:3296return AArch64::STURHHi;3297case AArch64::LDRSHXui:3298return AArch64::LDURSHXi;3299case AArch64::LDRSHWui:3300return AArch64::LDURSHWi;3301case AArch64::LDRBBui:3302return AArch64::LDURBBi;3303case AArch64::LDRBui:3304return AArch64::LDURBi;3305case AArch64::STRBBui:3306return AArch64::STURBBi;3307case AArch64::STRBui:3308return AArch64::STURBi;3309case AArch64::LDRSBWui:3310return AArch64::LDURSBWi;3311case AArch64::LDRSBXui:3312return AArch64::LDURSBXi;3313}3314}33153316// Given the opcode of a memory load/store instruction, return the opcode of an3317// instruction performing the same operation, but using3318// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the3319// offset register.3320static unsigned offsetExtendOpcode(unsigned Opcode) {3321switch (Opcode) {3322default:3323llvm_unreachable("Address folding not implemented for instruction");33243325case AArch64::LDRQroX:3326case AArch64::LDURQi:3327case AArch64::LDRQui:3328return AArch64::LDRQroW;3329case AArch64::STRQroX:3330case AArch64::STURQi:3331case AArch64::STRQui:3332return AArch64::STRQroW;3333case AArch64::LDRDroX:3334case AArch64::LDURDi:3335case AArch64::LDRDui:3336return AArch64::LDRDroW;3337case AArch64::STRDroX:3338case AArch64::STURDi:3339case AArch64::STRDui:3340return AArch64::STRDroW;3341case AArch64::LDRXroX:3342case AArch64::LDURXi:3343case AArch64::LDRXui:3344return AArch64::LDRXroW;3345case AArch64::STRXroX:3346case AArch64::STURXi:3347case AArch64::STRXui:3348return AArch64::STRXroW;3349case AArch64::LDRWroX:3350case AArch64::LDURWi:3351case AArch64::LDRWui:3352return AArch64::LDRWroW;3353case AArch64::LDRSWroX:3354case AArch64::LDURSWi:3355case AArch64::LDRSWui:3356return AArch64::LDRSWroW;3357case AArch64::STRWroX:3358case AArch64::STURWi:3359case AArch64::STRWui:3360return AArch64::STRWroW;3361case AArch64::LDRHroX:3362case AArch64::LDURHi:3363case AArch64::LDRHui:3364return AArch64::LDRHroW;3365case AArch64::STRHroX:3366case AArch64::STURHi:3367case AArch64::STRHui:3368return AArch64::STRHroW;3369case AArch64::LDRHHroX:3370case AArch64::LDURHHi:3371case AArch64::LDRHHui:3372return AArch64::LDRHHroW;3373case AArch64::STRHHroX:3374case AArch64::STURHHi:3375case AArch64::STRHHui:3376return AArch64::STRHHroW;3377case AArch64::LDRSHXroX:3378case AArch64::LDURSHXi:3379case AArch64::LDRSHXui:3380return AArch64::LDRSHXroW;3381case AArch64::LDRSHWroX:3382case AArch64::LDURSHWi:3383case AArch64::LDRSHWui:3384return AArch64::LDRSHWroW;3385case AArch64::LDRBroX:3386case AArch64::LDURBi:3387case AArch64::LDRBui:3388return AArch64::LDRBroW;3389case AArch64::LDRBBroX:3390case AArch64::LDURBBi:3391case AArch64::LDRBBui:3392return AArch64::LDRBBroW;3393case AArch64::LDRSBXroX:3394case AArch64::LDURSBXi:3395case AArch64::LDRSBXui:3396return AArch64::LDRSBXroW;3397case AArch64::LDRSBWroX:3398case AArch64::LDURSBWi:3399case AArch64::LDRSBWui:3400return AArch64::LDRSBWroW;3401case AArch64::STRBroX:3402case AArch64::STURBi:3403case AArch64::STRBui:3404return AArch64::STRBroW;3405case AArch64::STRBBroX:3406case AArch64::STURBBi:3407case AArch64::STRBBui:3408return AArch64::STRBBroW;3409}3410}34113412MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,3413const ExtAddrMode &AM) const {34143415const DebugLoc &DL = MemI.getDebugLoc();3416MachineBasicBlock &MBB = *MemI.getParent();3417MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();34183419if (AM.Form == ExtAddrMode::Formula::Basic) {3420if (AM.ScaledReg) {3421// The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.3422unsigned Opcode = regOffsetOpcode(MemI.getOpcode());3423MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);3424auto B = BuildMI(MBB, MemI, DL, get(Opcode))3425.addReg(MemI.getOperand(0).getReg(),3426MemI.mayLoad() ? RegState::Define : 0)3427.addReg(AM.BaseReg)3428.addReg(AM.ScaledReg)3429.addImm(0)3430.addImm(AM.Scale > 1)3431.setMemRefs(MemI.memoperands())3432.setMIFlags(MemI.getFlags());3433return B.getInstr();3434}34353436assert(AM.ScaledReg == 0 && AM.Scale == 0 &&3437"Addressing mode not supported for folding");34383439// The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.3440unsigned Scale = 1;3441unsigned Opcode = MemI.getOpcode();3442if (isInt<9>(AM.Displacement))3443Opcode = unscaledOffsetOpcode(Opcode);3444else3445Opcode = scaledOffsetOpcode(Opcode, Scale);34463447auto B = BuildMI(MBB, MemI, DL, get(Opcode))3448.addReg(MemI.getOperand(0).getReg(),3449MemI.mayLoad() ? RegState::Define : 0)3450.addReg(AM.BaseReg)3451.addImm(AM.Displacement / Scale)3452.setMemRefs(MemI.memoperands())3453.setMIFlags(MemI.getFlags());3454return B.getInstr();3455}34563457if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||3458AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {3459// The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.3460assert(AM.ScaledReg && !AM.Displacement &&3461"Address offset can be a register or an immediate, but not both");3462unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());3463MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);3464// Make sure the offset register is in the correct register class.3465Register OffsetReg = AM.ScaledReg;3466const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);3467if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {3468OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);3469BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)3470.addReg(AM.ScaledReg, 0, AArch64::sub_32);3471}3472auto B = BuildMI(MBB, MemI, DL, get(Opcode))3473.addReg(MemI.getOperand(0).getReg(),3474MemI.mayLoad() ? RegState::Define : 0)3475.addReg(AM.BaseReg)3476.addReg(OffsetReg)3477.addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg)3478.addImm(AM.Scale != 1)3479.setMemRefs(MemI.memoperands())3480.setMIFlags(MemI.getFlags());34813482return B.getInstr();3483}34843485llvm_unreachable(3486"Function must not be called with an addressing mode it can't handle");3487}34883489bool AArch64InstrInfo::getMemOperandWithOffsetWidth(3490const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,3491bool &OffsetIsScalable, TypeSize &Width,3492const TargetRegisterInfo *TRI) const {3493assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");3494// Handle only loads/stores with base register followed by immediate offset.3495if (LdSt.getNumExplicitOperands() == 3) {3496// Non-paired instruction (e.g., ldr x1, [x0, #8]).3497if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||3498!LdSt.getOperand(2).isImm())3499return false;3500} else if (LdSt.getNumExplicitOperands() == 4) {3501// Paired instruction (e.g., ldp x1, x2, [x0, #8]).3502if (!LdSt.getOperand(1).isReg() ||3503(!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||3504!LdSt.getOperand(3).isImm())3505return false;3506} else3507return false;35083509// Get the scaling factor for the instruction and set the width for the3510// instruction.3511TypeSize Scale(0U, false);3512int64_t Dummy1, Dummy2;35133514// If this returns false, then it's an instruction we don't want to handle.3515if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))3516return false;35173518// Compute the offset. Offset is calculated as the immediate operand3519// multiplied by the scaling factor. Unscaled instructions have scaling factor3520// set to 1.3521if (LdSt.getNumExplicitOperands() == 3) {3522BaseOp = &LdSt.getOperand(1);3523Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();3524} else {3525assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");3526BaseOp = &LdSt.getOperand(2);3527Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();3528}3529OffsetIsScalable = Scale.isScalable();35303531if (!BaseOp->isReg() && !BaseOp->isFI())3532return false;35333534return true;3535}35363537MachineOperand &3538AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {3539assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");3540MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);3541assert(OfsOp.isImm() && "Offset operand wasn't immediate.");3542return OfsOp;3543}35443545bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,3546TypeSize &Width, int64_t &MinOffset,3547int64_t &MaxOffset) {3548switch (Opcode) {3549// Not a memory operation or something we want to handle.3550default:3551Scale = TypeSize::getFixed(0);3552Width = TypeSize::getFixed(0);3553MinOffset = MaxOffset = 0;3554return false;3555// LDR / STR3556case AArch64::LDRQui:3557case AArch64::STRQui:3558Scale = TypeSize::getFixed(16);3559Width = TypeSize::getFixed(16);3560MinOffset = 0;3561MaxOffset = 4095;3562break;3563case AArch64::LDRXui:3564case AArch64::LDRDui:3565case AArch64::STRXui:3566case AArch64::STRDui:3567case AArch64::PRFMui:3568Scale = TypeSize::getFixed(8);3569Width = TypeSize::getFixed(8);3570MinOffset = 0;3571MaxOffset = 4095;3572break;3573case AArch64::LDRWui:3574case AArch64::LDRSui:3575case AArch64::LDRSWui:3576case AArch64::STRWui:3577case AArch64::STRSui:3578Scale = TypeSize::getFixed(4);3579Width = TypeSize::getFixed(4);3580MinOffset = 0;3581MaxOffset = 4095;3582break;3583case AArch64::LDRHui:3584case AArch64::LDRHHui:3585case AArch64::LDRSHWui:3586case AArch64::LDRSHXui:3587case AArch64::STRHui:3588case AArch64::STRHHui:3589Scale = TypeSize::getFixed(2);3590Width = TypeSize::getFixed(2);3591MinOffset = 0;3592MaxOffset = 4095;3593break;3594case AArch64::LDRBui:3595case AArch64::LDRBBui:3596case AArch64::LDRSBWui:3597case AArch64::LDRSBXui:3598case AArch64::STRBui:3599case AArch64::STRBBui:3600Scale = TypeSize::getFixed(1);3601Width = TypeSize::getFixed(1);3602MinOffset = 0;3603MaxOffset = 4095;3604break;3605// post/pre inc3606case AArch64::STRQpre:3607case AArch64::LDRQpost:3608Scale = TypeSize::getFixed(1);3609Width = TypeSize::getFixed(16);3610MinOffset = -256;3611MaxOffset = 255;3612break;3613case AArch64::STRXpre:3614case AArch64::STRDpre:3615case AArch64::LDRXpost:3616case AArch64::LDRDpost:3617Scale = TypeSize::getFixed(1);3618Width = TypeSize::getFixed(8);3619MinOffset = -256;3620MaxOffset = 255;3621break;3622case AArch64::STRWpost:3623case AArch64::LDRWpost:3624Scale = TypeSize::getFixed(4);3625Width = TypeSize::getFixed(32);3626MinOffset = -256;3627MaxOffset = 255;3628break;3629// Unscaled3630case AArch64::LDURQi:3631case AArch64::STURQi:3632Scale = TypeSize::getFixed(1);3633Width = TypeSize::getFixed(16);3634MinOffset = -256;3635MaxOffset = 255;3636break;3637case AArch64::LDURXi:3638case AArch64::LDURDi:3639case AArch64::LDAPURXi:3640case AArch64::STURXi:3641case AArch64::STURDi:3642case AArch64::STLURXi:3643case AArch64::PRFUMi:3644Scale = TypeSize::getFixed(1);3645Width = TypeSize::getFixed(8);3646MinOffset = -256;3647MaxOffset = 255;3648break;3649case AArch64::LDURWi:3650case AArch64::LDURSi:3651case AArch64::LDURSWi:3652case AArch64::LDAPURi:3653case AArch64::LDAPURSWi:3654case AArch64::STURWi:3655case AArch64::STURSi:3656case AArch64::STLURWi:3657Scale = TypeSize::getFixed(1);3658Width = TypeSize::getFixed(4);3659MinOffset = -256;3660MaxOffset = 255;3661break;3662case AArch64::LDURHi:3663case AArch64::LDURHHi:3664case AArch64::LDURSHXi:3665case AArch64::LDURSHWi:3666case AArch64::LDAPURHi:3667case AArch64::LDAPURSHWi:3668case AArch64::LDAPURSHXi:3669case AArch64::STURHi:3670case AArch64::STURHHi:3671case AArch64::STLURHi:3672Scale = TypeSize::getFixed(1);3673Width = TypeSize::getFixed(2);3674MinOffset = -256;3675MaxOffset = 255;3676break;3677case AArch64::LDURBi:3678case AArch64::LDURBBi:3679case AArch64::LDURSBXi:3680case AArch64::LDURSBWi:3681case AArch64::LDAPURBi:3682case AArch64::LDAPURSBWi:3683case AArch64::LDAPURSBXi:3684case AArch64::STURBi:3685case AArch64::STURBBi:3686case AArch64::STLURBi:3687Scale = TypeSize::getFixed(1);3688Width = TypeSize::getFixed(1);3689MinOffset = -256;3690MaxOffset = 255;3691break;3692// LDP / STP3693case AArch64::LDPQi:3694case AArch64::LDNPQi:3695case AArch64::STPQi:3696case AArch64::STNPQi:3697Scale = TypeSize::getFixed(16);3698Width = TypeSize::getFixed(32);3699MinOffset = -64;3700MaxOffset = 63;3701break;3702case AArch64::LDPXi:3703case AArch64::LDPDi:3704case AArch64::LDNPXi:3705case AArch64::LDNPDi:3706case AArch64::STPXi:3707case AArch64::STPDi:3708case AArch64::STNPXi:3709case AArch64::STNPDi:3710Scale = TypeSize::getFixed(8);3711Width = TypeSize::getFixed(16);3712MinOffset = -64;3713MaxOffset = 63;3714break;3715case AArch64::LDPWi:3716case AArch64::LDPSi:3717case AArch64::LDNPWi:3718case AArch64::LDNPSi:3719case AArch64::STPWi:3720case AArch64::STPSi:3721case AArch64::STNPWi:3722case AArch64::STNPSi:3723Scale = TypeSize::getFixed(4);3724Width = TypeSize::getFixed(8);3725MinOffset = -64;3726MaxOffset = 63;3727break;3728// pre/post inc3729case AArch64::STPQpre:3730case AArch64::LDPQpost:3731Scale = TypeSize::getFixed(16);3732Width = TypeSize::getFixed(16);3733MinOffset = -1024;3734MaxOffset = 1008;3735break;3736case AArch64::STPXpre:3737case AArch64::LDPXpost:3738case AArch64::STPDpre:3739case AArch64::LDPDpost:3740Scale = TypeSize::getFixed(8);3741Width = TypeSize::getFixed(8);3742MinOffset = -512;3743MaxOffset = 504;3744break;3745case AArch64::StoreSwiftAsyncContext:3746// Store is an STRXui, but there might be an ADDXri in the expansion too.3747Scale = TypeSize::getFixed(1);3748Width = TypeSize::getFixed(8);3749MinOffset = 0;3750MaxOffset = 4095;3751break;3752case AArch64::ADDG:3753Scale = TypeSize::getFixed(16);3754Width = TypeSize::getFixed(0);3755MinOffset = 0;3756MaxOffset = 63;3757break;3758case AArch64::TAGPstack:3759Scale = TypeSize::getFixed(16);3760Width = TypeSize::getFixed(0);3761// TAGP with a negative offset turns into SUBP, which has a maximum offset3762// of 63 (not 64!).3763MinOffset = -63;3764MaxOffset = 63;3765break;3766case AArch64::LDG:3767case AArch64::STGi:3768case AArch64::STZGi:3769Scale = TypeSize::getFixed(16);3770Width = TypeSize::getFixed(16);3771MinOffset = -256;3772MaxOffset = 255;3773break;3774// SVE3775case AArch64::STR_ZZZZXI:3776case AArch64::LDR_ZZZZXI:3777Scale = TypeSize::getScalable(16);3778Width = TypeSize::getScalable(16 * 4);3779MinOffset = -256;3780MaxOffset = 252;3781break;3782case AArch64::STR_ZZZXI:3783case AArch64::LDR_ZZZXI:3784Scale = TypeSize::getScalable(16);3785Width = TypeSize::getScalable(16 * 3);3786MinOffset = -256;3787MaxOffset = 253;3788break;3789case AArch64::STR_ZZXI:3790case AArch64::LDR_ZZXI:3791Scale = TypeSize::getScalable(16);3792Width = TypeSize::getScalable(16 * 2);3793MinOffset = -256;3794MaxOffset = 254;3795break;3796case AArch64::LDR_PXI:3797case AArch64::STR_PXI:3798Scale = TypeSize::getScalable(2);3799Width = TypeSize::getScalable(2);3800MinOffset = -256;3801MaxOffset = 255;3802break;3803case AArch64::LDR_PPXI:3804case AArch64::STR_PPXI:3805Scale = TypeSize::getScalable(2);3806Width = TypeSize::getScalable(2 * 2);3807MinOffset = -256;3808MaxOffset = 254;3809break;3810case AArch64::LDR_ZXI:3811case AArch64::STR_ZXI:3812Scale = TypeSize::getScalable(16);3813Width = TypeSize::getScalable(16);3814MinOffset = -256;3815MaxOffset = 255;3816break;3817case AArch64::LD1B_IMM:3818case AArch64::LD1H_IMM:3819case AArch64::LD1W_IMM:3820case AArch64::LD1D_IMM:3821case AArch64::LDNT1B_ZRI:3822case AArch64::LDNT1H_ZRI:3823case AArch64::LDNT1W_ZRI:3824case AArch64::LDNT1D_ZRI:3825case AArch64::ST1B_IMM:3826case AArch64::ST1H_IMM:3827case AArch64::ST1W_IMM:3828case AArch64::ST1D_IMM:3829case AArch64::STNT1B_ZRI:3830case AArch64::STNT1H_ZRI:3831case AArch64::STNT1W_ZRI:3832case AArch64::STNT1D_ZRI:3833case AArch64::LDNF1B_IMM:3834case AArch64::LDNF1H_IMM:3835case AArch64::LDNF1W_IMM:3836case AArch64::LDNF1D_IMM:3837// A full vectors worth of data3838// Width = mbytes * elements3839Scale = TypeSize::getScalable(16);3840Width = TypeSize::getScalable(16);3841MinOffset = -8;3842MaxOffset = 7;3843break;3844case AArch64::LD2B_IMM:3845case AArch64::LD2H_IMM:3846case AArch64::LD2W_IMM:3847case AArch64::LD2D_IMM:3848case AArch64::ST2B_IMM:3849case AArch64::ST2H_IMM:3850case AArch64::ST2W_IMM:3851case AArch64::ST2D_IMM:3852Scale = TypeSize::getScalable(32);3853Width = TypeSize::getScalable(16 * 2);3854MinOffset = -8;3855MaxOffset = 7;3856break;3857case AArch64::LD3B_IMM:3858case AArch64::LD3H_IMM:3859case AArch64::LD3W_IMM:3860case AArch64::LD3D_IMM:3861case AArch64::ST3B_IMM:3862case AArch64::ST3H_IMM:3863case AArch64::ST3W_IMM:3864case AArch64::ST3D_IMM:3865Scale = TypeSize::getScalable(48);3866Width = TypeSize::getScalable(16 * 3);3867MinOffset = -8;3868MaxOffset = 7;3869break;3870case AArch64::LD4B_IMM:3871case AArch64::LD4H_IMM:3872case AArch64::LD4W_IMM:3873case AArch64::LD4D_IMM:3874case AArch64::ST4B_IMM:3875case AArch64::ST4H_IMM:3876case AArch64::ST4W_IMM:3877case AArch64::ST4D_IMM:3878Scale = TypeSize::getScalable(64);3879Width = TypeSize::getScalable(16 * 4);3880MinOffset = -8;3881MaxOffset = 7;3882break;3883case AArch64::LD1B_H_IMM:3884case AArch64::LD1SB_H_IMM:3885case AArch64::LD1H_S_IMM:3886case AArch64::LD1SH_S_IMM:3887case AArch64::LD1W_D_IMM:3888case AArch64::LD1SW_D_IMM:3889case AArch64::ST1B_H_IMM:3890case AArch64::ST1H_S_IMM:3891case AArch64::ST1W_D_IMM:3892case AArch64::LDNF1B_H_IMM:3893case AArch64::LDNF1SB_H_IMM:3894case AArch64::LDNF1H_S_IMM:3895case AArch64::LDNF1SH_S_IMM:3896case AArch64::LDNF1W_D_IMM:3897case AArch64::LDNF1SW_D_IMM:3898// A half vector worth of data3899// Width = mbytes * elements3900Scale = TypeSize::getScalable(8);3901Width = TypeSize::getScalable(8);3902MinOffset = -8;3903MaxOffset = 7;3904break;3905case AArch64::LD1B_S_IMM:3906case AArch64::LD1SB_S_IMM:3907case AArch64::LD1H_D_IMM:3908case AArch64::LD1SH_D_IMM:3909case AArch64::ST1B_S_IMM:3910case AArch64::ST1H_D_IMM:3911case AArch64::LDNF1B_S_IMM:3912case AArch64::LDNF1SB_S_IMM:3913case AArch64::LDNF1H_D_IMM:3914case AArch64::LDNF1SH_D_IMM:3915// A quarter vector worth of data3916// Width = mbytes * elements3917Scale = TypeSize::getScalable(4);3918Width = TypeSize::getScalable(4);3919MinOffset = -8;3920MaxOffset = 7;3921break;3922case AArch64::LD1B_D_IMM:3923case AArch64::LD1SB_D_IMM:3924case AArch64::ST1B_D_IMM:3925case AArch64::LDNF1B_D_IMM:3926case AArch64::LDNF1SB_D_IMM:3927// A eighth vector worth of data3928// Width = mbytes * elements3929Scale = TypeSize::getScalable(2);3930Width = TypeSize::getScalable(2);3931MinOffset = -8;3932MaxOffset = 7;3933break;3934case AArch64::ST2Gi:3935case AArch64::STZ2Gi:3936Scale = TypeSize::getFixed(16);3937Width = TypeSize::getFixed(32);3938MinOffset = -256;3939MaxOffset = 255;3940break;3941case AArch64::STGPi:3942Scale = TypeSize::getFixed(16);3943Width = TypeSize::getFixed(16);3944MinOffset = -64;3945MaxOffset = 63;3946break;3947case AArch64::LD1RB_IMM:3948case AArch64::LD1RB_H_IMM:3949case AArch64::LD1RB_S_IMM:3950case AArch64::LD1RB_D_IMM:3951case AArch64::LD1RSB_H_IMM:3952case AArch64::LD1RSB_S_IMM:3953case AArch64::LD1RSB_D_IMM:3954Scale = TypeSize::getFixed(1);3955Width = TypeSize::getFixed(1);3956MinOffset = 0;3957MaxOffset = 63;3958break;3959case AArch64::LD1RH_IMM:3960case AArch64::LD1RH_S_IMM:3961case AArch64::LD1RH_D_IMM:3962case AArch64::LD1RSH_S_IMM:3963case AArch64::LD1RSH_D_IMM:3964Scale = TypeSize::getFixed(2);3965Width = TypeSize::getFixed(2);3966MinOffset = 0;3967MaxOffset = 63;3968break;3969case AArch64::LD1RW_IMM:3970case AArch64::LD1RW_D_IMM:3971case AArch64::LD1RSW_IMM:3972Scale = TypeSize::getFixed(4);3973Width = TypeSize::getFixed(4);3974MinOffset = 0;3975MaxOffset = 63;3976break;3977case AArch64::LD1RD_IMM:3978Scale = TypeSize::getFixed(8);3979Width = TypeSize::getFixed(8);3980MinOffset = 0;3981MaxOffset = 63;3982break;3983}39843985return true;3986}39873988// Scaling factor for unscaled load or store.3989int AArch64InstrInfo::getMemScale(unsigned Opc) {3990switch (Opc) {3991default:3992llvm_unreachable("Opcode has unknown scale!");3993case AArch64::LDRBBui:3994case AArch64::LDURBBi:3995case AArch64::LDRSBWui:3996case AArch64::LDURSBWi:3997case AArch64::STRBBui:3998case AArch64::STURBBi:3999return 1;4000case AArch64::LDRHHui:4001case AArch64::LDURHHi:4002case AArch64::LDRSHWui:4003case AArch64::LDURSHWi:4004case AArch64::STRHHui:4005case AArch64::STURHHi:4006return 2;4007case AArch64::LDRSui:4008case AArch64::LDURSi:4009case AArch64::LDRSpre:4010case AArch64::LDRSWui:4011case AArch64::LDURSWi:4012case AArch64::LDRSWpre:4013case AArch64::LDRWpre:4014case AArch64::LDRWui:4015case AArch64::LDURWi:4016case AArch64::STRSui:4017case AArch64::STURSi:4018case AArch64::STRSpre:4019case AArch64::STRWui:4020case AArch64::STURWi:4021case AArch64::STRWpre:4022case AArch64::LDPSi:4023case AArch64::LDPSWi:4024case AArch64::LDPWi:4025case AArch64::STPSi:4026case AArch64::STPWi:4027return 4;4028case AArch64::LDRDui:4029case AArch64::LDURDi:4030case AArch64::LDRDpre:4031case AArch64::LDRXui:4032case AArch64::LDURXi:4033case AArch64::LDRXpre:4034case AArch64::STRDui:4035case AArch64::STURDi:4036case AArch64::STRDpre:4037case AArch64::STRXui:4038case AArch64::STURXi:4039case AArch64::STRXpre:4040case AArch64::LDPDi:4041case AArch64::LDPXi:4042case AArch64::STPDi:4043case AArch64::STPXi:4044return 8;4045case AArch64::LDRQui:4046case AArch64::LDURQi:4047case AArch64::STRQui:4048case AArch64::STURQi:4049case AArch64::STRQpre:4050case AArch64::LDPQi:4051case AArch64::LDRQpre:4052case AArch64::STPQi:4053case AArch64::STGi:4054case AArch64::STZGi:4055case AArch64::ST2Gi:4056case AArch64::STZ2Gi:4057case AArch64::STGPi:4058return 16;4059}4060}40614062bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {4063switch (MI.getOpcode()) {4064default:4065return false;4066case AArch64::LDRWpre:4067case AArch64::LDRXpre:4068case AArch64::LDRSWpre:4069case AArch64::LDRSpre:4070case AArch64::LDRDpre:4071case AArch64::LDRQpre:4072return true;4073}4074}40754076bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {4077switch (MI.getOpcode()) {4078default:4079return false;4080case AArch64::STRWpre:4081case AArch64::STRXpre:4082case AArch64::STRSpre:4083case AArch64::STRDpre:4084case AArch64::STRQpre:4085return true;4086}4087}40884089bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {4090return isPreLd(MI) || isPreSt(MI);4091}40924093bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {4094switch (MI.getOpcode()) {4095default:4096return false;4097case AArch64::LDPSi:4098case AArch64::LDPSWi:4099case AArch64::LDPDi:4100case AArch64::LDPQi:4101case AArch64::LDPWi:4102case AArch64::LDPXi:4103case AArch64::STPSi:4104case AArch64::STPDi:4105case AArch64::STPQi:4106case AArch64::STPWi:4107case AArch64::STPXi:4108case AArch64::STGPi:4109return true;4110}4111}41124113const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {4114unsigned Idx =4115AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 24116: 1;4117return MI.getOperand(Idx);4118}41194120const MachineOperand &4121AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {4122unsigned Idx =4123AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 34124: 2;4125return MI.getOperand(Idx);4126}41274128static const TargetRegisterClass *getRegClass(const MachineInstr &MI,4129Register Reg) {4130if (MI.getParent() == nullptr)4131return nullptr;4132const MachineFunction *MF = MI.getParent()->getParent();4133return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;4134}41354136bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {4137auto IsHFPR = [&](const MachineOperand &Op) {4138if (!Op.isReg())4139return false;4140auto Reg = Op.getReg();4141if (Reg.isPhysical())4142return AArch64::FPR16RegClass.contains(Reg);4143const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);4144return TRC == &AArch64::FPR16RegClass ||4145TRC == &AArch64::FPR16_loRegClass;4146};4147return llvm::any_of(MI.operands(), IsHFPR);4148}41494150bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {4151auto IsQFPR = [&](const MachineOperand &Op) {4152if (!Op.isReg())4153return false;4154auto Reg = Op.getReg();4155if (Reg.isPhysical())4156return AArch64::FPR128RegClass.contains(Reg);4157const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);4158return TRC == &AArch64::FPR128RegClass ||4159TRC == &AArch64::FPR128_loRegClass;4160};4161return llvm::any_of(MI.operands(), IsQFPR);4162}41634164bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {4165switch (MI.getOpcode()) {4166case AArch64::BRK:4167case AArch64::HLT:4168case AArch64::PACIASP:4169case AArch64::PACIBSP:4170// Implicit BTI behavior.4171return true;4172case AArch64::PAUTH_PROLOGUE:4173// PAUTH_PROLOGUE expands to PACI(A|B)SP.4174return true;4175case AArch64::HINT: {4176unsigned Imm = MI.getOperand(0).getImm();4177// Explicit BTI instruction.4178if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)4179return true;4180// PACI(A|B)SP instructions.4181if (Imm == 25 || Imm == 27)4182return true;4183return false;4184}4185default:4186return false;4187}4188}41894190bool AArch64InstrInfo::isFpOrNEON(Register Reg) {4191if (Reg == 0)4192return false;4193assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");4194return AArch64::FPR128RegClass.contains(Reg) ||4195AArch64::FPR64RegClass.contains(Reg) ||4196AArch64::FPR32RegClass.contains(Reg) ||4197AArch64::FPR16RegClass.contains(Reg) ||4198AArch64::FPR8RegClass.contains(Reg);4199}42004201bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {4202auto IsFPR = [&](const MachineOperand &Op) {4203if (!Op.isReg())4204return false;4205auto Reg = Op.getReg();4206if (Reg.isPhysical())4207return isFpOrNEON(Reg);42084209const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);4210return TRC == &AArch64::FPR128RegClass ||4211TRC == &AArch64::FPR128_loRegClass ||4212TRC == &AArch64::FPR64RegClass ||4213TRC == &AArch64::FPR64_loRegClass ||4214TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||4215TRC == &AArch64::FPR8RegClass;4216};4217return llvm::any_of(MI.operands(), IsFPR);4218}42194220// Scale the unscaled offsets. Returns false if the unscaled offset can't be4221// scaled.4222static bool scaleOffset(unsigned Opc, int64_t &Offset) {4223int Scale = AArch64InstrInfo::getMemScale(Opc);42244225// If the byte-offset isn't a multiple of the stride, we can't scale this4226// offset.4227if (Offset % Scale != 0)4228return false;42294230// Convert the byte-offset used by unscaled into an "element" offset used4231// by the scaled pair load/store instructions.4232Offset /= Scale;4233return true;4234}42354236static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {4237if (FirstOpc == SecondOpc)4238return true;4239// We can also pair sign-ext and zero-ext instructions.4240switch (FirstOpc) {4241default:4242return false;4243case AArch64::STRSui:4244case AArch64::STURSi:4245return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;4246case AArch64::STRDui:4247case AArch64::STURDi:4248return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;4249case AArch64::STRQui:4250case AArch64::STURQi:4251return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;4252case AArch64::STRWui:4253case AArch64::STURWi:4254return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;4255case AArch64::STRXui:4256case AArch64::STURXi:4257return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;4258case AArch64::LDRSui:4259case AArch64::LDURSi:4260return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;4261case AArch64::LDRDui:4262case AArch64::LDURDi:4263return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;4264case AArch64::LDRQui:4265case AArch64::LDURQi:4266return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;4267case AArch64::LDRWui:4268case AArch64::LDURWi:4269return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;4270case AArch64::LDRSWui:4271case AArch64::LDURSWi:4272return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;4273case AArch64::LDRXui:4274case AArch64::LDURXi:4275return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;4276}4277// These instructions can't be paired based on their opcodes.4278return false;4279}42804281static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,4282int64_t Offset1, unsigned Opcode1, int FI2,4283int64_t Offset2, unsigned Opcode2) {4284// Accesses through fixed stack object frame indices may access a different4285// fixed stack slot. Check that the object offsets + offsets match.4286if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {4287int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);4288int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);4289assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");4290// Convert to scaled object offsets.4291int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);4292if (ObjectOffset1 % Scale1 != 0)4293return false;4294ObjectOffset1 /= Scale1;4295int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);4296if (ObjectOffset2 % Scale2 != 0)4297return false;4298ObjectOffset2 /= Scale2;4299ObjectOffset1 += Offset1;4300ObjectOffset2 += Offset2;4301return ObjectOffset1 + 1 == ObjectOffset2;4302}43034304return FI1 == FI2;4305}43064307/// Detect opportunities for ldp/stp formation.4308///4309/// Only called for LdSt for which getMemOperandWithOffset returns true.4310bool AArch64InstrInfo::shouldClusterMemOps(4311ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,4312bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,4313int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,4314unsigned NumBytes) const {4315assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);4316const MachineOperand &BaseOp1 = *BaseOps1.front();4317const MachineOperand &BaseOp2 = *BaseOps2.front();4318const MachineInstr &FirstLdSt = *BaseOp1.getParent();4319const MachineInstr &SecondLdSt = *BaseOp2.getParent();4320if (BaseOp1.getType() != BaseOp2.getType())4321return false;43224323assert((BaseOp1.isReg() || BaseOp1.isFI()) &&4324"Only base registers and frame indices are supported.");43254326// Check for both base regs and base FI.4327if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())4328return false;43294330// Only cluster up to a single pair.4331if (ClusterSize > 2)4332return false;43334334if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))4335return false;43364337// Can we pair these instructions based on their opcodes?4338unsigned FirstOpc = FirstLdSt.getOpcode();4339unsigned SecondOpc = SecondLdSt.getOpcode();4340if (!canPairLdStOpc(FirstOpc, SecondOpc))4341return false;43424343// Can't merge volatiles or load/stores that have a hint to avoid pair4344// formation, for example.4345if (!isCandidateToMergeOrPair(FirstLdSt) ||4346!isCandidateToMergeOrPair(SecondLdSt))4347return false;43484349// isCandidateToMergeOrPair guarantees that operand 2 is an immediate.4350int64_t Offset1 = FirstLdSt.getOperand(2).getImm();4351if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))4352return false;43534354int64_t Offset2 = SecondLdSt.getOperand(2).getImm();4355if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))4356return false;43574358// Pairwise instructions have a 7-bit signed offset field.4359if (Offset1 > 63 || Offset1 < -64)4360return false;43614362// The caller should already have ordered First/SecondLdSt by offset.4363// Note: except for non-equal frame index bases4364if (BaseOp1.isFI()) {4365assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&4366"Caller should have ordered offsets.");43674368const MachineFrameInfo &MFI =4369FirstLdSt.getParent()->getParent()->getFrameInfo();4370return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,4371BaseOp2.getIndex(), Offset2, SecondOpc);4372}43734374assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");43754376return Offset1 + 1 == Offset2;4377}43784379static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,4380unsigned Reg, unsigned SubIdx,4381unsigned State,4382const TargetRegisterInfo *TRI) {4383if (!SubIdx)4384return MIB.addReg(Reg, State);43854386if (Register::isPhysicalRegister(Reg))4387return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);4388return MIB.addReg(Reg, State, SubIdx);4389}43904391static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,4392unsigned NumRegs) {4393// We really want the positive remainder mod 32 here, that happens to be4394// easily obtainable with a mask.4395return ((DestReg - SrcReg) & 0x1f) < NumRegs;4396}43974398void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,4399MachineBasicBlock::iterator I,4400const DebugLoc &DL, MCRegister DestReg,4401MCRegister SrcReg, bool KillSrc,4402unsigned Opcode,4403ArrayRef<unsigned> Indices) const {4404assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");4405const TargetRegisterInfo *TRI = &getRegisterInfo();4406uint16_t DestEncoding = TRI->getEncodingValue(DestReg);4407uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);4408unsigned NumRegs = Indices.size();44094410int SubReg = 0, End = NumRegs, Incr = 1;4411if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {4412SubReg = NumRegs - 1;4413End = -1;4414Incr = -1;4415}44164417for (; SubReg != End; SubReg += Incr) {4418const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));4419AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);4420AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);4421AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);4422}4423}44244425void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,4426MachineBasicBlock::iterator I,4427DebugLoc DL, unsigned DestReg,4428unsigned SrcReg, bool KillSrc,4429unsigned Opcode, unsigned ZeroReg,4430llvm::ArrayRef<unsigned> Indices) const {4431const TargetRegisterInfo *TRI = &getRegisterInfo();4432unsigned NumRegs = Indices.size();44334434#ifndef NDEBUG4435uint16_t DestEncoding = TRI->getEncodingValue(DestReg);4436uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);4437assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&4438"GPR reg sequences should not be able to overlap");4439#endif44404441for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {4442const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));4443AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);4444MIB.addReg(ZeroReg);4445AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);4446MIB.addImm(0);4447}4448}44494450void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,4451MachineBasicBlock::iterator I,4452const DebugLoc &DL, MCRegister DestReg,4453MCRegister SrcReg, bool KillSrc) const {4454if (AArch64::GPR32spRegClass.contains(DestReg) &&4455(AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {4456const TargetRegisterInfo *TRI = &getRegisterInfo();44574458if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {4459// If either operand is WSP, expand to ADD #0.4460if (Subtarget.hasZeroCycleRegMove()) {4461// Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.4462MCRegister DestRegX = TRI->getMatchingSuperReg(4463DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);4464MCRegister SrcRegX = TRI->getMatchingSuperReg(4465SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);4466// This instruction is reading and writing X registers. This may upset4467// the register scavenger and machine verifier, so we need to indicate4468// that we are reading an undefined value from SrcRegX, but a proper4469// value from SrcReg.4470BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)4471.addReg(SrcRegX, RegState::Undef)4472.addImm(0)4473.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))4474.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));4475} else {4476BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)4477.addReg(SrcReg, getKillRegState(KillSrc))4478.addImm(0)4479.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));4480}4481} else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {4482BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)4483.addImm(0)4484.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));4485} else {4486if (Subtarget.hasZeroCycleRegMove()) {4487// Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.4488MCRegister DestRegX = TRI->getMatchingSuperReg(4489DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);4490MCRegister SrcRegX = TRI->getMatchingSuperReg(4491SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);4492// This instruction is reading and writing X registers. This may upset4493// the register scavenger and machine verifier, so we need to indicate4494// that we are reading an undefined value from SrcRegX, but a proper4495// value from SrcReg.4496BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)4497.addReg(AArch64::XZR)4498.addReg(SrcRegX, RegState::Undef)4499.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));4500} else {4501// Otherwise, expand to ORR WZR.4502BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)4503.addReg(AArch64::WZR)4504.addReg(SrcReg, getKillRegState(KillSrc));4505}4506}4507return;4508}45094510// Copy a Predicate register by ORRing with itself.4511if (AArch64::PPRRegClass.contains(DestReg) &&4512AArch64::PPRRegClass.contains(SrcReg)) {4513assert(Subtarget.isSVEorStreamingSVEAvailable() &&4514"Unexpected SVE register.");4515BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)4516.addReg(SrcReg) // Pg4517.addReg(SrcReg)4518.addReg(SrcReg, getKillRegState(KillSrc));4519return;4520}45214522// Copy a predicate-as-counter register by ORRing with itself as if it4523// were a regular predicate (mask) register.4524bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);4525bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);4526if (DestIsPNR || SrcIsPNR) {4527auto ToPPR = [](MCRegister R) -> MCRegister {4528return (R - AArch64::PN0) + AArch64::P0;4529};4530MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg;4531MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg;45324533if (PPRSrcReg != PPRDestReg) {4534auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)4535.addReg(PPRSrcReg) // Pg4536.addReg(PPRSrcReg)4537.addReg(PPRSrcReg, getKillRegState(KillSrc));4538if (DestIsPNR)4539NewMI.addDef(DestReg, RegState::Implicit);4540}4541return;4542}45434544// Copy a Z register by ORRing with itself.4545if (AArch64::ZPRRegClass.contains(DestReg) &&4546AArch64::ZPRRegClass.contains(SrcReg)) {4547assert(Subtarget.isSVEorStreamingSVEAvailable() &&4548"Unexpected SVE register.");4549BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)4550.addReg(SrcReg)4551.addReg(SrcReg, getKillRegState(KillSrc));4552return;4553}45544555// Copy a Z register pair by copying the individual sub-registers.4556if ((AArch64::ZPR2RegClass.contains(DestReg) ||4557AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&4558(AArch64::ZPR2RegClass.contains(SrcReg) ||4559AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {4560assert(Subtarget.isSVEorStreamingSVEAvailable() &&4561"Unexpected SVE register.");4562static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};4563copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,4564Indices);4565return;4566}45674568// Copy a Z register triple by copying the individual sub-registers.4569if (AArch64::ZPR3RegClass.contains(DestReg) &&4570AArch64::ZPR3RegClass.contains(SrcReg)) {4571assert(Subtarget.isSVEorStreamingSVEAvailable() &&4572"Unexpected SVE register.");4573static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,4574AArch64::zsub2};4575copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,4576Indices);4577return;4578}45794580// Copy a Z register quad by copying the individual sub-registers.4581if ((AArch64::ZPR4RegClass.contains(DestReg) ||4582AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&4583(AArch64::ZPR4RegClass.contains(SrcReg) ||4584AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {4585assert(Subtarget.isSVEorStreamingSVEAvailable() &&4586"Unexpected SVE register.");4587static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,4588AArch64::zsub2, AArch64::zsub3};4589copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,4590Indices);4591return;4592}45934594if (AArch64::GPR64spRegClass.contains(DestReg) &&4595(AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {4596if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {4597// If either operand is SP, expand to ADD #0.4598BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)4599.addReg(SrcReg, getKillRegState(KillSrc))4600.addImm(0)4601.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));4602} else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {4603BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)4604.addImm(0)4605.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));4606} else {4607// Otherwise, expand to ORR XZR.4608BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)4609.addReg(AArch64::XZR)4610.addReg(SrcReg, getKillRegState(KillSrc));4611}4612return;4613}46144615// Copy a DDDD register quad by copying the individual sub-registers.4616if (AArch64::DDDDRegClass.contains(DestReg) &&4617AArch64::DDDDRegClass.contains(SrcReg)) {4618static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,4619AArch64::dsub2, AArch64::dsub3};4620copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,4621Indices);4622return;4623}46244625// Copy a DDD register triple by copying the individual sub-registers.4626if (AArch64::DDDRegClass.contains(DestReg) &&4627AArch64::DDDRegClass.contains(SrcReg)) {4628static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,4629AArch64::dsub2};4630copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,4631Indices);4632return;4633}46344635// Copy a DD register pair by copying the individual sub-registers.4636if (AArch64::DDRegClass.contains(DestReg) &&4637AArch64::DDRegClass.contains(SrcReg)) {4638static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};4639copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,4640Indices);4641return;4642}46434644// Copy a QQQQ register quad by copying the individual sub-registers.4645if (AArch64::QQQQRegClass.contains(DestReg) &&4646AArch64::QQQQRegClass.contains(SrcReg)) {4647static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,4648AArch64::qsub2, AArch64::qsub3};4649copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,4650Indices);4651return;4652}46534654// Copy a QQQ register triple by copying the individual sub-registers.4655if (AArch64::QQQRegClass.contains(DestReg) &&4656AArch64::QQQRegClass.contains(SrcReg)) {4657static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,4658AArch64::qsub2};4659copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,4660Indices);4661return;4662}46634664// Copy a QQ register pair by copying the individual sub-registers.4665if (AArch64::QQRegClass.contains(DestReg) &&4666AArch64::QQRegClass.contains(SrcReg)) {4667static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};4668copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,4669Indices);4670return;4671}46724673if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&4674AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {4675static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};4676copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,4677AArch64::XZR, Indices);4678return;4679}46804681if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&4682AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {4683static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};4684copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,4685AArch64::WZR, Indices);4686return;4687}46884689if (AArch64::FPR128RegClass.contains(DestReg) &&4690AArch64::FPR128RegClass.contains(SrcReg)) {4691if (Subtarget.isSVEorStreamingSVEAvailable() &&4692!Subtarget.isNeonAvailable())4693BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))4694.addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)4695.addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))4696.addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));4697else if (Subtarget.isNeonAvailable())4698BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)4699.addReg(SrcReg)4700.addReg(SrcReg, getKillRegState(KillSrc));4701else {4702BuildMI(MBB, I, DL, get(AArch64::STRQpre))4703.addReg(AArch64::SP, RegState::Define)4704.addReg(SrcReg, getKillRegState(KillSrc))4705.addReg(AArch64::SP)4706.addImm(-16);4707BuildMI(MBB, I, DL, get(AArch64::LDRQpost))4708.addReg(AArch64::SP, RegState::Define)4709.addReg(DestReg, RegState::Define)4710.addReg(AArch64::SP)4711.addImm(16);4712}4713return;4714}47154716if (AArch64::FPR64RegClass.contains(DestReg) &&4717AArch64::FPR64RegClass.contains(SrcReg)) {4718BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)4719.addReg(SrcReg, getKillRegState(KillSrc));4720return;4721}47224723if (AArch64::FPR32RegClass.contains(DestReg) &&4724AArch64::FPR32RegClass.contains(SrcReg)) {4725BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)4726.addReg(SrcReg, getKillRegState(KillSrc));4727return;4728}47294730if (AArch64::FPR16RegClass.contains(DestReg) &&4731AArch64::FPR16RegClass.contains(SrcReg)) {4732DestReg =4733RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);4734SrcReg =4735RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);4736BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)4737.addReg(SrcReg, getKillRegState(KillSrc));4738return;4739}47404741if (AArch64::FPR8RegClass.contains(DestReg) &&4742AArch64::FPR8RegClass.contains(SrcReg)) {4743DestReg =4744RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);4745SrcReg =4746RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);4747BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)4748.addReg(SrcReg, getKillRegState(KillSrc));4749return;4750}47514752// Copies between GPR64 and FPR64.4753if (AArch64::FPR64RegClass.contains(DestReg) &&4754AArch64::GPR64RegClass.contains(SrcReg)) {4755BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)4756.addReg(SrcReg, getKillRegState(KillSrc));4757return;4758}4759if (AArch64::GPR64RegClass.contains(DestReg) &&4760AArch64::FPR64RegClass.contains(SrcReg)) {4761BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)4762.addReg(SrcReg, getKillRegState(KillSrc));4763return;4764}4765// Copies between GPR32 and FPR32.4766if (AArch64::FPR32RegClass.contains(DestReg) &&4767AArch64::GPR32RegClass.contains(SrcReg)) {4768BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)4769.addReg(SrcReg, getKillRegState(KillSrc));4770return;4771}4772if (AArch64::GPR32RegClass.contains(DestReg) &&4773AArch64::FPR32RegClass.contains(SrcReg)) {4774BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)4775.addReg(SrcReg, getKillRegState(KillSrc));4776return;4777}47784779if (DestReg == AArch64::NZCV) {4780assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");4781BuildMI(MBB, I, DL, get(AArch64::MSR))4782.addImm(AArch64SysReg::NZCV)4783.addReg(SrcReg, getKillRegState(KillSrc))4784.addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);4785return;4786}47874788if (SrcReg == AArch64::NZCV) {4789assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");4790BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)4791.addImm(AArch64SysReg::NZCV)4792.addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));4793return;4794}47954796#ifndef NDEBUG4797const TargetRegisterInfo &TRI = getRegisterInfo();4798errs() << TRI.getRegAsmName(DestReg) << " = COPY "4799<< TRI.getRegAsmName(SrcReg) << "\n";4800#endif4801llvm_unreachable("unimplemented reg-to-reg copy");4802}48034804static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,4805MachineBasicBlock &MBB,4806MachineBasicBlock::iterator InsertBefore,4807const MCInstrDesc &MCID,4808Register SrcReg, bool IsKill,4809unsigned SubIdx0, unsigned SubIdx1, int FI,4810MachineMemOperand *MMO) {4811Register SrcReg0 = SrcReg;4812Register SrcReg1 = SrcReg;4813if (SrcReg.isPhysical()) {4814SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);4815SubIdx0 = 0;4816SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);4817SubIdx1 = 0;4818}4819BuildMI(MBB, InsertBefore, DebugLoc(), MCID)4820.addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)4821.addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)4822.addFrameIndex(FI)4823.addImm(0)4824.addMemOperand(MMO);4825}48264827void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,4828MachineBasicBlock::iterator MBBI,4829Register SrcReg, bool isKill, int FI,4830const TargetRegisterClass *RC,4831const TargetRegisterInfo *TRI,4832Register VReg) const {4833MachineFunction &MF = *MBB.getParent();4834MachineFrameInfo &MFI = MF.getFrameInfo();48354836MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);4837MachineMemOperand *MMO =4838MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,4839MFI.getObjectSize(FI), MFI.getObjectAlign(FI));4840unsigned Opc = 0;4841bool Offset = true;4842MCRegister PNRReg = MCRegister::NoRegister;4843unsigned StackID = TargetStackID::Default;4844switch (TRI->getSpillSize(*RC)) {4845case 1:4846if (AArch64::FPR8RegClass.hasSubClassEq(RC))4847Opc = AArch64::STRBui;4848break;4849case 2: {4850if (AArch64::FPR16RegClass.hasSubClassEq(RC))4851Opc = AArch64::STRHui;4852else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||4853AArch64::PPRRegClass.hasSubClassEq(RC)) {4854assert(Subtarget.isSVEorStreamingSVEAvailable() &&4855"Unexpected register store without SVE store instructions");4856Opc = AArch64::STR_PXI;4857StackID = TargetStackID::ScalableVector;4858}4859break;4860}4861case 4:4862if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {4863Opc = AArch64::STRWui;4864if (SrcReg.isVirtual())4865MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);4866else4867assert(SrcReg != AArch64::WSP);4868} else if (AArch64::FPR32RegClass.hasSubClassEq(RC))4869Opc = AArch64::STRSui;4870else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {4871Opc = AArch64::STR_PPXI;4872StackID = TargetStackID::ScalableVector;4873}4874break;4875case 8:4876if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {4877Opc = AArch64::STRXui;4878if (SrcReg.isVirtual())4879MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);4880else4881assert(SrcReg != AArch64::SP);4882} else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {4883Opc = AArch64::STRDui;4884} else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {4885storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,4886get(AArch64::STPWi), SrcReg, isKill,4887AArch64::sube32, AArch64::subo32, FI, MMO);4888return;4889}4890break;4891case 16:4892if (AArch64::FPR128RegClass.hasSubClassEq(RC))4893Opc = AArch64::STRQui;4894else if (AArch64::DDRegClass.hasSubClassEq(RC)) {4895assert(Subtarget.hasNEON() && "Unexpected register store without NEON");4896Opc = AArch64::ST1Twov1d;4897Offset = false;4898} else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {4899storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,4900get(AArch64::STPXi), SrcReg, isKill,4901AArch64::sube64, AArch64::subo64, FI, MMO);4902return;4903} else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {4904assert(Subtarget.isSVEorStreamingSVEAvailable() &&4905"Unexpected register store without SVE store instructions");4906Opc = AArch64::STR_ZXI;4907StackID = TargetStackID::ScalableVector;4908}4909break;4910case 24:4911if (AArch64::DDDRegClass.hasSubClassEq(RC)) {4912assert(Subtarget.hasNEON() && "Unexpected register store without NEON");4913Opc = AArch64::ST1Threev1d;4914Offset = false;4915}4916break;4917case 32:4918if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {4919assert(Subtarget.hasNEON() && "Unexpected register store without NEON");4920Opc = AArch64::ST1Fourv1d;4921Offset = false;4922} else if (AArch64::QQRegClass.hasSubClassEq(RC)) {4923assert(Subtarget.hasNEON() && "Unexpected register store without NEON");4924Opc = AArch64::ST1Twov2d;4925Offset = false;4926} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||4927AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {4928assert(Subtarget.isSVEorStreamingSVEAvailable() &&4929"Unexpected register store without SVE store instructions");4930Opc = AArch64::STR_ZZXI;4931StackID = TargetStackID::ScalableVector;4932}4933break;4934case 48:4935if (AArch64::QQQRegClass.hasSubClassEq(RC)) {4936assert(Subtarget.hasNEON() && "Unexpected register store without NEON");4937Opc = AArch64::ST1Threev2d;4938Offset = false;4939} else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {4940assert(Subtarget.isSVEorStreamingSVEAvailable() &&4941"Unexpected register store without SVE store instructions");4942Opc = AArch64::STR_ZZZXI;4943StackID = TargetStackID::ScalableVector;4944}4945break;4946case 64:4947if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {4948assert(Subtarget.hasNEON() && "Unexpected register store without NEON");4949Opc = AArch64::ST1Fourv2d;4950Offset = false;4951} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||4952AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {4953assert(Subtarget.isSVEorStreamingSVEAvailable() &&4954"Unexpected register store without SVE store instructions");4955Opc = AArch64::STR_ZZZZXI;4956StackID = TargetStackID::ScalableVector;4957}4958break;4959}4960assert(Opc && "Unknown register class");4961MFI.setStackID(FI, StackID);49624963const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))4964.addReg(SrcReg, getKillRegState(isKill))4965.addFrameIndex(FI);49664967if (Offset)4968MI.addImm(0);4969if (PNRReg.isValid())4970MI.addDef(PNRReg, RegState::Implicit);4971MI.addMemOperand(MMO);4972}49734974static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,4975MachineBasicBlock &MBB,4976MachineBasicBlock::iterator InsertBefore,4977const MCInstrDesc &MCID,4978Register DestReg, unsigned SubIdx0,4979unsigned SubIdx1, int FI,4980MachineMemOperand *MMO) {4981Register DestReg0 = DestReg;4982Register DestReg1 = DestReg;4983bool IsUndef = true;4984if (DestReg.isPhysical()) {4985DestReg0 = TRI.getSubReg(DestReg, SubIdx0);4986SubIdx0 = 0;4987DestReg1 = TRI.getSubReg(DestReg, SubIdx1);4988SubIdx1 = 0;4989IsUndef = false;4990}4991BuildMI(MBB, InsertBefore, DebugLoc(), MCID)4992.addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)4993.addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)4994.addFrameIndex(FI)4995.addImm(0)4996.addMemOperand(MMO);4997}49984999void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,5000MachineBasicBlock::iterator MBBI,5001Register DestReg, int FI,5002const TargetRegisterClass *RC,5003const TargetRegisterInfo *TRI,5004Register VReg) const {5005MachineFunction &MF = *MBB.getParent();5006MachineFrameInfo &MFI = MF.getFrameInfo();5007MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);5008MachineMemOperand *MMO =5009MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,5010MFI.getObjectSize(FI), MFI.getObjectAlign(FI));50115012unsigned Opc = 0;5013bool Offset = true;5014unsigned StackID = TargetStackID::Default;5015Register PNRReg = MCRegister::NoRegister;5016switch (TRI->getSpillSize(*RC)) {5017case 1:5018if (AArch64::FPR8RegClass.hasSubClassEq(RC))5019Opc = AArch64::LDRBui;5020break;5021case 2: {5022bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);5023if (AArch64::FPR16RegClass.hasSubClassEq(RC))5024Opc = AArch64::LDRHui;5025else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {5026assert(Subtarget.isSVEorStreamingSVEAvailable() &&5027"Unexpected register load without SVE load instructions");5028if (IsPNR)5029PNRReg = DestReg;5030Opc = AArch64::LDR_PXI;5031StackID = TargetStackID::ScalableVector;5032}5033break;5034}5035case 4:5036if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {5037Opc = AArch64::LDRWui;5038if (DestReg.isVirtual())5039MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);5040else5041assert(DestReg != AArch64::WSP);5042} else if (AArch64::FPR32RegClass.hasSubClassEq(RC))5043Opc = AArch64::LDRSui;5044else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {5045Opc = AArch64::LDR_PPXI;5046StackID = TargetStackID::ScalableVector;5047}5048break;5049case 8:5050if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {5051Opc = AArch64::LDRXui;5052if (DestReg.isVirtual())5053MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);5054else5055assert(DestReg != AArch64::SP);5056} else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {5057Opc = AArch64::LDRDui;5058} else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {5059loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,5060get(AArch64::LDPWi), DestReg, AArch64::sube32,5061AArch64::subo32, FI, MMO);5062return;5063}5064break;5065case 16:5066if (AArch64::FPR128RegClass.hasSubClassEq(RC))5067Opc = AArch64::LDRQui;5068else if (AArch64::DDRegClass.hasSubClassEq(RC)) {5069assert(Subtarget.hasNEON() && "Unexpected register load without NEON");5070Opc = AArch64::LD1Twov1d;5071Offset = false;5072} else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {5073loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,5074get(AArch64::LDPXi), DestReg, AArch64::sube64,5075AArch64::subo64, FI, MMO);5076return;5077} else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {5078assert(Subtarget.isSVEorStreamingSVEAvailable() &&5079"Unexpected register load without SVE load instructions");5080Opc = AArch64::LDR_ZXI;5081StackID = TargetStackID::ScalableVector;5082}5083break;5084case 24:5085if (AArch64::DDDRegClass.hasSubClassEq(RC)) {5086assert(Subtarget.hasNEON() && "Unexpected register load without NEON");5087Opc = AArch64::LD1Threev1d;5088Offset = false;5089}5090break;5091case 32:5092if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {5093assert(Subtarget.hasNEON() && "Unexpected register load without NEON");5094Opc = AArch64::LD1Fourv1d;5095Offset = false;5096} else if (AArch64::QQRegClass.hasSubClassEq(RC)) {5097assert(Subtarget.hasNEON() && "Unexpected register load without NEON");5098Opc = AArch64::LD1Twov2d;5099Offset = false;5100} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||5101AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {5102assert(Subtarget.isSVEorStreamingSVEAvailable() &&5103"Unexpected register load without SVE load instructions");5104Opc = AArch64::LDR_ZZXI;5105StackID = TargetStackID::ScalableVector;5106}5107break;5108case 48:5109if (AArch64::QQQRegClass.hasSubClassEq(RC)) {5110assert(Subtarget.hasNEON() && "Unexpected register load without NEON");5111Opc = AArch64::LD1Threev2d;5112Offset = false;5113} else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {5114assert(Subtarget.isSVEorStreamingSVEAvailable() &&5115"Unexpected register load without SVE load instructions");5116Opc = AArch64::LDR_ZZZXI;5117StackID = TargetStackID::ScalableVector;5118}5119break;5120case 64:5121if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {5122assert(Subtarget.hasNEON() && "Unexpected register load without NEON");5123Opc = AArch64::LD1Fourv2d;5124Offset = false;5125} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||5126AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {5127assert(Subtarget.isSVEorStreamingSVEAvailable() &&5128"Unexpected register load without SVE load instructions");5129Opc = AArch64::LDR_ZZZZXI;5130StackID = TargetStackID::ScalableVector;5131}5132break;5133}51345135assert(Opc && "Unknown register class");5136MFI.setStackID(FI, StackID);51375138const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))5139.addReg(DestReg, getDefRegState(true))5140.addFrameIndex(FI);5141if (Offset)5142MI.addImm(0);5143if (PNRReg.isValid() && !PNRReg.isVirtual())5144MI.addDef(PNRReg, RegState::Implicit);5145MI.addMemOperand(MMO);5146}51475148bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,5149const MachineInstr &UseMI,5150const TargetRegisterInfo *TRI) {5151return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),5152UseMI.getIterator()),5153[TRI](const MachineInstr &I) {5154return I.modifiesRegister(AArch64::NZCV, TRI) ||5155I.readsRegister(AArch64::NZCV, TRI);5156});5157}51585159void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(5160const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {5161// The smallest scalable element supported by scaled SVE addressing5162// modes are predicates, which are 2 scalable bytes in size. So the scalable5163// byte offset must always be a multiple of 2.5164assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");51655166// VGSized offsets are divided by '2', because the VG register is the5167// the number of 64bit granules as opposed to 128bit vector chunks,5168// which is how the 'n' in e.g. MVT::nxv1i8 is modelled.5169// So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.5170// VG = n * 2 and the dwarf offset must be VG * 8 bytes.5171ByteSized = Offset.getFixed();5172VGSized = Offset.getScalable() / 2;5173}51745175/// Returns the offset in parts to which this frame offset can be5176/// decomposed for the purpose of describing a frame offset.5177/// For non-scalable offsets this is simply its byte size.5178void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(5179const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,5180int64_t &NumDataVectors) {5181// The smallest scalable element supported by scaled SVE addressing5182// modes are predicates, which are 2 scalable bytes in size. So the scalable5183// byte offset must always be a multiple of 2.5184assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");51855186NumBytes = Offset.getFixed();5187NumDataVectors = 0;5188NumPredicateVectors = Offset.getScalable() / 2;5189// This method is used to get the offsets to adjust the frame offset.5190// If the function requires ADDPL to be used and needs more than two ADDPL5191// instructions, part of the offset is folded into NumDataVectors so that it5192// uses ADDVL for part of it, reducing the number of ADDPL instructions.5193if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||5194NumPredicateVectors > 62) {5195NumDataVectors = NumPredicateVectors / 8;5196NumPredicateVectors -= NumDataVectors * 8;5197}5198}51995200// Convenience function to create a DWARF expression for5201// Expr + NumBytes + NumVGScaledBytes * AArch64::VG5202static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,5203int NumVGScaledBytes, unsigned VG,5204llvm::raw_string_ostream &Comment) {5205uint8_t buffer[16];52065207if (NumBytes) {5208Expr.push_back(dwarf::DW_OP_consts);5209Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));5210Expr.push_back((uint8_t)dwarf::DW_OP_plus);5211Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);5212}52135214if (NumVGScaledBytes) {5215Expr.push_back((uint8_t)dwarf::DW_OP_consts);5216Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));52175218Expr.push_back((uint8_t)dwarf::DW_OP_bregx);5219Expr.append(buffer, buffer + encodeULEB128(VG, buffer));5220Expr.push_back(0);52215222Expr.push_back((uint8_t)dwarf::DW_OP_mul);5223Expr.push_back((uint8_t)dwarf::DW_OP_plus);52245225Comment << (NumVGScaledBytes < 0 ? " - " : " + ")5226<< std::abs(NumVGScaledBytes) << " * VG";5227}5228}52295230// Creates an MCCFIInstruction:5231// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }5232static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,5233unsigned Reg,5234const StackOffset &Offset) {5235int64_t NumBytes, NumVGScaledBytes;5236AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,5237NumVGScaledBytes);5238std::string CommentBuffer;5239llvm::raw_string_ostream Comment(CommentBuffer);52405241if (Reg == AArch64::SP)5242Comment << "sp";5243else if (Reg == AArch64::FP)5244Comment << "fp";5245else5246Comment << printReg(Reg, &TRI);52475248// Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)5249SmallString<64> Expr;5250unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);5251Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));5252Expr.push_back(0);5253appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,5254TRI.getDwarfRegNum(AArch64::VG, true), Comment);52555256// Wrap this into DW_CFA_def_cfa.5257SmallString<64> DefCfaExpr;5258DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);5259uint8_t buffer[16];5260DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));5261DefCfaExpr.append(Expr.str());5262return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),5263Comment.str());5264}52655266MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,5267unsigned FrameReg, unsigned Reg,5268const StackOffset &Offset,5269bool LastAdjustmentWasScalable) {5270if (Offset.getScalable())5271return createDefCFAExpression(TRI, Reg, Offset);52725273if (FrameReg == Reg && !LastAdjustmentWasScalable)5274return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));52755276unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);5277return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());5278}52795280MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,5281unsigned Reg,5282const StackOffset &OffsetFromDefCFA) {5283int64_t NumBytes, NumVGScaledBytes;5284AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(5285OffsetFromDefCFA, NumBytes, NumVGScaledBytes);52865287unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);52885289// Non-scalable offsets can use DW_CFA_offset directly.5290if (!NumVGScaledBytes)5291return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);52925293std::string CommentBuffer;5294llvm::raw_string_ostream Comment(CommentBuffer);5295Comment << printReg(Reg, &TRI) << " @ cfa";52965297// Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)5298SmallString<64> OffsetExpr;5299appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,5300TRI.getDwarfRegNum(AArch64::VG, true), Comment);53015302// Wrap this into DW_CFA_expression5303SmallString<64> CfaExpr;5304CfaExpr.push_back(dwarf::DW_CFA_expression);5305uint8_t buffer[16];5306CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));5307CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));5308CfaExpr.append(OffsetExpr.str());53095310return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),5311Comment.str());5312}53135314// Helper function to emit a frame offset adjustment from a given5315// pointer (SrcReg), stored into DestReg. This function is explicit5316// in that it requires the opcode.5317static void emitFrameOffsetAdj(MachineBasicBlock &MBB,5318MachineBasicBlock::iterator MBBI,5319const DebugLoc &DL, unsigned DestReg,5320unsigned SrcReg, int64_t Offset, unsigned Opc,5321const TargetInstrInfo *TII,5322MachineInstr::MIFlag Flag, bool NeedsWinCFI,5323bool *HasWinCFI, bool EmitCFAOffset,5324StackOffset CFAOffset, unsigned FrameReg) {5325int Sign = 1;5326unsigned MaxEncoding, ShiftSize;5327switch (Opc) {5328case AArch64::ADDXri:5329case AArch64::ADDSXri:5330case AArch64::SUBXri:5331case AArch64::SUBSXri:5332MaxEncoding = 0xfff;5333ShiftSize = 12;5334break;5335case AArch64::ADDVL_XXI:5336case AArch64::ADDPL_XXI:5337case AArch64::ADDSVL_XXI:5338case AArch64::ADDSPL_XXI:5339MaxEncoding = 31;5340ShiftSize = 0;5341if (Offset < 0) {5342MaxEncoding = 32;5343Sign = -1;5344Offset = -Offset;5345}5346break;5347default:5348llvm_unreachable("Unsupported opcode");5349}53505351// `Offset` can be in bytes or in "scalable bytes".5352int VScale = 1;5353if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)5354VScale = 16;5355else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)5356VScale = 2;53575358// FIXME: If the offset won't fit in 24-bits, compute the offset into a5359// scratch register. If DestReg is a virtual register, use it as the5360// scratch register; otherwise, create a new virtual register (to be5361// replaced by the scavenger at the end of PEI). That case can be optimized5362// slightly if DestReg is SP which is always 16-byte aligned, so the scratch5363// register can be loaded with offset%8 and the add/sub can use an extending5364// instruction with LSL#3.5365// Currently the function handles any offsets but generates a poor sequence5366// of code.5367// assert(Offset < (1 << 24) && "unimplemented reg plus immediate");53685369const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;5370Register TmpReg = DestReg;5371if (TmpReg == AArch64::XZR)5372TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(5373&AArch64::GPR64RegClass);5374do {5375uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);5376unsigned LocalShiftSize = 0;5377if (ThisVal > MaxEncoding) {5378ThisVal = ThisVal >> ShiftSize;5379LocalShiftSize = ShiftSize;5380}5381assert((ThisVal >> ShiftSize) <= MaxEncoding &&5382"Encoding cannot handle value that big");53835384Offset -= ThisVal << LocalShiftSize;5385if (Offset == 0)5386TmpReg = DestReg;5387auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)5388.addReg(SrcReg)5389.addImm(Sign * (int)ThisVal);5390if (ShiftSize)5391MBI = MBI.addImm(5392AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));5393MBI = MBI.setMIFlag(Flag);53945395auto Change =5396VScale == 15397? StackOffset::getFixed(ThisVal << LocalShiftSize)5398: StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));5399if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)5400CFAOffset += Change;5401else5402CFAOffset -= Change;5403if (EmitCFAOffset && DestReg == TmpReg) {5404MachineFunction &MF = *MBB.getParent();5405const TargetSubtargetInfo &STI = MF.getSubtarget();5406const TargetRegisterInfo &TRI = *STI.getRegisterInfo();54075408unsigned CFIIndex = MF.addFrameInst(5409createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));5410BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))5411.addCFIIndex(CFIIndex)5412.setMIFlags(Flag);5413}54145415if (NeedsWinCFI) {5416assert(Sign == 1 && "SEH directives should always have a positive sign");5417int Imm = (int)(ThisVal << LocalShiftSize);5418if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||5419(SrcReg == AArch64::FP && DestReg == AArch64::SP)) {5420if (HasWinCFI)5421*HasWinCFI = true;5422if (Imm == 0)5423BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);5424else5425BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))5426.addImm(Imm)5427.setMIFlag(Flag);5428assert(Offset == 0 && "Expected remaining offset to be zero to "5429"emit a single SEH directive");5430} else if (DestReg == AArch64::SP) {5431if (HasWinCFI)5432*HasWinCFI = true;5433assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");5434BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))5435.addImm(Imm)5436.setMIFlag(Flag);5437}5438}54395440SrcReg = TmpReg;5441} while (Offset);5442}54435444void llvm::emitFrameOffset(MachineBasicBlock &MBB,5445MachineBasicBlock::iterator MBBI, const DebugLoc &DL,5446unsigned DestReg, unsigned SrcReg,5447StackOffset Offset, const TargetInstrInfo *TII,5448MachineInstr::MIFlag Flag, bool SetNZCV,5449bool NeedsWinCFI, bool *HasWinCFI,5450bool EmitCFAOffset, StackOffset CFAOffset,5451unsigned FrameReg) {5452// If a function is marked as arm_locally_streaming, then the runtime value of5453// vscale in the prologue/epilogue is different the runtime value of vscale5454// in the function's body. To avoid having to consider multiple vscales,5455// we can use `addsvl` to allocate any scalable stack-slots, which under5456// most circumstances will be only locals, not callee-save slots.5457const Function &F = MBB.getParent()->getFunction();5458bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");54595460int64_t Bytes, NumPredicateVectors, NumDataVectors;5461AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(5462Offset, Bytes, NumPredicateVectors, NumDataVectors);54635464// First emit non-scalable frame offsets, or a simple 'mov'.5465if (Bytes || (!Offset && SrcReg != DestReg)) {5466assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&5467"SP increment/decrement not 8-byte aligned");5468unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;5469if (Bytes < 0) {5470Bytes = -Bytes;5471Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;5472}5473emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,5474NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,5475FrameReg);5476CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)5477? StackOffset::getFixed(-Bytes)5478: StackOffset::getFixed(Bytes);5479SrcReg = DestReg;5480FrameReg = DestReg;5481}54825483assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&5484"SetNZCV not supported with SVE vectors");5485assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&5486"WinCFI not supported with SVE vectors");54875488if (NumDataVectors) {5489emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,5490UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI,5491TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,5492CFAOffset, FrameReg);5493CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);5494SrcReg = DestReg;5495}54965497if (NumPredicateVectors) {5498assert(DestReg != AArch64::SP && "Unaligned access to SP");5499emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,5500UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI,5501TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,5502CFAOffset, FrameReg);5503}5504}55055506MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(5507MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,5508MachineBasicBlock::iterator InsertPt, int FrameIndex,5509LiveIntervals *LIS, VirtRegMap *VRM) const {5510// This is a bit of a hack. Consider this instruction:5511//5512// %0 = COPY %sp; GPR64all:%05513//5514// We explicitly chose GPR64all for the virtual register so such a copy might5515// be eliminated by RegisterCoalescer. However, that may not be possible, and5516// %0 may even spill. We can't spill %sp, and since it is in the GPR64all5517// register class, TargetInstrInfo::foldMemoryOperand() is going to try.5518//5519// To prevent that, we are going to constrain the %0 register class here.5520if (MI.isFullCopy()) {5521Register DstReg = MI.getOperand(0).getReg();5522Register SrcReg = MI.getOperand(1).getReg();5523if (SrcReg == AArch64::SP && DstReg.isVirtual()) {5524MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);5525return nullptr;5526}5527if (DstReg == AArch64::SP && SrcReg.isVirtual()) {5528MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);5529return nullptr;5530}5531// Nothing can folded with copy from/to NZCV.5532if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)5533return nullptr;5534}55355536// Handle the case where a copy is being spilled or filled but the source5537// and destination register class don't match. For example:5538//5539// %0 = COPY %xzr; GPR64common:%05540//5541// In this case we can still safely fold away the COPY and generate the5542// following spill code:5543//5544// STRXui %xzr, %stack.05545//5546// This also eliminates spilled cross register class COPYs (e.g. between x and5547// d regs) of the same size. For example:5548//5549// %0 = COPY %1; GPR64:%0, FPR64:%15550//5551// will be filled as5552//5553// LDRDui %0, fi<#0>5554//5555// instead of5556//5557// LDRXui %Temp, fi<#0>5558// %0 = FMOV %Temp5559//5560if (MI.isCopy() && Ops.size() == 1 &&5561// Make sure we're only folding the explicit COPY defs/uses.5562(Ops[0] == 0 || Ops[0] == 1)) {5563bool IsSpill = Ops[0] == 0;5564bool IsFill = !IsSpill;5565const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();5566const MachineRegisterInfo &MRI = MF.getRegInfo();5567MachineBasicBlock &MBB = *MI.getParent();5568const MachineOperand &DstMO = MI.getOperand(0);5569const MachineOperand &SrcMO = MI.getOperand(1);5570Register DstReg = DstMO.getReg();5571Register SrcReg = SrcMO.getReg();5572// This is slightly expensive to compute for physical regs since5573// getMinimalPhysRegClass is slow.5574auto getRegClass = [&](unsigned Reg) {5575return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)5576: TRI.getMinimalPhysRegClass(Reg);5577};55785579if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {5580assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==5581TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&5582"Mismatched register size in non subreg COPY");5583if (IsSpill)5584storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,5585getRegClass(SrcReg), &TRI, Register());5586else5587loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,5588getRegClass(DstReg), &TRI, Register());5589return &*--InsertPt;5590}55915592// Handle cases like spilling def of:5593//5594// %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%05595//5596// where the physical register source can be widened and stored to the full5597// virtual reg destination stack slot, in this case producing:5598//5599// STRXui %xzr, %stack.05600//5601if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&5602TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {5603assert(SrcMO.getSubReg() == 0 &&5604"Unexpected subreg on physical register");5605storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),5606FrameIndex, &AArch64::GPR64RegClass, &TRI,5607Register());5608return &*--InsertPt;5609}56105611// Handle cases like filling use of:5612//5613// %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%15614//5615// where we can load the full virtual reg source stack slot, into the subreg5616// destination, in this case producing:5617//5618// LDRWui %0:sub_32<def,read-undef>, %stack.05619//5620if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {5621const TargetRegisterClass *FillRC;5622switch (DstMO.getSubReg()) {5623default:5624FillRC = nullptr;5625break;5626case AArch64::sub_32:5627FillRC = &AArch64::GPR32RegClass;5628break;5629case AArch64::ssub:5630FillRC = &AArch64::FPR32RegClass;5631break;5632case AArch64::dsub:5633FillRC = &AArch64::FPR64RegClass;5634break;5635}56365637if (FillRC) {5638assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==5639TRI.getRegSizeInBits(*FillRC) &&5640"Mismatched regclass size on folded subreg COPY");5641loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI,5642Register());5643MachineInstr &LoadMI = *--InsertPt;5644MachineOperand &LoadDst = LoadMI.getOperand(0);5645assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");5646LoadDst.setSubReg(DstMO.getSubReg());5647LoadDst.setIsUndef();5648return &LoadMI;5649}5650}5651}56525653// Cannot fold.5654return nullptr;5655}56565657int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,5658StackOffset &SOffset,5659bool *OutUseUnscaledOp,5660unsigned *OutUnscaledOp,5661int64_t *EmittableOffset) {5662// Set output values in case of early exit.5663if (EmittableOffset)5664*EmittableOffset = 0;5665if (OutUseUnscaledOp)5666*OutUseUnscaledOp = false;5667if (OutUnscaledOp)5668*OutUnscaledOp = 0;56695670// Exit early for structured vector spills/fills as they can't take an5671// immediate offset.5672switch (MI.getOpcode()) {5673default:5674break;5675case AArch64::LD1Rv1d:5676case AArch64::LD1Rv2s:5677case AArch64::LD1Rv2d:5678case AArch64::LD1Rv4h:5679case AArch64::LD1Rv4s:5680case AArch64::LD1Rv8b:5681case AArch64::LD1Rv8h:5682case AArch64::LD1Rv16b:5683case AArch64::LD1Twov2d:5684case AArch64::LD1Threev2d:5685case AArch64::LD1Fourv2d:5686case AArch64::LD1Twov1d:5687case AArch64::LD1Threev1d:5688case AArch64::LD1Fourv1d:5689case AArch64::ST1Twov2d:5690case AArch64::ST1Threev2d:5691case AArch64::ST1Fourv2d:5692case AArch64::ST1Twov1d:5693case AArch64::ST1Threev1d:5694case AArch64::ST1Fourv1d:5695case AArch64::ST1i8:5696case AArch64::ST1i16:5697case AArch64::ST1i32:5698case AArch64::ST1i64:5699case AArch64::IRG:5700case AArch64::IRGstack:5701case AArch64::STGloop:5702case AArch64::STZGloop:5703return AArch64FrameOffsetCannotUpdate;5704}57055706// Get the min/max offset and the scale.5707TypeSize ScaleValue(0U, false), Width(0U, false);5708int64_t MinOff, MaxOff;5709if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,5710MaxOff))5711llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");57125713// Construct the complete offset.5714bool IsMulVL = ScaleValue.isScalable();5715unsigned Scale = ScaleValue.getKnownMinValue();5716int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();57175718const MachineOperand &ImmOpnd =5719MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));5720Offset += ImmOpnd.getImm() * Scale;57215722// If the offset doesn't match the scale, we rewrite the instruction to5723// use the unscaled instruction instead. Likewise, if we have a negative5724// offset and there is an unscaled op to use.5725std::optional<unsigned> UnscaledOp =5726AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());5727bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);5728if (useUnscaledOp &&5729!AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,5730MaxOff))5731llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");57325733Scale = ScaleValue.getKnownMinValue();5734assert(IsMulVL == ScaleValue.isScalable() &&5735"Unscaled opcode has different value for scalable");57365737int64_t Remainder = Offset % Scale;5738assert(!(Remainder && useUnscaledOp) &&5739"Cannot have remainder when using unscaled op");57405741assert(MinOff < MaxOff && "Unexpected Min/Max offsets");5742int64_t NewOffset = Offset / Scale;5743if (MinOff <= NewOffset && NewOffset <= MaxOff)5744Offset = Remainder;5745else {5746NewOffset = NewOffset < 0 ? MinOff : MaxOff;5747Offset = Offset - (NewOffset * Scale);5748}57495750if (EmittableOffset)5751*EmittableOffset = NewOffset;5752if (OutUseUnscaledOp)5753*OutUseUnscaledOp = useUnscaledOp;5754if (OutUnscaledOp && UnscaledOp)5755*OutUnscaledOp = *UnscaledOp;57565757if (IsMulVL)5758SOffset = StackOffset::get(SOffset.getFixed(), Offset);5759else5760SOffset = StackOffset::get(Offset, SOffset.getScalable());5761return AArch64FrameOffsetCanUpdate |5762(SOffset ? 0 : AArch64FrameOffsetIsLegal);5763}57645765bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,5766unsigned FrameReg, StackOffset &Offset,5767const AArch64InstrInfo *TII) {5768unsigned Opcode = MI.getOpcode();5769unsigned ImmIdx = FrameRegIdx + 1;57705771if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {5772Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());5773emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),5774MI.getOperand(0).getReg(), FrameReg, Offset, TII,5775MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));5776MI.eraseFromParent();5777Offset = StackOffset();5778return true;5779}57805781int64_t NewOffset;5782unsigned UnscaledOp;5783bool UseUnscaledOp;5784int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,5785&UnscaledOp, &NewOffset);5786if (Status & AArch64FrameOffsetCanUpdate) {5787if (Status & AArch64FrameOffsetIsLegal)5788// Replace the FrameIndex with FrameReg.5789MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);5790if (UseUnscaledOp)5791MI.setDesc(TII->get(UnscaledOp));57925793MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);5794return !Offset;5795}57965797return false;5798}57995800void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,5801MachineBasicBlock::iterator MI) const {5802DebugLoc DL;5803BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0);5804}58055806MCInst AArch64InstrInfo::getNop() const {5807return MCInstBuilder(AArch64::HINT).addImm(0);5808}58095810// AArch64 supports MachineCombiner.5811bool AArch64InstrInfo::useMachineCombiner() const { return true; }58125813// True when Opc sets flag5814static bool isCombineInstrSettingFlag(unsigned Opc) {5815switch (Opc) {5816case AArch64::ADDSWrr:5817case AArch64::ADDSWri:5818case AArch64::ADDSXrr:5819case AArch64::ADDSXri:5820case AArch64::SUBSWrr:5821case AArch64::SUBSXrr:5822// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.5823case AArch64::SUBSWri:5824case AArch64::SUBSXri:5825return true;5826default:5827break;5828}5829return false;5830}58315832// 32b Opcodes that can be combined with a MUL5833static bool isCombineInstrCandidate32(unsigned Opc) {5834switch (Opc) {5835case AArch64::ADDWrr:5836case AArch64::ADDWri:5837case AArch64::SUBWrr:5838case AArch64::ADDSWrr:5839case AArch64::ADDSWri:5840case AArch64::SUBSWrr:5841// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.5842case AArch64::SUBWri:5843case AArch64::SUBSWri:5844return true;5845default:5846break;5847}5848return false;5849}58505851// 64b Opcodes that can be combined with a MUL5852static bool isCombineInstrCandidate64(unsigned Opc) {5853switch (Opc) {5854case AArch64::ADDXrr:5855case AArch64::ADDXri:5856case AArch64::SUBXrr:5857case AArch64::ADDSXrr:5858case AArch64::ADDSXri:5859case AArch64::SUBSXrr:5860// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.5861case AArch64::SUBXri:5862case AArch64::SUBSXri:5863case AArch64::ADDv8i8:5864case AArch64::ADDv16i8:5865case AArch64::ADDv4i16:5866case AArch64::ADDv8i16:5867case AArch64::ADDv2i32:5868case AArch64::ADDv4i32:5869case AArch64::SUBv8i8:5870case AArch64::SUBv16i8:5871case AArch64::SUBv4i16:5872case AArch64::SUBv8i16:5873case AArch64::SUBv2i32:5874case AArch64::SUBv4i32:5875return true;5876default:5877break;5878}5879return false;5880}58815882// FP Opcodes that can be combined with a FMUL.5883static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {5884switch (Inst.getOpcode()) {5885default:5886break;5887case AArch64::FADDHrr:5888case AArch64::FADDSrr:5889case AArch64::FADDDrr:5890case AArch64::FADDv4f16:5891case AArch64::FADDv8f16:5892case AArch64::FADDv2f32:5893case AArch64::FADDv2f64:5894case AArch64::FADDv4f32:5895case AArch64::FSUBHrr:5896case AArch64::FSUBSrr:5897case AArch64::FSUBDrr:5898case AArch64::FSUBv4f16:5899case AArch64::FSUBv8f16:5900case AArch64::FSUBv2f32:5901case AArch64::FSUBv2f64:5902case AArch64::FSUBv4f32:5903TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;5904// We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by5905// the target options or if FADD/FSUB has the contract fast-math flag.5906return Options.UnsafeFPMath ||5907Options.AllowFPOpFusion == FPOpFusion::Fast ||5908Inst.getFlag(MachineInstr::FmContract);5909return true;5910}5911return false;5912}59135914// Opcodes that can be combined with a MUL5915static bool isCombineInstrCandidate(unsigned Opc) {5916return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));5917}59185919//5920// Utility routine that checks if \param MO is defined by an5921// \param CombineOpc instruction in the basic block \param MBB5922static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,5923unsigned CombineOpc, unsigned ZeroReg = 0,5924bool CheckZeroReg = false) {5925MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();5926MachineInstr *MI = nullptr;59275928if (MO.isReg() && MO.getReg().isVirtual())5929MI = MRI.getUniqueVRegDef(MO.getReg());5930// And it needs to be in the trace (otherwise, it won't have a depth).5931if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)5932return false;5933// Must only used by the user we combine with.5934if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))5935return false;59365937if (CheckZeroReg) {5938assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&5939MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&5940MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");5941// The third input reg must be zero.5942if (MI->getOperand(3).getReg() != ZeroReg)5943return false;5944}59455946if (isCombineInstrSettingFlag(CombineOpc) &&5947MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)5948return false;59495950return true;5951}59525953//5954// Is \param MO defined by an integer multiply and can be combined?5955static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,5956unsigned MulOpc, unsigned ZeroReg) {5957return canCombine(MBB, MO, MulOpc, ZeroReg, true);5958}59595960//5961// Is \param MO defined by a floating-point multiply and can be combined?5962static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,5963unsigned MulOpc) {5964return canCombine(MBB, MO, MulOpc);5965}59665967// TODO: There are many more machine instruction opcodes to match:5968// 1. Other data types (integer, vectors)5969// 2. Other math / logic operations (xor, or)5970// 3. Other forms of the same operation (intrinsics and other variants)5971bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,5972bool Invert) const {5973if (Invert)5974return false;5975switch (Inst.getOpcode()) {5976// == Floating-point types ==5977// -- Floating-point instructions --5978case AArch64::FADDHrr:5979case AArch64::FADDSrr:5980case AArch64::FADDDrr:5981case AArch64::FMULHrr:5982case AArch64::FMULSrr:5983case AArch64::FMULDrr:5984case AArch64::FMULX16:5985case AArch64::FMULX32:5986case AArch64::FMULX64:5987// -- Advanced SIMD instructions --5988case AArch64::FADDv4f16:5989case AArch64::FADDv8f16:5990case AArch64::FADDv2f32:5991case AArch64::FADDv4f32:5992case AArch64::FADDv2f64:5993case AArch64::FMULv4f16:5994case AArch64::FMULv8f16:5995case AArch64::FMULv2f32:5996case AArch64::FMULv4f32:5997case AArch64::FMULv2f64:5998case AArch64::FMULXv4f16:5999case AArch64::FMULXv8f16:6000case AArch64::FMULXv2f32:6001case AArch64::FMULXv4f32:6002case AArch64::FMULXv2f64:6003// -- SVE instructions --6004// Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX6005// in the SVE instruction set (though there are predicated ones).6006case AArch64::FADD_ZZZ_H:6007case AArch64::FADD_ZZZ_S:6008case AArch64::FADD_ZZZ_D:6009case AArch64::FMUL_ZZZ_H:6010case AArch64::FMUL_ZZZ_S:6011case AArch64::FMUL_ZZZ_D:6012return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||6013(Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&6014Inst.getFlag(MachineInstr::MIFlag::FmNsz));60156016// == Integer types ==6017// -- Base instructions --6018// Opcodes MULWrr and MULXrr don't exist because6019// `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of6020// `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.6021// The machine-combiner does not support three-source-operands machine6022// instruction. So we cannot reassociate MULs.6023case AArch64::ADDWrr:6024case AArch64::ADDXrr:6025case AArch64::ANDWrr:6026case AArch64::ANDXrr:6027case AArch64::ORRWrr:6028case AArch64::ORRXrr:6029case AArch64::EORWrr:6030case AArch64::EORXrr:6031case AArch64::EONWrr:6032case AArch64::EONXrr:6033// -- Advanced SIMD instructions --6034// Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL6035// in the Advanced SIMD instruction set.6036case AArch64::ADDv8i8:6037case AArch64::ADDv16i8:6038case AArch64::ADDv4i16:6039case AArch64::ADDv8i16:6040case AArch64::ADDv2i32:6041case AArch64::ADDv4i32:6042case AArch64::ADDv1i64:6043case AArch64::ADDv2i64:6044case AArch64::MULv8i8:6045case AArch64::MULv16i8:6046case AArch64::MULv4i16:6047case AArch64::MULv8i16:6048case AArch64::MULv2i32:6049case AArch64::MULv4i32:6050case AArch64::ANDv8i8:6051case AArch64::ANDv16i8:6052case AArch64::ORRv8i8:6053case AArch64::ORRv16i8:6054case AArch64::EORv8i8:6055case AArch64::EORv16i8:6056// -- SVE instructions --6057case AArch64::ADD_ZZZ_B:6058case AArch64::ADD_ZZZ_H:6059case AArch64::ADD_ZZZ_S:6060case AArch64::ADD_ZZZ_D:6061case AArch64::MUL_ZZZ_B:6062case AArch64::MUL_ZZZ_H:6063case AArch64::MUL_ZZZ_S:6064case AArch64::MUL_ZZZ_D:6065case AArch64::AND_ZZZ:6066case AArch64::ORR_ZZZ:6067case AArch64::EOR_ZZZ:6068return true;60696070default:6071return false;6072}6073}60746075/// Find instructions that can be turned into madd.6076static bool getMaddPatterns(MachineInstr &Root,6077SmallVectorImpl<unsigned> &Patterns) {6078unsigned Opc = Root.getOpcode();6079MachineBasicBlock &MBB = *Root.getParent();6080bool Found = false;60816082if (!isCombineInstrCandidate(Opc))6083return false;6084if (isCombineInstrSettingFlag(Opc)) {6085int Cmp_NZCV =6086Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);6087// When NZCV is live bail out.6088if (Cmp_NZCV == -1)6089return false;6090unsigned NewOpc = convertToNonFlagSettingOpc(Root);6091// When opcode can't change bail out.6092// CHECKME: do we miss any cases for opcode conversion?6093if (NewOpc == Opc)6094return false;6095Opc = NewOpc;6096}60976098auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,6099unsigned Pattern) {6100if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {6101Patterns.push_back(Pattern);6102Found = true;6103}6104};61056106auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {6107if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {6108Patterns.push_back(Pattern);6109Found = true;6110}6111};61126113typedef AArch64MachineCombinerPattern MCP;61146115switch (Opc) {6116default:6117break;6118case AArch64::ADDWrr:6119assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&6120"ADDWrr does not have register operands");6121setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);6122setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);6123break;6124case AArch64::ADDXrr:6125setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);6126setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);6127break;6128case AArch64::SUBWrr:6129setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);6130setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);6131break;6132case AArch64::SUBXrr:6133setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);6134setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);6135break;6136case AArch64::ADDWri:6137setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);6138break;6139case AArch64::ADDXri:6140setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);6141break;6142case AArch64::SUBWri:6143setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);6144break;6145case AArch64::SUBXri:6146setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);6147break;6148case AArch64::ADDv8i8:6149setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);6150setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);6151break;6152case AArch64::ADDv16i8:6153setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);6154setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);6155break;6156case AArch64::ADDv4i16:6157setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);6158setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);6159setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);6160setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);6161break;6162case AArch64::ADDv8i16:6163setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);6164setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);6165setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);6166setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);6167break;6168case AArch64::ADDv2i32:6169setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);6170setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);6171setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);6172setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);6173break;6174case AArch64::ADDv4i32:6175setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);6176setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);6177setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);6178setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);6179break;6180case AArch64::SUBv8i8:6181setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);6182setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);6183break;6184case AArch64::SUBv16i8:6185setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);6186setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);6187break;6188case AArch64::SUBv4i16:6189setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);6190setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);6191setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);6192setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);6193break;6194case AArch64::SUBv8i16:6195setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);6196setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);6197setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);6198setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);6199break;6200case AArch64::SUBv2i32:6201setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);6202setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);6203setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);6204setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);6205break;6206case AArch64::SUBv4i32:6207setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);6208setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);6209setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);6210setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);6211break;6212}6213return Found;6214}6215/// Floating-Point Support62166217/// Find instructions that can be turned into madd.6218static bool getFMAPatterns(MachineInstr &Root,6219SmallVectorImpl<unsigned> &Patterns) {62206221if (!isCombineInstrCandidateFP(Root))6222return false;62236224MachineBasicBlock &MBB = *Root.getParent();6225bool Found = false;62266227auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {6228if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {6229Patterns.push_back(Pattern);6230return true;6231}6232return false;6233};62346235typedef AArch64MachineCombinerPattern MCP;62366237switch (Root.getOpcode()) {6238default:6239assert(false && "Unsupported FP instruction in combiner\n");6240break;6241case AArch64::FADDHrr:6242assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&6243"FADDHrr does not have register operands");62446245Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);6246Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);6247break;6248case AArch64::FADDSrr:6249assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&6250"FADDSrr does not have register operands");62516252Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||6253Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);62546255Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||6256Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);6257break;6258case AArch64::FADDDrr:6259Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||6260Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);62616262Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||6263Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);6264break;6265case AArch64::FADDv4f16:6266Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||6267Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);62686269Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||6270Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);6271break;6272case AArch64::FADDv8f16:6273Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||6274Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);62756276Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||6277Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);6278break;6279case AArch64::FADDv2f32:6280Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||6281Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);62826283Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||6284Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);6285break;6286case AArch64::FADDv2f64:6287Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||6288Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);62896290Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||6291Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);6292break;6293case AArch64::FADDv4f32:6294Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||6295Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);62966297Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||6298Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);6299break;6300case AArch64::FSUBHrr:6301Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);6302Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);6303Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);6304break;6305case AArch64::FSUBSrr:6306Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);63076308Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||6309Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);63106311Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);6312break;6313case AArch64::FSUBDrr:6314Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);63156316Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||6317Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);63186319Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);6320break;6321case AArch64::FSUBv4f16:6322Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||6323Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);63246325Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||6326Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);6327break;6328case AArch64::FSUBv8f16:6329Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||6330Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);63316332Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||6333Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);6334break;6335case AArch64::FSUBv2f32:6336Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||6337Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);63386339Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||6340Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);6341break;6342case AArch64::FSUBv2f64:6343Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||6344Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);63456346Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||6347Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);6348break;6349case AArch64::FSUBv4f32:6350Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||6351Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);63526353Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||6354Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);6355break;6356}6357return Found;6358}63596360static bool getFMULPatterns(MachineInstr &Root,6361SmallVectorImpl<unsigned> &Patterns) {6362MachineBasicBlock &MBB = *Root.getParent();6363bool Found = false;63646365auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {6366MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();6367MachineOperand &MO = Root.getOperand(Operand);6368MachineInstr *MI = nullptr;6369if (MO.isReg() && MO.getReg().isVirtual())6370MI = MRI.getUniqueVRegDef(MO.getReg());6371// Ignore No-op COPYs in FMUL(COPY(DUP(..)))6372if (MI && MI->getOpcode() == TargetOpcode::COPY &&6373MI->getOperand(1).getReg().isVirtual())6374MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());6375if (MI && MI->getOpcode() == Opcode) {6376Patterns.push_back(Pattern);6377return true;6378}6379return false;6380};63816382typedef AArch64MachineCombinerPattern MCP;63836384switch (Root.getOpcode()) {6385default:6386return false;6387case AArch64::FMULv2f32:6388Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);6389Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);6390break;6391case AArch64::FMULv2f64:6392Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);6393Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);6394break;6395case AArch64::FMULv4f16:6396Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);6397Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);6398break;6399case AArch64::FMULv4f32:6400Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);6401Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);6402break;6403case AArch64::FMULv8f16:6404Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);6405Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);6406break;6407}64086409return Found;6410}64116412static bool getFNEGPatterns(MachineInstr &Root,6413SmallVectorImpl<unsigned> &Patterns) {6414unsigned Opc = Root.getOpcode();6415MachineBasicBlock &MBB = *Root.getParent();6416MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();64176418auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {6419MachineOperand &MO = Root.getOperand(1);6420MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());6421if (MI != nullptr && (MI->getOpcode() == Opcode) &&6422MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&6423Root.getFlag(MachineInstr::MIFlag::FmContract) &&6424Root.getFlag(MachineInstr::MIFlag::FmNsz) &&6425MI->getFlag(MachineInstr::MIFlag::FmContract) &&6426MI->getFlag(MachineInstr::MIFlag::FmNsz)) {6427Patterns.push_back(Pattern);6428return true;6429}6430return false;6431};64326433switch (Opc) {6434default:6435break;6436case AArch64::FNEGDr:6437return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);6438case AArch64::FNEGSr:6439return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);6440}64416442return false;6443}64446445/// Return true when a code sequence can improve throughput. It6446/// should be called only for instructions in loops.6447/// \param Pattern - combiner pattern6448bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {6449switch (Pattern) {6450default:6451break;6452case AArch64MachineCombinerPattern::FMULADDH_OP1:6453case AArch64MachineCombinerPattern::FMULADDH_OP2:6454case AArch64MachineCombinerPattern::FMULSUBH_OP1:6455case AArch64MachineCombinerPattern::FMULSUBH_OP2:6456case AArch64MachineCombinerPattern::FMULADDS_OP1:6457case AArch64MachineCombinerPattern::FMULADDS_OP2:6458case AArch64MachineCombinerPattern::FMULSUBS_OP1:6459case AArch64MachineCombinerPattern::FMULSUBS_OP2:6460case AArch64MachineCombinerPattern::FMULADDD_OP1:6461case AArch64MachineCombinerPattern::FMULADDD_OP2:6462case AArch64MachineCombinerPattern::FMULSUBD_OP1:6463case AArch64MachineCombinerPattern::FMULSUBD_OP2:6464case AArch64MachineCombinerPattern::FNMULSUBH_OP1:6465case AArch64MachineCombinerPattern::FNMULSUBS_OP1:6466case AArch64MachineCombinerPattern::FNMULSUBD_OP1:6467case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:6468case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:6469case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:6470case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:6471case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:6472case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:6473case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:6474case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:6475case AArch64MachineCombinerPattern::FMLAv4f16_OP2:6476case AArch64MachineCombinerPattern::FMLAv4f16_OP1:6477case AArch64MachineCombinerPattern::FMLAv8f16_OP1:6478case AArch64MachineCombinerPattern::FMLAv8f16_OP2:6479case AArch64MachineCombinerPattern::FMLAv2f32_OP2:6480case AArch64MachineCombinerPattern::FMLAv2f32_OP1:6481case AArch64MachineCombinerPattern::FMLAv2f64_OP1:6482case AArch64MachineCombinerPattern::FMLAv2f64_OP2:6483case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:6484case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:6485case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:6486case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:6487case AArch64MachineCombinerPattern::FMLAv4f32_OP1:6488case AArch64MachineCombinerPattern::FMLAv4f32_OP2:6489case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:6490case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:6491case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1:6492case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:6493case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1:6494case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:6495case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:6496case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:6497case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:6498case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:6499case AArch64MachineCombinerPattern::FMLSv4f16_OP1:6500case AArch64MachineCombinerPattern::FMLSv4f16_OP2:6501case AArch64MachineCombinerPattern::FMLSv8f16_OP1:6502case AArch64MachineCombinerPattern::FMLSv8f16_OP2:6503case AArch64MachineCombinerPattern::FMLSv2f32_OP2:6504case AArch64MachineCombinerPattern::FMLSv2f64_OP2:6505case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:6506case AArch64MachineCombinerPattern::FMLSv4f32_OP2:6507case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:6508case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2:6509case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:6510case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2:6511case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:6512case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2:6513case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:6514case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2:6515case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:6516case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2:6517case AArch64MachineCombinerPattern::MULADDv8i8_OP1:6518case AArch64MachineCombinerPattern::MULADDv8i8_OP2:6519case AArch64MachineCombinerPattern::MULADDv16i8_OP1:6520case AArch64MachineCombinerPattern::MULADDv16i8_OP2:6521case AArch64MachineCombinerPattern::MULADDv4i16_OP1:6522case AArch64MachineCombinerPattern::MULADDv4i16_OP2:6523case AArch64MachineCombinerPattern::MULADDv8i16_OP1:6524case AArch64MachineCombinerPattern::MULADDv8i16_OP2:6525case AArch64MachineCombinerPattern::MULADDv2i32_OP1:6526case AArch64MachineCombinerPattern::MULADDv2i32_OP2:6527case AArch64MachineCombinerPattern::MULADDv4i32_OP1:6528case AArch64MachineCombinerPattern::MULADDv4i32_OP2:6529case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:6530case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:6531case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:6532case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:6533case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:6534case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:6535case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:6536case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:6537case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:6538case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:6539case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:6540case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:6541case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:6542case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:6543case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:6544case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:6545case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:6546case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:6547case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:6548case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:6549case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:6550case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:6551case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:6552case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:6553case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:6554case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:6555case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:6556case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:6557return true;6558} // end switch (Pattern)6559return false;6560}65616562/// Find other MI combine patterns.6563static bool getMiscPatterns(MachineInstr &Root,6564SmallVectorImpl<unsigned> &Patterns) {6565// A - (B + C) ==> (A - B) - C or (A - C) - B6566unsigned Opc = Root.getOpcode();6567MachineBasicBlock &MBB = *Root.getParent();65686569switch (Opc) {6570case AArch64::SUBWrr:6571case AArch64::SUBSWrr:6572case AArch64::SUBXrr:6573case AArch64::SUBSXrr:6574// Found candidate root.6575break;6576default:6577return false;6578}65796580if (isCombineInstrSettingFlag(Opc) &&6581Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==6582-1)6583return false;65846585if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||6586canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||6587canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||6588canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {6589Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP1);6590Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP2);6591return true;6592}65936594return false;6595}65966597CombinerObjective6598AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {6599switch (Pattern) {6600case AArch64MachineCombinerPattern::SUBADD_OP1:6601case AArch64MachineCombinerPattern::SUBADD_OP2:6602return CombinerObjective::MustReduceDepth;6603default:6604return TargetInstrInfo::getCombinerObjective(Pattern);6605}6606}66076608/// Return true when there is potentially a faster code sequence for an6609/// instruction chain ending in \p Root. All potential patterns are listed in6610/// the \p Pattern vector. Pattern should be sorted in priority order since the6611/// pattern evaluator stops checking as soon as it finds a faster sequence.66126613bool AArch64InstrInfo::getMachineCombinerPatterns(6614MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,6615bool DoRegPressureReduce) const {6616// Integer patterns6617if (getMaddPatterns(Root, Patterns))6618return true;6619// Floating point patterns6620if (getFMULPatterns(Root, Patterns))6621return true;6622if (getFMAPatterns(Root, Patterns))6623return true;6624if (getFNEGPatterns(Root, Patterns))6625return true;66266627// Other patterns6628if (getMiscPatterns(Root, Patterns))6629return true;66306631return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,6632DoRegPressureReduce);6633}66346635enum class FMAInstKind { Default, Indexed, Accumulator };6636/// genFusedMultiply - Generate fused multiply instructions.6637/// This function supports both integer and floating point instructions.6638/// A typical example:6639/// F|MUL I=A,B,06640/// F|ADD R,I,C6641/// ==> F|MADD R,A,B,C6642/// \param MF Containing MachineFunction6643/// \param MRI Register information6644/// \param TII Target information6645/// \param Root is the F|ADD instruction6646/// \param [out] InsInstrs is a vector of machine instructions and will6647/// contain the generated madd instruction6648/// \param IdxMulOpd is index of operand in Root that is the result of6649/// the F|MUL. In the example above IdxMulOpd is 1.6650/// \param MaddOpc the opcode fo the f|madd instruction6651/// \param RC Register class of operands6652/// \param kind of fma instruction (addressing mode) to be generated6653/// \param ReplacedAddend is the result register from the instruction6654/// replacing the non-combined operand, if any.6655static MachineInstr *6656genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,6657const TargetInstrInfo *TII, MachineInstr &Root,6658SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,6659unsigned MaddOpc, const TargetRegisterClass *RC,6660FMAInstKind kind = FMAInstKind::Default,6661const Register *ReplacedAddend = nullptr) {6662assert(IdxMulOpd == 1 || IdxMulOpd == 2);66636664unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;6665MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());6666Register ResultReg = Root.getOperand(0).getReg();6667Register SrcReg0 = MUL->getOperand(1).getReg();6668bool Src0IsKill = MUL->getOperand(1).isKill();6669Register SrcReg1 = MUL->getOperand(2).getReg();6670bool Src1IsKill = MUL->getOperand(2).isKill();66716672Register SrcReg2;6673bool Src2IsKill;6674if (ReplacedAddend) {6675// If we just generated a new addend, we must be it's only use.6676SrcReg2 = *ReplacedAddend;6677Src2IsKill = true;6678} else {6679SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();6680Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();6681}66826683if (ResultReg.isVirtual())6684MRI.constrainRegClass(ResultReg, RC);6685if (SrcReg0.isVirtual())6686MRI.constrainRegClass(SrcReg0, RC);6687if (SrcReg1.isVirtual())6688MRI.constrainRegClass(SrcReg1, RC);6689if (SrcReg2.isVirtual())6690MRI.constrainRegClass(SrcReg2, RC);66916692MachineInstrBuilder MIB;6693if (kind == FMAInstKind::Default)6694MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)6695.addReg(SrcReg0, getKillRegState(Src0IsKill))6696.addReg(SrcReg1, getKillRegState(Src1IsKill))6697.addReg(SrcReg2, getKillRegState(Src2IsKill));6698else if (kind == FMAInstKind::Indexed)6699MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)6700.addReg(SrcReg2, getKillRegState(Src2IsKill))6701.addReg(SrcReg0, getKillRegState(Src0IsKill))6702.addReg(SrcReg1, getKillRegState(Src1IsKill))6703.addImm(MUL->getOperand(3).getImm());6704else if (kind == FMAInstKind::Accumulator)6705MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)6706.addReg(SrcReg2, getKillRegState(Src2IsKill))6707.addReg(SrcReg0, getKillRegState(Src0IsKill))6708.addReg(SrcReg1, getKillRegState(Src1IsKill));6709else6710assert(false && "Invalid FMA instruction kind \n");6711// Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)6712InsInstrs.push_back(MIB);6713return MUL;6714}67156716static MachineInstr *6717genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,6718const TargetInstrInfo *TII, MachineInstr &Root,6719SmallVectorImpl<MachineInstr *> &InsInstrs) {6720MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());67216722unsigned Opc = 0;6723const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());6724if (AArch64::FPR32RegClass.hasSubClassEq(RC))6725Opc = AArch64::FNMADDSrrr;6726else if (AArch64::FPR64RegClass.hasSubClassEq(RC))6727Opc = AArch64::FNMADDDrrr;6728else6729return nullptr;67306731Register ResultReg = Root.getOperand(0).getReg();6732Register SrcReg0 = MAD->getOperand(1).getReg();6733Register SrcReg1 = MAD->getOperand(2).getReg();6734Register SrcReg2 = MAD->getOperand(3).getReg();6735bool Src0IsKill = MAD->getOperand(1).isKill();6736bool Src1IsKill = MAD->getOperand(2).isKill();6737bool Src2IsKill = MAD->getOperand(3).isKill();6738if (ResultReg.isVirtual())6739MRI.constrainRegClass(ResultReg, RC);6740if (SrcReg0.isVirtual())6741MRI.constrainRegClass(SrcReg0, RC);6742if (SrcReg1.isVirtual())6743MRI.constrainRegClass(SrcReg1, RC);6744if (SrcReg2.isVirtual())6745MRI.constrainRegClass(SrcReg2, RC);67466747MachineInstrBuilder MIB =6748BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)6749.addReg(SrcReg0, getKillRegState(Src0IsKill))6750.addReg(SrcReg1, getKillRegState(Src1IsKill))6751.addReg(SrcReg2, getKillRegState(Src2IsKill));6752InsInstrs.push_back(MIB);67536754return MAD;6755}67566757/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)6758static MachineInstr *6759genIndexedMultiply(MachineInstr &Root,6760SmallVectorImpl<MachineInstr *> &InsInstrs,6761unsigned IdxDupOp, unsigned MulOpc,6762const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {6763assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&6764"Invalid index of FMUL operand");67656766MachineFunction &MF = *Root.getMF();6767const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();67686769MachineInstr *Dup =6770MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());67716772if (Dup->getOpcode() == TargetOpcode::COPY)6773Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());67746775Register DupSrcReg = Dup->getOperand(1).getReg();6776MRI.clearKillFlags(DupSrcReg);6777MRI.constrainRegClass(DupSrcReg, RC);67786779unsigned DupSrcLane = Dup->getOperand(2).getImm();67806781unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;6782MachineOperand &MulOp = Root.getOperand(IdxMulOp);67836784Register ResultReg = Root.getOperand(0).getReg();67856786MachineInstrBuilder MIB;6787MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)6788.add(MulOp)6789.addReg(DupSrcReg)6790.addImm(DupSrcLane);67916792InsInstrs.push_back(MIB);6793return &Root;6794}67956796/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate6797/// instructions.6798///6799/// \see genFusedMultiply6800static MachineInstr *genFusedMultiplyAcc(6801MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,6802MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,6803unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {6804return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,6805FMAInstKind::Accumulator);6806}68076808/// genNeg - Helper to generate an intermediate negation of the second operand6809/// of Root6810static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,6811const TargetInstrInfo *TII, MachineInstr &Root,6812SmallVectorImpl<MachineInstr *> &InsInstrs,6813DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,6814unsigned MnegOpc, const TargetRegisterClass *RC) {6815Register NewVR = MRI.createVirtualRegister(RC);6816MachineInstrBuilder MIB =6817BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)6818.add(Root.getOperand(2));6819InsInstrs.push_back(MIB);68206821assert(InstrIdxForVirtReg.empty());6822InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));68236824return NewVR;6825}68266827/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate6828/// instructions with an additional negation of the accumulator6829static MachineInstr *genFusedMultiplyAccNeg(6830MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,6831MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,6832DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,6833unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {6834assert(IdxMulOpd == 1);68356836Register NewVR =6837genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);6838return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,6839FMAInstKind::Accumulator, &NewVR);6840}68416842/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate6843/// instructions.6844///6845/// \see genFusedMultiply6846static MachineInstr *genFusedMultiplyIdx(6847MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,6848MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,6849unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {6850return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,6851FMAInstKind::Indexed);6852}68536854/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate6855/// instructions with an additional negation of the accumulator6856static MachineInstr *genFusedMultiplyIdxNeg(6857MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,6858MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,6859DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,6860unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {6861assert(IdxMulOpd == 1);68626863Register NewVR =6864genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);68656866return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,6867FMAInstKind::Indexed, &NewVR);6868}68696870/// genMaddR - Generate madd instruction and combine mul and add using6871/// an extra virtual register6872/// Example - an ADD intermediate needs to be stored in a register:6873/// MUL I=A,B,06874/// ADD R,I,Imm6875/// ==> ORR V, ZR, Imm6876/// ==> MADD R,A,B,V6877/// \param MF Containing MachineFunction6878/// \param MRI Register information6879/// \param TII Target information6880/// \param Root is the ADD instruction6881/// \param [out] InsInstrs is a vector of machine instructions and will6882/// contain the generated madd instruction6883/// \param IdxMulOpd is index of operand in Root that is the result of6884/// the MUL. In the example above IdxMulOpd is 1.6885/// \param MaddOpc the opcode fo the madd instruction6886/// \param VR is a virtual register that holds the value of an ADD operand6887/// (V in the example above).6888/// \param RC Register class of operands6889static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,6890const TargetInstrInfo *TII, MachineInstr &Root,6891SmallVectorImpl<MachineInstr *> &InsInstrs,6892unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,6893const TargetRegisterClass *RC) {6894assert(IdxMulOpd == 1 || IdxMulOpd == 2);68956896MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());6897Register ResultReg = Root.getOperand(0).getReg();6898Register SrcReg0 = MUL->getOperand(1).getReg();6899bool Src0IsKill = MUL->getOperand(1).isKill();6900Register SrcReg1 = MUL->getOperand(2).getReg();6901bool Src1IsKill = MUL->getOperand(2).isKill();69026903if (ResultReg.isVirtual())6904MRI.constrainRegClass(ResultReg, RC);6905if (SrcReg0.isVirtual())6906MRI.constrainRegClass(SrcReg0, RC);6907if (SrcReg1.isVirtual())6908MRI.constrainRegClass(SrcReg1, RC);6909if (Register::isVirtualRegister(VR))6910MRI.constrainRegClass(VR, RC);69116912MachineInstrBuilder MIB =6913BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)6914.addReg(SrcReg0, getKillRegState(Src0IsKill))6915.addReg(SrcReg1, getKillRegState(Src1IsKill))6916.addReg(VR);6917// Insert the MADD6918InsInstrs.push_back(MIB);6919return MUL;6920}69216922/// Do the following transformation6923/// A - (B + C) ==> (A - B) - C6924/// A - (B + C) ==> (A - C) - B6925static void6926genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,6927const TargetInstrInfo *TII, MachineInstr &Root,6928SmallVectorImpl<MachineInstr *> &InsInstrs,6929SmallVectorImpl<MachineInstr *> &DelInstrs,6930unsigned IdxOpd1,6931DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {6932assert(IdxOpd1 == 1 || IdxOpd1 == 2);6933unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;6934MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());69356936Register ResultReg = Root.getOperand(0).getReg();6937Register RegA = Root.getOperand(1).getReg();6938bool RegAIsKill = Root.getOperand(1).isKill();6939Register RegB = AddMI->getOperand(IdxOpd1).getReg();6940bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();6941Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();6942bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();6943Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA));69446945unsigned Opcode = Root.getOpcode();6946if (Opcode == AArch64::SUBSWrr)6947Opcode = AArch64::SUBWrr;6948else if (Opcode == AArch64::SUBSXrr)6949Opcode = AArch64::SUBXrr;6950else6951assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&6952"Unexpected instruction opcode.");69536954uint32_t Flags = Root.mergeFlagsWith(*AddMI);6955Flags &= ~MachineInstr::NoSWrap;6956Flags &= ~MachineInstr::NoUWrap;69576958MachineInstrBuilder MIB1 =6959BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)6960.addReg(RegA, getKillRegState(RegAIsKill))6961.addReg(RegB, getKillRegState(RegBIsKill))6962.setMIFlags(Flags);6963MachineInstrBuilder MIB2 =6964BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)6965.addReg(NewVR, getKillRegState(true))6966.addReg(RegC, getKillRegState(RegCIsKill))6967.setMIFlags(Flags);69686969InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));6970InsInstrs.push_back(MIB1);6971InsInstrs.push_back(MIB2);6972DelInstrs.push_back(AddMI);6973DelInstrs.push_back(&Root);6974}69756976/// When getMachineCombinerPatterns() finds potential patterns,6977/// this function generates the instructions that could replace the6978/// original code sequence6979void AArch64InstrInfo::genAlternativeCodeSequence(6980MachineInstr &Root, unsigned Pattern,6981SmallVectorImpl<MachineInstr *> &InsInstrs,6982SmallVectorImpl<MachineInstr *> &DelInstrs,6983DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {6984MachineBasicBlock &MBB = *Root.getParent();6985MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();6986MachineFunction &MF = *MBB.getParent();6987const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();69886989MachineInstr *MUL = nullptr;6990const TargetRegisterClass *RC;6991unsigned Opc;6992switch (Pattern) {6993default:6994// Reassociate instructions.6995TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,6996DelInstrs, InstrIdxForVirtReg);6997return;6998case AArch64MachineCombinerPattern::SUBADD_OP1:6999// A - (B + C)7000// ==> (A - B) - C7001genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,7002InstrIdxForVirtReg);7003return;7004case AArch64MachineCombinerPattern::SUBADD_OP2:7005// A - (B + C)7006// ==> (A - C) - B7007genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,7008InstrIdxForVirtReg);7009return;7010case AArch64MachineCombinerPattern::MULADDW_OP1:7011case AArch64MachineCombinerPattern::MULADDX_OP1:7012// MUL I=A,B,07013// ADD R,I,C7014// ==> MADD R,A,B,C7015// --- Create(MADD);7016if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) {7017Opc = AArch64::MADDWrrr;7018RC = &AArch64::GPR32RegClass;7019} else {7020Opc = AArch64::MADDXrrr;7021RC = &AArch64::GPR64RegClass;7022}7023MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7024break;7025case AArch64MachineCombinerPattern::MULADDW_OP2:7026case AArch64MachineCombinerPattern::MULADDX_OP2:7027// MUL I=A,B,07028// ADD R,C,I7029// ==> MADD R,A,B,C7030// --- Create(MADD);7031if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) {7032Opc = AArch64::MADDWrrr;7033RC = &AArch64::GPR32RegClass;7034} else {7035Opc = AArch64::MADDXrrr;7036RC = &AArch64::GPR64RegClass;7037}7038MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7039break;7040case AArch64MachineCombinerPattern::MULADDWI_OP1:7041case AArch64MachineCombinerPattern::MULADDXI_OP1: {7042// MUL I=A,B,07043// ADD R,I,Imm7044// ==> MOV V, Imm7045// ==> MADD R,A,B,V7046// --- Create(MADD);7047const TargetRegisterClass *OrrRC;7048unsigned BitSize, OrrOpc, ZeroReg;7049if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1) {7050OrrOpc = AArch64::ORRWri;7051OrrRC = &AArch64::GPR32spRegClass;7052BitSize = 32;7053ZeroReg = AArch64::WZR;7054Opc = AArch64::MADDWrrr;7055RC = &AArch64::GPR32RegClass;7056} else {7057OrrOpc = AArch64::ORRXri;7058OrrRC = &AArch64::GPR64spRegClass;7059BitSize = 64;7060ZeroReg = AArch64::XZR;7061Opc = AArch64::MADDXrrr;7062RC = &AArch64::GPR64RegClass;7063}7064Register NewVR = MRI.createVirtualRegister(OrrRC);7065uint64_t Imm = Root.getOperand(2).getImm();70667067if (Root.getOperand(3).isImm()) {7068unsigned Val = Root.getOperand(3).getImm();7069Imm = Imm << Val;7070}7071uint64_t UImm = SignExtend64(Imm, BitSize);7072// The immediate can be composed via a single instruction.7073SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;7074AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);7075if (Insn.size() != 1)7076return;7077auto MovI = Insn.begin();7078MachineInstrBuilder MIB1;7079// MOV is an alias for one of three instructions: movz, movn, and orr.7080if (MovI->Opcode == OrrOpc)7081MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)7082.addReg(ZeroReg)7083.addImm(MovI->Op2);7084else {7085if (BitSize == 32)7086assert((MovI->Opcode == AArch64::MOVNWi ||7087MovI->Opcode == AArch64::MOVZWi) &&7088"Expected opcode");7089else7090assert((MovI->Opcode == AArch64::MOVNXi ||7091MovI->Opcode == AArch64::MOVZXi) &&7092"Expected opcode");7093MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)7094.addImm(MovI->Op1)7095.addImm(MovI->Op2);7096}7097InsInstrs.push_back(MIB1);7098InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));7099MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);7100break;7101}7102case AArch64MachineCombinerPattern::MULSUBW_OP1:7103case AArch64MachineCombinerPattern::MULSUBX_OP1: {7104// MUL I=A,B,07105// SUB R,I, C7106// ==> SUB V, 0, C7107// ==> MADD R,A,B,V // = -C + A*B7108// --- Create(MADD);7109const TargetRegisterClass *SubRC;7110unsigned SubOpc, ZeroReg;7111if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) {7112SubOpc = AArch64::SUBWrr;7113SubRC = &AArch64::GPR32spRegClass;7114ZeroReg = AArch64::WZR;7115Opc = AArch64::MADDWrrr;7116RC = &AArch64::GPR32RegClass;7117} else {7118SubOpc = AArch64::SUBXrr;7119SubRC = &AArch64::GPR64spRegClass;7120ZeroReg = AArch64::XZR;7121Opc = AArch64::MADDXrrr;7122RC = &AArch64::GPR64RegClass;7123}7124Register NewVR = MRI.createVirtualRegister(SubRC);7125// SUB NewVR, 0, C7126MachineInstrBuilder MIB1 =7127BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)7128.addReg(ZeroReg)7129.add(Root.getOperand(2));7130InsInstrs.push_back(MIB1);7131InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));7132MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);7133break;7134}7135case AArch64MachineCombinerPattern::MULSUBW_OP2:7136case AArch64MachineCombinerPattern::MULSUBX_OP2:7137// MUL I=A,B,07138// SUB R,C,I7139// ==> MSUB R,A,B,C (computes C - A*B)7140// --- Create(MSUB);7141if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) {7142Opc = AArch64::MSUBWrrr;7143RC = &AArch64::GPR32RegClass;7144} else {7145Opc = AArch64::MSUBXrrr;7146RC = &AArch64::GPR64RegClass;7147}7148MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7149break;7150case AArch64MachineCombinerPattern::MULSUBWI_OP1:7151case AArch64MachineCombinerPattern::MULSUBXI_OP1: {7152// MUL I=A,B,07153// SUB R,I, Imm7154// ==> MOV V, -Imm7155// ==> MADD R,A,B,V // = -Imm + A*B7156// --- Create(MADD);7157const TargetRegisterClass *OrrRC;7158unsigned BitSize, OrrOpc, ZeroReg;7159if (Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {7160OrrOpc = AArch64::ORRWri;7161OrrRC = &AArch64::GPR32spRegClass;7162BitSize = 32;7163ZeroReg = AArch64::WZR;7164Opc = AArch64::MADDWrrr;7165RC = &AArch64::GPR32RegClass;7166} else {7167OrrOpc = AArch64::ORRXri;7168OrrRC = &AArch64::GPR64spRegClass;7169BitSize = 64;7170ZeroReg = AArch64::XZR;7171Opc = AArch64::MADDXrrr;7172RC = &AArch64::GPR64RegClass;7173}7174Register NewVR = MRI.createVirtualRegister(OrrRC);7175uint64_t Imm = Root.getOperand(2).getImm();7176if (Root.getOperand(3).isImm()) {7177unsigned Val = Root.getOperand(3).getImm();7178Imm = Imm << Val;7179}7180uint64_t UImm = SignExtend64(-Imm, BitSize);7181// The immediate can be composed via a single instruction.7182SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;7183AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);7184if (Insn.size() != 1)7185return;7186auto MovI = Insn.begin();7187MachineInstrBuilder MIB1;7188// MOV is an alias for one of three instructions: movz, movn, and orr.7189if (MovI->Opcode == OrrOpc)7190MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)7191.addReg(ZeroReg)7192.addImm(MovI->Op2);7193else {7194if (BitSize == 32)7195assert((MovI->Opcode == AArch64::MOVNWi ||7196MovI->Opcode == AArch64::MOVZWi) &&7197"Expected opcode");7198else7199assert((MovI->Opcode == AArch64::MOVNXi ||7200MovI->Opcode == AArch64::MOVZXi) &&7201"Expected opcode");7202MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)7203.addImm(MovI->Op1)7204.addImm(MovI->Op2);7205}7206InsInstrs.push_back(MIB1);7207InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));7208MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);7209break;7210}72117212case AArch64MachineCombinerPattern::MULADDv8i8_OP1:7213Opc = AArch64::MLAv8i8;7214RC = &AArch64::FPR64RegClass;7215MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7216break;7217case AArch64MachineCombinerPattern::MULADDv8i8_OP2:7218Opc = AArch64::MLAv8i8;7219RC = &AArch64::FPR64RegClass;7220MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7221break;7222case AArch64MachineCombinerPattern::MULADDv16i8_OP1:7223Opc = AArch64::MLAv16i8;7224RC = &AArch64::FPR128RegClass;7225MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7226break;7227case AArch64MachineCombinerPattern::MULADDv16i8_OP2:7228Opc = AArch64::MLAv16i8;7229RC = &AArch64::FPR128RegClass;7230MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7231break;7232case AArch64MachineCombinerPattern::MULADDv4i16_OP1:7233Opc = AArch64::MLAv4i16;7234RC = &AArch64::FPR64RegClass;7235MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7236break;7237case AArch64MachineCombinerPattern::MULADDv4i16_OP2:7238Opc = AArch64::MLAv4i16;7239RC = &AArch64::FPR64RegClass;7240MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7241break;7242case AArch64MachineCombinerPattern::MULADDv8i16_OP1:7243Opc = AArch64::MLAv8i16;7244RC = &AArch64::FPR128RegClass;7245MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7246break;7247case AArch64MachineCombinerPattern::MULADDv8i16_OP2:7248Opc = AArch64::MLAv8i16;7249RC = &AArch64::FPR128RegClass;7250MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7251break;7252case AArch64MachineCombinerPattern::MULADDv2i32_OP1:7253Opc = AArch64::MLAv2i32;7254RC = &AArch64::FPR64RegClass;7255MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7256break;7257case AArch64MachineCombinerPattern::MULADDv2i32_OP2:7258Opc = AArch64::MLAv2i32;7259RC = &AArch64::FPR64RegClass;7260MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7261break;7262case AArch64MachineCombinerPattern::MULADDv4i32_OP1:7263Opc = AArch64::MLAv4i32;7264RC = &AArch64::FPR128RegClass;7265MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7266break;7267case AArch64MachineCombinerPattern::MULADDv4i32_OP2:7268Opc = AArch64::MLAv4i32;7269RC = &AArch64::FPR128RegClass;7270MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7271break;72727273case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:7274Opc = AArch64::MLAv8i8;7275RC = &AArch64::FPR64RegClass;7276MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,7277InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,7278RC);7279break;7280case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:7281Opc = AArch64::MLSv8i8;7282RC = &AArch64::FPR64RegClass;7283MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7284break;7285case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:7286Opc = AArch64::MLAv16i8;7287RC = &AArch64::FPR128RegClass;7288MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,7289InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,7290RC);7291break;7292case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:7293Opc = AArch64::MLSv16i8;7294RC = &AArch64::FPR128RegClass;7295MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7296break;7297case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:7298Opc = AArch64::MLAv4i16;7299RC = &AArch64::FPR64RegClass;7300MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,7301InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,7302RC);7303break;7304case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:7305Opc = AArch64::MLSv4i16;7306RC = &AArch64::FPR64RegClass;7307MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7308break;7309case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:7310Opc = AArch64::MLAv8i16;7311RC = &AArch64::FPR128RegClass;7312MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,7313InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,7314RC);7315break;7316case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:7317Opc = AArch64::MLSv8i16;7318RC = &AArch64::FPR128RegClass;7319MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7320break;7321case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:7322Opc = AArch64::MLAv2i32;7323RC = &AArch64::FPR64RegClass;7324MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,7325InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,7326RC);7327break;7328case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:7329Opc = AArch64::MLSv2i32;7330RC = &AArch64::FPR64RegClass;7331MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7332break;7333case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:7334Opc = AArch64::MLAv4i32;7335RC = &AArch64::FPR128RegClass;7336MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,7337InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,7338RC);7339break;7340case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:7341Opc = AArch64::MLSv4i32;7342RC = &AArch64::FPR128RegClass;7343MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7344break;73457346case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:7347Opc = AArch64::MLAv4i16_indexed;7348RC = &AArch64::FPR64RegClass;7349MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7350break;7351case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:7352Opc = AArch64::MLAv4i16_indexed;7353RC = &AArch64::FPR64RegClass;7354MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7355break;7356case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:7357Opc = AArch64::MLAv8i16_indexed;7358RC = &AArch64::FPR128RegClass;7359MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7360break;7361case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:7362Opc = AArch64::MLAv8i16_indexed;7363RC = &AArch64::FPR128RegClass;7364MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7365break;7366case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:7367Opc = AArch64::MLAv2i32_indexed;7368RC = &AArch64::FPR64RegClass;7369MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7370break;7371case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:7372Opc = AArch64::MLAv2i32_indexed;7373RC = &AArch64::FPR64RegClass;7374MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7375break;7376case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:7377Opc = AArch64::MLAv4i32_indexed;7378RC = &AArch64::FPR128RegClass;7379MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7380break;7381case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:7382Opc = AArch64::MLAv4i32_indexed;7383RC = &AArch64::FPR128RegClass;7384MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7385break;73867387case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:7388Opc = AArch64::MLAv4i16_indexed;7389RC = &AArch64::FPR64RegClass;7390MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,7391InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,7392RC);7393break;7394case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:7395Opc = AArch64::MLSv4i16_indexed;7396RC = &AArch64::FPR64RegClass;7397MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7398break;7399case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:7400Opc = AArch64::MLAv8i16_indexed;7401RC = &AArch64::FPR128RegClass;7402MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,7403InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,7404RC);7405break;7406case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:7407Opc = AArch64::MLSv8i16_indexed;7408RC = &AArch64::FPR128RegClass;7409MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7410break;7411case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:7412Opc = AArch64::MLAv2i32_indexed;7413RC = &AArch64::FPR64RegClass;7414MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,7415InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,7416RC);7417break;7418case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:7419Opc = AArch64::MLSv2i32_indexed;7420RC = &AArch64::FPR64RegClass;7421MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7422break;7423case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:7424Opc = AArch64::MLAv4i32_indexed;7425RC = &AArch64::FPR128RegClass;7426MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,7427InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,7428RC);7429break;7430case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:7431Opc = AArch64::MLSv4i32_indexed;7432RC = &AArch64::FPR128RegClass;7433MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7434break;74357436// Floating Point Support7437case AArch64MachineCombinerPattern::FMULADDH_OP1:7438Opc = AArch64::FMADDHrrr;7439RC = &AArch64::FPR16RegClass;7440MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7441break;7442case AArch64MachineCombinerPattern::FMULADDS_OP1:7443Opc = AArch64::FMADDSrrr;7444RC = &AArch64::FPR32RegClass;7445MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7446break;7447case AArch64MachineCombinerPattern::FMULADDD_OP1:7448Opc = AArch64::FMADDDrrr;7449RC = &AArch64::FPR64RegClass;7450MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7451break;74527453case AArch64MachineCombinerPattern::FMULADDH_OP2:7454Opc = AArch64::FMADDHrrr;7455RC = &AArch64::FPR16RegClass;7456MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7457break;7458case AArch64MachineCombinerPattern::FMULADDS_OP2:7459Opc = AArch64::FMADDSrrr;7460RC = &AArch64::FPR32RegClass;7461MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7462break;7463case AArch64MachineCombinerPattern::FMULADDD_OP2:7464Opc = AArch64::FMADDDrrr;7465RC = &AArch64::FPR64RegClass;7466MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7467break;74687469case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:7470Opc = AArch64::FMLAv1i32_indexed;7471RC = &AArch64::FPR32RegClass;7472MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7473FMAInstKind::Indexed);7474break;7475case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:7476Opc = AArch64::FMLAv1i32_indexed;7477RC = &AArch64::FPR32RegClass;7478MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7479FMAInstKind::Indexed);7480break;74817482case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:7483Opc = AArch64::FMLAv1i64_indexed;7484RC = &AArch64::FPR64RegClass;7485MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7486FMAInstKind::Indexed);7487break;7488case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:7489Opc = AArch64::FMLAv1i64_indexed;7490RC = &AArch64::FPR64RegClass;7491MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7492FMAInstKind::Indexed);7493break;74947495case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:7496RC = &AArch64::FPR64RegClass;7497Opc = AArch64::FMLAv4i16_indexed;7498MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7499FMAInstKind::Indexed);7500break;7501case AArch64MachineCombinerPattern::FMLAv4f16_OP1:7502RC = &AArch64::FPR64RegClass;7503Opc = AArch64::FMLAv4f16;7504MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7505FMAInstKind::Accumulator);7506break;7507case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:7508RC = &AArch64::FPR64RegClass;7509Opc = AArch64::FMLAv4i16_indexed;7510MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7511FMAInstKind::Indexed);7512break;7513case AArch64MachineCombinerPattern::FMLAv4f16_OP2:7514RC = &AArch64::FPR64RegClass;7515Opc = AArch64::FMLAv4f16;7516MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7517FMAInstKind::Accumulator);7518break;75197520case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:7521case AArch64MachineCombinerPattern::FMLAv2f32_OP1:7522RC = &AArch64::FPR64RegClass;7523if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) {7524Opc = AArch64::FMLAv2i32_indexed;7525MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7526FMAInstKind::Indexed);7527} else {7528Opc = AArch64::FMLAv2f32;7529MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7530FMAInstKind::Accumulator);7531}7532break;7533case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:7534case AArch64MachineCombinerPattern::FMLAv2f32_OP2:7535RC = &AArch64::FPR64RegClass;7536if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) {7537Opc = AArch64::FMLAv2i32_indexed;7538MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7539FMAInstKind::Indexed);7540} else {7541Opc = AArch64::FMLAv2f32;7542MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7543FMAInstKind::Accumulator);7544}7545break;75467547case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:7548RC = &AArch64::FPR128RegClass;7549Opc = AArch64::FMLAv8i16_indexed;7550MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7551FMAInstKind::Indexed);7552break;7553case AArch64MachineCombinerPattern::FMLAv8f16_OP1:7554RC = &AArch64::FPR128RegClass;7555Opc = AArch64::FMLAv8f16;7556MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7557FMAInstKind::Accumulator);7558break;7559case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:7560RC = &AArch64::FPR128RegClass;7561Opc = AArch64::FMLAv8i16_indexed;7562MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7563FMAInstKind::Indexed);7564break;7565case AArch64MachineCombinerPattern::FMLAv8f16_OP2:7566RC = &AArch64::FPR128RegClass;7567Opc = AArch64::FMLAv8f16;7568MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7569FMAInstKind::Accumulator);7570break;75717572case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:7573case AArch64MachineCombinerPattern::FMLAv2f64_OP1:7574RC = &AArch64::FPR128RegClass;7575if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) {7576Opc = AArch64::FMLAv2i64_indexed;7577MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7578FMAInstKind::Indexed);7579} else {7580Opc = AArch64::FMLAv2f64;7581MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7582FMAInstKind::Accumulator);7583}7584break;7585case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:7586case AArch64MachineCombinerPattern::FMLAv2f64_OP2:7587RC = &AArch64::FPR128RegClass;7588if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) {7589Opc = AArch64::FMLAv2i64_indexed;7590MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7591FMAInstKind::Indexed);7592} else {7593Opc = AArch64::FMLAv2f64;7594MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7595FMAInstKind::Accumulator);7596}7597break;75987599case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:7600case AArch64MachineCombinerPattern::FMLAv4f32_OP1:7601RC = &AArch64::FPR128RegClass;7602if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) {7603Opc = AArch64::FMLAv4i32_indexed;7604MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7605FMAInstKind::Indexed);7606} else {7607Opc = AArch64::FMLAv4f32;7608MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7609FMAInstKind::Accumulator);7610}7611break;76127613case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:7614case AArch64MachineCombinerPattern::FMLAv4f32_OP2:7615RC = &AArch64::FPR128RegClass;7616if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) {7617Opc = AArch64::FMLAv4i32_indexed;7618MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7619FMAInstKind::Indexed);7620} else {7621Opc = AArch64::FMLAv4f32;7622MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7623FMAInstKind::Accumulator);7624}7625break;76267627case AArch64MachineCombinerPattern::FMULSUBH_OP1:7628Opc = AArch64::FNMSUBHrrr;7629RC = &AArch64::FPR16RegClass;7630MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7631break;7632case AArch64MachineCombinerPattern::FMULSUBS_OP1:7633Opc = AArch64::FNMSUBSrrr;7634RC = &AArch64::FPR32RegClass;7635MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7636break;7637case AArch64MachineCombinerPattern::FMULSUBD_OP1:7638Opc = AArch64::FNMSUBDrrr;7639RC = &AArch64::FPR64RegClass;7640MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7641break;76427643case AArch64MachineCombinerPattern::FNMULSUBH_OP1:7644Opc = AArch64::FNMADDHrrr;7645RC = &AArch64::FPR16RegClass;7646MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7647break;7648case AArch64MachineCombinerPattern::FNMULSUBS_OP1:7649Opc = AArch64::FNMADDSrrr;7650RC = &AArch64::FPR32RegClass;7651MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7652break;7653case AArch64MachineCombinerPattern::FNMULSUBD_OP1:7654Opc = AArch64::FNMADDDrrr;7655RC = &AArch64::FPR64RegClass;7656MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);7657break;76587659case AArch64MachineCombinerPattern::FMULSUBH_OP2:7660Opc = AArch64::FMSUBHrrr;7661RC = &AArch64::FPR16RegClass;7662MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7663break;7664case AArch64MachineCombinerPattern::FMULSUBS_OP2:7665Opc = AArch64::FMSUBSrrr;7666RC = &AArch64::FPR32RegClass;7667MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7668break;7669case AArch64MachineCombinerPattern::FMULSUBD_OP2:7670Opc = AArch64::FMSUBDrrr;7671RC = &AArch64::FPR64RegClass;7672MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);7673break;76747675case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:7676Opc = AArch64::FMLSv1i32_indexed;7677RC = &AArch64::FPR32RegClass;7678MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7679FMAInstKind::Indexed);7680break;76817682case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:7683Opc = AArch64::FMLSv1i64_indexed;7684RC = &AArch64::FPR64RegClass;7685MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7686FMAInstKind::Indexed);7687break;76887689case AArch64MachineCombinerPattern::FMLSv4f16_OP1:7690case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: {7691RC = &AArch64::FPR64RegClass;7692Register NewVR = MRI.createVirtualRegister(RC);7693MachineInstrBuilder MIB1 =7694BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)7695.add(Root.getOperand(2));7696InsInstrs.push_back(MIB1);7697InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));7698if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) {7699Opc = AArch64::FMLAv4f16;7700MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7701FMAInstKind::Accumulator, &NewVR);7702} else {7703Opc = AArch64::FMLAv4i16_indexed;7704MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7705FMAInstKind::Indexed, &NewVR);7706}7707break;7708}7709case AArch64MachineCombinerPattern::FMLSv4f16_OP2:7710RC = &AArch64::FPR64RegClass;7711Opc = AArch64::FMLSv4f16;7712MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7713FMAInstKind::Accumulator);7714break;7715case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:7716RC = &AArch64::FPR64RegClass;7717Opc = AArch64::FMLSv4i16_indexed;7718MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7719FMAInstKind::Indexed);7720break;77217722case AArch64MachineCombinerPattern::FMLSv2f32_OP2:7723case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:7724RC = &AArch64::FPR64RegClass;7725if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) {7726Opc = AArch64::FMLSv2i32_indexed;7727MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7728FMAInstKind::Indexed);7729} else {7730Opc = AArch64::FMLSv2f32;7731MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7732FMAInstKind::Accumulator);7733}7734break;77357736case AArch64MachineCombinerPattern::FMLSv8f16_OP1:7737case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: {7738RC = &AArch64::FPR128RegClass;7739Register NewVR = MRI.createVirtualRegister(RC);7740MachineInstrBuilder MIB1 =7741BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)7742.add(Root.getOperand(2));7743InsInstrs.push_back(MIB1);7744InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));7745if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) {7746Opc = AArch64::FMLAv8f16;7747MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7748FMAInstKind::Accumulator, &NewVR);7749} else {7750Opc = AArch64::FMLAv8i16_indexed;7751MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7752FMAInstKind::Indexed, &NewVR);7753}7754break;7755}7756case AArch64MachineCombinerPattern::FMLSv8f16_OP2:7757RC = &AArch64::FPR128RegClass;7758Opc = AArch64::FMLSv8f16;7759MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7760FMAInstKind::Accumulator);7761break;7762case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:7763RC = &AArch64::FPR128RegClass;7764Opc = AArch64::FMLSv8i16_indexed;7765MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7766FMAInstKind::Indexed);7767break;77687769case AArch64MachineCombinerPattern::FMLSv2f64_OP2:7770case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:7771RC = &AArch64::FPR128RegClass;7772if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) {7773Opc = AArch64::FMLSv2i64_indexed;7774MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7775FMAInstKind::Indexed);7776} else {7777Opc = AArch64::FMLSv2f64;7778MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7779FMAInstKind::Accumulator);7780}7781break;77827783case AArch64MachineCombinerPattern::FMLSv4f32_OP2:7784case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:7785RC = &AArch64::FPR128RegClass;7786if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) {7787Opc = AArch64::FMLSv4i32_indexed;7788MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7789FMAInstKind::Indexed);7790} else {7791Opc = AArch64::FMLSv4f32;7792MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,7793FMAInstKind::Accumulator);7794}7795break;7796case AArch64MachineCombinerPattern::FMLSv2f32_OP1:7797case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: {7798RC = &AArch64::FPR64RegClass;7799Register NewVR = MRI.createVirtualRegister(RC);7800MachineInstrBuilder MIB1 =7801BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)7802.add(Root.getOperand(2));7803InsInstrs.push_back(MIB1);7804InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));7805if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) {7806Opc = AArch64::FMLAv2i32_indexed;7807MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7808FMAInstKind::Indexed, &NewVR);7809} else {7810Opc = AArch64::FMLAv2f32;7811MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7812FMAInstKind::Accumulator, &NewVR);7813}7814break;7815}7816case AArch64MachineCombinerPattern::FMLSv4f32_OP1:7817case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: {7818RC = &AArch64::FPR128RegClass;7819Register NewVR = MRI.createVirtualRegister(RC);7820MachineInstrBuilder MIB1 =7821BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)7822.add(Root.getOperand(2));7823InsInstrs.push_back(MIB1);7824InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));7825if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) {7826Opc = AArch64::FMLAv4i32_indexed;7827MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7828FMAInstKind::Indexed, &NewVR);7829} else {7830Opc = AArch64::FMLAv4f32;7831MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7832FMAInstKind::Accumulator, &NewVR);7833}7834break;7835}7836case AArch64MachineCombinerPattern::FMLSv2f64_OP1:7837case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: {7838RC = &AArch64::FPR128RegClass;7839Register NewVR = MRI.createVirtualRegister(RC);7840MachineInstrBuilder MIB1 =7841BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)7842.add(Root.getOperand(2));7843InsInstrs.push_back(MIB1);7844InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));7845if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) {7846Opc = AArch64::FMLAv2i64_indexed;7847MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7848FMAInstKind::Indexed, &NewVR);7849} else {7850Opc = AArch64::FMLAv2f64;7851MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,7852FMAInstKind::Accumulator, &NewVR);7853}7854break;7855}7856case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:7857case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: {7858unsigned IdxDupOp =7859(Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 17860: 2;7861genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,7862&AArch64::FPR128RegClass, MRI);7863break;7864}7865case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:7866case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: {7867unsigned IdxDupOp =7868(Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 17869: 2;7870genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,7871&AArch64::FPR128RegClass, MRI);7872break;7873}7874case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:7875case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: {7876unsigned IdxDupOp =7877(Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 17878: 2;7879genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,7880&AArch64::FPR128_loRegClass, MRI);7881break;7882}7883case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:7884case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: {7885unsigned IdxDupOp =7886(Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 17887: 2;7888genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,7889&AArch64::FPR128RegClass, MRI);7890break;7891}7892case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:7893case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: {7894unsigned IdxDupOp =7895(Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 17896: 2;7897genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,7898&AArch64::FPR128_loRegClass, MRI);7899break;7900}7901case AArch64MachineCombinerPattern::FNMADD: {7902MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);7903break;7904}79057906} // end switch (Pattern)7907// Record MUL and ADD/SUB for deletion7908if (MUL)7909DelInstrs.push_back(MUL);7910DelInstrs.push_back(&Root);79117912// Set the flags on the inserted instructions to be the merged flags of the7913// instructions that we have combined.7914uint32_t Flags = Root.getFlags();7915if (MUL)7916Flags = Root.mergeFlagsWith(*MUL);7917for (auto *MI : InsInstrs)7918MI->setFlags(Flags);7919}79207921/// Replace csincr-branch sequence by simple conditional branch7922///7923/// Examples:7924/// 1. \code7925/// csinc w9, wzr, wzr, <condition code>7926/// tbnz w9, #0, 0x447927/// \endcode7928/// to7929/// \code7930/// b.<inverted condition code>7931/// \endcode7932///7933/// 2. \code7934/// csinc w9, wzr, wzr, <condition code>7935/// tbz w9, #0, 0x447936/// \endcode7937/// to7938/// \code7939/// b.<condition code>7940/// \endcode7941///7942/// Replace compare and branch sequence by TBZ/TBNZ instruction when the7943/// compare's constant operand is power of 2.7944///7945/// Examples:7946/// \code7947/// and w8, w8, #0x4007948/// cbnz w8, L17949/// \endcode7950/// to7951/// \code7952/// tbnz w8, #10, L17953/// \endcode7954///7955/// \param MI Conditional Branch7956/// \return True when the simple conditional branch is generated7957///7958bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {7959bool IsNegativeBranch = false;7960bool IsTestAndBranch = false;7961unsigned TargetBBInMI = 0;7962switch (MI.getOpcode()) {7963default:7964llvm_unreachable("Unknown branch instruction?");7965case AArch64::Bcc:7966return false;7967case AArch64::CBZW:7968case AArch64::CBZX:7969TargetBBInMI = 1;7970break;7971case AArch64::CBNZW:7972case AArch64::CBNZX:7973TargetBBInMI = 1;7974IsNegativeBranch = true;7975break;7976case AArch64::TBZW:7977case AArch64::TBZX:7978TargetBBInMI = 2;7979IsTestAndBranch = true;7980break;7981case AArch64::TBNZW:7982case AArch64::TBNZX:7983TargetBBInMI = 2;7984IsNegativeBranch = true;7985IsTestAndBranch = true;7986break;7987}7988// So we increment a zero register and test for bits other7989// than bit 0? Conservatively bail out in case the verifier7990// missed this case.7991if (IsTestAndBranch && MI.getOperand(1).getImm())7992return false;79937994// Find Definition.7995assert(MI.getParent() && "Incomplete machine instruciton\n");7996MachineBasicBlock *MBB = MI.getParent();7997MachineFunction *MF = MBB->getParent();7998MachineRegisterInfo *MRI = &MF->getRegInfo();7999Register VReg = MI.getOperand(0).getReg();8000if (!VReg.isVirtual())8001return false;80028003MachineInstr *DefMI = MRI->getVRegDef(VReg);80048005// Look through COPY instructions to find definition.8006while (DefMI->isCopy()) {8007Register CopyVReg = DefMI->getOperand(1).getReg();8008if (!MRI->hasOneNonDBGUse(CopyVReg))8009return false;8010if (!MRI->hasOneDef(CopyVReg))8011return false;8012DefMI = MRI->getVRegDef(CopyVReg);8013}80148015switch (DefMI->getOpcode()) {8016default:8017return false;8018// Fold AND into a TBZ/TBNZ if constant operand is power of 2.8019case AArch64::ANDWri:8020case AArch64::ANDXri: {8021if (IsTestAndBranch)8022return false;8023if (DefMI->getParent() != MBB)8024return false;8025if (!MRI->hasOneNonDBGUse(VReg))8026return false;80278028bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);8029uint64_t Mask = AArch64_AM::decodeLogicalImmediate(8030DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);8031if (!isPowerOf2_64(Mask))8032return false;80338034MachineOperand &MO = DefMI->getOperand(1);8035Register NewReg = MO.getReg();8036if (!NewReg.isVirtual())8037return false;80388039assert(!MRI->def_empty(NewReg) && "Register must be defined.");80408041MachineBasicBlock &RefToMBB = *MBB;8042MachineBasicBlock *TBB = MI.getOperand(1).getMBB();8043DebugLoc DL = MI.getDebugLoc();8044unsigned Imm = Log2_64(Mask);8045unsigned Opc = (Imm < 32)8046? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)8047: (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);8048MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))8049.addReg(NewReg)8050.addImm(Imm)8051.addMBB(TBB);8052// Register lives on to the CBZ now.8053MO.setIsKill(false);80548055// For immediate smaller than 32, we need to use the 32-bit8056// variant (W) in all cases. Indeed the 64-bit variant does not8057// allow to encode them.8058// Therefore, if the input register is 64-bit, we need to take the8059// 32-bit sub-part.8060if (!Is32Bit && Imm < 32)8061NewMI->getOperand(0).setSubReg(AArch64::sub_32);8062MI.eraseFromParent();8063return true;8064}8065// Look for CSINC8066case AArch64::CSINCWr:8067case AArch64::CSINCXr: {8068if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&8069DefMI->getOperand(2).getReg() == AArch64::WZR) &&8070!(DefMI->getOperand(1).getReg() == AArch64::XZR &&8071DefMI->getOperand(2).getReg() == AArch64::XZR))8072return false;80738074if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,8075true) != -1)8076return false;80778078AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();8079// Convert only when the condition code is not modified between8080// the CSINC and the branch. The CC may be used by other8081// instructions in between.8082if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))8083return false;8084MachineBasicBlock &RefToMBB = *MBB;8085MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();8086DebugLoc DL = MI.getDebugLoc();8087if (IsNegativeBranch)8088CC = AArch64CC::getInvertedCondCode(CC);8089BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);8090MI.eraseFromParent();8091return true;8092}8093}8094}80958096std::pair<unsigned, unsigned>8097AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {8098const unsigned Mask = AArch64II::MO_FRAGMENT;8099return std::make_pair(TF & Mask, TF & ~Mask);8100}81018102ArrayRef<std::pair<unsigned, const char *>>8103AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {8104using namespace AArch64II;81058106static const std::pair<unsigned, const char *> TargetFlags[] = {8107{MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},8108{MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},8109{MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},8110{MO_HI12, "aarch64-hi12"}};8111return ArrayRef(TargetFlags);8112}81138114ArrayRef<std::pair<unsigned, const char *>>8115AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {8116using namespace AArch64II;81178118static const std::pair<unsigned, const char *> TargetFlags[] = {8119{MO_COFFSTUB, "aarch64-coffstub"},8120{MO_GOT, "aarch64-got"},8121{MO_NC, "aarch64-nc"},8122{MO_S, "aarch64-s"},8123{MO_TLS, "aarch64-tls"},8124{MO_DLLIMPORT, "aarch64-dllimport"},8125{MO_PREL, "aarch64-prel"},8126{MO_TAGGED, "aarch64-tagged"},8127{MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},8128};8129return ArrayRef(TargetFlags);8130}81318132ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>8133AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {8134static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =8135{{MOSuppressPair, "aarch64-suppress-pair"},8136{MOStridedAccess, "aarch64-strided-access"}};8137return ArrayRef(TargetFlags);8138}81398140/// Constants defining how certain sequences should be outlined.8141/// This encompasses how an outlined function should be called, and what kind of8142/// frame should be emitted for that outlined function.8143///8144/// \p MachineOutlinerDefault implies that the function should be called with8145/// a save and restore of LR to the stack.8146///8147/// That is,8148///8149/// I1 Save LR OUTLINED_FUNCTION:8150/// I2 --> BL OUTLINED_FUNCTION I18151/// I3 Restore LR I28152/// I38153/// RET8154///8155/// * Call construction overhead: 3 (save + BL + restore)8156/// * Frame construction overhead: 1 (ret)8157/// * Requires stack fixups? Yes8158///8159/// \p MachineOutlinerTailCall implies that the function is being created from8160/// a sequence of instructions ending in a return.8161///8162/// That is,8163///8164/// I1 OUTLINED_FUNCTION:8165/// I2 --> B OUTLINED_FUNCTION I18166/// RET I28167/// RET8168///8169/// * Call construction overhead: 1 (B)8170/// * Frame construction overhead: 0 (Return included in sequence)8171/// * Requires stack fixups? No8172///8173/// \p MachineOutlinerNoLRSave implies that the function should be called using8174/// a BL instruction, but doesn't require LR to be saved and restored. This8175/// happens when LR is known to be dead.8176///8177/// That is,8178///8179/// I1 OUTLINED_FUNCTION:8180/// I2 --> BL OUTLINED_FUNCTION I18181/// I3 I28182/// I38183/// RET8184///8185/// * Call construction overhead: 1 (BL)8186/// * Frame construction overhead: 1 (RET)8187/// * Requires stack fixups? No8188///8189/// \p MachineOutlinerThunk implies that the function is being created from8190/// a sequence of instructions ending in a call. The outlined function is8191/// called with a BL instruction, and the outlined function tail-calls the8192/// original call destination.8193///8194/// That is,8195///8196/// I1 OUTLINED_FUNCTION:8197/// I2 --> BL OUTLINED_FUNCTION I18198/// BL f I28199/// B f8200/// * Call construction overhead: 1 (BL)8201/// * Frame construction overhead: 08202/// * Requires stack fixups? No8203///8204/// \p MachineOutlinerRegSave implies that the function should be called with a8205/// save and restore of LR to an available register. This allows us to avoid8206/// stack fixups. Note that this outlining variant is compatible with the8207/// NoLRSave case.8208///8209/// That is,8210///8211/// I1 Save LR OUTLINED_FUNCTION:8212/// I2 --> BL OUTLINED_FUNCTION I18213/// I3 Restore LR I28214/// I38215/// RET8216///8217/// * Call construction overhead: 3 (save + BL + restore)8218/// * Frame construction overhead: 1 (ret)8219/// * Requires stack fixups? No8220enum MachineOutlinerClass {8221MachineOutlinerDefault, /// Emit a save, restore, call, and return.8222MachineOutlinerTailCall, /// Only emit a branch.8223MachineOutlinerNoLRSave, /// Emit a call and return.8224MachineOutlinerThunk, /// Emit a call and tail-call.8225MachineOutlinerRegSave /// Same as default, but save to a register.8226};82278228enum MachineOutlinerMBBFlags {8229LRUnavailableSomewhere = 0x2,8230HasCalls = 0x4,8231UnsafeRegsDead = 0x88232};82338234Register8235AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {8236MachineFunction *MF = C.getMF();8237const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();8238const AArch64RegisterInfo *ARI =8239static_cast<const AArch64RegisterInfo *>(&TRI);8240// Check if there is an available register across the sequence that we can8241// use.8242for (unsigned Reg : AArch64::GPR64RegClass) {8243if (!ARI->isReservedReg(*MF, Reg) &&8244Reg != AArch64::LR && // LR is not reserved, but don't use it.8245Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.8246Reg != AArch64::X17 && // Ditto for X17.8247C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&8248C.isAvailableInsideSeq(Reg, TRI))8249return Reg;8250}8251return Register();8252}82538254static bool8255outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,8256const outliner::Candidate &b) {8257const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();8258const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();82598260return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) &&8261MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true);8262}82638264static bool8265outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,8266const outliner::Candidate &b) {8267const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();8268const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();82698270return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();8271}82728273static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,8274const outliner::Candidate &b) {8275const AArch64Subtarget &SubtargetA =8276a.getMF()->getSubtarget<AArch64Subtarget>();8277const AArch64Subtarget &SubtargetB =8278b.getMF()->getSubtarget<AArch64Subtarget>();8279return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();8280}82818282std::optional<outliner::OutlinedFunction>8283AArch64InstrInfo::getOutliningCandidateInfo(8284std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {8285unsigned SequenceSize = 0;8286for (auto &MI : RepeatedSequenceLocs[0])8287SequenceSize += getInstSizeInBytes(MI);82888289unsigned NumBytesToCreateFrame = 0;82908291// We only allow outlining for functions having exactly matching return8292// address signing attributes, i.e., all share the same value for the8293// attribute "sign-return-address" and all share the same type of key they8294// are signed with.8295// Additionally we require all functions to simultaniously either support8296// v8.3a features or not. Otherwise an outlined function could get signed8297// using dedicated v8.3 instructions and a call from a function that doesn't8298// support v8.3 instructions would therefore be invalid.8299if (std::adjacent_find(8300RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),8301[](const outliner::Candidate &a, const outliner::Candidate &b) {8302// Return true if a and b are non-equal w.r.t. return address8303// signing or support of v8.3a features8304if (outliningCandidatesSigningScopeConsensus(a, b) &&8305outliningCandidatesSigningKeyConsensus(a, b) &&8306outliningCandidatesV8_3OpsConsensus(a, b)) {8307return false;8308}8309return true;8310}) != RepeatedSequenceLocs.end()) {8311return std::nullopt;8312}83138314// Since at this point all candidates agree on their return address signing8315// picking just one is fine. If the candidate functions potentially sign their8316// return addresses, the outlined function should do the same. Note that in8317// the case of "sign-return-address"="non-leaf" this is an assumption: It is8318// not certainly true that the outlined function will have to sign its return8319// address but this decision is made later, when the decision to outline8320// has already been made.8321// The same holds for the number of additional instructions we need: On8322// v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is8323// necessary. However, at this point we don't know if the outlined function8324// will have a RET instruction so we assume the worst.8325const TargetRegisterInfo &TRI = getRegisterInfo();8326// Performing a tail call may require extra checks when PAuth is enabled.8327// If PAuth is disabled, set it to zero for uniformity.8328unsigned NumBytesToCheckLRInTCEpilogue = 0;8329if (RepeatedSequenceLocs[0]8330.getMF()8331->getInfo<AArch64FunctionInfo>()8332->shouldSignReturnAddress(true)) {8333// One PAC and one AUT instructions8334NumBytesToCreateFrame += 8;83358336// PAuth is enabled - set extra tail call cost, if any.8337auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(8338*RepeatedSequenceLocs[0].getMF());8339NumBytesToCheckLRInTCEpilogue =8340AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod);8341// Checking the authenticated LR value may significantly impact8342// SequenceSize, so account for it for more precise results.8343if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))8344SequenceSize += NumBytesToCheckLRInTCEpilogue;83458346// We have to check if sp modifying instructions would get outlined.8347// If so we only allow outlining if sp is unchanged overall, so matching8348// sub and add instructions are okay to outline, all other sp modifications8349// are not8350auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {8351int SPValue = 0;8352for (auto &MI : C) {8353if (MI.modifiesRegister(AArch64::SP, &TRI)) {8354switch (MI.getOpcode()) {8355case AArch64::ADDXri:8356case AArch64::ADDWri:8357assert(MI.getNumOperands() == 4 && "Wrong number of operands");8358assert(MI.getOperand(2).isImm() &&8359"Expected operand to be immediate");8360assert(MI.getOperand(1).isReg() &&8361"Expected operand to be a register");8362// Check if the add just increments sp. If so, we search for8363// matching sub instructions that decrement sp. If not, the8364// modification is illegal8365if (MI.getOperand(1).getReg() == AArch64::SP)8366SPValue += MI.getOperand(2).getImm();8367else8368return true;8369break;8370case AArch64::SUBXri:8371case AArch64::SUBWri:8372assert(MI.getNumOperands() == 4 && "Wrong number of operands");8373assert(MI.getOperand(2).isImm() &&8374"Expected operand to be immediate");8375assert(MI.getOperand(1).isReg() &&8376"Expected operand to be a register");8377// Check if the sub just decrements sp. If so, we search for8378// matching add instructions that increment sp. If not, the8379// modification is illegal8380if (MI.getOperand(1).getReg() == AArch64::SP)8381SPValue -= MI.getOperand(2).getImm();8382else8383return true;8384break;8385default:8386return true;8387}8388}8389}8390if (SPValue)8391return true;8392return false;8393};8394// Remove candidates with illegal stack modifying instructions8395llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);83968397// If the sequence doesn't have enough candidates left, then we're done.8398if (RepeatedSequenceLocs.size() < 2)8399return std::nullopt;8400}84018402// Properties about candidate MBBs that hold for all of them.8403unsigned FlagsSetInAll = 0xF;84048405// Compute liveness information for each candidate, and set FlagsSetInAll.8406for (outliner::Candidate &C : RepeatedSequenceLocs)8407FlagsSetInAll &= C.Flags;84088409unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();84108411// Helper lambda which sets call information for every candidate.8412auto SetCandidateCallInfo =8413[&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {8414for (outliner::Candidate &C : RepeatedSequenceLocs)8415C.setCallInfo(CallID, NumBytesForCall);8416};84178418unsigned FrameID = MachineOutlinerDefault;8419NumBytesToCreateFrame += 4;84208421bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {8422return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();8423});84248425// We check to see if CFI Instructions are present, and if they are8426// we find the number of CFI Instructions in the candidates.8427unsigned CFICount = 0;8428for (auto &I : RepeatedSequenceLocs[0]) {8429if (I.isCFIInstruction())8430CFICount++;8431}84328433// We compare the number of found CFI Instructions to the number of CFI8434// instructions in the parent function for each candidate. We must check this8435// since if we outline one of the CFI instructions in a function, we have to8436// outline them all for correctness. If we do not, the address offsets will be8437// incorrect between the two sections of the program.8438for (outliner::Candidate &C : RepeatedSequenceLocs) {8439std::vector<MCCFIInstruction> CFIInstructions =8440C.getMF()->getFrameInstructions();84418442if (CFICount > 0 && CFICount != CFIInstructions.size())8443return std::nullopt;8444}84458446// Returns true if an instructions is safe to fix up, false otherwise.8447auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {8448if (MI.isCall())8449return true;84508451if (!MI.modifiesRegister(AArch64::SP, &TRI) &&8452!MI.readsRegister(AArch64::SP, &TRI))8453return true;84548455// Any modification of SP will break our code to save/restore LR.8456// FIXME: We could handle some instructions which add a constant8457// offset to SP, with a bit more work.8458if (MI.modifiesRegister(AArch64::SP, &TRI))8459return false;84608461// At this point, we have a stack instruction that we might need to8462// fix up. We'll handle it if it's a load or store.8463if (MI.mayLoadOrStore()) {8464const MachineOperand *Base; // Filled with the base operand of MI.8465int64_t Offset; // Filled with the offset of MI.8466bool OffsetIsScalable;84678468// Does it allow us to offset the base operand and is the base the8469// register SP?8470if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||8471!Base->isReg() || Base->getReg() != AArch64::SP)8472return false;84738474// Fixe-up code below assumes bytes.8475if (OffsetIsScalable)8476return false;84778478// Find the minimum/maximum offset for this instruction and check8479// if fixing it up would be in range.8480int64_t MinOffset,8481MaxOffset; // Unscaled offsets for the instruction.8482// The scale to multiply the offsets by.8483TypeSize Scale(0U, false), DummyWidth(0U, false);8484getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);84858486Offset += 16; // Update the offset to what it would be if we outlined.8487if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||8488Offset > MaxOffset * (int64_t)Scale.getFixedValue())8489return false;84908491// It's in range, so we can outline it.8492return true;8493}84948495// FIXME: Add handling for instructions like "add x0, sp, #8".84968497// We can't fix it up, so don't outline it.8498return false;8499};85008501// True if it's possible to fix up each stack instruction in this sequence.8502// Important for frames/call variants that modify the stack.8503bool AllStackInstrsSafe =8504llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);85058506// If the last instruction in any candidate is a terminator, then we should8507// tail call all of the candidates.8508if (RepeatedSequenceLocs[0].back().isTerminator()) {8509FrameID = MachineOutlinerTailCall;8510NumBytesToCreateFrame = 0;8511unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;8512SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);8513}85148515else if (LastInstrOpcode == AArch64::BL ||8516((LastInstrOpcode == AArch64::BLR ||8517LastInstrOpcode == AArch64::BLRNoIP) &&8518!HasBTI)) {8519// FIXME: Do we need to check if the code after this uses the value of LR?8520FrameID = MachineOutlinerThunk;8521NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;8522SetCandidateCallInfo(MachineOutlinerThunk, 4);8523}85248525else {8526// We need to decide how to emit calls + frames. We can always emit the same8527// frame if we don't need to save to the stack. If we have to save to the8528// stack, then we need a different frame.8529unsigned NumBytesNoStackCalls = 0;8530std::vector<outliner::Candidate> CandidatesWithoutStackFixups;85318532// Check if we have to save LR.8533for (outliner::Candidate &C : RepeatedSequenceLocs) {8534bool LRAvailable =8535(C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)8536? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)8537: true;8538// If we have a noreturn caller, then we're going to be conservative and8539// say that we have to save LR. If we don't have a ret at the end of the8540// block, then we can't reason about liveness accurately.8541//8542// FIXME: We can probably do better than always disabling this in8543// noreturn functions by fixing up the liveness info.8544bool IsNoReturn =8545C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);85468547// Is LR available? If so, we don't need a save.8548if (LRAvailable && !IsNoReturn) {8549NumBytesNoStackCalls += 4;8550C.setCallInfo(MachineOutlinerNoLRSave, 4);8551CandidatesWithoutStackFixups.push_back(C);8552}85538554// Is an unused register available? If so, we won't modify the stack, so8555// we can outline with the same frame type as those that don't save LR.8556else if (findRegisterToSaveLRTo(C)) {8557NumBytesNoStackCalls += 12;8558C.setCallInfo(MachineOutlinerRegSave, 12);8559CandidatesWithoutStackFixups.push_back(C);8560}85618562// Is SP used in the sequence at all? If not, we don't have to modify8563// the stack, so we are guaranteed to get the same frame.8564else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {8565NumBytesNoStackCalls += 12;8566C.setCallInfo(MachineOutlinerDefault, 12);8567CandidatesWithoutStackFixups.push_back(C);8568}85698570// If we outline this, we need to modify the stack. Pretend we don't8571// outline this by saving all of its bytes.8572else {8573NumBytesNoStackCalls += SequenceSize;8574}8575}85768577// If there are no places where we have to save LR, then note that we8578// don't have to update the stack. Otherwise, give every candidate the8579// default call type, as long as it's safe to do so.8580if (!AllStackInstrsSafe ||8581NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {8582RepeatedSequenceLocs = CandidatesWithoutStackFixups;8583FrameID = MachineOutlinerNoLRSave;8584if (RepeatedSequenceLocs.size() < 2)8585return std::nullopt;8586} else {8587SetCandidateCallInfo(MachineOutlinerDefault, 12);85888589// Bugzilla ID: 467678590// TODO: Check if fixing up the stack more than once is safe so we can8591// outline these.8592//8593// An outline resulting in a caller that requires stack fixups at the8594// callsite to a callee that also requires stack fixups can happen when8595// there are no available registers at the candidate callsite for a8596// candidate that itself also has calls.8597//8598// In other words if function_containing_sequence in the following pseudo8599// assembly requires that we save LR at the point of the call, but there8600// are no available registers: in this case we save using SP and as a8601// result the SP offsets requires stack fixups by multiples of 16.8602//8603// function_containing_sequence:8604// ...8605// save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N8606// call OUTLINED_FUNCTION_N8607// restore LR from SP8608// ...8609//8610// OUTLINED_FUNCTION_N:8611// save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N8612// ...8613// bl foo8614// restore LR from SP8615// ret8616//8617// Because the code to handle more than one stack fixup does not8618// currently have the proper checks for legality, these cases will assert8619// in the AArch64 MachineOutliner. This is because the code to do this8620// needs more hardening, testing, better checks that generated code is8621// legal, etc and because it is only verified to handle a single pass of8622// stack fixup.8623//8624// The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch8625// these cases until they are known to be handled. Bugzilla 46767 is8626// referenced in comments at the assert site.8627//8628// To avoid asserting (or generating non-legal code on noassert builds)8629// we remove all candidates which would need more than one stack fixup by8630// pruning the cases where the candidate has calls while also having no8631// available LR and having no available general purpose registers to copy8632// LR to (ie one extra stack save/restore).8633//8634if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {8635erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {8636auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };8637return (llvm::any_of(C, IsCall)) &&8638(!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||8639!findRegisterToSaveLRTo(C));8640});8641}8642}86438644// If we dropped all of the candidates, bail out here.8645if (RepeatedSequenceLocs.size() < 2) {8646RepeatedSequenceLocs.clear();8647return std::nullopt;8648}8649}86508651// Does every candidate's MBB contain a call? If so, then we might have a call8652// in the range.8653if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {8654// Check if the range contains a call. These require a save + restore of the8655// link register.8656outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];8657bool ModStackToSaveLR = false;8658if (std::any_of(FirstCand.begin(), std::prev(FirstCand.end()),8659[](const MachineInstr &MI) { return MI.isCall(); }))8660ModStackToSaveLR = true;86618662// Handle the last instruction separately. If this is a tail call, then the8663// last instruction is a call. We don't want to save + restore in this case.8664// However, it could be possible that the last instruction is a call without8665// it being valid to tail call this sequence. We should consider this as8666// well.8667else if (FrameID != MachineOutlinerThunk &&8668FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())8669ModStackToSaveLR = true;86708671if (ModStackToSaveLR) {8672// We can't fix up the stack. Bail out.8673if (!AllStackInstrsSafe) {8674RepeatedSequenceLocs.clear();8675return std::nullopt;8676}86778678// Save + restore LR.8679NumBytesToCreateFrame += 8;8680}8681}86828683// If we have CFI instructions, we can only outline if the outlined section8684// can be a tail call8685if (FrameID != MachineOutlinerTailCall && CFICount > 0)8686return std::nullopt;86878688return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,8689NumBytesToCreateFrame, FrameID);8690}86918692void AArch64InstrInfo::mergeOutliningCandidateAttributes(8693Function &F, std::vector<outliner::Candidate> &Candidates) const {8694// If a bunch of candidates reach this point they must agree on their return8695// address signing. It is therefore enough to just consider the signing8696// behaviour of one of them8697const auto &CFn = Candidates.front().getMF()->getFunction();86988699if (CFn.hasFnAttribute("ptrauth-returns"))8700F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));8701if (CFn.hasFnAttribute("ptrauth-auth-traps"))8702F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));8703// Since all candidates belong to the same module, just copy the8704// function-level attributes of an arbitrary function.8705if (CFn.hasFnAttribute("sign-return-address"))8706F.addFnAttr(CFn.getFnAttribute("sign-return-address"));8707if (CFn.hasFnAttribute("sign-return-address-key"))8708F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));87098710AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);8711}87128713bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(8714MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {8715const Function &F = MF.getFunction();87168717// Can F be deduplicated by the linker? If it can, don't outline from it.8718if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())8719return false;87208721// Don't outline from functions with section markings; the program could8722// expect that all the code is in the named section.8723// FIXME: Allow outlining from multiple functions with the same section8724// marking.8725if (F.hasSection())8726return false;87278728// Outlining from functions with redzones is unsafe since the outliner may8729// modify the stack. Check if hasRedZone is true or unknown; if yes, don't8730// outline from it.8731AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();8732if (!AFI || AFI->hasRedZone().value_or(true))8733return false;87348735// FIXME: Determine whether it is safe to outline from functions which contain8736// streaming-mode changes. We may need to ensure any smstart/smstop pairs are8737// outlined together and ensure it is safe to outline with async unwind info,8738// required for saving & restoring VG around calls.8739if (AFI->hasStreamingModeChanges())8740return false;87418742// FIXME: Teach the outliner to generate/handle Windows unwind info.8743if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())8744return false;87458746// It's safe to outline from MF.8747return true;8748}87498750SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>8751AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,8752unsigned &Flags) const {8753assert(MBB.getParent()->getRegInfo().tracksLiveness() &&8754"Must track liveness!");8755SmallVector<8756std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>8757Ranges;8758// According to the AArch64 Procedure Call Standard, the following are8759// undefined on entry/exit from a function call:8760//8761// * Registers x16, x17, (and thus w16, w17)8762// * Condition codes (and thus the NZCV register)8763//8764// If any of these registers are used inside or live across an outlined8765// function, then they may be modified later, either by the compiler or8766// some other tool (like the linker).8767//8768// To avoid outlining in these situations, partition each block into ranges8769// where these registers are dead. We will only outline from those ranges.8770LiveRegUnits LRU(getRegisterInfo());8771auto AreAllUnsafeRegsDead = [&LRU]() {8772return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&8773LRU.available(AArch64::NZCV);8774};87758776// We need to know if LR is live across an outlining boundary later on in8777// order to decide how we'll create the outlined call, frame, etc.8778//8779// It's pretty expensive to check this for *every candidate* within a block.8780// That's some potentially n^2 behaviour, since in the worst case, we'd need8781// to compute liveness from the end of the block for O(n) candidates within8782// the block.8783//8784// So, to improve the average case, let's keep track of liveness from the end8785// of the block to the beginning of *every outlinable range*. If we know that8786// LR is available in every range we could outline from, then we know that8787// we don't need to check liveness for any candidate within that range.8788bool LRAvailableEverywhere = true;8789// Compute liveness bottom-up.8790LRU.addLiveOuts(MBB);8791// Update flags that require info about the entire MBB.8792auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {8793if (MI.isCall() && !MI.isTerminator())8794Flags |= MachineOutlinerMBBFlags::HasCalls;8795};8796// Range: [RangeBegin, RangeEnd)8797MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;8798unsigned RangeLen;8799auto CreateNewRangeStartingAt =8800[&RangeBegin, &RangeEnd,8801&RangeLen](MachineBasicBlock::instr_iterator NewBegin) {8802RangeBegin = NewBegin;8803RangeEnd = std::next(RangeBegin);8804RangeLen = 0;8805};8806auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {8807// At least one unsafe register is not dead. We do not want to outline at8808// this point. If it is long enough to outline from, save the range8809// [RangeBegin, RangeEnd).8810if (RangeLen > 1)8811Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));8812};8813// Find the first point where all unsafe registers are dead.8814// FIND: <safe instr> <-- end of first potential range8815// SKIP: <unsafe def>8816// SKIP: ... everything between ...8817// SKIP: <unsafe use>8818auto FirstPossibleEndPt = MBB.instr_rbegin();8819for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {8820LRU.stepBackward(*FirstPossibleEndPt);8821// Update flags that impact how we outline across the entire block,8822// regardless of safety.8823UpdateWholeMBBFlags(*FirstPossibleEndPt);8824if (AreAllUnsafeRegsDead())8825break;8826}8827// If we exhausted the entire block, we have no safe ranges to outline.8828if (FirstPossibleEndPt == MBB.instr_rend())8829return Ranges;8830// Current range.8831CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());8832// StartPt points to the first place where all unsafe registers8833// are dead (if there is any such point). Begin partitioning the MBB into8834// ranges.8835for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {8836LRU.stepBackward(MI);8837UpdateWholeMBBFlags(MI);8838if (!AreAllUnsafeRegsDead()) {8839SaveRangeIfNonEmpty();8840CreateNewRangeStartingAt(MI.getIterator());8841continue;8842}8843LRAvailableEverywhere &= LRU.available(AArch64::LR);8844RangeBegin = MI.getIterator();8845++RangeLen;8846}8847// Above loop misses the last (or only) range. If we are still safe, then8848// let's save the range.8849if (AreAllUnsafeRegsDead())8850SaveRangeIfNonEmpty();8851if (Ranges.empty())8852return Ranges;8853// We found the ranges bottom-up. Mapping expects the top-down. Reverse8854// the order.8855std::reverse(Ranges.begin(), Ranges.end());8856// If there is at least one outlinable range where LR is unavailable8857// somewhere, remember that.8858if (!LRAvailableEverywhere)8859Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;8860return Ranges;8861}88628863outliner::InstrType8864AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,8865unsigned Flags) const {8866MachineInstr &MI = *MIT;8867MachineBasicBlock *MBB = MI.getParent();8868MachineFunction *MF = MBB->getParent();8869AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();88708871// Don't outline anything used for return address signing. The outlined8872// function will get signed later if needed8873switch (MI.getOpcode()) {8874case AArch64::PACM:8875case AArch64::PACIASP:8876case AArch64::PACIBSP:8877case AArch64::PACIASPPC:8878case AArch64::PACIBSPPC:8879case AArch64::AUTIASP:8880case AArch64::AUTIBSP:8881case AArch64::AUTIASPPCi:8882case AArch64::AUTIASPPCr:8883case AArch64::AUTIBSPPCi:8884case AArch64::AUTIBSPPCr:8885case AArch64::RETAA:8886case AArch64::RETAB:8887case AArch64::RETAASPPCi:8888case AArch64::RETAASPPCr:8889case AArch64::RETABSPPCi:8890case AArch64::RETABSPPCr:8891case AArch64::EMITBKEY:8892case AArch64::PAUTH_PROLOGUE:8893case AArch64::PAUTH_EPILOGUE:8894return outliner::InstrType::Illegal;8895}88968897// Don't outline LOHs.8898if (FuncInfo->getLOHRelated().count(&MI))8899return outliner::InstrType::Illegal;89008901// We can only outline these if we will tail call the outlined function, or8902// fix up the CFI offsets. Currently, CFI instructions are outlined only if8903// in a tail call.8904//8905// FIXME: If the proper fixups for the offset are implemented, this should be8906// possible.8907if (MI.isCFIInstruction())8908return outliner::InstrType::Legal;89098910// Is this a terminator for a basic block?8911if (MI.isTerminator())8912// TargetInstrInfo::getOutliningType has already filtered out anything8913// that would break this, so we can allow it here.8914return outliner::InstrType::Legal;89158916// Make sure none of the operands are un-outlinable.8917for (const MachineOperand &MOP : MI.operands()) {8918// A check preventing CFI indices was here before, but only CFI8919// instructions should have those.8920assert(!MOP.isCFIIndex());89218922// If it uses LR or W30 explicitly, then don't touch it.8923if (MOP.isReg() && !MOP.isImplicit() &&8924(MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))8925return outliner::InstrType::Illegal;8926}89278928// Special cases for instructions that can always be outlined, but will fail8929// the later tests. e.g, ADRPs, which are PC-relative use LR, but can always8930// be outlined because they don't require a *specific* value to be in LR.8931if (MI.getOpcode() == AArch64::ADRP)8932return outliner::InstrType::Legal;89338934// If MI is a call we might be able to outline it. We don't want to outline8935// any calls that rely on the position of items on the stack. When we outline8936// something containing a call, we have to emit a save and restore of LR in8937// the outlined function. Currently, this always happens by saving LR to the8938// stack. Thus, if we outline, say, half the parameters for a function call8939// plus the call, then we'll break the callee's expectations for the layout8940// of the stack.8941//8942// FIXME: Allow calls to functions which construct a stack frame, as long8943// as they don't access arguments on the stack.8944// FIXME: Figure out some way to analyze functions defined in other modules.8945// We should be able to compute the memory usage based on the IR calling8946// convention, even if we can't see the definition.8947if (MI.isCall()) {8948// Get the function associated with the call. Look at each operand and find8949// the one that represents the callee and get its name.8950const Function *Callee = nullptr;8951for (const MachineOperand &MOP : MI.operands()) {8952if (MOP.isGlobal()) {8953Callee = dyn_cast<Function>(MOP.getGlobal());8954break;8955}8956}89578958// Never outline calls to mcount. There isn't any rule that would require8959// this, but the Linux kernel's "ftrace" feature depends on it.8960if (Callee && Callee->getName() == "\01_mcount")8961return outliner::InstrType::Illegal;89628963// If we don't know anything about the callee, assume it depends on the8964// stack layout of the caller. In that case, it's only legal to outline8965// as a tail-call. Explicitly list the call instructions we know about so we8966// don't get unexpected results with call pseudo-instructions.8967auto UnknownCallOutlineType = outliner::InstrType::Illegal;8968if (MI.getOpcode() == AArch64::BLR ||8969MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)8970UnknownCallOutlineType = outliner::InstrType::LegalTerminator;89718972if (!Callee)8973return UnknownCallOutlineType;89748975// We have a function we have information about. Check it if it's something8976// can safely outline.8977MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);89788979// We don't know what's going on with the callee at all. Don't touch it.8980if (!CalleeMF)8981return UnknownCallOutlineType;89828983// Check if we know anything about the callee saves on the function. If we8984// don't, then don't touch it, since that implies that we haven't8985// computed anything about its stack frame yet.8986MachineFrameInfo &MFI = CalleeMF->getFrameInfo();8987if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||8988MFI.getNumObjects() > 0)8989return UnknownCallOutlineType;89908991// At this point, we can say that CalleeMF ought to not pass anything on the8992// stack. Therefore, we can outline it.8993return outliner::InstrType::Legal;8994}89958996// Don't touch the link register or W30.8997if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||8998MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))8999return outliner::InstrType::Illegal;90009001// Don't outline BTI instructions, because that will prevent the outlining9002// site from being indirectly callable.9003if (hasBTISemantics(MI))9004return outliner::InstrType::Illegal;90059006return outliner::InstrType::Legal;9007}90089009void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {9010for (MachineInstr &MI : MBB) {9011const MachineOperand *Base;9012TypeSize Width(0, false);9013int64_t Offset;9014bool OffsetIsScalable;90159016// Is this a load or store with an immediate offset with SP as the base?9017if (!MI.mayLoadOrStore() ||9018!getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,9019&RI) ||9020(Base->isReg() && Base->getReg() != AArch64::SP))9021continue;90229023// It is, so we have to fix it up.9024TypeSize Scale(0U, false);9025int64_t Dummy1, Dummy2;90269027MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);9028assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");9029getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);9030assert(Scale != 0 && "Unexpected opcode!");9031assert(!OffsetIsScalable && "Expected offset to be a byte offset");90329033// We've pushed the return address to the stack, so add 16 to the offset.9034// This is safe, since we already checked if it would overflow when we9035// checked if this instruction was legal to outline.9036int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();9037StackOffsetOperand.setImm(NewImm);9038}9039}90409041static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,9042const AArch64InstrInfo *TII,9043bool ShouldSignReturnAddr) {9044if (!ShouldSignReturnAddr)9045return;90469047BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))9048.setMIFlag(MachineInstr::FrameSetup);9049BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),9050TII->get(AArch64::PAUTH_EPILOGUE))9051.setMIFlag(MachineInstr::FrameDestroy);9052}90539054void AArch64InstrInfo::buildOutlinedFrame(9055MachineBasicBlock &MBB, MachineFunction &MF,9056const outliner::OutlinedFunction &OF) const {90579058AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();90599060if (OF.FrameConstructionID == MachineOutlinerTailCall)9061FI->setOutliningStyle("Tail Call");9062else if (OF.FrameConstructionID == MachineOutlinerThunk) {9063// For thunk outlining, rewrite the last instruction from a call to a9064// tail-call.9065MachineInstr *Call = &*--MBB.instr_end();9066unsigned TailOpcode;9067if (Call->getOpcode() == AArch64::BL) {9068TailOpcode = AArch64::TCRETURNdi;9069} else {9070assert(Call->getOpcode() == AArch64::BLR ||9071Call->getOpcode() == AArch64::BLRNoIP);9072TailOpcode = AArch64::TCRETURNriALL;9073}9074MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))9075.add(Call->getOperand(0))9076.addImm(0);9077MBB.insert(MBB.end(), TC);9078Call->eraseFromParent();90799080FI->setOutliningStyle("Thunk");9081}90829083bool IsLeafFunction = true;90849085// Is there a call in the outlined range?9086auto IsNonTailCall = [](const MachineInstr &MI) {9087return MI.isCall() && !MI.isReturn();9088};90899090if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {9091// Fix up the instructions in the range, since we're going to modify the9092// stack.90939094// Bugzilla ID: 467679095// TODO: Check if fixing up twice is safe so we can outline these.9096assert(OF.FrameConstructionID != MachineOutlinerDefault &&9097"Can only fix up stack references once");9098fixupPostOutline(MBB);90999100IsLeafFunction = false;91019102// LR has to be a live in so that we can save it.9103if (!MBB.isLiveIn(AArch64::LR))9104MBB.addLiveIn(AArch64::LR);91059106MachineBasicBlock::iterator It = MBB.begin();9107MachineBasicBlock::iterator Et = MBB.end();91089109if (OF.FrameConstructionID == MachineOutlinerTailCall ||9110OF.FrameConstructionID == MachineOutlinerThunk)9111Et = std::prev(MBB.end());91129113// Insert a save before the outlined region9114MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))9115.addReg(AArch64::SP, RegState::Define)9116.addReg(AArch64::LR)9117.addReg(AArch64::SP)9118.addImm(-16);9119It = MBB.insert(It, STRXpre);91209121if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {9122const TargetSubtargetInfo &STI = MF.getSubtarget();9123const MCRegisterInfo *MRI = STI.getRegisterInfo();9124unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);91259126// Add a CFI saying the stack was moved 16 B down.9127int64_t StackPosEntry =9128MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));9129BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))9130.addCFIIndex(StackPosEntry)9131.setMIFlags(MachineInstr::FrameSetup);91329133// Add a CFI saying that the LR that we want to find is now 16 B higher9134// than before.9135int64_t LRPosEntry = MF.addFrameInst(9136MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));9137BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))9138.addCFIIndex(LRPosEntry)9139.setMIFlags(MachineInstr::FrameSetup);9140}91419142// Insert a restore before the terminator for the function.9143MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))9144.addReg(AArch64::SP, RegState::Define)9145.addReg(AArch64::LR, RegState::Define)9146.addReg(AArch64::SP)9147.addImm(16);9148Et = MBB.insert(Et, LDRXpost);9149}91509151bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(!IsLeafFunction);91529153// If this is a tail call outlined function, then there's already a return.9154if (OF.FrameConstructionID == MachineOutlinerTailCall ||9155OF.FrameConstructionID == MachineOutlinerThunk) {9156signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);9157return;9158}91599160// It's not a tail call, so we have to insert the return ourselves.91619162// LR has to be a live in so that we can return to it.9163if (!MBB.isLiveIn(AArch64::LR))9164MBB.addLiveIn(AArch64::LR);91659166MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))9167.addReg(AArch64::LR);9168MBB.insert(MBB.end(), ret);91699170signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);91719172FI->setOutliningStyle("Function");91739174// Did we have to modify the stack by saving the link register?9175if (OF.FrameConstructionID != MachineOutlinerDefault)9176return;91779178// We modified the stack.9179// Walk over the basic block and fix up all the stack accesses.9180fixupPostOutline(MBB);9181}91829183MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(9184Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,9185MachineFunction &MF, outliner::Candidate &C) const {91869187// Are we tail calling?9188if (C.CallConstructionID == MachineOutlinerTailCall) {9189// If yes, then we can just branch to the label.9190It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))9191.addGlobalAddress(M.getNamedValue(MF.getName()))9192.addImm(0));9193return It;9194}91959196// Are we saving the link register?9197if (C.CallConstructionID == MachineOutlinerNoLRSave ||9198C.CallConstructionID == MachineOutlinerThunk) {9199// No, so just insert the call.9200It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))9201.addGlobalAddress(M.getNamedValue(MF.getName())));9202return It;9203}92049205// We want to return the spot where we inserted the call.9206MachineBasicBlock::iterator CallPt;92079208// Instructions for saving and restoring LR around the call instruction we're9209// going to insert.9210MachineInstr *Save;9211MachineInstr *Restore;9212// Can we save to a register?9213if (C.CallConstructionID == MachineOutlinerRegSave) {9214// FIXME: This logic should be sunk into a target-specific interface so that9215// we don't have to recompute the register.9216Register Reg = findRegisterToSaveLRTo(C);9217assert(Reg && "No callee-saved register available?");92189219// LR has to be a live in so that we can save it.9220if (!MBB.isLiveIn(AArch64::LR))9221MBB.addLiveIn(AArch64::LR);92229223// Save and restore LR from Reg.9224Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)9225.addReg(AArch64::XZR)9226.addReg(AArch64::LR)9227.addImm(0);9228Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)9229.addReg(AArch64::XZR)9230.addReg(Reg)9231.addImm(0);9232} else {9233// We have the default case. Save and restore from SP.9234Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))9235.addReg(AArch64::SP, RegState::Define)9236.addReg(AArch64::LR)9237.addReg(AArch64::SP)9238.addImm(-16);9239Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))9240.addReg(AArch64::SP, RegState::Define)9241.addReg(AArch64::LR, RegState::Define)9242.addReg(AArch64::SP)9243.addImm(16);9244}92459246It = MBB.insert(It, Save);9247It++;92489249// Insert the call.9250It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))9251.addGlobalAddress(M.getNamedValue(MF.getName())));9252CallPt = It;9253It++;92549255It = MBB.insert(It, Restore);9256return CallPt;9257}92589259bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(9260MachineFunction &MF) const {9261return MF.getFunction().hasMinSize();9262}92639264void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,9265MachineBasicBlock::iterator Iter,9266DebugLoc &DL,9267bool AllowSideEffects) const {9268const MachineFunction &MF = *MBB.getParent();9269const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();9270const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();92719272if (TRI.isGeneralPurposeRegister(MF, Reg)) {9273BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);9274} else if (STI.hasSVE()) {9275BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)9276.addImm(0)9277.addImm(0);9278} else {9279BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)9280.addImm(0);9281}9282}92839284std::optional<DestSourcePair>9285AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {92869287// AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg9288// and zero immediate operands used as an alias for mov instruction.9289if (MI.getOpcode() == AArch64::ORRWrs &&9290MI.getOperand(1).getReg() == AArch64::WZR &&9291MI.getOperand(3).getImm() == 0x0 &&9292// Check that the w->w move is not a zero-extending w->x mov.9293(!MI.getOperand(0).getReg().isVirtual() ||9294MI.getOperand(0).getSubReg() == 0) &&9295(!MI.getOperand(0).getReg().isPhysical() ||9296MI.findRegisterDefOperandIdx(MI.getOperand(0).getReg() - AArch64::W0 +9297AArch64::X0,9298/*TRI=*/nullptr) == -1))9299return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};93009301if (MI.getOpcode() == AArch64::ORRXrs &&9302MI.getOperand(1).getReg() == AArch64::XZR &&9303MI.getOperand(3).getImm() == 0x0)9304return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};93059306return std::nullopt;9307}93089309std::optional<DestSourcePair>9310AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {9311if (MI.getOpcode() == AArch64::ORRWrs &&9312MI.getOperand(1).getReg() == AArch64::WZR &&9313MI.getOperand(3).getImm() == 0x0)9314return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};9315return std::nullopt;9316}93179318std::optional<RegImmPair>9319AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {9320int Sign = 1;9321int64_t Offset = 0;93229323// TODO: Handle cases where Reg is a super- or sub-register of the9324// destination register.9325const MachineOperand &Op0 = MI.getOperand(0);9326if (!Op0.isReg() || Reg != Op0.getReg())9327return std::nullopt;93289329switch (MI.getOpcode()) {9330default:9331return std::nullopt;9332case AArch64::SUBWri:9333case AArch64::SUBXri:9334case AArch64::SUBSWri:9335case AArch64::SUBSXri:9336Sign *= -1;9337[[fallthrough]];9338case AArch64::ADDSWri:9339case AArch64::ADDSXri:9340case AArch64::ADDWri:9341case AArch64::ADDXri: {9342// TODO: Third operand can be global address (usually some string).9343if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||9344!MI.getOperand(2).isImm())9345return std::nullopt;9346int Shift = MI.getOperand(3).getImm();9347assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");9348Offset = Sign * (MI.getOperand(2).getImm() << Shift);9349}9350}9351return RegImmPair{MI.getOperand(1).getReg(), Offset};9352}93539354/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with9355/// the destination register then, if possible, describe the value in terms of9356/// the source register.9357static std::optional<ParamLoadedValue>9358describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,9359const TargetInstrInfo *TII,9360const TargetRegisterInfo *TRI) {9361auto DestSrc = TII->isCopyLikeInstr(MI);9362if (!DestSrc)9363return std::nullopt;93649365Register DestReg = DestSrc->Destination->getReg();9366Register SrcReg = DestSrc->Source->getReg();93679368auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});93699370// If the described register is the destination, just return the source.9371if (DestReg == DescribedReg)9372return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);93739374// ORRWrs zero-extends to 64-bits, so we need to consider such cases.9375if (MI.getOpcode() == AArch64::ORRWrs &&9376TRI->isSuperRegister(DestReg, DescribedReg))9377return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);93789379// We may need to describe the lower part of a ORRXrs move.9380if (MI.getOpcode() == AArch64::ORRXrs &&9381TRI->isSubRegister(DestReg, DescribedReg)) {9382Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);9383return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);9384}93859386assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&9387"Unhandled ORR[XW]rs copy case");93889389return std::nullopt;9390}93919392bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {9393// Functions cannot be split to different sections on AArch64 if they have9394// a red zone. This is because relaxing a cross-section branch may require9395// incrementing the stack pointer to spill a register, which would overwrite9396// the red zone.9397if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))9398return false;93999400return TargetInstrInfo::isFunctionSafeToSplit(MF);9401}94029403bool AArch64InstrInfo::isMBBSafeToSplitToCold(9404const MachineBasicBlock &MBB) const {9405// Asm Goto blocks can contain conditional branches to goto labels, which can9406// get moved out of range of the branch instruction.9407auto isAsmGoto = [](const MachineInstr &MI) {9408return MI.getOpcode() == AArch64::INLINEASM_BR;9409};9410if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())9411return false;94129413// Because jump tables are label-relative instead of table-relative, they all9414// must be in the same section or relocation fixup handling will fail.94159416// Check if MBB is a jump table target9417const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();9418auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {9419return llvm::is_contained(JTE.MBBs, &MBB);9420};9421if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))9422return false;94239424// Check if MBB contains a jump table lookup9425for (const MachineInstr &MI : MBB) {9426switch (MI.getOpcode()) {9427case TargetOpcode::G_BRJT:9428case AArch64::JumpTableDest32:9429case AArch64::JumpTableDest16:9430case AArch64::JumpTableDest8:9431return false;9432default:9433continue;9434}9435}94369437// MBB isn't a special case, so it's safe to be split to the cold section.9438return true;9439}94409441std::optional<ParamLoadedValue>9442AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,9443Register Reg) const {9444const MachineFunction *MF = MI.getMF();9445const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();9446switch (MI.getOpcode()) {9447case AArch64::MOVZWi:9448case AArch64::MOVZXi: {9449// MOVZWi may be used for producing zero-extended 32-bit immediates in9450// 64-bit parameters, so we need to consider super-registers.9451if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))9452return std::nullopt;94539454if (!MI.getOperand(1).isImm())9455return std::nullopt;9456int64_t Immediate = MI.getOperand(1).getImm();9457int Shift = MI.getOperand(2).getImm();9458return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),9459nullptr);9460}9461case AArch64::ORRWrs:9462case AArch64::ORRXrs:9463return describeORRLoadedValue(MI, Reg, this, TRI);9464}94659466return TargetInstrInfo::describeLoadedValue(MI, Reg);9467}94689469bool AArch64InstrInfo::isExtendLikelyToBeFolded(9470MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {9471assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||9472ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||9473ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);94749475// Anyexts are nops.9476if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)9477return true;94789479Register DefReg = ExtMI.getOperand(0).getReg();9480if (!MRI.hasOneNonDBGUse(DefReg))9481return false;94829483// It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an9484// addressing mode.9485auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);9486return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;9487}94889489uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {9490return get(Opc).TSFlags & AArch64::ElementSizeMask;9491}94929493bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {9494return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;9495}94969497bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {9498return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;9499}95009501unsigned int9502AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {9503return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;9504}95059506bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,9507unsigned Scale) const {9508if (Offset && Scale)9509return false;95109511// Check Reg + Imm9512if (!Scale) {9513// 9-bit signed offset9514if (isInt<9>(Offset))9515return true;95169517// 12-bit unsigned offset9518unsigned Shift = Log2_64(NumBytes);9519if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&9520// Must be a multiple of NumBytes (NumBytes is a power of 2)9521(Offset >> Shift) << Shift == Offset)9522return true;9523return false;9524}95259526// Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg29527return Scale == 1 || (Scale > 0 && Scale == NumBytes);9528}95299530unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {9531if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())9532return AArch64::BLRNoIP;9533else9534return AArch64::BLR;9535}95369537MachineBasicBlock::iterator9538AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,9539Register TargetReg, bool FrameSetup) const {9540assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");95419542MachineBasicBlock &MBB = *MBBI->getParent();9543MachineFunction &MF = *MBB.getParent();9544const AArch64InstrInfo *TII =9545MF.getSubtarget<AArch64Subtarget>().getInstrInfo();9546int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();9547DebugLoc DL = MBB.findDebugLoc(MBBI);95489549MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());9550MachineBasicBlock *LoopTestMBB =9551MF.CreateMachineBasicBlock(MBB.getBasicBlock());9552MF.insert(MBBInsertPoint, LoopTestMBB);9553MachineBasicBlock *LoopBodyMBB =9554MF.CreateMachineBasicBlock(MBB.getBasicBlock());9555MF.insert(MBBInsertPoint, LoopBodyMBB);9556MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());9557MF.insert(MBBInsertPoint, ExitMBB);9558MachineInstr::MIFlag Flags =9559FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;95609561// LoopTest:9562// SUB SP, SP, #ProbeSize9563emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,9564AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);95659566// CMP SP, TargetReg9567BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),9568AArch64::XZR)9569.addReg(AArch64::SP)9570.addReg(TargetReg)9571.addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))9572.setMIFlags(Flags);95739574// B.<Cond> LoopExit9575BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))9576.addImm(AArch64CC::LE)9577.addMBB(ExitMBB)9578.setMIFlags(Flags);95799580// STR XZR, [SP]9581BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))9582.addReg(AArch64::XZR)9583.addReg(AArch64::SP)9584.addImm(0)9585.setMIFlags(Flags);95869587// B loop9588BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))9589.addMBB(LoopTestMBB)9590.setMIFlags(Flags);95919592// LoopExit:9593// MOV SP, TargetReg9594BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)9595.addReg(TargetReg)9596.addImm(0)9597.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))9598.setMIFlags(Flags);95999600// LDR XZR, [SP]9601BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))9602.addReg(AArch64::XZR, RegState::Define)9603.addReg(AArch64::SP)9604.addImm(0)9605.setMIFlags(Flags);96069607ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());9608ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);96099610LoopTestMBB->addSuccessor(ExitMBB);9611LoopTestMBB->addSuccessor(LoopBodyMBB);9612LoopBodyMBB->addSuccessor(LoopTestMBB);9613MBB.addSuccessor(LoopTestMBB);96149615// Update liveins.9616if (MF.getRegInfo().reservedRegsFrozen())9617fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});96189619return ExitMBB->begin();9620}96219622namespace {9623class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {9624MachineFunction *MF;9625const TargetInstrInfo *TII;9626const TargetRegisterInfo *TRI;9627MachineRegisterInfo &MRI;96289629/// The block of the loop9630MachineBasicBlock *LoopBB;9631/// The conditional branch of the loop9632MachineInstr *CondBranch;9633/// The compare instruction for loop control9634MachineInstr *Comp;9635/// The number of the operand of the loop counter value in Comp9636unsigned CompCounterOprNum;9637/// The instruction that updates the loop counter value9638MachineInstr *Update;9639/// The number of the operand of the loop counter value in Update9640unsigned UpdateCounterOprNum;9641/// The initial value of the loop counter9642Register Init;9643/// True iff Update is a predecessor of Comp9644bool IsUpdatePriorComp;96459646/// The normalized condition used by createTripCountGreaterCondition()9647SmallVector<MachineOperand, 4> Cond;96489649public:9650AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,9651MachineInstr *Comp, unsigned CompCounterOprNum,9652MachineInstr *Update, unsigned UpdateCounterOprNum,9653Register Init, bool IsUpdatePriorComp,9654const SmallVectorImpl<MachineOperand> &Cond)9655: MF(Comp->getParent()->getParent()),9656TII(MF->getSubtarget().getInstrInfo()),9657TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),9658LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),9659CompCounterOprNum(CompCounterOprNum), Update(Update),9660UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),9661IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}96629663bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {9664// Make the instructions for loop control be placed in stage 0.9665// The predecessors of Comp are considered by the caller.9666return MI == Comp;9667}96689669std::optional<bool> createTripCountGreaterCondition(9670int TC, MachineBasicBlock &MBB,9671SmallVectorImpl<MachineOperand> &CondParam) override {9672// A branch instruction will be inserted as "if (Cond) goto epilogue".9673// Cond is normalized for such use.9674// The predecessors of the branch are assumed to have already been inserted.9675CondParam = Cond;9676return {};9677}96789679void createRemainingIterationsGreaterCondition(9680int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,9681DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;96829683void setPreheader(MachineBasicBlock *NewPreheader) override {}96849685void adjustTripCount(int TripCountAdjust) override {}96869687void disposed() override {}9688bool isMVEExpanderSupported() override { return true; }9689};9690} // namespace96919692/// Clone an instruction from MI. The register of ReplaceOprNum-th operand9693/// is replaced by ReplaceReg. The output register is newly created.9694/// The other operands are unchanged from MI.9695static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,9696Register ReplaceReg, MachineBasicBlock &MBB,9697MachineBasicBlock::iterator InsertTo) {9698MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();9699const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();9700const TargetRegisterInfo *TRI =9701MBB.getParent()->getSubtarget().getRegisterInfo();9702MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);9703Register Result = 0;9704for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {9705if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {9706Result = MRI.createVirtualRegister(9707MRI.getRegClass(NewMI->getOperand(0).getReg()));9708NewMI->getOperand(I).setReg(Result);9709} else if (I == ReplaceOprNum) {9710MRI.constrainRegClass(9711ReplaceReg,9712TII->getRegClass(NewMI->getDesc(), I, TRI, *MBB.getParent()));9713NewMI->getOperand(I).setReg(ReplaceReg);9714}9715}9716MBB.insert(InsertTo, NewMI);9717return Result;9718}97199720void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(9721int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,9722DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {9723// Create and accumulate conditions for next TC iterations.9724// Example:9725// SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last9726// # iteration of the kernel9727//9728// # insert the following instructions9729// cond = CSINCXr 0, 0, C, implicit $nzcv9730// counter = ADDXri counter, 1 # clone from this->Update9731// SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp9732// cond = CSINCXr cond, cond, C, implicit $nzcv9733// ... (repeat TC times)9734// SUBSXri cond, 0, implicit-def $nzcv97359736assert(CondBranch->getOpcode() == AArch64::Bcc);9737// CondCode to exit the loop9738AArch64CC::CondCode CC =9739(AArch64CC::CondCode)CondBranch->getOperand(0).getImm();9740if (CondBranch->getOperand(1).getMBB() == LoopBB)9741CC = AArch64CC::getInvertedCondCode(CC);97429743// Accumulate conditions to exit the loop9744Register AccCond = AArch64::XZR;97459746// If CC holds, CurCond+1 is returned; otherwise CurCond is returned.9747auto AccumulateCond = [&](Register CurCond,9748AArch64CC::CondCode CC) -> Register {9749Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);9750BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))9751.addReg(NewCond, RegState::Define)9752.addReg(CurCond)9753.addReg(CurCond)9754.addImm(AArch64CC::getInvertedCondCode(CC));9755return NewCond;9756};97579758if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {9759// Update and Comp for I==0 are already exists in MBB9760// (MBB is an unrolled kernel)9761Register Counter;9762for (int I = 0; I <= TC; ++I) {9763Register NextCounter;9764if (I != 0)9765NextCounter =9766cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());97679768AccCond = AccumulateCond(AccCond, CC);97699770if (I != TC) {9771if (I == 0) {9772if (Update != Comp && IsUpdatePriorComp) {9773Counter =9774LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();9775NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,9776MBB.end());9777} else {9778// can use already calculated value9779NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();9780}9781} else if (Update != Comp) {9782NextCounter =9783cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());9784}9785}9786Counter = NextCounter;9787}9788} else {9789Register Counter;9790if (LastStage0Insts.empty()) {9791// use initial counter value (testing if the trip count is sufficient to9792// be executed by pipelined code)9793Counter = Init;9794if (IsUpdatePriorComp)9795Counter =9796cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());9797} else {9798// MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.9799Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();9800}98019802for (int I = 0; I <= TC; ++I) {9803Register NextCounter;9804NextCounter =9805cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());9806AccCond = AccumulateCond(AccCond, CC);9807if (I != TC && Update != Comp)9808NextCounter =9809cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());9810Counter = NextCounter;9811}9812}98139814// If AccCond == 0, the remainder is greater than TC.9815BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))9816.addReg(AArch64::XZR, RegState::Define | RegState::Dead)9817.addReg(AccCond)9818.addImm(0)9819.addImm(0);9820Cond.clear();9821Cond.push_back(MachineOperand::CreateImm(AArch64CC::EQ));9822}98239824static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,9825Register &RegMBB, Register &RegOther) {9826assert(Phi.getNumOperands() == 5);9827if (Phi.getOperand(2).getMBB() == MBB) {9828RegMBB = Phi.getOperand(1).getReg();9829RegOther = Phi.getOperand(3).getReg();9830} else {9831assert(Phi.getOperand(4).getMBB() == MBB);9832RegMBB = Phi.getOperand(3).getReg();9833RegOther = Phi.getOperand(1).getReg();9834}9835}98369837static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {9838if (!Reg.isVirtual())9839return false;9840const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();9841return MRI.getVRegDef(Reg)->getParent() != BB;9842}98439844/// If Reg is an induction variable, return true and set some parameters9845static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,9846MachineInstr *&UpdateInst,9847unsigned &UpdateCounterOprNum, Register &InitReg,9848bool &IsUpdatePriorComp) {9849// Example:9850//9851// Preheader:9852// InitReg = ...9853// LoopBB:9854// Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)9855// Reg = COPY Reg0 ; COPY is ignored.9856// Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.9857// ; Reg is the value calculated in the previous9858// ; iteration, so IsUpdatePriorComp == false.98599860if (LoopBB->pred_size() != 2)9861return false;9862if (!Reg.isVirtual())9863return false;9864const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();9865UpdateInst = nullptr;9866UpdateCounterOprNum = 0;9867InitReg = 0;9868IsUpdatePriorComp = true;9869Register CurReg = Reg;9870while (true) {9871MachineInstr *Def = MRI.getVRegDef(CurReg);9872if (Def->getParent() != LoopBB)9873return false;9874if (Def->isCopy()) {9875// Ignore copy instructions unless they contain subregisters9876if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())9877return false;9878CurReg = Def->getOperand(1).getReg();9879} else if (Def->isPHI()) {9880if (InitReg != 0)9881return false;9882if (!UpdateInst)9883IsUpdatePriorComp = false;9884extractPhiReg(*Def, LoopBB, CurReg, InitReg);9885} else {9886if (UpdateInst)9887return false;9888switch (Def->getOpcode()) {9889case AArch64::ADDSXri:9890case AArch64::ADDSWri:9891case AArch64::SUBSXri:9892case AArch64::SUBSWri:9893case AArch64::ADDXri:9894case AArch64::ADDWri:9895case AArch64::SUBXri:9896case AArch64::SUBWri:9897UpdateInst = Def;9898UpdateCounterOprNum = 1;9899break;9900case AArch64::ADDSXrr:9901case AArch64::ADDSWrr:9902case AArch64::SUBSXrr:9903case AArch64::SUBSWrr:9904case AArch64::ADDXrr:9905case AArch64::ADDWrr:9906case AArch64::SUBXrr:9907case AArch64::SUBWrr:9908UpdateInst = Def;9909if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))9910UpdateCounterOprNum = 1;9911else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))9912UpdateCounterOprNum = 2;9913else9914return false;9915break;9916default:9917return false;9918}9919CurReg = Def->getOperand(UpdateCounterOprNum).getReg();9920}99219922if (!CurReg.isVirtual())9923return false;9924if (Reg == CurReg)9925break;9926}99279928if (!UpdateInst)9929return false;99309931return true;9932}99339934std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>9935AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {9936// Accept loops that meet the following conditions9937// * The conditional branch is BCC9938// * The compare instruction is ADDS/SUBS/WHILEXX9939// * One operand of the compare is an induction variable and the other is a9940// loop invariant value9941// * The induction variable is incremented/decremented by a single instruction9942// * Does not contain CALL or instructions which have unmodeled side effects99439944for (MachineInstr &MI : *LoopBB)9945if (MI.isCall() || MI.hasUnmodeledSideEffects())9946// This instruction may use NZCV, which interferes with the instruction to9947// be inserted for loop control.9948return nullptr;99499950MachineBasicBlock *TBB = nullptr, *FBB = nullptr;9951SmallVector<MachineOperand, 4> Cond;9952if (analyzeBranch(*LoopBB, TBB, FBB, Cond))9953return nullptr;99549955// Infinite loops are not supported9956if (TBB == LoopBB && FBB == LoopBB)9957return nullptr;99589959// Must be conditional branch9960if (TBB != LoopBB && FBB == nullptr)9961return nullptr;99629963assert((TBB == LoopBB || FBB == LoopBB) &&9964"The Loop must be a single-basic-block loop");99659966MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();9967const TargetRegisterInfo &TRI = getRegisterInfo();99689969if (CondBranch->getOpcode() != AArch64::Bcc)9970return nullptr;99719972// Normalization for createTripCountGreaterCondition()9973if (TBB == LoopBB)9974reverseBranchCondition(Cond);99759976MachineInstr *Comp = nullptr;9977unsigned CompCounterOprNum = 0;9978for (MachineInstr &MI : reverse(*LoopBB)) {9979if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {9980// Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the9981// operands is a loop invariant value99829983switch (MI.getOpcode()) {9984case AArch64::SUBSXri:9985case AArch64::SUBSWri:9986case AArch64::ADDSXri:9987case AArch64::ADDSWri:9988Comp = &MI;9989CompCounterOprNum = 1;9990break;9991case AArch64::ADDSWrr:9992case AArch64::ADDSXrr:9993case AArch64::SUBSWrr:9994case AArch64::SUBSXrr:9995Comp = &MI;9996break;9997default:9998if (isWhileOpcode(MI.getOpcode())) {9999Comp = &MI;10000break;10001}10002return nullptr;10003}1000410005if (CompCounterOprNum == 0) {10006if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))10007CompCounterOprNum = 2;10008else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))10009CompCounterOprNum = 1;10010else10011return nullptr;10012}10013break;10014}10015}10016if (!Comp)10017return nullptr;1001810019MachineInstr *Update = nullptr;10020Register Init;10021bool IsUpdatePriorComp;10022unsigned UpdateCounterOprNum;10023if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,10024Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))10025return nullptr;1002610027return std::make_unique<AArch64PipelinerLoopInfo>(10028LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,10029Init, IsUpdatePriorComp, Cond);10030}1003110032#define GET_INSTRINFO_HELPERS10033#define GET_INSTRMAP_INFO10034#include "AArch64GenInstrInfo.inc"100351003610037