Path: blob/main/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
35267 views
//===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This pass performs below peephole optimizations on MIR level.9//10// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri11// MOVi64imm + ANDXrr ==> ANDXri + ANDXri12//13// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi14// MOVi64imm + ADDXrr ==> ANDXri + ANDXri15//16// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi17// MOVi64imm + SUBXrr ==> SUBXri + SUBXri18//19// The mov pseudo instruction could be expanded to multiple mov instructions20// later. In this case, we could try to split the constant operand of mov21// instruction into two immediates which can be directly encoded into22// *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of23// multiple `mov` + `and/add/sub` instructions.24//25// 4. Remove redundant ORRWrs which is generated by zero-extend.26//27// %3:gpr32 = ORRWrs $wzr, %2, 028// %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_3229//30// If AArch64's 32-bit form of instruction defines the source operand of31// ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source32// operand are set to zero.33//34// 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx35// ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx36//37// 6. %intermediate:gpr32 = COPY %src:fpr12838// %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr3239// ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 040//41// In cases where a source FPR is copied to a GPR in order to be copied42// to a destination FPR, we can directly copy the values between the FPRs,43// eliminating the use of the Integer unit. When we match a pattern of44// INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR45// source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr46// instructions.47//48// 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high49// 64-bits. For example,50//51// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr52// %2:fpr64 = MOVID 053// %4:fpr128 = IMPLICIT_DEF54// %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub55// %6:fpr128 = IMPLICIT_DEF56// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub57// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 058// ==>59// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr60// %6:fpr128 = IMPLICIT_DEF61// %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub62//63//===----------------------------------------------------------------------===//6465#include "AArch64ExpandImm.h"66#include "AArch64InstrInfo.h"67#include "MCTargetDesc/AArch64AddressingModes.h"68#include "llvm/CodeGen/MachineDominators.h"69#include "llvm/CodeGen/MachineLoopInfo.h"7071using namespace llvm;7273#define DEBUG_TYPE "aarch64-mi-peephole-opt"7475namespace {7677struct AArch64MIPeepholeOpt : public MachineFunctionPass {78static char ID;7980AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {81initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry());82}8384const AArch64InstrInfo *TII;85const AArch64RegisterInfo *TRI;86MachineLoopInfo *MLI;87MachineRegisterInfo *MRI;8889using OpcodePair = std::pair<unsigned, unsigned>;90template <typename T>91using SplitAndOpcFunc =92std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>;93using BuildMIFunc =94std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,95Register, Register, Register)>;9697/// For instructions where an immediate operand could be split into two98/// separate immediate instructions, use the splitTwoPartImm two handle the99/// optimization.100///101/// To implement, the following function types must be passed to102/// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if103/// splitting the immediate is valid and returns the associated new opcode. A104/// BuildMIFunc must be implemented to build the two immediate instructions.105///106/// Example Pattern (where IMM would require 2+ MOV instructions):107/// %dst = <Instr>rr %src IMM [...]108/// becomes:109/// %tmp = <Instr>ri %src (encode half IMM) [...]110/// %dst = <Instr>ri %tmp (encode half IMM) [...]111template <typename T>112bool splitTwoPartImm(MachineInstr &MI,113SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);114115bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,116MachineInstr *&SubregToRegMI);117118template <typename T>119bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI);120template <typename T>121bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);122123template <typename T>124bool visitAND(unsigned Opc, MachineInstr &MI);125bool visitORR(MachineInstr &MI);126bool visitINSERT(MachineInstr &MI);127bool visitINSviGPR(MachineInstr &MI, unsigned Opc);128bool visitINSvi64lane(MachineInstr &MI);129bool visitFMOVDr(MachineInstr &MI);130bool visitCopy(MachineInstr &MI);131bool runOnMachineFunction(MachineFunction &MF) override;132133StringRef getPassName() const override {134return "AArch64 MI Peephole Optimization pass";135}136137void getAnalysisUsage(AnalysisUsage &AU) const override {138AU.setPreservesCFG();139AU.addRequired<MachineLoopInfoWrapperPass>();140MachineFunctionPass::getAnalysisUsage(AU);141}142};143144char AArch64MIPeepholeOpt::ID = 0;145146} // end anonymous namespace147148INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",149"AArch64 MI Peephole Optimization", false, false)150151template <typename T>152static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {153T UImm = static_cast<T>(Imm);154if (AArch64_AM::isLogicalImmediate(UImm, RegSize))155return false;156157// If this immediate can be handled by one instruction, do not split it.158SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;159AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);160if (Insn.size() == 1)161return false;162163// The bitmask immediate consists of consecutive ones. Let's say there is164// constant 0b00000000001000000000010000000000 which does not consist of165// consecutive ones. We can split it in to two bitmask immediate like166// 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.167// If we do AND with these two bitmask immediate, we can see original one.168unsigned LowestBitSet = llvm::countr_zero(UImm);169unsigned HighestBitSet = Log2_64(UImm);170171// Create a mask which is filled with one from the position of lowest bit set172// to the position of highest bit set.173T NewImm1 = (static_cast<T>(2) << HighestBitSet) -174(static_cast<T>(1) << LowestBitSet);175// Create a mask which is filled with one outside the position of lowest bit176// set and the position of highest bit set.177T NewImm2 = UImm | ~NewImm1;178179// If the split value is not valid bitmask immediate, do not split this180// constant.181if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize))182return false;183184Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);185Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);186return true;187}188189template <typename T>190bool AArch64MIPeepholeOpt::visitAND(191unsigned Opc, MachineInstr &MI) {192// Try below transformation.193//194// MOVi32imm + ANDWrr ==> ANDWri + ANDWri195// MOVi64imm + ANDXrr ==> ANDXri + ANDXri196//197// The mov pseudo instruction could be expanded to multiple mov instructions198// later. Let's try to split the constant operand of mov instruction into two199// bitmask immediates. It makes only two AND instructions intead of multiple200// mov + and instructions.201202return splitTwoPartImm<T>(203MI,204[Opc](T Imm, unsigned RegSize, T &Imm0,205T &Imm1) -> std::optional<OpcodePair> {206if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))207return std::make_pair(Opc, Opc);208return std::nullopt;209},210[&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,211unsigned Imm1, Register SrcReg, Register NewTmpReg,212Register NewDstReg) {213DebugLoc DL = MI.getDebugLoc();214MachineBasicBlock *MBB = MI.getParent();215BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)216.addReg(SrcReg)217.addImm(Imm0);218BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)219.addReg(NewTmpReg)220.addImm(Imm1);221});222}223224bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {225// Check this ORR comes from below zero-extend pattern.226//227// def : Pat<(i64 (zext GPR32:$src)),228// (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;229if (MI.getOperand(3).getImm() != 0)230return false;231232if (MI.getOperand(1).getReg() != AArch64::WZR)233return false;234235MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());236if (!SrcMI)237return false;238239// From https://developer.arm.com/documentation/dui0801/b/BABBGCAC240//241// When you use the 32-bit form of an instruction, the upper 32 bits of the242// source registers are ignored and the upper 32 bits of the destination243// register are set to zero.244//245// If AArch64's 32-bit form of instruction defines the source operand of246// zero-extend, we do not need the zero-extend. Let's check the MI's opcode is247// real AArch64 instruction and if it is not, do not process the opcode248// conservatively.249if (SrcMI->getOpcode() == TargetOpcode::COPY &&250SrcMI->getOperand(1).getReg().isVirtual()) {251const TargetRegisterClass *RC =252MRI->getRegClass(SrcMI->getOperand(1).getReg());253254// A COPY from an FPR will become a FMOVSWr, so do so now so that we know255// that the upper bits are zero.256if (RC != &AArch64::FPR32RegClass &&257((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) ||258SrcMI->getOperand(1).getSubReg() != AArch64::ssub))259return false;260Register CpySrc = SrcMI->getOperand(1).getReg();261if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) {262CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);263BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),264TII->get(TargetOpcode::COPY), CpySrc)265.add(SrcMI->getOperand(1));266}267BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),268TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg())269.addReg(CpySrc);270SrcMI->eraseFromParent();271}272else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)273return false;274275Register DefReg = MI.getOperand(0).getReg();276Register SrcReg = MI.getOperand(2).getReg();277MRI->replaceRegWith(DefReg, SrcReg);278MRI->clearKillFlags(SrcReg);279LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");280MI.eraseFromParent();281282return true;283}284285bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) {286// Check this INSERT_SUBREG comes from below zero-extend pattern.287//288// From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx289// To %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx290//291// We're assuming the first operand to INSERT_SUBREG is irrelevant because a292// COPY would destroy the upper part of the register anyway293if (!MI.isRegTiedToDefOperand(1))294return false;295296Register DstReg = MI.getOperand(0).getReg();297const TargetRegisterClass *RC = MRI->getRegClass(DstReg);298MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());299if (!SrcMI)300return false;301302// From https://developer.arm.com/documentation/dui0801/b/BABBGCAC303//304// When you use the 32-bit form of an instruction, the upper 32 bits of the305// source registers are ignored and the upper 32 bits of the destination306// register are set to zero.307//308// If AArch64's 32-bit form of instruction defines the source operand of309// zero-extend, we do not need the zero-extend. Let's check the MI's opcode is310// real AArch64 instruction and if it is not, do not process the opcode311// conservatively.312if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) ||313!AArch64::GPR64allRegClass.hasSubClassEq(RC))314return false;315316// Build a SUBREG_TO_REG instruction317MachineInstr *SubregMI =318BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),319TII->get(TargetOpcode::SUBREG_TO_REG), DstReg)320.addImm(0)321.add(MI.getOperand(2))322.add(MI.getOperand(3));323LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *SubregMI << "\n");324(void)SubregMI;325MI.eraseFromParent();326327return true;328}329330template <typename T>331static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {332// The immediate must be in the form of ((imm0 << 12) + imm1), in which both333// imm0 and imm1 are non-zero 12-bit unsigned int.334if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||335(Imm & ~static_cast<T>(0xffffff)) != 0)336return false;337338// The immediate can not be composed via a single instruction.339SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;340AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);341if (Insn.size() == 1)342return false;343344// Split Imm into (Imm0 << 12) + Imm1;345Imm0 = (Imm >> 12) & 0xfff;346Imm1 = Imm & 0xfff;347return true;348}349350template <typename T>351bool AArch64MIPeepholeOpt::visitADDSUB(352unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) {353// Try below transformation.354//355// ADDWrr X, MOVi32imm ==> ADDWri + ADDWri356// ADDXrr X, MOVi64imm ==> ADDXri + ADDXri357//358// SUBWrr X, MOVi32imm ==> SUBWri + SUBWri359// SUBXrr X, MOVi64imm ==> SUBXri + SUBXri360//361// The mov pseudo instruction could be expanded to multiple mov instructions362// later. Let's try to split the constant operand of mov instruction into two363// legal add/sub immediates. It makes only two ADD/SUB instructions intead of364// multiple `mov` + `and/sub` instructions.365366// We can sometimes have ADDWrr WZR, MULi32imm that have not been constant367// folded. Make sure that we don't generate invalid instructions that use XZR368// in those cases.369if (MI.getOperand(1).getReg() == AArch64::XZR ||370MI.getOperand(1).getReg() == AArch64::WZR)371return false;372373return splitTwoPartImm<T>(374MI,375[PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,376T &Imm1) -> std::optional<OpcodePair> {377if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))378return std::make_pair(PosOpc, PosOpc);379if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))380return std::make_pair(NegOpc, NegOpc);381return std::nullopt;382},383[&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,384unsigned Imm1, Register SrcReg, Register NewTmpReg,385Register NewDstReg) {386DebugLoc DL = MI.getDebugLoc();387MachineBasicBlock *MBB = MI.getParent();388BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)389.addReg(SrcReg)390.addImm(Imm0)391.addImm(12);392BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)393.addReg(NewTmpReg)394.addImm(Imm1)395.addImm(0);396});397}398399template <typename T>400bool AArch64MIPeepholeOpt::visitADDSSUBS(401OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) {402// Try the same transformation as ADDSUB but with additional requirement403// that the condition code usages are only for Equal and Not Equal404405if (MI.getOperand(1).getReg() == AArch64::XZR ||406MI.getOperand(1).getReg() == AArch64::WZR)407return false;408409return splitTwoPartImm<T>(410MI,411[PosOpcs, NegOpcs, &MI, &TRI = TRI,412&MRI = MRI](T Imm, unsigned RegSize, T &Imm0,413T &Imm1) -> std::optional<OpcodePair> {414OpcodePair OP;415if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))416OP = PosOpcs;417else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))418OP = NegOpcs;419else420return std::nullopt;421// Check conditional uses last since it is expensive for scanning422// proceeding instructions423MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg());424std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI);425if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V)426return std::nullopt;427return OP;428},429[&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,430unsigned Imm1, Register SrcReg, Register NewTmpReg,431Register NewDstReg) {432DebugLoc DL = MI.getDebugLoc();433MachineBasicBlock *MBB = MI.getParent();434BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)435.addReg(SrcReg)436.addImm(Imm0)437.addImm(12);438BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)439.addReg(NewTmpReg)440.addImm(Imm1)441.addImm(0);442});443}444445// Checks if the corresponding MOV immediate instruction is applicable for446// this peephole optimization.447bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,448MachineInstr *&MovMI,449MachineInstr *&SubregToRegMI) {450// Check whether current MBB is in loop and the AND is loop invariant.451MachineBasicBlock *MBB = MI.getParent();452MachineLoop *L = MLI->getLoopFor(MBB);453if (L && !L->isLoopInvariant(MI))454return false;455456// Check whether current MI's operand is MOV with immediate.457MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());458if (!MovMI)459return false;460461// If it is SUBREG_TO_REG, check its operand.462SubregToRegMI = nullptr;463if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {464SubregToRegMI = MovMI;465MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());466if (!MovMI)467return false;468}469470if (MovMI->getOpcode() != AArch64::MOVi32imm &&471MovMI->getOpcode() != AArch64::MOVi64imm)472return false;473474// If the MOV has multiple uses, do not split the immediate because it causes475// more instructions.476if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))477return false;478if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))479return false;480481// It is OK to perform this peephole optimization.482return true;483}484485template <typename T>486bool AArch64MIPeepholeOpt::splitTwoPartImm(487MachineInstr &MI,488SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {489unsigned RegSize = sizeof(T) * 8;490assert((RegSize == 32 || RegSize == 64) &&491"Invalid RegSize for legal immediate peephole optimization");492493// Perform several essential checks against current MI.494MachineInstr *MovMI, *SubregToRegMI;495if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))496return false;497498// Split the immediate to Imm0 and Imm1, and calculate the Opcode.499T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;500// For the 32 bit form of instruction, the upper 32 bits of the destination501// register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits502// of Imm to zero. This is essential if the Immediate value was a negative503// number since it was sign extended when we assign to the 64-bit Imm.504if (SubregToRegMI)505Imm &= 0xFFFFFFFF;506OpcodePair Opcode;507if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))508Opcode = *R;509else510return false;511512// Create new MIs using the first and second opcodes. Opcodes might differ for513// flag setting operations that should only set flags on second instruction.514// NewTmpReg = Opcode.first SrcReg Imm0515// NewDstReg = Opcode.second NewTmpReg Imm1516517// Determine register classes for destinations and register operands518MachineFunction *MF = MI.getMF();519const TargetRegisterClass *FirstInstrDstRC =520TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF);521const TargetRegisterClass *FirstInstrOperandRC =522TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF);523const TargetRegisterClass *SecondInstrDstRC =524(Opcode.first == Opcode.second)525? FirstInstrDstRC526: TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF);527const TargetRegisterClass *SecondInstrOperandRC =528(Opcode.first == Opcode.second)529? FirstInstrOperandRC530: TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF);531532// Get old registers destinations and new register destinations533Register DstReg = MI.getOperand(0).getReg();534Register SrcReg = MI.getOperand(1).getReg();535Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC);536// In the situation that DstReg is not Virtual (likely WZR or XZR), we want to537// reuse that same destination register.538Register NewDstReg = DstReg.isVirtual()539? MRI->createVirtualRegister(SecondInstrDstRC)540: DstReg;541542// Constrain registers based on their new uses543MRI->constrainRegClass(SrcReg, FirstInstrOperandRC);544MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC);545if (DstReg != NewDstReg)546MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));547548// Call the delegating operation to build the instruction549BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);550551// replaceRegWith changes MI's definition register. Keep it for SSA form until552// deleting MI. Only if we made a new destination register.553if (DstReg != NewDstReg) {554MRI->replaceRegWith(DstReg, NewDstReg);555MI.getOperand(0).setReg(DstReg);556}557558// Record the MIs need to be removed.559MI.eraseFromParent();560if (SubregToRegMI)561SubregToRegMI->eraseFromParent();562MovMI->eraseFromParent();563564return true;565}566567bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) {568// Check if this INSvi[X]gpr comes from COPY of a source FPR128569//570// From571// %intermediate1:gpr64 = COPY %src:fpr128572// %intermediate2:gpr32 = COPY %intermediate1:gpr64573// %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32574// To575// %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128,576// src_index577// where src_index = 0, X = [8|16|32|64]578579MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());580581// For a chain of COPY instructions, find the initial source register582// and check if it's an FPR128583while (true) {584if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY)585return false;586587if (!SrcMI->getOperand(1).getReg().isVirtual())588return false;589590if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) ==591&AArch64::FPR128RegClass) {592break;593}594SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());595}596597Register DstReg = MI.getOperand(0).getReg();598Register SrcReg = SrcMI->getOperand(1).getReg();599MachineInstr *INSvilaneMI =600BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg)601.add(MI.getOperand(1))602.add(MI.getOperand(2))603.addUse(SrcReg, getRegState(SrcMI->getOperand(1)))604.addImm(0);605606LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n");607(void)INSvilaneMI;608MI.eraseFromParent();609return true;610}611612// All instructions that set a FPR64 will implicitly zero the top bits of the613// register.614static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI,615MachineRegisterInfo *MRI) {616if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef())617return false;618const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg());619if (RC != &AArch64::FPR64RegClass)620return false;621return MI->getOpcode() > TargetOpcode::GENERIC_OP_END;622}623624bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {625// Check the MI for low 64-bits sets zero for high 64-bits implicitly.626// We are expecting below case.627//628// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr629// %6:fpr128 = IMPLICIT_DEF630// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub631// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0632MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());633if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG)634return false;635Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg());636if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))637return false;638639// Check there is `mov 0` MI for high 64-bits.640// We are expecting below cases.641//642// %2:fpr64 = MOVID 0643// %4:fpr128 = IMPLICIT_DEF644// %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub645// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0646// or647// %5:fpr128 = MOVIv2d_ns 0648// %6:fpr64 = COPY %5.dsub:fpr128649// %8:fpr128 = IMPLICIT_DEF650// %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub651// %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0652MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());653if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG)654return false;655High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg());656if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY)657High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg());658if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID &&659High64MI->getOpcode() != AArch64::MOVIv2d_ns))660return false;661if (High64MI->getOperand(1).getImm() != 0)662return false;663664// Let's remove MIs for high 64-bits.665Register OldDef = MI.getOperand(0).getReg();666Register NewDef = MI.getOperand(1).getReg();667MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));668MRI->replaceRegWith(OldDef, NewDef);669MI.eraseFromParent();670671return true;672}673674bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {675// An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.676MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());677if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))678return false;679680// Let's remove MIs for high 64-bits.681Register OldDef = MI.getOperand(0).getReg();682Register NewDef = MI.getOperand(1).getReg();683LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n");684MRI->clearKillFlags(OldDef);685MRI->clearKillFlags(NewDef);686MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));687MRI->replaceRegWith(OldDef, NewDef);688MI.eraseFromParent();689690return true;691}692693// Across a basic-block we might have in i32 extract from a value that only694// operates on upper bits (for example a sxtw). We can replace the COPY with a695// new version skipping the sxtw.696bool AArch64MIPeepholeOpt::visitCopy(MachineInstr &MI) {697Register InputReg = MI.getOperand(1).getReg();698if (MI.getOperand(1).getSubReg() != AArch64::sub_32 ||699!MRI->hasOneNonDBGUse(InputReg))700return false;701702MachineInstr *SrcMI = MRI->getUniqueVRegDef(InputReg);703SmallPtrSet<MachineInstr *, 4> DeadInstrs;704DeadInstrs.insert(SrcMI);705while (SrcMI && SrcMI->isFullCopy() &&706MRI->hasOneNonDBGUse(SrcMI->getOperand(1).getReg())) {707SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());708DeadInstrs.insert(SrcMI);709}710711if (!SrcMI || SrcMI->getOpcode() != AArch64::SBFMXri ||712SrcMI->getOperand(2).getImm() != 0 || SrcMI->getOperand(3).getImm() != 31)713return false;714715Register SrcReg = SrcMI->getOperand(1).getReg();716MRI->constrainRegClass(SrcReg, MRI->getRegClass(InputReg));717LLVM_DEBUG(dbgs() << "Optimizing: " << MI);718MI.getOperand(1).setReg(SrcReg);719LLVM_DEBUG(dbgs() << " to: " << MI);720for (auto *DeadMI : DeadInstrs) {721LLVM_DEBUG(dbgs() << " Removing: " << *DeadMI);722DeadMI->eraseFromParent();723}724return true;725}726727bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {728if (skipFunction(MF.getFunction()))729return false;730731TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());732TRI = static_cast<const AArch64RegisterInfo *>(733MF.getSubtarget().getRegisterInfo());734MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();735MRI = &MF.getRegInfo();736737assert(MRI->isSSA() && "Expected to be run on SSA form!");738739bool Changed = false;740741for (MachineBasicBlock &MBB : MF) {742for (MachineInstr &MI : make_early_inc_range(MBB)) {743switch (MI.getOpcode()) {744default:745break;746case AArch64::INSERT_SUBREG:747Changed |= visitINSERT(MI);748break;749case AArch64::ANDWrr:750Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);751break;752case AArch64::ANDXrr:753Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);754break;755case AArch64::ORRWrs:756Changed |= visitORR(MI);757break;758case AArch64::ADDWrr:759Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI);760break;761case AArch64::SUBWrr:762Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI);763break;764case AArch64::ADDXrr:765Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI);766break;767case AArch64::SUBXrr:768Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI);769break;770case AArch64::ADDSWrr:771Changed |=772visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri},773{AArch64::SUBWri, AArch64::SUBSWri}, MI);774break;775case AArch64::SUBSWrr:776Changed |=777visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri},778{AArch64::ADDWri, AArch64::ADDSWri}, MI);779break;780case AArch64::ADDSXrr:781Changed |=782visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri},783{AArch64::SUBXri, AArch64::SUBSXri}, MI);784break;785case AArch64::SUBSXrr:786Changed |=787visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri},788{AArch64::ADDXri, AArch64::ADDSXri}, MI);789break;790case AArch64::INSvi64gpr:791Changed |= visitINSviGPR(MI, AArch64::INSvi64lane);792break;793case AArch64::INSvi32gpr:794Changed |= visitINSviGPR(MI, AArch64::INSvi32lane);795break;796case AArch64::INSvi16gpr:797Changed |= visitINSviGPR(MI, AArch64::INSvi16lane);798break;799case AArch64::INSvi8gpr:800Changed |= visitINSviGPR(MI, AArch64::INSvi8lane);801break;802case AArch64::INSvi64lane:803Changed |= visitINSvi64lane(MI);804break;805case AArch64::FMOVDr:806Changed |= visitFMOVDr(MI);807break;808case AArch64::COPY:809Changed |= visitCopy(MI);810break;811}812}813}814815return Changed;816}817818FunctionPass *llvm::createAArch64MIPeepholeOptPass() {819return new AArch64MIPeepholeOpt();820}821822823