Path: blob/main/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
35269 views
//===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7/// \file8/// Finalize v8.1-m low-overhead loops by converting the associated pseudo9/// instructions into machine operations.10/// The expectation is that the loop contains three pseudo instructions:11/// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop12/// form should be in the preheader, whereas the while form should be in the13/// preheaders only predecessor.14/// - t2LoopDec - placed within in the loop body.15/// - t2LoopEnd - the loop latch terminator.16///17/// In addition to this, we also look for the presence of the VCTP instruction,18/// which determines whether we can generated the tail-predicated low-overhead19/// loop form.20///21/// Assumptions and Dependencies:22/// Low-overhead loops are constructed and executed using a setup instruction:23/// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.24/// WLS(TP) and LE(TP) are branching instructions with a (large) limited range25/// but fixed polarity: WLS can only branch forwards and LE can only branch26/// backwards. These restrictions mean that this pass is dependent upon block27/// layout and block sizes, which is why it's the last pass to run. The same is28/// true for ConstantIslands, but this pass does not increase the size of the29/// basic blocks, nor does it change the CFG. Instructions are mainly removed30/// during the transform and pseudo instructions are replaced by real ones. In31/// some cases, when we have to revert to a 'normal' loop, we have to introduce32/// multiple instructions for a single pseudo (see RevertWhile and33/// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd34/// are defined to be as large as this maximum sequence of replacement35/// instructions.36///37/// A note on VPR.P0 (the lane mask):38/// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a39/// "VPT Active" context (which includes low-overhead loops and vpt blocks).40/// They will simply "and" the result of their calculation with the current41/// value of VPR.P0. You can think of it like this:42/// \verbatim43/// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs44/// VPR.P0 &= Value45/// else46/// VPR.P0 = Value47/// \endverbatim48/// When we're inside the low-overhead loop (between DLSTP and LETP), we always49/// fall in the "VPT active" case, so we can consider that all VPR writes by50/// one of those instruction is actually a "and".51//===----------------------------------------------------------------------===//5253#include "ARM.h"54#include "ARMBaseInstrInfo.h"55#include "ARMBaseRegisterInfo.h"56#include "ARMBasicBlockInfo.h"57#include "ARMSubtarget.h"58#include "MVETailPredUtils.h"59#include "Thumb2InstrInfo.h"60#include "llvm/ADT/SetOperations.h"61#include "llvm/ADT/SetVector.h"62#include "llvm/CodeGen/LivePhysRegs.h"63#include "llvm/CodeGen/MachineFrameInfo.h"64#include "llvm/CodeGen/MachineFunctionPass.h"65#include "llvm/CodeGen/MachineLoopInfo.h"66#include "llvm/CodeGen/MachineLoopUtils.h"67#include "llvm/CodeGen/MachineRegisterInfo.h"68#include "llvm/CodeGen/Passes.h"69#include "llvm/CodeGen/ReachingDefAnalysis.h"70#include "llvm/MC/MCInstrDesc.h"7172using namespace llvm;7374#define DEBUG_TYPE "arm-low-overhead-loops"75#define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"7677static cl::opt<bool>78DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden,79cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"),80cl::init(false));8182static cl::opt<bool>83DisableOmitDLS("arm-disable-omit-dls", cl::Hidden,84cl::desc("Disable omitting 'dls lr, lr' instructions"),85cl::init(false));8687static bool isVectorPredicated(MachineInstr *MI) {88int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);89return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR;90}9192static bool isVectorPredicate(MachineInstr *MI) {93return MI->findRegisterDefOperandIdx(ARM::VPR, /*TRI=*/nullptr) != -1;94}9596static bool hasVPRUse(MachineInstr &MI) {97return MI.findRegisterUseOperandIdx(ARM::VPR, /*TRI=*/nullptr) != -1;98}99100static bool isDomainMVE(MachineInstr *MI) {101uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask;102return Domain == ARMII::DomainMVE;103}104105static int getVecSize(const MachineInstr &MI) {106const MCInstrDesc &MCID = MI.getDesc();107uint64_t Flags = MCID.TSFlags;108return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift;109}110111static bool shouldInspect(MachineInstr &MI) {112if (MI.isDebugInstr())113return false;114return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);115}116117static bool isHorizontalReduction(const MachineInstr &MI) {118const MCInstrDesc &MCID = MI.getDesc();119uint64_t Flags = MCID.TSFlags;120return (Flags & ARMII::HorizontalReduction) != 0;121}122123namespace {124125using InstSet = SmallPtrSetImpl<MachineInstr *>;126127class PostOrderLoopTraversal {128MachineLoop &ML;129MachineLoopInfo &MLI;130SmallPtrSet<MachineBasicBlock*, 4> Visited;131SmallVector<MachineBasicBlock*, 4> Order;132133public:134PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)135: ML(ML), MLI(MLI) { }136137const SmallVectorImpl<MachineBasicBlock*> &getOrder() const {138return Order;139}140141// Visit all the blocks within the loop, as well as exit blocks and any142// blocks properly dominating the header.143void ProcessLoop() {144std::function<void(MachineBasicBlock*)> Search = [this, &Search]145(MachineBasicBlock *MBB) -> void {146if (Visited.count(MBB))147return;148149Visited.insert(MBB);150for (auto *Succ : MBB->successors()) {151if (!ML.contains(Succ))152continue;153Search(Succ);154}155Order.push_back(MBB);156};157158// Insert exit blocks.159SmallVector<MachineBasicBlock*, 2> ExitBlocks;160ML.getExitBlocks(ExitBlocks);161append_range(Order, ExitBlocks);162163// Then add the loop body.164Search(ML.getHeader());165166// Then try the preheader and its predecessors.167std::function<void(MachineBasicBlock*)> GetPredecessor =168[this, &GetPredecessor] (MachineBasicBlock *MBB) -> void {169Order.push_back(MBB);170if (MBB->pred_size() == 1)171GetPredecessor(*MBB->pred_begin());172};173174if (auto *Preheader = ML.getLoopPreheader())175GetPredecessor(Preheader);176else if (auto *Preheader = MLI.findLoopPreheader(&ML, true, true))177GetPredecessor(Preheader);178}179};180181class VPTBlock {182SmallVector<MachineInstr *, 4> Insts;183184public:185VPTBlock(MachineInstr *MI) { Insts.push_back(MI); }186187// Have we found an instruction within the block which defines the vpr? If188// so, not all the instructions in the block will have the same predicate.189bool hasUniformPredicate() { return getDivergent() == nullptr; }190191// If it exists, return the first internal instruction which modifies the192// VPR.193MachineInstr *getDivergent() {194SmallVectorImpl<MachineInstr *> &Insts = getInsts();195for (unsigned i = 1; i < Insts.size(); ++i) {196MachineInstr *Next = Insts[i];197if (isVectorPredicate(Next))198return Next; // Found an instruction altering the vpr.199}200return nullptr;201}202203void insert(MachineInstr *MI) {204Insts.push_back(MI);205// VPT/VPST + 4 predicated instructions.206assert(Insts.size() <= 5 && "Too many instructions in VPT block!");207}208209bool containsVCTP() const { return llvm::any_of(Insts, isVCTP); }210211unsigned size() const { return Insts.size(); }212SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; }213};214215// Represent the current state of the VPR and hold all instances which216// represent a VPT block, which is a list of instructions that begins with a217// VPT/VPST and has a maximum of four proceeding instructions. All218// instructions within the block are predicated upon the vpr and we allow219// instructions to define the vpr within in the block too.220class VPTState {221friend struct LowOverheadLoop;222223SmallVector<VPTBlock, 4> Blocks;224SetVector<MachineInstr *> CurrentPredicates;225std::map<MachineInstr *, SetVector<MachineInstr *>> PredicatedInsts;226227void CreateVPTBlock(MachineInstr *MI) {228assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR))229&& "Can't begin VPT without predicate");230Blocks.emplace_back(MI);231// The execution of MI is predicated upon the current set of instructions232// that are AND'ed together to form the VPR predicate value. In the case233// that MI is a VPT, CurrentPredicates will also just be MI.234PredicatedInsts[MI] = CurrentPredicates;235}236237void addInst(MachineInstr *MI) {238Blocks.back().insert(MI);239PredicatedInsts[MI] = CurrentPredicates;240}241242void addPredicate(MachineInstr *MI) {243LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);244CurrentPredicates.insert(MI);245}246247void resetPredicate(MachineInstr *MI) {248LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);249CurrentPredicates.clear();250CurrentPredicates.insert(MI);251}252253public:254// Return whether the given instruction is predicated upon a VCTP.255bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) {256SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI];257if (Exclusive && Predicates.size() != 1)258return false;259// We do not know how to convert an else predicate of a VCTP.260if (getVPTInstrPredicate(*MI) == ARMVCC::Else)261return false;262return llvm::any_of(Predicates, isVCTP);263}264265// Is the VPST, controlling the block entry, predicated upon a VCTP.266bool isEntryPredicatedOnVCTP(VPTBlock &Block, bool Exclusive = false) {267SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();268return isPredicatedOnVCTP(Insts.front(), Exclusive);269}270271// If this block begins with a VPT, we can check whether it's using272// at least one predicated input(s), as well as possible loop invariant273// which would result in it being implicitly predicated.274bool hasImplicitlyValidVPT(VPTBlock &Block, ReachingDefAnalysis &RDA) {275SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();276MachineInstr *VPT = Insts.front();277assert(isVPTOpcode(VPT->getOpcode()) &&278"Expected VPT block to begin with VPT/VPST");279280if (VPT->getOpcode() == ARM::MVE_VPST)281return false;282283// If the VPT block does not define something that is an "output", then284// the tail-predicated version will just perform a subset of the original285// vpt block, where the last lanes should not be used.286if (isVPTOpcode(VPT->getOpcode()) &&287all_of(Block.getInsts(), [](const MachineInstr *MI) {288return !MI->mayStore() && !MI->mayLoad() &&289!isHorizontalReduction(*MI) && !isVCTP(MI);290}))291return true;292293auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {294MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));295return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);296};297298auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) {299MachineOperand &MO = MI->getOperand(Idx);300if (!MO.isReg() || !MO.getReg())301return true;302303SmallPtrSet<MachineInstr *, 2> Defs;304RDA.getGlobalReachingDefs(MI, MO.getReg(), Defs);305if (Defs.empty())306return true;307308for (auto *Def : Defs)309if (Def->getParent() == VPT->getParent())310return false;311return true;312};313314// Check that at least one of the operands is directly predicated on a315// vctp and allow an invariant value too.316return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) &&317(IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) &&318(IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2));319}320321bool isValid(ReachingDefAnalysis &RDA) {322// All predication within the loop should be based on vctp. If the block323// isn't predicated on entry, check whether the vctp is within the block324// and that all other instructions are then predicated on it.325for (auto &Block : Blocks) {326if (isEntryPredicatedOnVCTP(Block, false) &&327!any_of(drop_begin(Block.getInsts()), [](const MachineInstr *MI) {328return getVPTInstrPredicate(*MI) == ARMVCC::Else;329}))330continue;331if (hasImplicitlyValidVPT(Block, RDA))332continue;333334SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();335// We don't know how to convert a block with just a VPT;VCTP into336// anything valid once we remove the VCTP. For now just bail out.337assert(isVPTOpcode(Insts.front()->getOpcode()) &&338"Expected VPT block to start with a VPST or VPT!");339if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST &&340isVCTP(Insts.back()))341return false;342343for (auto *MI : Insts) {344// Check that any internal VCTPs are 'Then' predicated.345if (isVCTP(MI) && getVPTInstrPredicate(*MI) != ARMVCC::Then)346return false;347// Skip other instructions that build up the predicate.348if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI))349continue;350// Check that any other instructions are predicated upon a vctp.351// TODO: We could infer when VPTs are implicitly predicated on the352// vctp (when the operands are predicated).353if (!isPredicatedOnVCTP(MI)) {354LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);355return false;356}357}358}359return true;360}361};362363struct LowOverheadLoop {364365MachineLoop &ML;366MachineBasicBlock *Preheader = nullptr;367MachineLoopInfo &MLI;368ReachingDefAnalysis &RDA;369const TargetRegisterInfo &TRI;370const ARMBaseInstrInfo &TII;371MachineFunction *MF = nullptr;372MachineBasicBlock::iterator StartInsertPt;373MachineBasicBlock *StartInsertBB = nullptr;374MachineInstr *Start = nullptr;375MachineInstr *Dec = nullptr;376MachineInstr *End = nullptr;377MachineOperand TPNumElements;378SmallVector<MachineInstr *, 4> VCTPs;379SmallPtrSet<MachineInstr *, 4> ToRemove;380SmallPtrSet<MachineInstr *, 4> BlockMasksToRecompute;381SmallPtrSet<MachineInstr *, 4> DoubleWidthResultInstrs;382SmallPtrSet<MachineInstr *, 4> VMOVCopies;383bool Revert = false;384bool CannotTailPredicate = false;385VPTState VPTstate;386387LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,388ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,389const ARMBaseInstrInfo &TII)390: ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),391TPNumElements(MachineOperand::CreateImm(0)) {392MF = ML.getHeader()->getParent();393if (auto *MBB = ML.getLoopPreheader())394Preheader = MBB;395else if (auto *MBB = MLI.findLoopPreheader(&ML, true, true))396Preheader = MBB;397}398399// If this is an MVE instruction, check that we know how to use tail400// predication with it. Record VPT blocks and return whether the401// instruction is valid for tail predication.402bool ValidateMVEInst(MachineInstr *MI);403404void AnalyseMVEInst(MachineInstr *MI) {405CannotTailPredicate = !ValidateMVEInst(MI);406}407408bool IsTailPredicationLegal() const {409// For now, let's keep things really simple and only support a single410// block for tail predication.411return !Revert && FoundAllComponents() && !VCTPs.empty() &&412!CannotTailPredicate && ML.getNumBlocks() == 1;413}414415// Given that MI is a VCTP, check that is equivalent to any other VCTPs416// found.417bool AddVCTP(MachineInstr *MI);418419// Check that the predication in the loop will be equivalent once we420// perform the conversion. Also ensure that we can provide the number421// of elements to the loop start instruction.422bool ValidateTailPredicate();423424// Check that any values available outside of the loop will be the same425// after tail predication conversion.426bool ValidateLiveOuts();427428// Check the branch targets are within range and we satisfy our429// restrictions.430void Validate(ARMBasicBlockUtils *BBUtils);431432bool FoundAllComponents() const {433return Start && Dec && End;434}435436SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTstate.Blocks; }437438// Return the operand for the loop start instruction. This will be the loop439// iteration count, or the number of elements if we're tail predicating.440MachineOperand &getLoopStartOperand() {441if (IsTailPredicationLegal())442return TPNumElements;443return Start->getOperand(1);444}445446unsigned getStartOpcode() const {447bool IsDo = isDoLoopStart(*Start);448if (!IsTailPredicationLegal())449return IsDo ? ARM::t2DLS : ARM::t2WLS;450451return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo);452}453454void dump() const {455if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;456if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;457if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;458if (!VCTPs.empty()) {459dbgs() << "ARM Loops: Found VCTP(s):\n";460for (auto *MI : VCTPs)461dbgs() << " - " << *MI;462}463if (!FoundAllComponents())464dbgs() << "ARM Loops: Not a low-overhead loop.\n";465else if (!(Start && Dec && End))466dbgs() << "ARM Loops: Failed to find all loop components.\n";467}468};469470class ARMLowOverheadLoops : public MachineFunctionPass {471MachineFunction *MF = nullptr;472MachineLoopInfo *MLI = nullptr;473ReachingDefAnalysis *RDA = nullptr;474const ARMBaseInstrInfo *TII = nullptr;475MachineRegisterInfo *MRI = nullptr;476const TargetRegisterInfo *TRI = nullptr;477std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;478479public:480static char ID;481482ARMLowOverheadLoops() : MachineFunctionPass(ID) { }483484void getAnalysisUsage(AnalysisUsage &AU) const override {485AU.setPreservesCFG();486AU.addRequired<MachineLoopInfoWrapperPass>();487AU.addRequired<ReachingDefAnalysis>();488MachineFunctionPass::getAnalysisUsage(AU);489}490491bool runOnMachineFunction(MachineFunction &MF) override;492493MachineFunctionProperties getRequiredProperties() const override {494return MachineFunctionProperties().set(495MachineFunctionProperties::Property::NoVRegs).set(496MachineFunctionProperties::Property::TracksLiveness);497}498499StringRef getPassName() const override {500return ARM_LOW_OVERHEAD_LOOPS_NAME;501}502503private:504bool ProcessLoop(MachineLoop *ML);505506bool RevertNonLoops();507508void RevertWhile(MachineInstr *MI) const;509void RevertDo(MachineInstr *MI) const;510511bool RevertLoopDec(MachineInstr *MI) const;512513void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;514515void RevertLoopEndDec(MachineInstr *MI) const;516517void ConvertVPTBlocks(LowOverheadLoop &LoLoop);518519MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);520521void Expand(LowOverheadLoop &LoLoop);522523void IterationCountDCE(LowOverheadLoop &LoLoop);524};525}526527char ARMLowOverheadLoops::ID = 0;528529INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,530false, false)531532static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA,533InstSet &ToRemove, InstSet &Ignore) {534535// Check that we can remove all of Killed without having to modify any IT536// blocks.537auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) {538// Collect the dead code and the MBBs in which they reside.539SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks;540for (auto *Dead : Killed)541BasicBlocks.insert(Dead->getParent());542543// Collect IT blocks in all affected basic blocks.544std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks;545for (auto *MBB : BasicBlocks) {546for (auto &IT : *MBB) {547if (IT.getOpcode() != ARM::t2IT)548continue;549RDA.getReachingLocalUses(&IT, MCRegister::from(ARM::ITSTATE),550ITBlocks[&IT]);551}552}553554// If we're removing all of the instructions within an IT block, then555// also remove the IT instruction.556SmallPtrSet<MachineInstr *, 2> ModifiedITs;557SmallPtrSet<MachineInstr *, 2> RemoveITs;558for (auto *Dead : Killed) {559if (MachineOperand *MO =560Dead->findRegisterUseOperand(ARM::ITSTATE, /*TRI=*/nullptr)) {561MachineInstr *IT = RDA.getMIOperand(Dead, *MO);562RemoveITs.insert(IT);563auto &CurrentBlock = ITBlocks[IT];564CurrentBlock.erase(Dead);565if (CurrentBlock.empty())566ModifiedITs.erase(IT);567else568ModifiedITs.insert(IT);569}570}571if (!ModifiedITs.empty())572return false;573Killed.insert(RemoveITs.begin(), RemoveITs.end());574return true;575};576577SmallPtrSet<MachineInstr *, 2> Uses;578if (!RDA.isSafeToRemove(MI, Uses, Ignore))579return false;580581if (WontCorruptITs(Uses, RDA)) {582ToRemove.insert(Uses.begin(), Uses.end());583LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI584<< " - can also remove:\n";585for (auto *Use : Uses)586dbgs() << " - " << *Use);587588SmallPtrSet<MachineInstr*, 4> Killed;589RDA.collectKilledOperands(MI, Killed);590if (WontCorruptITs(Killed, RDA)) {591ToRemove.insert(Killed.begin(), Killed.end());592LLVM_DEBUG(for (auto *Dead : Killed)593dbgs() << " - " << *Dead);594}595return true;596}597return false;598}599600bool LowOverheadLoop::ValidateTailPredicate() {601if (!IsTailPredicationLegal()) {602LLVM_DEBUG(if (VCTPs.empty())603dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";604dbgs() << "ARM Loops: Tail-predication is not valid.\n");605return false;606}607608assert(!VCTPs.empty() && "VCTP instruction expected but is not set");609assert(ML.getBlocks().size() == 1 &&610"Shouldn't be processing a loop with more than one block");611612if (DisableTailPredication) {613LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n");614return false;615}616617if (!VPTstate.isValid(RDA)) {618LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n");619return false;620}621622if (!ValidateLiveOuts()) {623LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");624return false;625}626627// For tail predication, we need to provide the number of elements, instead628// of the iteration count, to the loop start instruction. The number of629// elements is provided to the vctp instruction, so we need to check that630// we can use this register at InsertPt.631MachineInstr *VCTP = VCTPs.back();632if (Start->getOpcode() == ARM::t2DoLoopStartTP ||633Start->getOpcode() == ARM::t2WhileLoopStartTP) {634TPNumElements = Start->getOperand(2);635StartInsertPt = Start;636StartInsertBB = Start->getParent();637} else {638TPNumElements = VCTP->getOperand(1);639MCRegister NumElements = TPNumElements.getReg().asMCReg();640641// If the register is defined within loop, then we can't perform TP.642// TODO: Check whether this is just a mov of a register that would be643// available.644if (RDA.hasLocalDefBefore(VCTP, NumElements)) {645LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");646return false;647}648649// The element count register maybe defined after InsertPt, in which case we650// need to try to move either InsertPt or the def so that the [w|d]lstp can651// use the value.652653if (StartInsertPt != StartInsertBB->end() &&654!RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) {655if (auto *ElemDef =656RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) {657if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) {658ElemDef->removeFromParent();659StartInsertBB->insert(StartInsertPt, ElemDef);660LLVM_DEBUG(dbgs()661<< "ARM Loops: Moved element count def: " << *ElemDef);662} else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) {663StartInsertPt->removeFromParent();664StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),665&*StartInsertPt);666LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);667} else {668// If we fail to move an instruction and the element count is provided669// by a mov, use the mov operand if it will have the same value at the670// insertion point671MachineOperand Operand = ElemDef->getOperand(1);672if (isMovRegOpcode(ElemDef->getOpcode()) &&673RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg().asMCReg()) ==674RDA.getUniqueReachingMIDef(&*StartInsertPt,675Operand.getReg().asMCReg())) {676TPNumElements = Operand;677NumElements = TPNumElements.getReg();678} else {679LLVM_DEBUG(dbgs()680<< "ARM Loops: Unable to move element count to loop "681<< "start instruction.\n");682return false;683}684}685}686}687688// Especially in the case of while loops, InsertBB may not be the689// preheader, so we need to check that the register isn't redefined690// before entering the loop.691auto CannotProvideElements = [this](MachineBasicBlock *MBB,692MCRegister NumElements) {693if (MBB->empty())694return false;695// NumElements is redefined in this block.696if (RDA.hasLocalDefBefore(&MBB->back(), NumElements))697return true;698699// Don't continue searching up through multiple predecessors.700if (MBB->pred_size() > 1)701return true;702703return false;704};705706// Search backwards for a def, until we get to InsertBB.707MachineBasicBlock *MBB = Preheader;708while (MBB && MBB != StartInsertBB) {709if (CannotProvideElements(MBB, NumElements)) {710LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");711return false;712}713MBB = *MBB->pred_begin();714}715}716717// Could inserting the [W|D]LSTP cause some unintended affects? In a perfect718// world the [w|d]lstp instruction would be last instruction in the preheader719// and so it would only affect instructions within the loop body. But due to720// scheduling, and/or the logic in this pass (above), the insertion point can721// be moved earlier. So if the Loop Start isn't the last instruction in the722// preheader, and if the initial element count is smaller than the vector723// width, the Loop Start instruction will immediately generate one or more724// false lane mask which can, incorrectly, affect the proceeding MVE725// instructions in the preheader.726if (std::any_of(StartInsertPt, StartInsertBB->end(), shouldInspect)) {727LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n");728return false;729}730731// For any DoubleWidthResultInstrs we found whilst scanning instructions, they732// need to compute an output size that is smaller than the VCTP mask operates733// on. The VecSize of the DoubleWidthResult is the larger vector size - the734// size it extends into, so any VCTP VecSize <= is valid.735unsigned VCTPVecSize = getVecSize(*VCTP);736for (MachineInstr *MI : DoubleWidthResultInstrs) {737unsigned InstrVecSize = getVecSize(*MI);738if (InstrVecSize > VCTPVecSize) {739LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP "740<< "VecSize:\n" << *MI);741return false;742}743}744745// Check that the value change of the element count is what we expect and746// that the predication will be equivalent. For this we need:747// NumElements = NumElements - VectorWidth. The sub will be a sub immediate748// and we can also allow register copies within the chain too.749auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) {750return -getAddSubImmediate(*MI) == ExpectedVecWidth;751};752753MachineBasicBlock *MBB = VCTP->getParent();754// Remove modifications to the element count since they have no purpose in a755// tail predicated loop. Explicitly refer to the vctp operand no matter which756// register NumElements has been assigned to, since that is what the757// modifications will be using758if (auto *Def = RDA.getUniqueReachingMIDef(759&MBB->back(), VCTP->getOperand(1).getReg().asMCReg())) {760SmallPtrSet<MachineInstr*, 2> ElementChain;761SmallPtrSet<MachineInstr*, 2> Ignore;762unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());763764Ignore.insert(VCTPs.begin(), VCTPs.end());765766if (TryRemove(Def, RDA, ElementChain, Ignore)) {767bool FoundSub = false;768769for (auto *MI : ElementChain) {770if (isMovRegOpcode(MI->getOpcode()))771continue;772773if (isSubImmOpcode(MI->getOpcode())) {774if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) {775LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"776" count: " << *MI);777return false;778}779FoundSub = true;780} else {781LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"782" count: " << *MI);783return false;784}785}786ToRemove.insert(ElementChain.begin(), ElementChain.end());787}788}789790// If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we791// can also remove any extra instructions in the preheader, which often792// includes a now unused MOV.793if ((Start->getOpcode() == ARM::t2DoLoopStartTP ||794Start->getOpcode() == ARM::t2WhileLoopStartTP) &&795Preheader && !Preheader->empty() &&796!RDA.hasLocalDefBefore(VCTP, VCTP->getOperand(1).getReg())) {797if (auto *Def = RDA.getUniqueReachingMIDef(798&Preheader->back(), VCTP->getOperand(1).getReg().asMCReg())) {799SmallPtrSet<MachineInstr*, 2> Ignore;800Ignore.insert(VCTPs.begin(), VCTPs.end());801TryRemove(Def, RDA, ToRemove, Ignore);802}803}804805return true;806}807808static bool isRegInClass(const MachineOperand &MO,809const TargetRegisterClass *Class) {810return MO.isReg() && MO.getReg() && Class->contains(MO.getReg());811}812813// MVE 'narrowing' operate on half a lane, reading from half and writing814// to half, which are referred to has the top and bottom half. The other815// half retains its previous value.816static bool retainsPreviousHalfElement(const MachineInstr &MI) {817const MCInstrDesc &MCID = MI.getDesc();818uint64_t Flags = MCID.TSFlags;819return (Flags & ARMII::RetainsPreviousHalfElement) != 0;820}821822// Some MVE instructions read from the top/bottom halves of their operand(s)823// and generate a vector result with result elements that are double the824// width of the input.825static bool producesDoubleWidthResult(const MachineInstr &MI) {826const MCInstrDesc &MCID = MI.getDesc();827uint64_t Flags = MCID.TSFlags;828return (Flags & ARMII::DoubleWidthResult) != 0;829}830831// Can this instruction generate a non-zero result when given only zeroed832// operands? This allows us to know that, given operands with false bytes833// zeroed by masked loads, that the result will also contain zeros in those834// bytes.835static bool canGenerateNonZeros(const MachineInstr &MI) {836837// Check for instructions which can write into a larger element size,838// possibly writing into a previous zero'd lane.839if (producesDoubleWidthResult(MI))840return true;841842switch (MI.getOpcode()) {843default:844break;845// FIXME: VNEG FP and -0? I think we'll need to handle this once we allow846// fp16 -> fp32 vector conversions.847// Instructions that perform a NOT will generate 1s from 0s.848case ARM::MVE_VMVN:849case ARM::MVE_VORN:850// Count leading zeros will do just that!851case ARM::MVE_VCLZs8:852case ARM::MVE_VCLZs16:853case ARM::MVE_VCLZs32:854return true;855}856return false;857}858859// Look at its register uses to see if it only can only receive zeros860// into its false lanes which would then produce zeros. Also check that861// the output register is also defined by an FalseLanesZero instruction862// so that if tail-predication happens, the lanes that aren't updated will863// still be zeros.864static bool producesFalseLanesZero(MachineInstr &MI,865const TargetRegisterClass *QPRs,866const ReachingDefAnalysis &RDA,867InstSet &FalseLanesZero) {868if (canGenerateNonZeros(MI))869return false;870871bool isPredicated = isVectorPredicated(&MI);872// Predicated loads will write zeros to the falsely predicated bytes of the873// destination register.874if (MI.mayLoad())875return isPredicated;876877auto IsZeroInit = [](MachineInstr *Def) {878return !isVectorPredicated(Def) &&879Def->getOpcode() == ARM::MVE_VMOVimmi32 &&880Def->getOperand(1).getImm() == 0;881};882883bool AllowScalars = isHorizontalReduction(MI);884for (auto &MO : MI.operands()) {885if (!MO.isReg() || !MO.getReg())886continue;887if (!isRegInClass(MO, QPRs) && AllowScalars)888continue;889// Skip the lr predicate reg890int PIdx = llvm::findFirstVPTPredOperandIdx(MI);891if (PIdx != -1 && (int)MO.getOperandNo() == PIdx + 2)892continue;893894// Check that this instruction will produce zeros in its false lanes:895// - If it only consumes false lanes zero or constant 0 (vmov #0)896// - If it's predicated, it only matters that it's def register already has897// false lane zeros, so we can ignore the uses.898SmallPtrSet<MachineInstr *, 2> Defs;899RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs);900if (Defs.empty())901return false;902for (auto *Def : Defs) {903if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def))904continue;905if (MO.isUse() && isPredicated)906continue;907return false;908}909}910LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);911return true;912}913914bool LowOverheadLoop::ValidateLiveOuts() {915// We want to find out if the tail-predicated version of this loop will916// produce the same values as the loop in its original form. For this to917// be true, the newly inserted implicit predication must not change the918// the (observable) results.919// We're doing this because many instructions in the loop will not be920// predicated and so the conversion from VPT predication to tail-predication921// can result in different values being produced; due to the tail-predication922// preventing many instructions from updating their falsely predicated923// lanes. This analysis assumes that all the instructions perform lane-wise924// operations and don't perform any exchanges.925// A masked load, whether through VPT or tail predication, will write zeros926// to any of the falsely predicated bytes. So, from the loads, we know that927// the false lanes are zeroed and here we're trying to track that those false928// lanes remain zero, or where they change, the differences are masked away929// by their user(s).930// All MVE stores have to be predicated, so we know that any predicate load931// operands, or stored results are equivalent already. Other explicitly932// predicated instructions will perform the same operation in the original933// loop and the tail-predicated form too. Because of this, we can insert934// loads, stores and other predicated instructions into our Predicated935// set and build from there.936const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID);937SetVector<MachineInstr *> FalseLanesUnknown;938SmallPtrSet<MachineInstr *, 4> FalseLanesZero;939SmallPtrSet<MachineInstr *, 4> Predicated;940MachineBasicBlock *Header = ML.getHeader();941942LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n");943944for (auto &MI : *Header) {945if (!shouldInspect(MI))946continue;947948if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode()))949continue;950951bool isPredicated = isVectorPredicated(&MI);952bool retainsOrReduces =953retainsPreviousHalfElement(MI) || isHorizontalReduction(MI);954955if (isPredicated)956Predicated.insert(&MI);957if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero))958FalseLanesZero.insert(&MI);959else if (MI.getNumDefs() == 0)960continue;961else if (!isPredicated && retainsOrReduces) {962LLVM_DEBUG(dbgs() << " Unpredicated instruction that retainsOrReduces: " << MI);963return false;964} else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy)965FalseLanesUnknown.insert(&MI);966}967968LLVM_DEBUG({969dbgs() << " Predicated:\n";970for (auto *I : Predicated)971dbgs() << " " << *I;972dbgs() << " FalseLanesZero:\n";973for (auto *I : FalseLanesZero)974dbgs() << " " << *I;975dbgs() << " FalseLanesUnknown:\n";976for (auto *I : FalseLanesUnknown)977dbgs() << " " << *I;978});979980auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,981SmallPtrSetImpl<MachineInstr *> &Predicated) {982SmallPtrSet<MachineInstr *, 2> Uses;983RDA.getGlobalUses(MI, MO.getReg().asMCReg(), Uses);984for (auto *Use : Uses) {985if (Use != MI && !Predicated.count(Use))986return false;987}988return true;989};990991// Visit the unknowns in reverse so that we can start at the values being992// stored and then we can work towards the leaves, hopefully adding more993// instructions to Predicated. Successfully terminating the loop means that994// all the unknown values have to found to be masked by predicated user(s).995// For any unpredicated values, we store them in NonPredicated so that we996// can later check whether these form a reduction.997SmallPtrSet<MachineInstr*, 2> NonPredicated;998for (auto *MI : reverse(FalseLanesUnknown)) {999for (auto &MO : MI->operands()) {1000if (!isRegInClass(MO, QPRs) || !MO.isDef())1001continue;1002if (!HasPredicatedUsers(MI, MO, Predicated)) {1003LLVM_DEBUG(dbgs() << " Found an unknown def of : "1004<< TRI.getRegAsmName(MO.getReg()) << " at " << *MI);1005NonPredicated.insert(MI);1006break;1007}1008}1009// Any unknown false lanes have been masked away by the user(s).1010if (!NonPredicated.contains(MI))1011Predicated.insert(MI);1012}10131014SmallPtrSet<MachineInstr *, 2> LiveOutMIs;1015SmallVector<MachineBasicBlock *, 2> ExitBlocks;1016ML.getExitBlocks(ExitBlocks);1017assert(ML.getNumBlocks() == 1 && "Expected single block loop!");1018assert(ExitBlocks.size() == 1 && "Expected a single exit block");1019MachineBasicBlock *ExitBB = ExitBlocks.front();1020for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {1021// TODO: Instead of blocking predication, we could move the vctp to the exit1022// block and calculate it's operand there in or the preheader.1023if (RegMask.PhysReg == ARM::VPR) {1024LLVM_DEBUG(dbgs() << " VPR is live in to the exit block.");1025return false;1026}1027// Check Q-regs that are live in the exit blocks. We don't collect scalars1028// because they won't be affected by lane predication.1029if (QPRs->contains(RegMask.PhysReg))1030if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg))1031LiveOutMIs.insert(MI);1032}10331034// We've already validated that any VPT predication within the loop will be1035// equivalent when we perform the predication transformation; so we know that1036// any VPT predicated instruction is predicated upon VCTP. Any live-out1037// instruction needs to be predicated, so check this here. The instructions1038// in NonPredicated have been found to be a reduction that we can ensure its1039// legality. Any MQPRCopy found will need to validate its input as if it was1040// live out.1041SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end());1042while (!Worklist.empty()) {1043MachineInstr *MI = Worklist.pop_back_val();1044if (MI->getOpcode() == ARM::MQPRCopy) {1045VMOVCopies.insert(MI);1046MachineInstr *CopySrc =1047RDA.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg());1048if (CopySrc)1049Worklist.push_back(CopySrc);1050} else if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) {1051LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);1052VMOVCopies.clear();1053return false;1054}1055}10561057return true;1058}10591060void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {1061if (Revert)1062return;10631064// Check branch target ranges: WLS[TP] can only branch forwards and LE[TP]1065// can only jump back.1066auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End,1067ARMBasicBlockUtils *BBUtils, MachineLoop &ML) {1068MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd1069? End->getOperand(1).getMBB()1070: End->getOperand(2).getMBB();1071// TODO Maybe there's cases where the target doesn't have to be the header,1072// but for now be safe and revert.1073if (TgtBB != ML.getHeader()) {1074LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n");1075return false;1076}10771078// The WLS and LE instructions have 12-bits for the label offset. WLS1079// requires a positive offset, while LE uses negative.1080if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) ||1081!BBUtils->isBBInRange(End, ML.getHeader(), 4094)) {1082LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");1083return false;1084}10851086if (isWhileLoopStart(*Start)) {1087MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(*Start);1088if (BBUtils->getOffsetOf(Start) > BBUtils->getOffsetOf(TargetBB) ||1089!BBUtils->isBBInRange(Start, TargetBB, 4094)) {1090LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");1091return false;1092}1093}1094return true;1095};10961097StartInsertPt = MachineBasicBlock::iterator(Start);1098StartInsertBB = Start->getParent();1099LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at "1100<< *StartInsertPt);11011102Revert = !ValidateRanges(Start, End, BBUtils, ML);1103CannotTailPredicate = !ValidateTailPredicate();1104}11051106bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {1107LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);1108if (VCTPs.empty()) {1109VCTPs.push_back(MI);1110return true;1111}11121113// If we find another VCTP, check whether it uses the same value as the main VCTP.1114// If it does, store it in the VCTPs set, else refuse it.1115MachineInstr *Prev = VCTPs.back();1116if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||1117!RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg().asMCReg())) {1118LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "1119"definition from the main VCTP");1120return false;1121}1122VCTPs.push_back(MI);1123return true;1124}11251126static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) {11271128auto GetFrameIndex = [](MachineMemOperand *Operand) {1129const PseudoSourceValue *PseudoValue = Operand->getPseudoValue();1130if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) {1131if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(PseudoValue)) {1132return FS->getFrameIndex();1133}1134}1135return -1;1136};11371138auto IsStackOp = [GetFrameIndex](MachineInstr *I) {1139switch (I->getOpcode()) {1140case ARM::MVE_VSTRWU32:1141case ARM::MVE_VLDRWU32: {1142return I->getOperand(1).getReg() == ARM::SP &&1143I->memoperands().size() == 1 &&1144GetFrameIndex(I->memoperands().front()) >= 0;1145}1146default:1147return false;1148}1149};11501151// An unpredicated vector register spill is allowed if all of the uses of the1152// stack slot are within the loop1153if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI))1154return false;11551156// Search all blocks after the loop for accesses to the same stack slot.1157// ReachingDefAnalysis doesn't work for sp as it relies on registers being1158// live-out (which sp never is) to know what blocks to look in1159if (MI->memoperands().size() == 0)1160return false;1161int FI = GetFrameIndex(MI->memoperands().front());11621163auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo();1164if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(FI))1165return false;11661167SmallVector<MachineBasicBlock *> Frontier;1168ML->getExitBlocks(Frontier);1169SmallPtrSet<MachineBasicBlock *, 4> Visited{MI->getParent()};1170unsigned Idx = 0;1171while (Idx < Frontier.size()) {1172MachineBasicBlock *BB = Frontier[Idx];1173bool LookAtSuccessors = true;1174for (auto &I : *BB) {1175if (!IsStackOp(&I) || I.memoperands().size() == 0)1176continue;1177if (GetFrameIndex(I.memoperands().front()) != FI)1178continue;1179// If this block has a store to the stack slot before any loads then we1180// can ignore the block1181if (I.getOpcode() == ARM::MVE_VSTRWU32) {1182LookAtSuccessors = false;1183break;1184}1185// If the store and the load are using the same stack slot then the1186// store isn't valid for tail predication1187if (I.getOpcode() == ARM::MVE_VLDRWU32)1188return false;1189}11901191if (LookAtSuccessors) {1192for (auto *Succ : BB->successors()) {1193if (!Visited.contains(Succ) && !is_contained(Frontier, Succ))1194Frontier.push_back(Succ);1195}1196}1197Visited.insert(BB);1198Idx++;1199}12001201return true;1202}12031204bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) {1205if (CannotTailPredicate)1206return false;12071208if (!shouldInspect(*MI))1209return true;12101211if (MI->getOpcode() == ARM::MVE_VPSEL ||1212MI->getOpcode() == ARM::MVE_VPNOT) {1213// TODO: Allow VPSEL and VPNOT, we currently cannot because:1214// 1) It will use the VPR as a predicate operand, but doesn't have to be1215// instead a VPT block, which means we can assert while building up1216// the VPT block because we don't find another VPT or VPST to being a new1217// one.1218// 2) VPSEL still requires a VPR operand even after tail predicating,1219// which means we can't remove it unless there is another1220// instruction, such as vcmp, that can provide the VPR def.1221return false;1222}12231224// Record all VCTPs and check that they're equivalent to one another.1225if (isVCTP(MI) && !AddVCTP(MI))1226return false;12271228// Inspect uses first so that any instructions that alter the VPR don't1229// alter the predicate upon themselves.1230const MCInstrDesc &MCID = MI->getDesc();1231bool IsUse = false;1232unsigned LastOpIdx = MI->getNumOperands() - 1;1233for (const auto &Op : enumerate(reverse(MCID.operands()))) {1234const MachineOperand &MO = MI->getOperand(LastOpIdx - Op.index());1235if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR)1236continue;12371238if (ARM::isVpred(Op.value().OperandType)) {1239VPTstate.addInst(MI);1240IsUse = true;1241} else if (MI->getOpcode() != ARM::MVE_VPST) {1242LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);1243return false;1244}1245}12461247// If we find an instruction that has been marked as not valid for tail1248// predication, only allow the instruction if it's contained within a valid1249// VPT block.1250bool RequiresExplicitPredication =1251(MCID.TSFlags & ARMII::ValidForTailPredication) == 0;1252if (isDomainMVE(MI) && RequiresExplicitPredication) {1253if (MI->getOpcode() == ARM::MQPRCopy)1254return true;1255if (!IsUse && producesDoubleWidthResult(*MI)) {1256DoubleWidthResultInstrs.insert(MI);1257return true;1258}12591260LLVM_DEBUG(if (!IsUse) dbgs()1261<< "ARM Loops: Can't tail predicate: " << *MI);1262return IsUse;1263}12641265// If the instruction is already explicitly predicated, then the conversion1266// will be fine, but ensure that all store operations are predicated.1267if (MI->mayStore() && !ValidateMVEStore(MI, &ML))1268return IsUse;12691270// If this instruction defines the VPR, update the predicate for the1271// proceeding instructions.1272if (isVectorPredicate(MI)) {1273// Clear the existing predicate when we're not in VPT Active state,1274// otherwise we add to it.1275if (!isVectorPredicated(MI))1276VPTstate.resetPredicate(MI);1277else1278VPTstate.addPredicate(MI);1279}12801281// Finally once the predicate has been modified, we can start a new VPT1282// block if necessary.1283if (isVPTOpcode(MI->getOpcode()))1284VPTstate.CreateVPTBlock(MI);12851286return true;1287}12881289bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {1290const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>();1291if (!ST.hasLOB())1292return false;12931294MF = &mf;1295LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");12961297MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();1298RDA = &getAnalysis<ReachingDefAnalysis>();1299MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);1300MRI = &MF->getRegInfo();1301TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());1302TRI = ST.getRegisterInfo();1303BBUtils = std::make_unique<ARMBasicBlockUtils>(*MF);1304BBUtils->computeAllBlockSizes();1305BBUtils->adjustBBOffsetsAfter(&MF->front());13061307bool Changed = false;1308for (auto *ML : *MLI) {1309if (ML->isOutermost())1310Changed |= ProcessLoop(ML);1311}1312Changed |= RevertNonLoops();1313return Changed;1314}13151316bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {1317bool Changed = false;13181319// Process inner loops first.1320for (MachineLoop *L : *ML)1321Changed |= ProcessLoop(L);13221323LLVM_DEBUG({1324dbgs() << "ARM Loops: Processing loop containing:\n";1325if (auto *Preheader = ML->getLoopPreheader())1326dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";1327else if (auto *Preheader = MLI->findLoopPreheader(ML, true, true))1328dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";1329for (auto *MBB : ML->getBlocks())1330dbgs() << " - Block: " << printMBBReference(*MBB) << "\n";1331});13321333// Search the given block for a loop start instruction. If one isn't found,1334// and there's only one predecessor block, search that one too.1335std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =1336[&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {1337for (auto &MI : *MBB) {1338if (isLoopStart(MI))1339return &MI;1340}1341if (MBB->pred_size() == 1)1342return SearchForStart(*MBB->pred_begin());1343return nullptr;1344};13451346LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII);1347// Search the preheader for the start intrinsic.1348// FIXME: I don't see why we shouldn't be supporting multiple predecessors1349// with potentially multiple set.loop.iterations, so we need to enable this.1350if (LoLoop.Preheader)1351LoLoop.Start = SearchForStart(LoLoop.Preheader);1352else1353return Changed;13541355// Find the low-overhead loop components and decide whether or not to fall1356// back to a normal loop. Also look for a vctp instructions and decide1357// whether we can convert that predicate using tail predication.1358for (auto *MBB : reverse(ML->getBlocks())) {1359for (auto &MI : *MBB) {1360if (MI.isDebugValue())1361continue;1362else if (MI.getOpcode() == ARM::t2LoopDec)1363LoLoop.Dec = &MI;1364else if (MI.getOpcode() == ARM::t2LoopEnd)1365LoLoop.End = &MI;1366else if (MI.getOpcode() == ARM::t2LoopEndDec)1367LoLoop.End = LoLoop.Dec = &MI;1368else if (isLoopStart(MI))1369LoLoop.Start = &MI;1370else if (MI.getDesc().isCall()) {1371// TODO: Though the call will require LE to execute again, does this1372// mean we should revert? Always executing LE hopefully should be1373// faster than performing a sub,cmp,br or even subs,br.1374LoLoop.Revert = true;1375LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");1376} else {1377// Record VPR defs and build up their corresponding vpt blocks.1378// Check we know how to tail predicate any mve instructions.1379LoLoop.AnalyseMVEInst(&MI);1380}1381}1382}13831384LLVM_DEBUG(LoLoop.dump());1385if (!LoLoop.FoundAllComponents()) {1386LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");1387return Changed;1388}13891390assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart &&1391"Expected t2WhileLoopStart to be removed before regalloc!");13921393// Check that the only instruction using LoopDec is LoopEnd. This can only1394// happen when the Dec and End are separate, not a single t2LoopEndDec.1395// TODO: Check for copy chains that really have no effect.1396if (LoLoop.Dec != LoLoop.End) {1397SmallPtrSet<MachineInstr *, 2> Uses;1398RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses);1399if (Uses.size() > 1 || !Uses.count(LoLoop.End)) {1400LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");1401LoLoop.Revert = true;1402}1403}1404LoLoop.Validate(BBUtils.get());1405Expand(LoLoop);1406return true;1407}14081409// WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a1410// beq that branches to the exit branch.1411// TODO: We could also try to generate a cbz if the value in LR is also in1412// another low register.1413void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {1414LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);1415MachineBasicBlock *DestBB = getWhileLoopStartTargetBB(*MI);1416unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?1417ARM::tBcc : ARM::t2Bcc;14181419RevertWhileLoopStartLR(MI, TII, BrOpc);1420}14211422void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const {1423LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);1424RevertDoLoopStart(MI, TII);1425}14261427bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {1428LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);1429MachineBasicBlock *MBB = MI->getParent();1430SmallPtrSet<MachineInstr*, 1> Ignore;1431for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) {1432if (I->getOpcode() == ARM::t2LoopEnd) {1433Ignore.insert(&*I);1434break;1435}1436}14371438// If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.1439bool SetFlags =1440RDA->isSafeToDefRegAt(MI, MCRegister::from(ARM::CPSR), Ignore);14411442llvm::RevertLoopDec(MI, TII, SetFlags);1443return SetFlags;1444}14451446// Generate a subs, or sub and cmp, and a branch instead of an LE.1447void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {1448LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);14491450MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();1451unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?1452ARM::tBcc : ARM::t2Bcc;14531454llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp);1455}14561457// Generate a subs, or sub and cmp, and a branch instead of an LE.1458void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const {1459LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI);1460assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!");1461MachineBasicBlock *MBB = MI->getParent();14621463MachineInstrBuilder MIB =1464BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));1465MIB.addDef(ARM::LR);1466MIB.add(MI->getOperand(1));1467MIB.addImm(1);1468MIB.addImm(ARMCC::AL);1469MIB.addReg(ARM::NoRegister);1470MIB.addReg(ARM::CPSR);1471MIB->getOperand(5).setIsDef(true);14721473MachineBasicBlock *DestBB = MI->getOperand(2).getMBB();1474unsigned BrOpc =1475BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc;14761477// Create bne1478MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));1479MIB.add(MI->getOperand(2)); // branch target1480MIB.addImm(ARMCC::NE); // condition code1481MIB.addReg(ARM::CPSR);14821483MI->eraseFromParent();1484}14851486// Perform dead code elimation on the loop iteration count setup expression.1487// If we are tail-predicating, the number of elements to be processed is the1488// operand of the VCTP instruction in the vector body, see getCount(), which is1489// register $r3 in this example:1490//1491// $lr = big-itercount-expression1492// ..1493// $lr = t2DoLoopStart renamable $lr1494// vector.body:1495// ..1496// $vpr = MVE_VCTP32 renamable $r31497// renamable $lr = t2LoopDec killed renamable $lr, 11498// t2LoopEnd renamable $lr, %vector.body1499// tB %end1500//1501// What we would like achieve here is to replace the do-loop start pseudo1502// instruction t2DoLoopStart with:1503//1504// $lr = MVE_DLSTP_32 killed renamable $r31505//1506// Thus, $r3 which defines the number of elements, is written to $lr,1507// and then we want to delete the whole chain that used to define $lr,1508// see the comment below how this chain could look like.1509//1510void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {1511if (!LoLoop.IsTailPredicationLegal())1512return;15131514LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");15151516MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 1);1517if (!Def) {1518LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");1519return;1520}15211522// Collect and remove the users of iteration count.1523SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec,1524LoLoop.End };1525if (!TryRemove(Def, *RDA, LoLoop.ToRemove, Killed))1526LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");1527}15281529MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {1530LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n");1531// When using tail-predication, try to delete the dead code that was used to1532// calculate the number of loop iterations.1533IterationCountDCE(LoLoop);15341535MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt;1536MachineInstr *Start = LoLoop.Start;1537MachineBasicBlock *MBB = LoLoop.StartInsertBB;1538unsigned Opc = LoLoop.getStartOpcode();1539MachineOperand &Count = LoLoop.getLoopStartOperand();15401541// A DLS lr, lr we needn't emit1542MachineInstr* NewStart;1543if (!DisableOmitDLS && Opc == ARM::t2DLS && Count.isReg() &&1544Count.getReg() == ARM::LR) {1545LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr");1546NewStart = nullptr;1547} else {1548MachineInstrBuilder MIB =1549BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc));15501551MIB.addDef(ARM::LR);1552MIB.add(Count);1553if (isWhileLoopStart(*Start))1554MIB.addMBB(getWhileLoopStartTargetBB(*Start));15551556LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);1557NewStart = &*MIB;1558}15591560LoLoop.ToRemove.insert(Start);1561return NewStart;1562}15631564void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {1565auto RemovePredicate = [](MachineInstr *MI) {1566if (MI->isDebugInstr())1567return;1568LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);1569int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);1570assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction");1571assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&1572"Expected Then predicate!");1573MI->getOperand(PIdx).setImm(ARMVCC::None);1574MI->getOperand(PIdx + 1).setReg(0);1575};15761577for (auto &Block : LoLoop.getVPTBlocks()) {1578SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();15791580auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) {1581assert(TheVCMP && "Replacing a removed or non-existent VCMP");1582// Replace the VCMP with a VPT1583MachineInstrBuilder MIB =1584BuildMI(*At->getParent(), At, At->getDebugLoc(),1585TII->get(VCMPOpcodeToVPT(TheVCMP->getOpcode())));1586MIB.addImm(ARMVCC::Then);1587// Register one1588MIB.add(TheVCMP->getOperand(1));1589// Register two1590MIB.add(TheVCMP->getOperand(2));1591// The comparison code, e.g. ge, eq, lt1592MIB.add(TheVCMP->getOperand(3));1593LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB);1594LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());1595LoLoop.ToRemove.insert(TheVCMP);1596TheVCMP = nullptr;1597};15981599if (LoLoop.VPTstate.isEntryPredicatedOnVCTP(Block, /*exclusive*/ true)) {1600MachineInstr *VPST = Insts.front();1601if (Block.hasUniformPredicate()) {1602// A vpt block starting with VPST, is only predicated upon vctp and has no1603// internal vpr defs:1604// - Remove vpst.1605// - Unpredicate the remaining instructions.1606LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);1607for (unsigned i = 1; i < Insts.size(); ++i)1608RemovePredicate(Insts[i]);1609} else {1610// The VPT block has a non-uniform predicate but it uses a vpst and its1611// entry is guarded only by a vctp, which means we:1612// - Need to remove the original vpst.1613// - Then need to unpredicate any following instructions, until1614// we come across the divergent vpr def.1615// - Insert a new vpst to predicate the instruction(s) that following1616// the divergent vpr def.1617MachineInstr *Divergent = Block.getDivergent();1618MachineBasicBlock *MBB = Divergent->getParent();1619auto DivergentNext = ++MachineBasicBlock::iterator(Divergent);1620while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr())1621++DivergentNext;16221623bool DivergentNextIsPredicated =1624DivergentNext != MBB->end() &&1625getVPTInstrPredicate(*DivergentNext) != ARMVCC::None;16261627for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext;1628I != E; ++I)1629RemovePredicate(&*I);16301631// Check if the instruction defining vpr is a vcmp so it can be combined1632// with the VPST This should be the divergent instruction1633MachineInstr *VCMP =1634VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr;16351636if (DivergentNextIsPredicated) {1637// Insert a VPST at the divergent only if the next instruction1638// would actually use it. A VCMP following a VPST can be1639// merged into a VPT so do that instead if the VCMP exists.1640if (!VCMP) {1641// Create a VPST (with a null mask for now, we'll recompute it1642// later)1643MachineInstrBuilder MIB =1644BuildMI(*Divergent->getParent(), Divergent,1645Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST));1646MIB.addImm(0);1647LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);1648LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());1649} else {1650// No RDA checks are necessary here since the VPST would have been1651// directly after the VCMP1652ReplaceVCMPWithVPT(VCMP, VCMP);1653}1654}1655}1656LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);1657LoLoop.ToRemove.insert(VPST);1658} else if (Block.containsVCTP()) {1659// The vctp will be removed, so either the entire block will be dead or1660// the block mask of the vp(s)t will need to be recomputed.1661MachineInstr *VPST = Insts.front();1662if (Block.size() == 2) {1663assert(VPST->getOpcode() == ARM::MVE_VPST &&1664"Found a VPST in an otherwise empty vpt block");1665LoLoop.ToRemove.insert(VPST);1666} else1667LoLoop.BlockMasksToRecompute.insert(VPST);1668} else if (Insts.front()->getOpcode() == ARM::MVE_VPST) {1669// If this block starts with a VPST then attempt to merge it with the1670// preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT1671// block that no longer exists1672MachineInstr *VPST = Insts.front();1673auto Next = ++MachineBasicBlock::iterator(VPST);1674assert(getVPTInstrPredicate(*Next) != ARMVCC::None &&1675"The instruction after a VPST must be predicated");1676(void)Next;1677MachineInstr *VprDef = RDA->getUniqueReachingMIDef(VPST, ARM::VPR);1678if (VprDef && VCMPOpcodeToVPT(VprDef->getOpcode()) &&1679!LoLoop.ToRemove.contains(VprDef)) {1680MachineInstr *VCMP = VprDef;1681// The VCMP and VPST can only be merged if the VCMP's operands will have1682// the same values at the VPST.1683// If any of the instructions between the VCMP and VPST are predicated1684// then a different code path is expected to have merged the VCMP and1685// VPST already.1686if (std::none_of(++MachineBasicBlock::iterator(VCMP),1687MachineBasicBlock::iterator(VPST), hasVPRUse) &&1688RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(1).getReg()) &&1689RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(2).getReg())) {1690ReplaceVCMPWithVPT(VCMP, VPST);1691LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);1692LoLoop.ToRemove.insert(VPST);1693}1694}1695}1696}16971698LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end());1699}17001701void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {17021703// Combine the LoopDec and LoopEnd instructions into LE(TP).1704auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {1705MachineInstr *End = LoLoop.End;1706MachineBasicBlock *MBB = End->getParent();1707unsigned Opc = LoLoop.IsTailPredicationLegal() ?1708ARM::MVE_LETP : ARM::t2LEUpdate;1709MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),1710TII->get(Opc));1711MIB.addDef(ARM::LR);1712unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0;1713MIB.add(End->getOperand(Off + 0));1714MIB.add(End->getOperand(Off + 1));1715LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);1716LoLoop.ToRemove.insert(LoLoop.Dec);1717LoLoop.ToRemove.insert(End);1718return &*MIB;1719};17201721// TODO: We should be able to automatically remove these branches before we1722// get here - probably by teaching analyzeBranch about the pseudo1723// instructions.1724// If there is an unconditional branch, after I, that just branches to the1725// next block, remove it.1726auto RemoveDeadBranch = [](MachineInstr *I) {1727MachineBasicBlock *BB = I->getParent();1728MachineInstr *Terminator = &BB->instr_back();1729if (Terminator->isUnconditionalBranch() && I != Terminator) {1730MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();1731if (BB->isLayoutSuccessor(Succ)) {1732LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);1733Terminator->eraseFromParent();1734}1735}1736};17371738// And VMOVCopies need to become 2xVMOVD for tail predication to be valid.1739// Anything other MQPRCopy can be converted to MVE_VORR later on.1740auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, 4> &VMOVCopies) {1741for (auto *MI : VMOVCopies) {1742LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI);1743assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");1744MachineBasicBlock *MBB = MI->getParent();1745Register Dst = MI->getOperand(0).getReg();1746Register Src = MI->getOperand(1).getReg();1747auto MIB1 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),1748ARM::D0 + (Dst - ARM::Q0) * 2)1749.addReg(ARM::D0 + (Src - ARM::Q0) * 2)1750.add(predOps(ARMCC::AL));1751(void)MIB1;1752LLVM_DEBUG(dbgs() << " into " << *MIB1);1753auto MIB2 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),1754ARM::D0 + (Dst - ARM::Q0) * 2 + 1)1755.addReg(ARM::D0 + (Src - ARM::Q0) * 2 + 1)1756.add(predOps(ARMCC::AL));1757LLVM_DEBUG(dbgs() << " and " << *MIB2);1758(void)MIB2;1759MI->eraseFromParent();1760}1761};17621763if (LoLoop.Revert) {1764if (isWhileLoopStart(*LoLoop.Start))1765RevertWhile(LoLoop.Start);1766else1767RevertDo(LoLoop.Start);1768if (LoLoop.Dec == LoLoop.End)1769RevertLoopEndDec(LoLoop.End);1770else1771RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec));1772} else {1773ExpandVMOVCopies(LoLoop.VMOVCopies);1774LoLoop.Start = ExpandLoopStart(LoLoop);1775if (LoLoop.Start)1776RemoveDeadBranch(LoLoop.Start);1777LoLoop.End = ExpandLoopEnd(LoLoop);1778RemoveDeadBranch(LoLoop.End);1779if (LoLoop.IsTailPredicationLegal())1780ConvertVPTBlocks(LoLoop);1781for (auto *I : LoLoop.ToRemove) {1782LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);1783I->eraseFromParent();1784}1785for (auto *I : LoLoop.BlockMasksToRecompute) {1786LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I);1787recomputeVPTBlockMask(*I);1788LLVM_DEBUG(dbgs() << " ... done: " << *I);1789}1790}17911792PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);1793DFS.ProcessLoop();1794const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();1795fullyRecomputeLiveIns(PostOrder);17961797for (auto *MBB : reverse(PostOrder))1798recomputeLivenessFlags(*MBB);17991800// We've moved, removed and inserted new instructions, so update RDA.1801RDA->reset();1802}18031804bool ARMLowOverheadLoops::RevertNonLoops() {1805LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");1806bool Changed = false;18071808for (auto &MBB : *MF) {1809SmallVector<MachineInstr*, 4> Starts;1810SmallVector<MachineInstr*, 4> Decs;1811SmallVector<MachineInstr*, 4> Ends;1812SmallVector<MachineInstr *, 4> EndDecs;1813SmallVector<MachineInstr *, 4> MQPRCopies;18141815for (auto &I : MBB) {1816if (isLoopStart(I))1817Starts.push_back(&I);1818else if (I.getOpcode() == ARM::t2LoopDec)1819Decs.push_back(&I);1820else if (I.getOpcode() == ARM::t2LoopEnd)1821Ends.push_back(&I);1822else if (I.getOpcode() == ARM::t2LoopEndDec)1823EndDecs.push_back(&I);1824else if (I.getOpcode() == ARM::MQPRCopy)1825MQPRCopies.push_back(&I);1826}18271828if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() &&1829MQPRCopies.empty())1830continue;18311832Changed = true;18331834for (auto *Start : Starts) {1835if (isWhileLoopStart(*Start))1836RevertWhile(Start);1837else1838RevertDo(Start);1839}1840for (auto *Dec : Decs)1841RevertLoopDec(Dec);18421843for (auto *End : Ends)1844RevertLoopEnd(End);1845for (auto *End : EndDecs)1846RevertLoopEndDec(End);1847for (auto *MI : MQPRCopies) {1848LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI);1849assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");1850MachineBasicBlock *MBB = MI->getParent();1851auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::MVE_VORR),1852MI->getOperand(0).getReg())1853.add(MI->getOperand(1))1854.add(MI->getOperand(1));1855addUnpredicatedMveVpredROp(MIB, MI->getOperand(0).getReg());1856MI->eraseFromParent();1857}1858}1859return Changed;1860}18611862FunctionPass *llvm::createARMLowOverheadLoopsPass() {1863return new ARMLowOverheadLoops();1864}186518661867