Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp
35271 views
//===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file implements some loop unrolling utilities. It does not define any9// actual pass or policy, but provides a single function to perform loop10// unrolling.11//12// The process of unrolling can produce extraneous basic blocks linked with13// unconditional branches. This will be corrected in the future.14//15//===----------------------------------------------------------------------===//1617#include "llvm/ADT/ArrayRef.h"18#include "llvm/ADT/DenseMap.h"19#include "llvm/ADT/STLExtras.h"20#include "llvm/ADT/ScopedHashTable.h"21#include "llvm/ADT/SetVector.h"22#include "llvm/ADT/SmallVector.h"23#include "llvm/ADT/Statistic.h"24#include "llvm/ADT/StringRef.h"25#include "llvm/ADT/Twine.h"26#include "llvm/ADT/ilist_iterator.h"27#include "llvm/Analysis/AliasAnalysis.h"28#include "llvm/Analysis/AssumptionCache.h"29#include "llvm/Analysis/DomTreeUpdater.h"30#include "llvm/Analysis/InstructionSimplify.h"31#include "llvm/Analysis/LoopInfo.h"32#include "llvm/Analysis/LoopIterator.h"33#include "llvm/Analysis/MemorySSA.h"34#include "llvm/Analysis/OptimizationRemarkEmitter.h"35#include "llvm/Analysis/ScalarEvolution.h"36#include "llvm/IR/BasicBlock.h"37#include "llvm/IR/CFG.h"38#include "llvm/IR/Constants.h"39#include "llvm/IR/DebugInfoMetadata.h"40#include "llvm/IR/DebugLoc.h"41#include "llvm/IR/DiagnosticInfo.h"42#include "llvm/IR/Dominators.h"43#include "llvm/IR/Function.h"44#include "llvm/IR/Instruction.h"45#include "llvm/IR/Instructions.h"46#include "llvm/IR/IntrinsicInst.h"47#include "llvm/IR/Metadata.h"48#include "llvm/IR/Module.h"49#include "llvm/IR/PatternMatch.h"50#include "llvm/IR/Use.h"51#include "llvm/IR/User.h"52#include "llvm/IR/ValueHandle.h"53#include "llvm/IR/ValueMap.h"54#include "llvm/Support/Casting.h"55#include "llvm/Support/CommandLine.h"56#include "llvm/Support/Debug.h"57#include "llvm/Support/GenericDomTree.h"58#include "llvm/Support/MathExtras.h"59#include "llvm/Support/raw_ostream.h"60#include "llvm/Transforms/Utils/BasicBlockUtils.h"61#include "llvm/Transforms/Utils/Cloning.h"62#include "llvm/Transforms/Utils/Local.h"63#include "llvm/Transforms/Utils/LoopSimplify.h"64#include "llvm/Transforms/Utils/LoopUtils.h"65#include "llvm/Transforms/Utils/SimplifyIndVar.h"66#include "llvm/Transforms/Utils/UnrollLoop.h"67#include "llvm/Transforms/Utils/ValueMapper.h"68#include <algorithm>69#include <assert.h>70#include <numeric>71#include <type_traits>72#include <vector>7374namespace llvm {75class DataLayout;76class Value;77} // namespace llvm7879using namespace llvm;8081#define DEBUG_TYPE "loop-unroll"8283// TODO: Should these be here or in LoopUnroll?84STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");85STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");86STATISTIC(NumUnrolledNotLatch, "Number of loops unrolled without a conditional "87"latch (completely or otherwise)");8889static cl::opt<bool>90UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,91cl::desc("Allow runtime unrolled loops to be unrolled "92"with epilog instead of prolog."));9394static cl::opt<bool>95UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,96cl::desc("Verify domtree after unrolling"),97#ifdef EXPENSIVE_CHECKS98cl::init(true)99#else100cl::init(false)101#endif102);103104static cl::opt<bool>105UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden,106cl::desc("Verify loopinfo after unrolling"),107#ifdef EXPENSIVE_CHECKS108cl::init(true)109#else110cl::init(false)111#endif112);113114115/// Check if unrolling created a situation where we need to insert phi nodes to116/// preserve LCSSA form.117/// \param Blocks is a vector of basic blocks representing unrolled loop.118/// \param L is the outer loop.119/// It's possible that some of the blocks are in L, and some are not. In this120/// case, if there is a use is outside L, and definition is inside L, we need to121/// insert a phi-node, otherwise LCSSA will be broken.122/// The function is just a helper function for llvm::UnrollLoop that returns123/// true if this situation occurs, indicating that LCSSA needs to be fixed.124static bool needToInsertPhisForLCSSA(Loop *L,125const std::vector<BasicBlock *> &Blocks,126LoopInfo *LI) {127for (BasicBlock *BB : Blocks) {128if (LI->getLoopFor(BB) == L)129continue;130for (Instruction &I : *BB) {131for (Use &U : I.operands()) {132if (const auto *Def = dyn_cast<Instruction>(U)) {133Loop *DefLoop = LI->getLoopFor(Def->getParent());134if (!DefLoop)135continue;136if (DefLoop->contains(L))137return true;138}139}140}141}142return false;143}144145/// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary146/// and adds a mapping from the original loop to the new loop to NewLoops.147/// Returns nullptr if no new loop was created and a pointer to the148/// original loop OriginalBB was part of otherwise.149const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,150BasicBlock *ClonedBB, LoopInfo *LI,151NewLoopsMap &NewLoops) {152// Figure out which loop New is in.153const Loop *OldLoop = LI->getLoopFor(OriginalBB);154assert(OldLoop && "Should (at least) be in the loop being unrolled!");155156Loop *&NewLoop = NewLoops[OldLoop];157if (!NewLoop) {158// Found a new sub-loop.159assert(OriginalBB == OldLoop->getHeader() &&160"Header should be first in RPO");161162NewLoop = LI->AllocateLoop();163Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());164165if (NewLoopParent)166NewLoopParent->addChildLoop(NewLoop);167else168LI->addTopLevelLoop(NewLoop);169170NewLoop->addBasicBlockToLoop(ClonedBB, *LI);171return OldLoop;172} else {173NewLoop->addBasicBlockToLoop(ClonedBB, *LI);174return nullptr;175}176}177178/// The function chooses which type of unroll (epilog or prolog) is more179/// profitabale.180/// Epilog unroll is more profitable when there is PHI that starts from181/// constant. In this case epilog will leave PHI start from constant,182/// but prolog will convert it to non-constant.183///184/// loop:185/// PN = PHI [I, Latch], [CI, PreHeader]186/// I = foo(PN)187/// ...188///189/// Epilog unroll case.190/// loop:191/// PN = PHI [I2, Latch], [CI, PreHeader]192/// I1 = foo(PN)193/// I2 = foo(I1)194/// ...195/// Prolog unroll case.196/// NewPN = PHI [PrologI, Prolog], [CI, PreHeader]197/// loop:198/// PN = PHI [I2, Latch], [NewPN, PreHeader]199/// I1 = foo(PN)200/// I2 = foo(I1)201/// ...202///203static bool isEpilogProfitable(Loop *L) {204BasicBlock *PreHeader = L->getLoopPreheader();205BasicBlock *Header = L->getHeader();206assert(PreHeader && Header);207for (const PHINode &PN : Header->phis()) {208if (isa<ConstantInt>(PN.getIncomingValueForBlock(PreHeader)))209return true;210}211return false;212}213214struct LoadValue {215Instruction *DefI = nullptr;216unsigned Generation = 0;217LoadValue() = default;218LoadValue(Instruction *Inst, unsigned Generation)219: DefI(Inst), Generation(Generation) {}220};221222class StackNode {223ScopedHashTable<const SCEV *, LoadValue>::ScopeTy LoadScope;224unsigned CurrentGeneration;225unsigned ChildGeneration;226DomTreeNode *Node;227DomTreeNode::const_iterator ChildIter;228DomTreeNode::const_iterator EndIter;229bool Processed = false;230231public:232StackNode(ScopedHashTable<const SCEV *, LoadValue> &AvailableLoads,233unsigned cg, DomTreeNode *N, DomTreeNode::const_iterator Child,234DomTreeNode::const_iterator End)235: LoadScope(AvailableLoads), CurrentGeneration(cg), ChildGeneration(cg),236Node(N), ChildIter(Child), EndIter(End) {}237// Accessors.238unsigned currentGeneration() const { return CurrentGeneration; }239unsigned childGeneration() const { return ChildGeneration; }240void childGeneration(unsigned generation) { ChildGeneration = generation; }241DomTreeNode *node() { return Node; }242DomTreeNode::const_iterator childIter() const { return ChildIter; }243244DomTreeNode *nextChild() {245DomTreeNode *Child = *ChildIter;246++ChildIter;247return Child;248}249250DomTreeNode::const_iterator end() const { return EndIter; }251bool isProcessed() const { return Processed; }252void process() { Processed = true; }253};254255Value *getMatchingValue(LoadValue LV, LoadInst *LI, unsigned CurrentGeneration,256BatchAAResults &BAA,257function_ref<MemorySSA *()> GetMSSA) {258if (!LV.DefI)259return nullptr;260if (LV.DefI->getType() != LI->getType())261return nullptr;262if (LV.Generation != CurrentGeneration) {263MemorySSA *MSSA = GetMSSA();264if (!MSSA)265return nullptr;266auto *EarlierMA = MSSA->getMemoryAccess(LV.DefI);267MemoryAccess *LaterDef =268MSSA->getWalker()->getClobberingMemoryAccess(LI, BAA);269if (!MSSA->dominates(LaterDef, EarlierMA))270return nullptr;271}272return LV.DefI;273}274275void loadCSE(Loop *L, DominatorTree &DT, ScalarEvolution &SE, LoopInfo &LI,276BatchAAResults &BAA, function_ref<MemorySSA *()> GetMSSA) {277ScopedHashTable<const SCEV *, LoadValue> AvailableLoads;278SmallVector<std::unique_ptr<StackNode>> NodesToProcess;279DomTreeNode *HeaderD = DT.getNode(L->getHeader());280NodesToProcess.emplace_back(new StackNode(AvailableLoads, 0, HeaderD,281HeaderD->begin(), HeaderD->end()));282283unsigned CurrentGeneration = 0;284while (!NodesToProcess.empty()) {285StackNode *NodeToProcess = &*NodesToProcess.back();286287CurrentGeneration = NodeToProcess->currentGeneration();288289if (!NodeToProcess->isProcessed()) {290// Process the node.291292// If this block has a single predecessor, then the predecessor is the293// parent294// of the domtree node and all of the live out memory values are still295// current in this block. If this block has multiple predecessors, then296// they could have invalidated the live-out memory values of our parent297// value. For now, just be conservative and invalidate memory if this298// block has multiple predecessors.299if (!NodeToProcess->node()->getBlock()->getSinglePredecessor())300++CurrentGeneration;301for (auto &I : make_early_inc_range(*NodeToProcess->node()->getBlock())) {302303auto *Load = dyn_cast<LoadInst>(&I);304if (!Load || !Load->isSimple()) {305if (I.mayWriteToMemory())306CurrentGeneration++;307continue;308}309310const SCEV *PtrSCEV = SE.getSCEV(Load->getPointerOperand());311LoadValue LV = AvailableLoads.lookup(PtrSCEV);312if (Value *M =313getMatchingValue(LV, Load, CurrentGeneration, BAA, GetMSSA)) {314if (LI.replacementPreservesLCSSAForm(Load, M)) {315Load->replaceAllUsesWith(M);316Load->eraseFromParent();317}318} else {319AvailableLoads.insert(PtrSCEV, LoadValue(Load, CurrentGeneration));320}321}322NodeToProcess->childGeneration(CurrentGeneration);323NodeToProcess->process();324} else if (NodeToProcess->childIter() != NodeToProcess->end()) {325// Push the next child onto the stack.326DomTreeNode *Child = NodeToProcess->nextChild();327if (!L->contains(Child->getBlock()))328continue;329NodesToProcess.emplace_back(330new StackNode(AvailableLoads, NodeToProcess->childGeneration(), Child,331Child->begin(), Child->end()));332} else {333// It has been processed, and there are no more children to process,334// so delete it and pop it off the stack.335NodesToProcess.pop_back();336}337}338}339340/// Perform some cleanup and simplifications on loops after unrolling. It is341/// useful to simplify the IV's in the new loop, as well as do a quick342/// simplify/dce pass of the instructions.343void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,344ScalarEvolution *SE, DominatorTree *DT,345AssumptionCache *AC,346const TargetTransformInfo *TTI,347AAResults *AA) {348using namespace llvm::PatternMatch;349350// Simplify any new induction variables in the partially unrolled loop.351if (SE && SimplifyIVs) {352SmallVector<WeakTrackingVH, 16> DeadInsts;353simplifyLoopIVs(L, SE, DT, LI, TTI, DeadInsts);354355// Aggressively clean up dead instructions that simplifyLoopIVs already356// identified. Any remaining should be cleaned up below.357while (!DeadInsts.empty()) {358Value *V = DeadInsts.pop_back_val();359if (Instruction *Inst = dyn_cast_or_null<Instruction>(V))360RecursivelyDeleteTriviallyDeadInstructions(Inst);361}362363if (AA) {364std::unique_ptr<MemorySSA> MSSA = nullptr;365BatchAAResults BAA(*AA);366loadCSE(L, *DT, *SE, *LI, BAA, [L, AA, DT, &MSSA]() -> MemorySSA * {367if (!MSSA)368MSSA.reset(new MemorySSA(*L, AA, DT));369return &*MSSA;370});371}372}373374// At this point, the code is well formed. Perform constprop, instsimplify,375// and dce.376const DataLayout &DL = L->getHeader()->getDataLayout();377SmallVector<WeakTrackingVH, 16> DeadInsts;378for (BasicBlock *BB : L->getBlocks()) {379// Remove repeated debug instructions after loop unrolling.380if (BB->getParent()->getSubprogram())381RemoveRedundantDbgInstrs(BB);382383for (Instruction &Inst : llvm::make_early_inc_range(*BB)) {384if (Value *V = simplifyInstruction(&Inst, {DL, nullptr, DT, AC}))385if (LI->replacementPreservesLCSSAForm(&Inst, V))386Inst.replaceAllUsesWith(V);387if (isInstructionTriviallyDead(&Inst))388DeadInsts.emplace_back(&Inst);389390// Fold ((add X, C1), C2) to (add X, C1+C2). This is very common in391// unrolled loops, and handling this early allows following code to392// identify the IV as a "simple recurrence" without first folding away393// a long chain of adds.394{395Value *X;396const APInt *C1, *C2;397if (match(&Inst, m_Add(m_Add(m_Value(X), m_APInt(C1)), m_APInt(C2)))) {398auto *InnerI = dyn_cast<Instruction>(Inst.getOperand(0));399auto *InnerOBO = cast<OverflowingBinaryOperator>(Inst.getOperand(0));400bool SignedOverflow;401APInt NewC = C1->sadd_ov(*C2, SignedOverflow);402Inst.setOperand(0, X);403Inst.setOperand(1, ConstantInt::get(Inst.getType(), NewC));404Inst.setHasNoUnsignedWrap(Inst.hasNoUnsignedWrap() &&405InnerOBO->hasNoUnsignedWrap());406Inst.setHasNoSignedWrap(Inst.hasNoSignedWrap() &&407InnerOBO->hasNoSignedWrap() &&408!SignedOverflow);409if (InnerI && isInstructionTriviallyDead(InnerI))410DeadInsts.emplace_back(InnerI);411}412}413}414// We can't do recursive deletion until we're done iterating, as we might415// have a phi which (potentially indirectly) uses instructions later in416// the block we're iterating through.417RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);418}419}420421// Loops containing convergent instructions that are uncontrolled or controlled422// from outside the loop must have a count that divides their TripMultiple.423LLVM_ATTRIBUTE_USED424static bool canHaveUnrollRemainder(const Loop *L) {425if (getLoopConvergenceHeart(L))426return false;427428// Check for uncontrolled convergent operations.429for (auto &BB : L->blocks()) {430for (auto &I : *BB) {431if (isa<ConvergenceControlInst>(I))432return true;433if (auto *CB = dyn_cast<CallBase>(&I))434if (CB->isConvergent())435return CB->getConvergenceControlToken();436}437}438return true;439}440441/// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling442/// can only fail when the loop's latch block is not terminated by a conditional443/// branch instruction. However, if the trip count (and multiple) are not known,444/// loop unrolling will mostly produce more code that is no faster.445///446/// If Runtime is true then UnrollLoop will try to insert a prologue or447/// epilogue that ensures the latch has a trip multiple of Count. UnrollLoop448/// will not runtime-unroll the loop if computing the run-time trip count will449/// be expensive and AllowExpensiveTripCount is false.450///451/// The LoopInfo Analysis that is passed will be kept consistent.452///453/// This utility preserves LoopInfo. It will also preserve ScalarEvolution and454/// DominatorTree if they are non-null.455///456/// If RemainderLoop is non-null, it will receive the remainder loop (if457/// required and not fully unrolled).458LoopUnrollResult459llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,460ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,461const TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,462bool PreserveLCSSA, Loop **RemainderLoop, AAResults *AA) {463assert(DT && "DomTree is required");464465if (!L->getLoopPreheader()) {466LLVM_DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n");467return LoopUnrollResult::Unmodified;468}469470if (!L->getLoopLatch()) {471LLVM_DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n");472return LoopUnrollResult::Unmodified;473}474475// Loops with indirectbr cannot be cloned.476if (!L->isSafeToClone()) {477LLVM_DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n");478return LoopUnrollResult::Unmodified;479}480481if (L->getHeader()->hasAddressTaken()) {482// The loop-rotate pass can be helpful to avoid this in many cases.483LLVM_DEBUG(484dbgs() << " Won't unroll loop: address of header block is taken.\n");485return LoopUnrollResult::Unmodified;486}487488assert(ULO.Count > 0);489490// All these values should be taken only after peeling because they might have491// changed.492BasicBlock *Preheader = L->getLoopPreheader();493BasicBlock *Header = L->getHeader();494BasicBlock *LatchBlock = L->getLoopLatch();495SmallVector<BasicBlock *, 4> ExitBlocks;496L->getExitBlocks(ExitBlocks);497std::vector<BasicBlock *> OriginalLoopBlocks = L->getBlocks();498499const unsigned MaxTripCount = SE->getSmallConstantMaxTripCount(L);500const bool MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L);501unsigned EstimatedLoopInvocationWeight = 0;502std::optional<unsigned> OriginalTripCount =503llvm::getLoopEstimatedTripCount(L, &EstimatedLoopInvocationWeight);504505// Effectively "DCE" unrolled iterations that are beyond the max tripcount506// and will never be executed.507if (MaxTripCount && ULO.Count > MaxTripCount)508ULO.Count = MaxTripCount;509510struct ExitInfo {511unsigned TripCount;512unsigned TripMultiple;513unsigned BreakoutTrip;514bool ExitOnTrue;515BasicBlock *FirstExitingBlock = nullptr;516SmallVector<BasicBlock *> ExitingBlocks;517};518DenseMap<BasicBlock *, ExitInfo> ExitInfos;519SmallVector<BasicBlock *, 4> ExitingBlocks;520L->getExitingBlocks(ExitingBlocks);521for (auto *ExitingBlock : ExitingBlocks) {522// The folding code is not prepared to deal with non-branch instructions523// right now.524auto *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());525if (!BI)526continue;527528ExitInfo &Info = ExitInfos.try_emplace(ExitingBlock).first->second;529Info.TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);530Info.TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);531if (Info.TripCount != 0) {532Info.BreakoutTrip = Info.TripCount % ULO.Count;533Info.TripMultiple = 0;534} else {535Info.BreakoutTrip = Info.TripMultiple =536(unsigned)std::gcd(ULO.Count, Info.TripMultiple);537}538Info.ExitOnTrue = !L->contains(BI->getSuccessor(0));539Info.ExitingBlocks.push_back(ExitingBlock);540LLVM_DEBUG(dbgs() << " Exiting block %" << ExitingBlock->getName()541<< ": TripCount=" << Info.TripCount542<< ", TripMultiple=" << Info.TripMultiple543<< ", BreakoutTrip=" << Info.BreakoutTrip << "\n");544}545546// Are we eliminating the loop control altogether? Note that we can know547// we're eliminating the backedge without knowing exactly which iteration548// of the unrolled body exits.549const bool CompletelyUnroll = ULO.Count == MaxTripCount;550551const bool PreserveOnlyFirst = CompletelyUnroll && MaxOrZero;552553// There's no point in performing runtime unrolling if this unroll count554// results in a full unroll.555if (CompletelyUnroll)556ULO.Runtime = false;557558// Go through all exits of L and see if there are any phi-nodes there. We just559// conservatively assume that they're inserted to preserve LCSSA form, which560// means that complete unrolling might break this form. We need to either fix561// it in-place after the transformation, or entirely rebuild LCSSA. TODO: For562// now we just recompute LCSSA for the outer loop, but it should be possible563// to fix it in-place.564bool NeedToFixLCSSA =565PreserveLCSSA && CompletelyUnroll &&566any_of(ExitBlocks,567[](const BasicBlock *BB) { return isa<PHINode>(BB->begin()); });568569// The current loop unroll pass can unroll loops that have570// (1) single latch; and571// (2a) latch is unconditional; or572// (2b) latch is conditional and is an exiting block573// FIXME: The implementation can be extended to work with more complicated574// cases, e.g. loops with multiple latches.575BranchInst *LatchBI = dyn_cast<BranchInst>(LatchBlock->getTerminator());576577// A conditional branch which exits the loop, which can be optimized to an578// unconditional branch in the unrolled loop in some cases.579bool LatchIsExiting = L->isLoopExiting(LatchBlock);580if (!LatchBI || (LatchBI->isConditional() && !LatchIsExiting)) {581LLVM_DEBUG(582dbgs() << "Can't unroll; a conditional latch must exit the loop");583return LoopUnrollResult::Unmodified;584}585586assert((!ULO.Runtime || canHaveUnrollRemainder(L)) &&587"Can't runtime unroll if loop contains a convergent operation.");588589bool EpilogProfitability =590UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog591: isEpilogProfitable(L);592593if (ULO.Runtime &&594!UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount,595EpilogProfitability, ULO.UnrollRemainder,596ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,597PreserveLCSSA, RemainderLoop)) {598if (ULO.Force)599ULO.Runtime = false;600else {601LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "602"generated when assuming runtime trip count\n");603return LoopUnrollResult::Unmodified;604}605}606607using namespace ore;608// Report the unrolling decision.609if (CompletelyUnroll) {610LLVM_DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()611<< " with trip count " << ULO.Count << "!\n");612if (ORE)613ORE->emit([&]() {614return OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),615L->getHeader())616<< "completely unrolled loop with "617<< NV("UnrollCount", ULO.Count) << " iterations";618});619} else {620LLVM_DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by "621<< ULO.Count);622if (ULO.Runtime)623LLVM_DEBUG(dbgs() << " with run-time trip count");624LLVM_DEBUG(dbgs() << "!\n");625626if (ORE)627ORE->emit([&]() {628OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),629L->getHeader());630Diag << "unrolled loop by a factor of " << NV("UnrollCount", ULO.Count);631if (ULO.Runtime)632Diag << " with run-time trip count";633return Diag;634});635}636637// We are going to make changes to this loop. SCEV may be keeping cached info638// about it, in particular about backedge taken count. The changes we make639// are guaranteed to invalidate this information for our loop. It is tempting640// to only invalidate the loop being unrolled, but it is incorrect as long as641// all exiting branches from all inner loops have impact on the outer loops,642// and if something changes inside them then any of outer loops may also643// change. When we forget outermost loop, we also forget all contained loops644// and this is what we need here.645if (SE) {646if (ULO.ForgetAllSCEV)647SE->forgetAllLoops();648else {649SE->forgetTopmostLoop(L);650SE->forgetBlockAndLoopDispositions();651}652}653654if (!LatchIsExiting)655++NumUnrolledNotLatch;656657// For the first iteration of the loop, we should use the precloned values for658// PHI nodes. Insert associations now.659ValueToValueMapTy LastValueMap;660std::vector<PHINode*> OrigPHINode;661for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {662OrigPHINode.push_back(cast<PHINode>(I));663}664665std::vector<BasicBlock *> Headers;666std::vector<BasicBlock *> Latches;667Headers.push_back(Header);668Latches.push_back(LatchBlock);669670// The current on-the-fly SSA update requires blocks to be processed in671// reverse postorder so that LastValueMap contains the correct value at each672// exit.673LoopBlocksDFS DFS(L);674DFS.perform(LI);675676// Stash the DFS iterators before adding blocks to the loop.677LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();678LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();679680std::vector<BasicBlock*> UnrolledLoopBlocks = L->getBlocks();681682// Loop Unrolling might create new loops. While we do preserve LoopInfo, we683// might break loop-simplified form for these loops (as they, e.g., would684// share the same exit blocks). We'll keep track of loops for which we can685// break this so that later we can re-simplify them.686SmallSetVector<Loop *, 4> LoopsToSimplify;687for (Loop *SubLoop : *L)688LoopsToSimplify.insert(SubLoop);689690// When a FSDiscriminator is enabled, we don't need to add the multiply691// factors to the discriminators.692if (Header->getParent()->shouldEmitDebugInfoForProfiling() &&693!EnableFSDiscriminator)694for (BasicBlock *BB : L->getBlocks())695for (Instruction &I : *BB)696if (!I.isDebugOrPseudoInst())697if (const DILocation *DIL = I.getDebugLoc()) {698auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(ULO.Count);699if (NewDIL)700I.setDebugLoc(*NewDIL);701else702LLVM_DEBUG(dbgs()703<< "Failed to create new discriminator: "704<< DIL->getFilename() << " Line: " << DIL->getLine());705}706707// Identify what noalias metadata is inside the loop: if it is inside the708// loop, the associated metadata must be cloned for each iteration.709SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;710identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes);711712// We place the unrolled iterations immediately after the original loop713// latch. This is a reasonable default placement if we don't have block714// frequencies, and if we do, well the layout will be adjusted later.715auto BlockInsertPt = std::next(LatchBlock->getIterator());716for (unsigned It = 1; It != ULO.Count; ++It) {717SmallVector<BasicBlock *, 8> NewBlocks;718SmallDenseMap<const Loop *, Loop *, 4> NewLoops;719NewLoops[L] = L;720721for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {722ValueToValueMapTy VMap;723BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));724Header->getParent()->insert(BlockInsertPt, New);725726assert((*BB != Header || LI->getLoopFor(*BB) == L) &&727"Header should not be in a sub-loop");728// Tell LI about New.729const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);730if (OldLoop)731LoopsToSimplify.insert(NewLoops[OldLoop]);732733if (*BB == Header) {734// Loop over all of the PHI nodes in the block, changing them to use735// the incoming values from the previous block.736for (PHINode *OrigPHI : OrigPHINode) {737PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);738Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);739if (Instruction *InValI = dyn_cast<Instruction>(InVal))740if (It > 1 && L->contains(InValI))741InVal = LastValueMap[InValI];742VMap[OrigPHI] = InVal;743NewPHI->eraseFromParent();744}745746// Eliminate copies of the loop heart intrinsic, if any.747if (ULO.Heart) {748auto it = VMap.find(ULO.Heart);749assert(it != VMap.end());750Instruction *heartCopy = cast<Instruction>(it->second);751heartCopy->eraseFromParent();752VMap.erase(it);753}754}755756// Update our running map of newest clones757LastValueMap[*BB] = New;758for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();759VI != VE; ++VI)760LastValueMap[VI->first] = VI->second;761762// Add phi entries for newly created values to all exit blocks.763for (BasicBlock *Succ : successors(*BB)) {764if (L->contains(Succ))765continue;766for (PHINode &PHI : Succ->phis()) {767Value *Incoming = PHI.getIncomingValueForBlock(*BB);768ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);769if (It != LastValueMap.end())770Incoming = It->second;771PHI.addIncoming(Incoming, New);772SE->forgetValue(&PHI);773}774}775// Keep track of new headers and latches as we create them, so that776// we can insert the proper branches later.777if (*BB == Header)778Headers.push_back(New);779if (*BB == LatchBlock)780Latches.push_back(New);781782// Keep track of the exiting block and its successor block contained in783// the loop for the current iteration.784auto ExitInfoIt = ExitInfos.find(*BB);785if (ExitInfoIt != ExitInfos.end())786ExitInfoIt->second.ExitingBlocks.push_back(New);787788NewBlocks.push_back(New);789UnrolledLoopBlocks.push_back(New);790791// Update DomTree: since we just copy the loop body, and each copy has a792// dedicated entry block (copy of the header block), this header's copy793// dominates all copied blocks. That means, dominance relations in the794// copied body are the same as in the original body.795if (*BB == Header)796DT->addNewBlock(New, Latches[It - 1]);797else {798auto BBDomNode = DT->getNode(*BB);799auto BBIDom = BBDomNode->getIDom();800BasicBlock *OriginalBBIDom = BBIDom->getBlock();801DT->addNewBlock(802New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));803}804}805806// Remap all instructions in the most recent iteration807remapInstructionsInBlocks(NewBlocks, LastValueMap);808for (BasicBlock *NewBlock : NewBlocks)809for (Instruction &I : *NewBlock)810if (auto *II = dyn_cast<AssumeInst>(&I))811AC->registerAssumption(II);812813{814// Identify what other metadata depends on the cloned version. After815// cloning, replace the metadata with the corrected version for both816// memory instructions and noalias intrinsics.817std::string ext = (Twine("It") + Twine(It)).str();818cloneAndAdaptNoAliasScopes(LoopLocalNoAliasDeclScopes, NewBlocks,819Header->getContext(), ext);820}821}822823// Loop over the PHI nodes in the original block, setting incoming values.824for (PHINode *PN : OrigPHINode) {825if (CompletelyUnroll) {826PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));827PN->eraseFromParent();828} else if (ULO.Count > 1) {829Value *InVal = PN->removeIncomingValue(LatchBlock, false);830// If this value was defined in the loop, take the value defined by the831// last iteration of the loop.832if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {833if (L->contains(InValI))834InVal = LastValueMap[InVal];835}836assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch");837PN->addIncoming(InVal, Latches.back());838}839}840841// Connect latches of the unrolled iterations to the headers of the next842// iteration. Currently they point to the header of the same iteration.843for (unsigned i = 0, e = Latches.size(); i != e; ++i) {844unsigned j = (i + 1) % e;845Latches[i]->getTerminator()->replaceSuccessorWith(Headers[i], Headers[j]);846}847848// Update dominators of blocks we might reach through exits.849// Immediate dominator of such block might change, because we add more850// routes which can lead to the exit: we can now reach it from the copied851// iterations too.852if (ULO.Count > 1) {853for (auto *BB : OriginalLoopBlocks) {854auto *BBDomNode = DT->getNode(BB);855SmallVector<BasicBlock *, 16> ChildrenToUpdate;856for (auto *ChildDomNode : BBDomNode->children()) {857auto *ChildBB = ChildDomNode->getBlock();858if (!L->contains(ChildBB))859ChildrenToUpdate.push_back(ChildBB);860}861// The new idom of the block will be the nearest common dominator862// of all copies of the previous idom. This is equivalent to the863// nearest common dominator of the previous idom and the first latch,864// which dominates all copies of the previous idom.865BasicBlock *NewIDom = DT->findNearestCommonDominator(BB, LatchBlock);866for (auto *ChildBB : ChildrenToUpdate)867DT->changeImmediateDominator(ChildBB, NewIDom);868}869}870871assert(!UnrollVerifyDomtree ||872DT->verify(DominatorTree::VerificationLevel::Fast));873874SmallVector<DominatorTree::UpdateType> DTUpdates;875auto SetDest = [&](BasicBlock *Src, bool WillExit, bool ExitOnTrue) {876auto *Term = cast<BranchInst>(Src->getTerminator());877const unsigned Idx = ExitOnTrue ^ WillExit;878BasicBlock *Dest = Term->getSuccessor(Idx);879BasicBlock *DeadSucc = Term->getSuccessor(1-Idx);880881// Remove predecessors from all non-Dest successors.882DeadSucc->removePredecessor(Src, /* KeepOneInputPHIs */ true);883884// Replace the conditional branch with an unconditional one.885BranchInst::Create(Dest, Term->getIterator());886Term->eraseFromParent();887888DTUpdates.emplace_back(DominatorTree::Delete, Src, DeadSucc);889};890891auto WillExit = [&](const ExitInfo &Info, unsigned i, unsigned j,892bool IsLatch) -> std::optional<bool> {893if (CompletelyUnroll) {894if (PreserveOnlyFirst) {895if (i == 0)896return std::nullopt;897return j == 0;898}899// Complete (but possibly inexact) unrolling900if (j == 0)901return true;902if (Info.TripCount && j != Info.TripCount)903return false;904return std::nullopt;905}906907if (ULO.Runtime) {908// If runtime unrolling inserts a prologue, information about non-latch909// exits may be stale.910if (IsLatch && j != 0)911return false;912return std::nullopt;913}914915if (j != Info.BreakoutTrip &&916(Info.TripMultiple == 0 || j % Info.TripMultiple != 0)) {917// If we know the trip count or a multiple of it, we can safely use an918// unconditional branch for some iterations.919return false;920}921return std::nullopt;922};923924// Fold branches for iterations where we know that they will exit or not925// exit.926for (auto &Pair : ExitInfos) {927ExitInfo &Info = Pair.second;928for (unsigned i = 0, e = Info.ExitingBlocks.size(); i != e; ++i) {929// The branch destination.930unsigned j = (i + 1) % e;931bool IsLatch = Pair.first == LatchBlock;932std::optional<bool> KnownWillExit = WillExit(Info, i, j, IsLatch);933if (!KnownWillExit) {934if (!Info.FirstExitingBlock)935Info.FirstExitingBlock = Info.ExitingBlocks[i];936continue;937}938939// We don't fold known-exiting branches for non-latch exits here,940// because this ensures that both all loop blocks and all exit blocks941// remain reachable in the CFG.942// TODO: We could fold these branches, but it would require much more943// sophisticated updates to LoopInfo.944if (*KnownWillExit && !IsLatch) {945if (!Info.FirstExitingBlock)946Info.FirstExitingBlock = Info.ExitingBlocks[i];947continue;948}949950SetDest(Info.ExitingBlocks[i], *KnownWillExit, Info.ExitOnTrue);951}952}953954DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);955DomTreeUpdater *DTUToUse = &DTU;956if (ExitingBlocks.size() == 1 && ExitInfos.size() == 1) {957// Manually update the DT if there's a single exiting node. In that case958// there's a single exit node and it is sufficient to update the nodes959// immediately dominated by the original exiting block. They will become960// dominated by the first exiting block that leaves the loop after961// unrolling. Note that the CFG inside the loop does not change, so there's962// no need to update the DT inside the unrolled loop.963DTUToUse = nullptr;964auto &[OriginalExit, Info] = *ExitInfos.begin();965if (!Info.FirstExitingBlock)966Info.FirstExitingBlock = Info.ExitingBlocks.back();967for (auto *C : to_vector(DT->getNode(OriginalExit)->children())) {968if (L->contains(C->getBlock()))969continue;970C->setIDom(DT->getNode(Info.FirstExitingBlock));971}972} else {973DTU.applyUpdates(DTUpdates);974}975976// When completely unrolling, the last latch becomes unreachable.977if (!LatchIsExiting && CompletelyUnroll) {978// There is no need to update the DT here, because there must be a unique979// latch. Hence if the latch is not exiting it must directly branch back to980// the original loop header and does not dominate any nodes.981assert(LatchBlock->getSingleSuccessor() && "Loop with multiple latches?");982changeToUnreachable(Latches.back()->getTerminator(), PreserveLCSSA);983}984985// Merge adjacent basic blocks, if possible.986for (BasicBlock *Latch : Latches) {987BranchInst *Term = dyn_cast<BranchInst>(Latch->getTerminator());988assert((Term ||989(CompletelyUnroll && !LatchIsExiting && Latch == Latches.back())) &&990"Need a branch as terminator, except when fully unrolling with "991"unconditional latch");992if (Term && Term->isUnconditional()) {993BasicBlock *Dest = Term->getSuccessor(0);994BasicBlock *Fold = Dest->getUniquePredecessor();995if (MergeBlockIntoPredecessor(Dest, /*DTU=*/DTUToUse, LI,996/*MSSAU=*/nullptr, /*MemDep=*/nullptr,997/*PredecessorWithTwoSuccessors=*/false,998DTUToUse ? nullptr : DT)) {999// Dest has been folded into Fold. Update our worklists accordingly.1000std::replace(Latches.begin(), Latches.end(), Dest, Fold);1001llvm::erase(UnrolledLoopBlocks, Dest);1002}1003}1004}10051006if (DTUToUse) {1007// Apply updates to the DomTree.1008DT = &DTU.getDomTree();1009}1010assert(!UnrollVerifyDomtree ||1011DT->verify(DominatorTree::VerificationLevel::Fast));10121013// At this point, the code is well formed. We now simplify the unrolled loop,1014// doing constant propagation and dead code elimination as we go.1015simplifyLoopAfterUnroll(L, !CompletelyUnroll && ULO.Count > 1, LI, SE, DT, AC,1016TTI, AA);10171018NumCompletelyUnrolled += CompletelyUnroll;1019++NumUnrolled;10201021Loop *OuterL = L->getParentLoop();1022// Update LoopInfo if the loop is completely removed.1023if (CompletelyUnroll) {1024LI->erase(L);1025// We shouldn't try to use `L` anymore.1026L = nullptr;1027} else if (OriginalTripCount) {1028// Update the trip count. Note that the remainder has already logic1029// computing it in `UnrollRuntimeLoopRemainder`.1030setLoopEstimatedTripCount(L, *OriginalTripCount / ULO.Count,1031EstimatedLoopInvocationWeight);1032}10331034// LoopInfo should not be valid, confirm that.1035if (UnrollVerifyLoopInfo)1036LI->verify(*DT);10371038// After complete unrolling most of the blocks should be contained in OuterL.1039// However, some of them might happen to be out of OuterL (e.g. if they1040// precede a loop exit). In this case we might need to insert PHI nodes in1041// order to preserve LCSSA form.1042// We don't need to check this if we already know that we need to fix LCSSA1043// form.1044// TODO: For now we just recompute LCSSA for the outer loop in this case, but1045// it should be possible to fix it in-place.1046if (PreserveLCSSA && OuterL && CompletelyUnroll && !NeedToFixLCSSA)1047NeedToFixLCSSA |= ::needToInsertPhisForLCSSA(OuterL, UnrolledLoopBlocks, LI);10481049// Make sure that loop-simplify form is preserved. We want to simplify1050// at least one layer outside of the loop that was unrolled so that any1051// changes to the parent loop exposed by the unrolling are considered.1052if (OuterL) {1053// OuterL includes all loops for which we can break loop-simplify, so1054// it's sufficient to simplify only it (it'll recursively simplify inner1055// loops too).1056if (NeedToFixLCSSA) {1057// LCSSA must be performed on the outermost affected loop. The unrolled1058// loop's last loop latch is guaranteed to be in the outermost loop1059// after LoopInfo's been updated by LoopInfo::erase.1060Loop *LatchLoop = LI->getLoopFor(Latches.back());1061Loop *FixLCSSALoop = OuterL;1062if (!FixLCSSALoop->contains(LatchLoop))1063while (FixLCSSALoop->getParentLoop() != LatchLoop)1064FixLCSSALoop = FixLCSSALoop->getParentLoop();10651066formLCSSARecursively(*FixLCSSALoop, *DT, LI, SE);1067} else if (PreserveLCSSA) {1068assert(OuterL->isLCSSAForm(*DT) &&1069"Loops should be in LCSSA form after loop-unroll.");1070}10711072// TODO: That potentially might be compile-time expensive. We should try1073// to fix the loop-simplified form incrementally.1074simplifyLoop(OuterL, DT, LI, SE, AC, nullptr, PreserveLCSSA);1075} else {1076// Simplify loops for which we might've broken loop-simplify form.1077for (Loop *SubLoop : LoopsToSimplify)1078simplifyLoop(SubLoop, DT, LI, SE, AC, nullptr, PreserveLCSSA);1079}10801081return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled1082: LoopUnrollResult::PartiallyUnrolled;1083}10841085/// Given an llvm.loop loop id metadata node, returns the loop hint metadata1086/// node with the given name (for example, "llvm.loop.unroll.count"). If no1087/// such metadata node exists, then nullptr is returned.1088MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {1089// First operand should refer to the loop id itself.1090assert(LoopID->getNumOperands() > 0 && "requires at least one operand");1091assert(LoopID->getOperand(0) == LoopID && "invalid loop id");10921093for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {1094MDNode *MD = dyn_cast<MDNode>(MDO);1095if (!MD)1096continue;10971098MDString *S = dyn_cast<MDString>(MD->getOperand(0));1099if (!S)1100continue;11011102if (Name == S->getString())1103return MD;1104}1105return nullptr;1106}110711081109