Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp
35266 views
//===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7///8/// \file9/// This file implements the loop fusion pass.10/// The implementation is largely based on the following document:11///12/// Code Transformations to Augment the Scope of Loop Fusion in a13/// Production Compiler14/// Christopher Mark Barton15/// MSc Thesis16/// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf17///18/// The general approach taken is to collect sets of control flow equivalent19/// loops and test whether they can be fused. The necessary conditions for20/// fusion are:21/// 1. The loops must be adjacent (there cannot be any statements between22/// the two loops).23/// 2. The loops must be conforming (they must execute the same number of24/// iterations).25/// 3. The loops must be control flow equivalent (if one loop executes, the26/// other is guaranteed to execute).27/// 4. There cannot be any negative distance dependencies between the loops.28/// If all of these conditions are satisfied, it is safe to fuse the loops.29///30/// This implementation creates FusionCandidates that represent the loop and the31/// necessary information needed by fusion. It then operates on the fusion32/// candidates, first confirming that the candidate is eligible for fusion. The33/// candidates are then collected into control flow equivalent sets, sorted in34/// dominance order. Each set of control flow equivalent candidates is then35/// traversed, attempting to fuse pairs of candidates in the set. If all36/// requirements for fusion are met, the two candidates are fused, creating a37/// new (fused) candidate which is then added back into the set to consider for38/// additional fusion.39///40/// This implementation currently does not make any modifications to remove41/// conditions for fusion. Code transformations to make loops conform to each of42/// the conditions for fusion are discussed in more detail in the document43/// above. These can be added to the current implementation in the future.44//===----------------------------------------------------------------------===//4546#include "llvm/Transforms/Scalar/LoopFuse.h"47#include "llvm/ADT/Statistic.h"48#include "llvm/Analysis/AssumptionCache.h"49#include "llvm/Analysis/DependenceAnalysis.h"50#include "llvm/Analysis/DomTreeUpdater.h"51#include "llvm/Analysis/LoopInfo.h"52#include "llvm/Analysis/OptimizationRemarkEmitter.h"53#include "llvm/Analysis/PostDominators.h"54#include "llvm/Analysis/ScalarEvolution.h"55#include "llvm/Analysis/ScalarEvolutionExpressions.h"56#include "llvm/Analysis/TargetTransformInfo.h"57#include "llvm/IR/Function.h"58#include "llvm/IR/Verifier.h"59#include "llvm/Support/CommandLine.h"60#include "llvm/Support/Debug.h"61#include "llvm/Support/raw_ostream.h"62#include "llvm/Transforms/Utils.h"63#include "llvm/Transforms/Utils/BasicBlockUtils.h"64#include "llvm/Transforms/Utils/CodeMoverUtils.h"65#include "llvm/Transforms/Utils/LoopPeel.h"66#include "llvm/Transforms/Utils/LoopSimplify.h"6768using namespace llvm;6970#define DEBUG_TYPE "loop-fusion"7172STATISTIC(FuseCounter, "Loops fused");73STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");74STATISTIC(InvalidPreheader, "Loop has invalid preheader");75STATISTIC(InvalidHeader, "Loop has invalid header");76STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks");77STATISTIC(InvalidExitBlock, "Loop has invalid exit block");78STATISTIC(InvalidLatch, "Loop has invalid latch");79STATISTIC(InvalidLoop, "Loop is invalid");80STATISTIC(AddressTakenBB, "Basic block has address taken");81STATISTIC(MayThrowException, "Loop may throw an exception");82STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");83STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");84STATISTIC(InvalidDependencies, "Dependencies prevent fusion");85STATISTIC(UnknownTripCount, "Loop has unknown trip count");86STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");87STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");88STATISTIC(NonAdjacent, "Loops are not adjacent");89STATISTIC(90NonEmptyPreheader,91"Loop has a non-empty preheader with instructions that cannot be moved");92STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");93STATISTIC(NonIdenticalGuards, "Candidates have different guards");94STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with "95"instructions that cannot be moved");96STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with "97"instructions that cannot be moved");98STATISTIC(NotRotated, "Candidate is not rotated");99STATISTIC(OnlySecondCandidateIsGuarded,100"The second candidate is guarded while the first one is not");101STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions.");102STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions.");103104enum FusionDependenceAnalysisChoice {105FUSION_DEPENDENCE_ANALYSIS_SCEV,106FUSION_DEPENDENCE_ANALYSIS_DA,107FUSION_DEPENDENCE_ANALYSIS_ALL,108};109110static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis(111"loop-fusion-dependence-analysis",112cl::desc("Which dependence analysis should loop fusion use?"),113cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev",114"Use the scalar evolution interface"),115clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da",116"Use the dependence analysis interface"),117clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all",118"Use all available analyses")),119cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL));120121static cl::opt<unsigned> FusionPeelMaxCount(122"loop-fusion-peel-max-count", cl::init(0), cl::Hidden,123cl::desc("Max number of iterations to be peeled from a loop, such that "124"fusion can take place"));125126#ifndef NDEBUG127static cl::opt<bool>128VerboseFusionDebugging("loop-fusion-verbose-debug",129cl::desc("Enable verbose debugging for Loop Fusion"),130cl::Hidden, cl::init(false));131#endif132133namespace {134/// This class is used to represent a candidate for loop fusion. When it is135/// constructed, it checks the conditions for loop fusion to ensure that it136/// represents a valid candidate. It caches several parts of a loop that are137/// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead138/// of continually querying the underlying Loop to retrieve these values. It is139/// assumed these will not change throughout loop fusion.140///141/// The invalidate method should be used to indicate that the FusionCandidate is142/// no longer a valid candidate for fusion. Similarly, the isValid() method can143/// be used to ensure that the FusionCandidate is still valid for fusion.144struct FusionCandidate {145/// Cache of parts of the loop used throughout loop fusion. These should not146/// need to change throughout the analysis and transformation.147/// These parts are cached to avoid repeatedly looking up in the Loop class.148149/// Preheader of the loop this candidate represents150BasicBlock *Preheader;151/// Header of the loop this candidate represents152BasicBlock *Header;153/// Blocks in the loop that exit the loop154BasicBlock *ExitingBlock;155/// The successor block of this loop (where the exiting blocks go to)156BasicBlock *ExitBlock;157/// Latch of the loop158BasicBlock *Latch;159/// The loop that this fusion candidate represents160Loop *L;161/// Vector of instructions in this loop that read from memory162SmallVector<Instruction *, 16> MemReads;163/// Vector of instructions in this loop that write to memory164SmallVector<Instruction *, 16> MemWrites;165/// Are all of the members of this fusion candidate still valid166bool Valid;167/// Guard branch of the loop, if it exists168BranchInst *GuardBranch;169/// Peeling Paramaters of the Loop.170TTI::PeelingPreferences PP;171/// Can you Peel this Loop?172bool AbleToPeel;173/// Has this loop been Peeled174bool Peeled;175176/// Dominator and PostDominator trees are needed for the177/// FusionCandidateCompare function, required by FusionCandidateSet to178/// determine where the FusionCandidate should be inserted into the set. These179/// are used to establish ordering of the FusionCandidates based on dominance.180DominatorTree &DT;181const PostDominatorTree *PDT;182183OptimizationRemarkEmitter &ORE;184185FusionCandidate(Loop *L, DominatorTree &DT, const PostDominatorTree *PDT,186OptimizationRemarkEmitter &ORE, TTI::PeelingPreferences PP)187: Preheader(L->getLoopPreheader()), Header(L->getHeader()),188ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),189Latch(L->getLoopLatch()), L(L), Valid(true),190GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)),191Peeled(false), DT(DT), PDT(PDT), ORE(ORE) {192193// Walk over all blocks in the loop and check for conditions that may194// prevent fusion. For each block, walk over all instructions and collect195// the memory reads and writes If any instructions that prevent fusion are196// found, invalidate this object and return.197for (BasicBlock *BB : L->blocks()) {198if (BB->hasAddressTaken()) {199invalidate();200reportInvalidCandidate(AddressTakenBB);201return;202}203204for (Instruction &I : *BB) {205if (I.mayThrow()) {206invalidate();207reportInvalidCandidate(MayThrowException);208return;209}210if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {211if (SI->isVolatile()) {212invalidate();213reportInvalidCandidate(ContainsVolatileAccess);214return;215}216}217if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {218if (LI->isVolatile()) {219invalidate();220reportInvalidCandidate(ContainsVolatileAccess);221return;222}223}224if (I.mayWriteToMemory())225MemWrites.push_back(&I);226if (I.mayReadFromMemory())227MemReads.push_back(&I);228}229}230}231232/// Check if all members of the class are valid.233bool isValid() const {234return Preheader && Header && ExitingBlock && ExitBlock && Latch && L &&235!L->isInvalid() && Valid;236}237238/// Verify that all members are in sync with the Loop object.239void verify() const {240assert(isValid() && "Candidate is not valid!!");241assert(!L->isInvalid() && "Loop is invalid!");242assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync");243assert(Header == L->getHeader() && "Header is out of sync");244assert(ExitingBlock == L->getExitingBlock() &&245"Exiting Blocks is out of sync");246assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync");247assert(Latch == L->getLoopLatch() && "Latch is out of sync");248}249250/// Get the entry block for this fusion candidate.251///252/// If this fusion candidate represents a guarded loop, the entry block is the253/// loop guard block. If it represents an unguarded loop, the entry block is254/// the preheader of the loop.255BasicBlock *getEntryBlock() const {256if (GuardBranch)257return GuardBranch->getParent();258else259return Preheader;260}261262/// After Peeling the loop is modified quite a bit, hence all of the Blocks263/// need to be updated accordingly.264void updateAfterPeeling() {265Preheader = L->getLoopPreheader();266Header = L->getHeader();267ExitingBlock = L->getExitingBlock();268ExitBlock = L->getExitBlock();269Latch = L->getLoopLatch();270verify();271}272273/// Given a guarded loop, get the successor of the guard that is not in the274/// loop.275///276/// This method returns the successor of the loop guard that is not located277/// within the loop (i.e., the successor of the guard that is not the278/// preheader).279/// This method is only valid for guarded loops.280BasicBlock *getNonLoopBlock() const {281assert(GuardBranch && "Only valid on guarded loops.");282assert(GuardBranch->isConditional() &&283"Expecting guard to be a conditional branch.");284if (Peeled)285return GuardBranch->getSuccessor(1);286return (GuardBranch->getSuccessor(0) == Preheader)287? GuardBranch->getSuccessor(1)288: GuardBranch->getSuccessor(0);289}290291#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)292LLVM_DUMP_METHOD void dump() const {293dbgs() << "\tGuardBranch: ";294if (GuardBranch)295dbgs() << *GuardBranch;296else297dbgs() << "nullptr";298dbgs() << "\n"299<< (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"300<< "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")301<< "\n"302<< "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"303<< "\tExitingBB: "304<< (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"305<< "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")306<< "\n"307<< "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"308<< "\tEntryBlock: "309<< (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")310<< "\n";311}312#endif313314/// Determine if a fusion candidate (representing a loop) is eligible for315/// fusion. Note that this only checks whether a single loop can be fused - it316/// does not check whether it is *legal* to fuse two loops together.317bool isEligibleForFusion(ScalarEvolution &SE) const {318if (!isValid()) {319LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");320if (!Preheader)321++InvalidPreheader;322if (!Header)323++InvalidHeader;324if (!ExitingBlock)325++InvalidExitingBlock;326if (!ExitBlock)327++InvalidExitBlock;328if (!Latch)329++InvalidLatch;330if (L->isInvalid())331++InvalidLoop;332333return false;334}335336// Require ScalarEvolution to be able to determine a trip count.337if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {338LLVM_DEBUG(dbgs() << "Loop " << L->getName()339<< " trip count not computable!\n");340return reportInvalidCandidate(UnknownTripCount);341}342343if (!L->isLoopSimplifyForm()) {344LLVM_DEBUG(dbgs() << "Loop " << L->getName()345<< " is not in simplified form!\n");346return reportInvalidCandidate(NotSimplifiedForm);347}348349if (!L->isRotatedForm()) {350LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n");351return reportInvalidCandidate(NotRotated);352}353354return true;355}356357private:358// This is only used internally for now, to clear the MemWrites and MemReads359// list and setting Valid to false. I can't envision other uses of this right360// now, since once FusionCandidates are put into the FusionCandidateSet they361// are immutable. Thus, any time we need to change/update a FusionCandidate,362// we must create a new one and insert it into the FusionCandidateSet to363// ensure the FusionCandidateSet remains ordered correctly.364void invalidate() {365MemWrites.clear();366MemReads.clear();367Valid = false;368}369370bool reportInvalidCandidate(llvm::Statistic &Stat) const {371using namespace ore;372assert(L && Preheader && "Fusion candidate not initialized properly!");373#if LLVM_ENABLE_STATS374++Stat;375ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(),376L->getStartLoc(), Preheader)377<< "[" << Preheader->getParent()->getName() << "]: "378<< "Loop is not a candidate for fusion: " << Stat.getDesc());379#endif380return false;381}382};383384struct FusionCandidateCompare {385/// Comparison functor to sort two Control Flow Equivalent fusion candidates386/// into dominance order.387/// If LHS dominates RHS and RHS post-dominates LHS, return true;388/// If RHS dominates LHS and LHS post-dominates RHS, return false;389/// If both LHS and RHS are not dominating each other then, non-strictly390/// post dominate check will decide the order of candidates. If RHS391/// non-strictly post dominates LHS then, return true. If LHS non-strictly392/// post dominates RHS then, return false. If both are non-strictly post393/// dominate each other then, level in the post dominator tree will decide394/// the order of candidates.395bool operator()(const FusionCandidate &LHS,396const FusionCandidate &RHS) const {397const DominatorTree *DT = &(LHS.DT);398399BasicBlock *LHSEntryBlock = LHS.getEntryBlock();400BasicBlock *RHSEntryBlock = RHS.getEntryBlock();401402// Do not save PDT to local variable as it is only used in asserts and thus403// will trigger an unused variable warning if building without asserts.404assert(DT && LHS.PDT && "Expecting valid dominator tree");405406// Do this compare first so if LHS == RHS, function returns false.407if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {408// RHS dominates LHS409// Verify LHS post-dominates RHS410assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));411return false;412}413414if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {415// Verify RHS Postdominates LHS416assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));417return true;418}419420// If two FusionCandidates are in the same level of dominator tree,421// they will not dominate each other, but may still be control flow422// equivalent. To sort those FusionCandidates, nonStrictlyPostDominate()423// function is needed.424bool WrongOrder =425nonStrictlyPostDominate(LHSEntryBlock, RHSEntryBlock, DT, LHS.PDT);426bool RightOrder =427nonStrictlyPostDominate(RHSEntryBlock, LHSEntryBlock, DT, LHS.PDT);428if (WrongOrder && RightOrder) {429// If common predecessor of LHS and RHS post dominates both430// FusionCandidates then, Order of FusionCandidate can be431// identified by its level in post dominator tree.432DomTreeNode *LNode = LHS.PDT->getNode(LHSEntryBlock);433DomTreeNode *RNode = LHS.PDT->getNode(RHSEntryBlock);434return LNode->getLevel() > RNode->getLevel();435} else if (WrongOrder)436return false;437else if (RightOrder)438return true;439440// If LHS does not non-strict Postdominate RHS and RHS does not non-strict441// Postdominate LHS then, there is no dominance relationship between the442// two FusionCandidates. Thus, they should not be in the same set together.443llvm_unreachable(444"No dominance relationship between these fusion candidates!");445}446};447448using LoopVector = SmallVector<Loop *, 4>;449450// Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance451// order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0452// dominates FC1 and FC1 post-dominates FC0.453// std::set was chosen because we want a sorted data structure with stable454// iterators. A subsequent patch to loop fusion will enable fusing non-adjacent455// loops by moving intervening code around. When this intervening code contains456// loops, those loops will be moved also. The corresponding FusionCandidates457// will also need to be moved accordingly. As this is done, having stable458// iterators will simplify the logic. Similarly, having an efficient insert that459// keeps the FusionCandidateSet sorted will also simplify the implementation.460using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;461using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;462463#if !defined(NDEBUG)464static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,465const FusionCandidate &FC) {466if (FC.isValid())467OS << FC.Preheader->getName();468else469OS << "<Invalid>";470471return OS;472}473474static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,475const FusionCandidateSet &CandSet) {476for (const FusionCandidate &FC : CandSet)477OS << FC << '\n';478479return OS;480}481482static void483printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {484dbgs() << "Fusion Candidates: \n";485for (const auto &CandidateSet : FusionCandidates) {486dbgs() << "*** Fusion Candidate Set ***\n";487dbgs() << CandidateSet;488dbgs() << "****************************\n";489}490}491#endif492493/// Collect all loops in function at the same nest level, starting at the494/// outermost level.495///496/// This data structure collects all loops at the same nest level for a497/// given function (specified by the LoopInfo object). It starts at the498/// outermost level.499struct LoopDepthTree {500using LoopsOnLevelTy = SmallVector<LoopVector, 4>;501using iterator = LoopsOnLevelTy::iterator;502using const_iterator = LoopsOnLevelTy::const_iterator;503504LoopDepthTree(LoopInfo &LI) : Depth(1) {505if (!LI.empty())506LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend()));507}508509/// Test whether a given loop has been removed from the function, and thus is510/// no longer valid.511bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); }512513/// Record that a given loop has been removed from the function and is no514/// longer valid.515void removeLoop(const Loop *L) { RemovedLoops.insert(L); }516517/// Descend the tree to the next (inner) nesting level518void descend() {519LoopsOnLevelTy LoopsOnNextLevel;520521for (const LoopVector &LV : *this)522for (Loop *L : LV)523if (!isRemovedLoop(L) && L->begin() != L->end())524LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end()));525526LoopsOnLevel = LoopsOnNextLevel;527RemovedLoops.clear();528Depth++;529}530531bool empty() const { return size() == 0; }532size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); }533unsigned getDepth() const { return Depth; }534535iterator begin() { return LoopsOnLevel.begin(); }536iterator end() { return LoopsOnLevel.end(); }537const_iterator begin() const { return LoopsOnLevel.begin(); }538const_iterator end() const { return LoopsOnLevel.end(); }539540private:541/// Set of loops that have been removed from the function and are no longer542/// valid.543SmallPtrSet<const Loop *, 8> RemovedLoops;544545/// Depth of the current level, starting at 1 (outermost loops).546unsigned Depth;547548/// Vector of loops at the current depth level that have the same parent loop549LoopsOnLevelTy LoopsOnLevel;550};551552#ifndef NDEBUG553static void printLoopVector(const LoopVector &LV) {554dbgs() << "****************************\n";555for (auto *L : LV)556printLoop(*L, dbgs());557dbgs() << "****************************\n";558}559#endif560561struct LoopFuser {562private:563// Sets of control flow equivalent fusion candidates for a given nest level.564FusionCandidateCollection FusionCandidates;565566LoopDepthTree LDT;567DomTreeUpdater DTU;568569LoopInfo &LI;570DominatorTree &DT;571DependenceInfo &DI;572ScalarEvolution &SE;573PostDominatorTree &PDT;574OptimizationRemarkEmitter &ORE;575AssumptionCache &AC;576const TargetTransformInfo &TTI;577578public:579LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,580ScalarEvolution &SE, PostDominatorTree &PDT,581OptimizationRemarkEmitter &ORE, const DataLayout &DL,582AssumptionCache &AC, const TargetTransformInfo &TTI)583: LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),584DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {}585586/// This is the main entry point for loop fusion. It will traverse the587/// specified function and collect candidate loops to fuse, starting at the588/// outermost nesting level and working inwards.589bool fuseLoops(Function &F) {590#ifndef NDEBUG591if (VerboseFusionDebugging) {592LI.print(dbgs());593}594#endif595596LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName()597<< "\n");598bool Changed = false;599600while (!LDT.empty()) {601LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth "602<< LDT.getDepth() << "\n";);603604for (const LoopVector &LV : LDT) {605assert(LV.size() > 0 && "Empty loop set was build!");606607// Skip singleton loop sets as they do not offer fusion opportunities on608// this level.609if (LV.size() == 1)610continue;611#ifndef NDEBUG612if (VerboseFusionDebugging) {613LLVM_DEBUG({614dbgs() << " Visit loop set (#" << LV.size() << "):\n";615printLoopVector(LV);616});617}618#endif619620collectFusionCandidates(LV);621Changed |= fuseCandidates();622}623624// Finished analyzing candidates at this level.625// Descend to the next level and clear all of the candidates currently626// collected. Note that it will not be possible to fuse any of the627// existing candidates with new candidates because the new candidates will628// be at a different nest level and thus not be control flow equivalent629// with all of the candidates collected so far.630LLVM_DEBUG(dbgs() << "Descend one level!\n");631LDT.descend();632FusionCandidates.clear();633}634635if (Changed)636LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump(););637638#ifndef NDEBUG639assert(DT.verify());640assert(PDT.verify());641LI.verify(DT);642SE.verify();643#endif644645LLVM_DEBUG(dbgs() << "Loop Fusion complete\n");646return Changed;647}648649private:650/// Determine if two fusion candidates are control flow equivalent.651///652/// Two fusion candidates are control flow equivalent if when one executes,653/// the other is guaranteed to execute. This is determined using dominators654/// and post-dominators: if A dominates B and B post-dominates A then A and B655/// are control-flow equivalent.656bool isControlFlowEquivalent(const FusionCandidate &FC0,657const FusionCandidate &FC1) const {658assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");659660return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(),661DT, PDT);662}663664/// Iterate over all loops in the given loop set and identify the loops that665/// are eligible for fusion. Place all eligible fusion candidates into Control666/// Flow Equivalent sets, sorted by dominance.667void collectFusionCandidates(const LoopVector &LV) {668for (Loop *L : LV) {669TTI::PeelingPreferences PP =670gatherPeelingPreferences(L, SE, TTI, std::nullopt, std::nullopt);671FusionCandidate CurrCand(L, DT, &PDT, ORE, PP);672if (!CurrCand.isEligibleForFusion(SE))673continue;674675// Go through each list in FusionCandidates and determine if L is control676// flow equivalent with the first loop in that list. If it is, append LV.677// If not, go to the next list.678// If no suitable list is found, start another list and add it to679// FusionCandidates.680bool FoundSet = false;681682for (auto &CurrCandSet : FusionCandidates) {683if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) {684CurrCandSet.insert(CurrCand);685FoundSet = true;686#ifndef NDEBUG687if (VerboseFusionDebugging)688LLVM_DEBUG(dbgs() << "Adding " << CurrCand689<< " to existing candidate set\n");690#endif691break;692}693}694if (!FoundSet) {695// No set was found. Create a new set and add to FusionCandidates696#ifndef NDEBUG697if (VerboseFusionDebugging)698LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n");699#endif700FusionCandidateSet NewCandSet;701NewCandSet.insert(CurrCand);702FusionCandidates.push_back(NewCandSet);703}704NumFusionCandidates++;705}706}707708/// Determine if it is beneficial to fuse two loops.709///710/// For now, this method simply returns true because we want to fuse as much711/// as possible (primarily to test the pass). This method will evolve, over712/// time, to add heuristics for profitability of fusion.713bool isBeneficialFusion(const FusionCandidate &FC0,714const FusionCandidate &FC1) {715return true;716}717718/// Determine if two fusion candidates have the same trip count (i.e., they719/// execute the same number of iterations).720///721/// This function will return a pair of values. The first is a boolean,722/// stating whether or not the two candidates are known at compile time to723/// have the same TripCount. The second is the difference in the two724/// TripCounts. This information can be used later to determine whether or not725/// peeling can be performed on either one of the candidates.726std::pair<bool, std::optional<unsigned>>727haveIdenticalTripCounts(const FusionCandidate &FC0,728const FusionCandidate &FC1) const {729const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);730if (isa<SCEVCouldNotCompute>(TripCount0)) {731UncomputableTripCount++;732LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");733return {false, std::nullopt};734}735736const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);737if (isa<SCEVCouldNotCompute>(TripCount1)) {738UncomputableTripCount++;739LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");740return {false, std::nullopt};741}742743LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "744<< *TripCount1 << " are "745<< (TripCount0 == TripCount1 ? "identical" : "different")746<< "\n");747748if (TripCount0 == TripCount1)749return {true, 0};750751LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, "752"determining the difference between trip counts\n");753754// Currently only considering loops with a single exit point755// and a non-constant trip count.756const unsigned TC0 = SE.getSmallConstantTripCount(FC0.L);757const unsigned TC1 = SE.getSmallConstantTripCount(FC1.L);758759// If any of the tripcounts are zero that means that loop(s) do not have760// a single exit or a constant tripcount.761if (TC0 == 0 || TC1 == 0) {762LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not "763"have a constant number of iterations. Peeling "764"is not benefical\n");765return {false, std::nullopt};766}767768std::optional<unsigned> Difference;769int Diff = TC0 - TC1;770771if (Diff > 0)772Difference = Diff;773else {774LLVM_DEBUG(775dbgs() << "Difference is less than 0. FC1 (second loop) has more "776"iterations than the first one. Currently not supported\n");777}778779LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference780<< "\n");781782return {false, Difference};783}784785void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1,786unsigned PeelCount) {787assert(FC0.AbleToPeel && "Should be able to peel loop");788789LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount790<< " iterations of the first loop. \n");791792ValueToValueMapTy VMap;793FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, DT, &AC, true, VMap);794if (FC0.Peeled) {795LLVM_DEBUG(dbgs() << "Done Peeling\n");796797#ifndef NDEBUG798auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1);799800assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 &&801"Loops should have identical trip counts after peeling");802#endif803804FC0.PP.PeelCount += PeelCount;805806// Peeling does not update the PDT807PDT.recalculate(*FC0.Preheader->getParent());808809FC0.updateAfterPeeling();810811// In this case the iterations of the loop are constant, so the first812// loop will execute completely (will not jump from one of813// the peeled blocks to the second loop). Here we are updating the814// branch conditions of each of the peeled blocks, such that it will815// branch to its successor which is not the preheader of the second loop816// in the case of unguarded loops, or the succesors of the exit block of817// the first loop otherwise. Doing this update will ensure that the entry818// block of the first loop dominates the entry block of the second loop.819BasicBlock *BB =820FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader;821if (BB) {822SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;823SmallVector<Instruction *, 8> WorkList;824for (BasicBlock *Pred : predecessors(BB)) {825if (Pred != FC0.ExitBlock) {826WorkList.emplace_back(Pred->getTerminator());827TreeUpdates.emplace_back(828DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB));829}830}831// Cannot modify the predecessors inside the above loop as it will cause832// the iterators to be nullptrs, causing memory errors.833for (Instruction *CurrentBranch : WorkList) {834BasicBlock *Succ = CurrentBranch->getSuccessor(0);835if (Succ == BB)836Succ = CurrentBranch->getSuccessor(1);837ReplaceInstWithInst(CurrentBranch, BranchInst::Create(Succ));838}839840DTU.applyUpdates(TreeUpdates);841DTU.flush();842}843LLVM_DEBUG(844dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount845<< " iterations from the first loop.\n"846"Both Loops have the same number of iterations now.\n");847}848}849850/// Walk each set of control flow equivalent fusion candidates and attempt to851/// fuse them. This does a single linear traversal of all candidates in the852/// set. The conditions for legal fusion are checked at this point. If a pair853/// of fusion candidates passes all legality checks, they are fused together854/// and a new fusion candidate is created and added to the FusionCandidateSet.855/// The original fusion candidates are then removed, as they are no longer856/// valid.857bool fuseCandidates() {858bool Fused = false;859LLVM_DEBUG(printFusionCandidates(FusionCandidates));860for (auto &CandidateSet : FusionCandidates) {861if (CandidateSet.size() < 2)862continue;863864LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n"865<< CandidateSet << "\n");866867for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) {868assert(!LDT.isRemovedLoop(FC0->L) &&869"Should not have removed loops in CandidateSet!");870auto FC1 = FC0;871for (++FC1; FC1 != CandidateSet.end(); ++FC1) {872assert(!LDT.isRemovedLoop(FC1->L) &&873"Should not have removed loops in CandidateSet!");874875LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump();876dbgs() << " with\n"; FC1->dump(); dbgs() << "\n");877878FC0->verify();879FC1->verify();880881// Check if the candidates have identical tripcounts (first value of882// pair), and if not check the difference in the tripcounts between883// the loops (second value of pair). The difference is not equal to884// std::nullopt iff the loops iterate a constant number of times, and885// have a single exit.886std::pair<bool, std::optional<unsigned>> IdenticalTripCountRes =887haveIdenticalTripCounts(*FC0, *FC1);888bool SameTripCount = IdenticalTripCountRes.first;889std::optional<unsigned> TCDifference = IdenticalTripCountRes.second;890891// Here we are checking that FC0 (the first loop) can be peeled, and892// both loops have different tripcounts.893if (FC0->AbleToPeel && !SameTripCount && TCDifference) {894if (*TCDifference > FusionPeelMaxCount) {895LLVM_DEBUG(dbgs()896<< "Difference in loop trip counts: " << *TCDifference897<< " is greater than maximum peel count specificed: "898<< FusionPeelMaxCount << "\n");899} else {900// Dependent on peeling being performed on the first loop, and901// assuming all other conditions for fusion return true.902SameTripCount = true;903}904}905906if (!SameTripCount) {907LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "908"counts. Not fusing.\n");909reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,910NonEqualTripCount);911continue;912}913914if (!isAdjacent(*FC0, *FC1)) {915LLVM_DEBUG(dbgs()916<< "Fusion candidates are not adjacent. Not fusing.\n");917reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent);918continue;919}920921if ((!FC0->GuardBranch && FC1->GuardBranch) ||922(FC0->GuardBranch && !FC1->GuardBranch)) {923LLVM_DEBUG(dbgs() << "The one of candidate is guarded while the "924"another one is not. Not fusing.\n");925reportLoopFusion<OptimizationRemarkMissed>(926*FC0, *FC1, OnlySecondCandidateIsGuarded);927continue;928}929930// Ensure that FC0 and FC1 have identical guards.931// If one (or both) are not guarded, this check is not necessary.932if (FC0->GuardBranch && FC1->GuardBranch &&933!haveIdenticalGuards(*FC0, *FC1) && !TCDifference) {934LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "935"guards. Not Fusing.\n");936reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,937NonIdenticalGuards);938continue;939}940941if (FC0->GuardBranch) {942assert(FC1->GuardBranch && "Expecting valid FC1 guard branch");943944if (!isSafeToMoveBefore(*FC0->ExitBlock,945*FC1->ExitBlock->getFirstNonPHIOrDbg(), DT,946&PDT, &DI)) {947LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "948"instructions in exit block. Not fusing.\n");949reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,950NonEmptyExitBlock);951continue;952}953954if (!isSafeToMoveBefore(955*FC1->GuardBranch->getParent(),956*FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT,957&DI)) {958LLVM_DEBUG(dbgs()959<< "Fusion candidate contains unsafe "960"instructions in guard block. Not fusing.\n");961reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,962NonEmptyGuardBlock);963continue;964}965}966967// Check the dependencies across the loops and do not fuse if it would968// violate them.969if (!dependencesAllowFusion(*FC0, *FC1)) {970LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");971reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,972InvalidDependencies);973continue;974}975976// If the second loop has instructions in the pre-header, attempt to977// hoist them up to the first loop's pre-header or sink them into the978// body of the second loop.979SmallVector<Instruction *, 4> SafeToHoist;980SmallVector<Instruction *, 4> SafeToSink;981// At this point, this is the last remaining legality check.982// Which means if we can make this pre-header empty, we can fuse983// these loops984if (!isEmptyPreheader(*FC1)) {985LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "986"preheader.\n");987988// If it is not safe to hoist/sink all instructions in the989// pre-header, we cannot fuse these loops.990if (!collectMovablePreheaderInsts(*FC0, *FC1, SafeToHoist,991SafeToSink)) {992LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in "993"Fusion Candidate Pre-header.\n"994<< "Not Fusing.\n");995reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,996NonEmptyPreheader);997continue;998}999}10001001bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1);1002LLVM_DEBUG(dbgs()1003<< "\tFusion appears to be "1004<< (BeneficialToFuse ? "" : "un") << "profitable!\n");1005if (!BeneficialToFuse) {1006reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,1007FusionNotBeneficial);1008continue;1009}1010// All analysis has completed and has determined that fusion is legal1011// and profitable. At this point, start transforming the code and1012// perform fusion.10131014// Execute the hoist/sink operations on preheader instructions1015movePreheaderInsts(*FC0, *FC1, SafeToHoist, SafeToSink);10161017LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "1018<< *FC1 << "\n");10191020FusionCandidate FC0Copy = *FC0;1021// Peel the loop after determining that fusion is legal. The Loops1022// will still be safe to fuse after the peeling is performed.1023bool Peel = TCDifference && *TCDifference > 0;1024if (Peel)1025peelFusionCandidate(FC0Copy, *FC1, *TCDifference);10261027// Report fusion to the Optimization Remarks.1028// Note this needs to be done *before* performFusion because1029// performFusion will change the original loops, making it not1030// possible to identify them after fusion is complete.1031reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1,1032FuseCounter);10331034FusionCandidate FusedCand(1035performFusion((Peel ? FC0Copy : *FC0), *FC1), DT, &PDT, ORE,1036FC0Copy.PP);1037FusedCand.verify();1038assert(FusedCand.isEligibleForFusion(SE) &&1039"Fused candidate should be eligible for fusion!");10401041// Notify the loop-depth-tree that these loops are not valid objects1042LDT.removeLoop(FC1->L);10431044CandidateSet.erase(FC0);1045CandidateSet.erase(FC1);10461047auto InsertPos = CandidateSet.insert(FusedCand);10481049assert(InsertPos.second &&1050"Unable to insert TargetCandidate in CandidateSet!");10511052// Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations1053// of the FC1 loop will attempt to fuse the new (fused) loop with the1054// remaining candidates in the current candidate set.1055FC0 = FC1 = InsertPos.first;10561057LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet1058<< "\n");10591060Fused = true;1061}1062}1063}1064return Fused;1065}10661067// Returns true if the instruction \p I can be hoisted to the end of the1068// preheader of \p FC0. \p SafeToHoist contains the instructions that are1069// known to be safe to hoist. The instructions encountered that cannot be1070// hoisted are in \p NotHoisting.1071// TODO: Move functionality into CodeMoverUtils1072bool canHoistInst(Instruction &I,1073const SmallVector<Instruction *, 4> &SafeToHoist,1074const SmallVector<Instruction *, 4> &NotHoisting,1075const FusionCandidate &FC0) const {1076const BasicBlock *FC0PreheaderTarget = FC0.Preheader->getSingleSuccessor();1077assert(FC0PreheaderTarget &&1078"Expected single successor for loop preheader.");10791080for (Use &Op : I.operands()) {1081if (auto *OpInst = dyn_cast<Instruction>(Op)) {1082bool OpHoisted = is_contained(SafeToHoist, OpInst);1083// Check if we have already decided to hoist this operand. In this1084// case, it does not dominate FC0 *yet*, but will after we hoist it.1085if (!(OpHoisted || DT.dominates(OpInst, FC0PreheaderTarget))) {1086return false;1087}1088}1089}10901091// PHIs in FC1's header only have FC0 blocks as predecessors. PHIs1092// cannot be hoisted and should be sunk to the exit of the fused loop.1093if (isa<PHINode>(I))1094return false;10951096// If this isn't a memory inst, hoisting is safe1097if (!I.mayReadOrWriteMemory())1098return true;10991100LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n");1101for (Instruction *NotHoistedInst : NotHoisting) {1102if (auto D = DI.depends(&I, NotHoistedInst, true)) {1103// Dependency is not read-before-write, write-before-read or1104// write-before-write1105if (D->isFlow() || D->isAnti() || D->isOutput()) {1106LLVM_DEBUG(dbgs() << "Inst depends on an instruction in FC1's "1107"preheader that is not being hoisted.\n");1108return false;1109}1110}1111}11121113for (Instruction *ReadInst : FC0.MemReads) {1114if (auto D = DI.depends(ReadInst, &I, true)) {1115// Dependency is not read-before-write1116if (D->isAnti()) {1117LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC0.\n");1118return false;1119}1120}1121}11221123for (Instruction *WriteInst : FC0.MemWrites) {1124if (auto D = DI.depends(WriteInst, &I, true)) {1125// Dependency is not write-before-read or write-before-write1126if (D->isFlow() || D->isOutput()) {1127LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC0.\n");1128return false;1129}1130}1131}1132return true;1133}11341135// Returns true if the instruction \p I can be sunk to the top of the exit1136// block of \p FC1.1137// TODO: Move functionality into CodeMoverUtils1138bool canSinkInst(Instruction &I, const FusionCandidate &FC1) const {1139for (User *U : I.users()) {1140if (auto *UI{dyn_cast<Instruction>(U)}) {1141// Cannot sink if user in loop1142// If FC1 has phi users of this value, we cannot sink it into FC1.1143if (FC1.L->contains(UI)) {1144// Cannot hoist or sink this instruction. No hoisting/sinking1145// should take place, loops should not fuse1146return false;1147}1148}1149}11501151// If this isn't a memory inst, sinking is safe1152if (!I.mayReadOrWriteMemory())1153return true;11541155for (Instruction *ReadInst : FC1.MemReads) {1156if (auto D = DI.depends(&I, ReadInst, true)) {1157// Dependency is not write-before-read1158if (D->isFlow()) {1159LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC1.\n");1160return false;1161}1162}1163}11641165for (Instruction *WriteInst : FC1.MemWrites) {1166if (auto D = DI.depends(&I, WriteInst, true)) {1167// Dependency is not write-before-write or read-before-write1168if (D->isOutput() || D->isAnti()) {1169LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC1.\n");1170return false;1171}1172}1173}11741175return true;1176}11771178/// Collect instructions in the \p FC1 Preheader that can be hoisted1179/// to the \p FC0 Preheader or sunk into the \p FC1 Body1180bool collectMovablePreheaderInsts(1181const FusionCandidate &FC0, const FusionCandidate &FC1,1182SmallVector<Instruction *, 4> &SafeToHoist,1183SmallVector<Instruction *, 4> &SafeToSink) const {1184BasicBlock *FC1Preheader = FC1.Preheader;1185// Save the instructions that are not being hoisted, so we know not to hoist1186// mem insts that they dominate.1187SmallVector<Instruction *, 4> NotHoisting;11881189for (Instruction &I : *FC1Preheader) {1190// Can't move a branch1191if (&I == FC1Preheader->getTerminator())1192continue;1193// If the instruction has side-effects, give up.1194// TODO: The case of mayReadFromMemory we can handle but requires1195// additional work with a dependence analysis so for now we give1196// up on memory reads.1197if (I.mayThrow() || !I.willReturn()) {1198LLVM_DEBUG(dbgs() << "Inst: " << I << " may throw or won't return.\n");1199return false;1200}12011202LLVM_DEBUG(dbgs() << "Checking Inst: " << I << "\n");12031204if (I.isAtomic() || I.isVolatile()) {1205LLVM_DEBUG(1206dbgs() << "\tInstruction is volatile or atomic. Cannot move it.\n");1207return false;1208}12091210if (canHoistInst(I, SafeToHoist, NotHoisting, FC0)) {1211SafeToHoist.push_back(&I);1212LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n");1213} else {1214LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n");1215NotHoisting.push_back(&I);12161217if (canSinkInst(I, FC1)) {1218SafeToSink.push_back(&I);1219LLVM_DEBUG(dbgs() << "\tSafe to sink.\n");1220} else {1221LLVM_DEBUG(dbgs() << "\tCould not sink.\n");1222return false;1223}1224}1225}1226LLVM_DEBUG(1227dbgs() << "All preheader instructions could be sunk or hoisted!\n");1228return true;1229}12301231/// Rewrite all additive recurrences in a SCEV to use a new loop.1232class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> {1233public:1234AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL,1235bool UseMax = true)1236: SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL),1237NewL(NewL) {}12381239const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {1240const Loop *ExprL = Expr->getLoop();1241SmallVector<const SCEV *, 2> Operands;1242if (ExprL == &OldL) {1243append_range(Operands, Expr->operands());1244return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags());1245}12461247if (OldL.contains(ExprL)) {1248bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE));1249if (!UseMax || !Pos || !Expr->isAffine()) {1250Valid = false;1251return Expr;1252}1253return visit(Expr->getStart());1254}12551256for (const SCEV *Op : Expr->operands())1257Operands.push_back(visit(Op));1258return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags());1259}12601261bool wasValidSCEV() const { return Valid; }12621263private:1264bool Valid, UseMax;1265const Loop &OldL, &NewL;1266};12671268/// Return false if the access functions of \p I0 and \p I1 could cause1269/// a negative dependence.1270bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0,1271Instruction &I1, bool EqualIsInvalid) {1272Value *Ptr0 = getLoadStorePointerOperand(&I0);1273Value *Ptr1 = getLoadStorePointerOperand(&I1);1274if (!Ptr0 || !Ptr1)1275return false;12761277const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0);1278const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1);1279#ifndef NDEBUG1280if (VerboseFusionDebugging)1281LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs "1282<< *SCEVPtr1 << "\n");1283#endif1284AddRecLoopReplacer Rewriter(SE, L0, L1);1285SCEVPtr0 = Rewriter.visit(SCEVPtr0);1286#ifndef NDEBUG1287if (VerboseFusionDebugging)1288LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr01289<< " [Valid: " << Rewriter.wasValidSCEV() << "]\n");1290#endif1291if (!Rewriter.wasValidSCEV())1292return false;12931294// TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by1295// L0) and the other is not. We could check if it is monotone and test1296// the beginning and end value instead.12971298BasicBlock *L0Header = L0.getHeader();1299auto HasNonLinearDominanceRelation = [&](const SCEV *S) {1300const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S);1301if (!AddRec)1302return false;1303return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) &&1304!DT.dominates(AddRec->getLoop()->getHeader(), L0Header);1305};1306if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation))1307return false;13081309ICmpInst::Predicate Pred =1310EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE;1311bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1);1312#ifndef NDEBUG1313if (VerboseFusionDebugging)1314LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr01315<< (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr11316<< "\n");1317#endif1318return IsAlwaysGE;1319}13201321/// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in1322/// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses1323/// specified by @p DepChoice are used to determine this.1324bool dependencesAllowFusion(const FusionCandidate &FC0,1325const FusionCandidate &FC1, Instruction &I0,1326Instruction &I1, bool AnyDep,1327FusionDependenceAnalysisChoice DepChoice) {1328#ifndef NDEBUG1329if (VerboseFusionDebugging) {1330LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : "1331<< DepChoice << "\n");1332}1333#endif1334switch (DepChoice) {1335case FUSION_DEPENDENCE_ANALYSIS_SCEV:1336return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep);1337case FUSION_DEPENDENCE_ANALYSIS_DA: {1338auto DepResult = DI.depends(&I0, &I1, true);1339if (!DepResult)1340return true;1341#ifndef NDEBUG1342if (VerboseFusionDebugging) {1343LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs());1344dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: "1345<< (DepResult->isOrdered() ? "true" : "false")1346<< "]\n");1347LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels()1348<< "\n");1349}1350#endif13511352if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor())1353LLVM_DEBUG(1354dbgs() << "TODO: Implement pred/succ dependence handling!\n");13551356// TODO: Can we actually use the dependence info analysis here?1357return false;1358}13591360case FUSION_DEPENDENCE_ANALYSIS_ALL:1361return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,1362FUSION_DEPENDENCE_ANALYSIS_SCEV) ||1363dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,1364FUSION_DEPENDENCE_ANALYSIS_DA);1365}13661367llvm_unreachable("Unknown fusion dependence analysis choice!");1368}13691370/// Perform a dependence check and return if @p FC0 and @p FC1 can be fused.1371bool dependencesAllowFusion(const FusionCandidate &FC0,1372const FusionCandidate &FC1) {1373LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC11374<< "\n");1375assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());1376assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));13771378for (Instruction *WriteL0 : FC0.MemWrites) {1379for (Instruction *WriteL1 : FC1.MemWrites)1380if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,1381/* AnyDep */ false,1382FusionDependenceAnalysis)) {1383InvalidDependencies++;1384return false;1385}1386for (Instruction *ReadL1 : FC1.MemReads)1387if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1,1388/* AnyDep */ false,1389FusionDependenceAnalysis)) {1390InvalidDependencies++;1391return false;1392}1393}13941395for (Instruction *WriteL1 : FC1.MemWrites) {1396for (Instruction *WriteL0 : FC0.MemWrites)1397if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,1398/* AnyDep */ false,1399FusionDependenceAnalysis)) {1400InvalidDependencies++;1401return false;1402}1403for (Instruction *ReadL0 : FC0.MemReads)1404if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1,1405/* AnyDep */ false,1406FusionDependenceAnalysis)) {1407InvalidDependencies++;1408return false;1409}1410}14111412// Walk through all uses in FC1. For each use, find the reaching def. If the1413// def is located in FC0 then it is not safe to fuse.1414for (BasicBlock *BB : FC1.L->blocks())1415for (Instruction &I : *BB)1416for (auto &Op : I.operands())1417if (Instruction *Def = dyn_cast<Instruction>(Op))1418if (FC0.L->contains(Def->getParent())) {1419InvalidDependencies++;1420return false;1421}14221423return true;1424}14251426/// Determine if two fusion candidates are adjacent in the CFG.1427///1428/// This method will determine if there are additional basic blocks in the CFG1429/// between the exit of \p FC0 and the entry of \p FC1.1430/// If the two candidates are guarded loops, then it checks whether the1431/// non-loop successor of the \p FC0 guard branch is the entry block of \p1432/// FC1. If not, then the loops are not adjacent. If the two candidates are1433/// not guarded loops, then it checks whether the exit block of \p FC0 is the1434/// preheader of \p FC1.1435bool isAdjacent(const FusionCandidate &FC0,1436const FusionCandidate &FC1) const {1437// If the successor of the guard branch is FC1, then the loops are adjacent1438if (FC0.GuardBranch)1439return FC0.getNonLoopBlock() == FC1.getEntryBlock();1440else1441return FC0.ExitBlock == FC1.getEntryBlock();1442}14431444bool isEmptyPreheader(const FusionCandidate &FC) const {1445return FC.Preheader->size() == 1;1446}14471448/// Hoist \p FC1 Preheader instructions to \p FC0 Preheader1449/// and sink others into the body of \p FC1.1450void movePreheaderInsts(const FusionCandidate &FC0,1451const FusionCandidate &FC1,1452SmallVector<Instruction *, 4> &HoistInsts,1453SmallVector<Instruction *, 4> &SinkInsts) const {1454// All preheader instructions except the branch must be hoisted or sunk1455assert(HoistInsts.size() + SinkInsts.size() == FC1.Preheader->size() - 1 &&1456"Attempting to sink and hoist preheader instructions, but not all "1457"the preheader instructions are accounted for.");14581459NumHoistedInsts += HoistInsts.size();1460NumSunkInsts += SinkInsts.size();14611462LLVM_DEBUG(if (VerboseFusionDebugging) {1463if (!HoistInsts.empty())1464dbgs() << "Hoisting: \n";1465for (Instruction *I : HoistInsts)1466dbgs() << *I << "\n";1467if (!SinkInsts.empty())1468dbgs() << "Sinking: \n";1469for (Instruction *I : SinkInsts)1470dbgs() << *I << "\n";1471});14721473for (Instruction *I : HoistInsts) {1474assert(I->getParent() == FC1.Preheader);1475I->moveBefore(*FC0.Preheader,1476FC0.Preheader->getTerminator()->getIterator());1477}1478// insert instructions in reverse order to maintain dominance relationship1479for (Instruction *I : reverse(SinkInsts)) {1480assert(I->getParent() == FC1.Preheader);1481I->moveBefore(*FC1.ExitBlock, FC1.ExitBlock->getFirstInsertionPt());1482}1483}14841485/// Determine if two fusion candidates have identical guards1486///1487/// This method will determine if two fusion candidates have the same guards.1488/// The guards are considered the same if:1489/// 1. The instructions to compute the condition used in the compare are1490/// identical.1491/// 2. The successors of the guard have the same flow into/around the loop.1492/// If the compare instructions are identical, then the first successor of the1493/// guard must go to the same place (either the preheader of the loop or the1494/// NonLoopBlock). In other words, the first successor of both loops must1495/// both go into the loop (i.e., the preheader) or go around the loop (i.e.,1496/// the NonLoopBlock). The same must be true for the second successor.1497bool haveIdenticalGuards(const FusionCandidate &FC0,1498const FusionCandidate &FC1) const {1499assert(FC0.GuardBranch && FC1.GuardBranch &&1500"Expecting FC0 and FC1 to be guarded loops.");15011502if (auto FC0CmpInst =1503dyn_cast<Instruction>(FC0.GuardBranch->getCondition()))1504if (auto FC1CmpInst =1505dyn_cast<Instruction>(FC1.GuardBranch->getCondition()))1506if (!FC0CmpInst->isIdenticalTo(FC1CmpInst))1507return false;15081509// The compare instructions are identical.1510// Now make sure the successor of the guards have the same flow into/around1511// the loop1512if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader)1513return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader);1514else1515return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);1516}15171518/// Modify the latch branch of FC to be unconditional since successors of the1519/// branch are the same.1520void simplifyLatchBranch(const FusionCandidate &FC) const {1521BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator());1522if (FCLatchBranch) {1523assert(FCLatchBranch->isConditional() &&1524FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) &&1525"Expecting the two successors of FCLatchBranch to be the same");1526BranchInst *NewBranch =1527BranchInst::Create(FCLatchBranch->getSuccessor(0));1528ReplaceInstWithInst(FCLatchBranch, NewBranch);1529}1530}15311532/// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique1533/// successor, then merge FC0.Latch with its unique successor.1534void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) {1535moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI);1536if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) {1537MergeBlockIntoPredecessor(Succ, &DTU, &LI);1538DTU.flush();1539}1540}15411542/// Fuse two fusion candidates, creating a new fused loop.1543///1544/// This method contains the mechanics of fusing two loops, represented by \p1545/// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC11546/// postdominates \p FC0 (making them control flow equivalent). It also1547/// assumes that the other conditions for fusion have been met: adjacent,1548/// identical trip counts, and no negative distance dependencies exist that1549/// would prevent fusion. Thus, there is no checking for these conditions in1550/// this method.1551///1552/// Fusion is performed by rewiring the CFG to update successor blocks of the1553/// components of tho loop. Specifically, the following changes are done:1554///1555/// 1. The preheader of \p FC1 is removed as it is no longer necessary1556/// (because it is currently only a single statement block).1557/// 2. The latch of \p FC0 is modified to jump to the header of \p FC1.1558/// 3. The latch of \p FC1 i modified to jump to the header of \p FC0.1559/// 4. All blocks from \p FC1 are removed from FC1 and added to FC0.1560///1561/// All of these modifications are done with dominator tree updates, thus1562/// keeping the dominator (and post dominator) information up-to-date.1563///1564/// This can be improved in the future by actually merging blocks during1565/// fusion. For example, the preheader of \p FC1 can be merged with the1566/// preheader of \p FC0. This would allow loops with more than a single1567/// statement in the preheader to be fused. Similarly, the latch blocks of the1568/// two loops could also be fused into a single block. This will require1569/// analysis to prove it is safe to move the contents of the block past1570/// existing code, which currently has not been implemented.1571Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) {1572assert(FC0.isValid() && FC1.isValid() &&1573"Expecting valid fusion candidates");15741575LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();1576dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););15771578// Move instructions from the preheader of FC1 to the end of the preheader1579// of FC0.1580moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI);15811582// Fusing guarded loops is handled slightly differently than non-guarded1583// loops and has been broken out into a separate method instead of trying to1584// intersperse the logic within a single method.1585if (FC0.GuardBranch)1586return fuseGuardedLoops(FC0, FC1);15871588assert(FC1.Preheader ==1589(FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock));1590assert(FC1.Preheader->size() == 1 &&1591FC1.Preheader->getSingleSuccessor() == FC1.Header);15921593// Remember the phi nodes originally in the header of FC0 in order to rewire1594// them later. However, this is only necessary if the new loop carried1595// values might not dominate the exiting branch. While we do not generally1596// test if this is the case but simply insert intermediate phi nodes, we1597// need to make sure these intermediate phi nodes have different1598// predecessors. To this end, we filter the special case where the exiting1599// block is the latch block of the first loop. Nothing needs to be done1600// anyway as all loop carried values dominate the latch and thereby also the1601// exiting branch.1602SmallVector<PHINode *, 8> OriginalFC0PHIs;1603if (FC0.ExitingBlock != FC0.Latch)1604for (PHINode &PHI : FC0.Header->phis())1605OriginalFC0PHIs.push_back(&PHI);16061607// Replace incoming blocks for header PHIs first.1608FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);1609FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);16101611// Then modify the control flow and update DT and PDT.1612SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;16131614// The old exiting block of the first loop (FC0) has to jump to the header1615// of the second as we need to execute the code in the second header block1616// regardless of the trip count. That is, if the trip count is 0, so the1617// back edge is never taken, we still have to execute both loop headers,1618// especially (but not only!) if the second is a do-while style loop.1619// However, doing so might invalidate the phi nodes of the first loop as1620// the new values do only need to dominate their latch and not the exiting1621// predicate. To remedy this potential problem we always introduce phi1622// nodes in the header of the second loop later that select the loop carried1623// value, if the second header was reached through an old latch of the1624// first, or undef otherwise. This is sound as exiting the first implies the1625// second will exit too, __without__ taking the back-edge. [Their1626// trip-counts are equal after all.1627// KB: Would this sequence be simpler to just make FC0.ExitingBlock go1628// to FC1.Header? I think this is basically what the three sequences are1629// trying to accomplish; however, doing this directly in the CFG may mean1630// the DT/PDT becomes invalid1631if (!FC0.Peeled) {1632FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,1633FC1.Header);1634TreeUpdates.emplace_back(DominatorTree::UpdateType(1635DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));1636TreeUpdates.emplace_back(DominatorTree::UpdateType(1637DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));1638} else {1639TreeUpdates.emplace_back(DominatorTree::UpdateType(1640DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader));16411642// Remove the ExitBlock of the first Loop (also not needed)1643FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,1644FC1.Header);1645TreeUpdates.emplace_back(DominatorTree::UpdateType(1646DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));1647FC0.ExitBlock->getTerminator()->eraseFromParent();1648TreeUpdates.emplace_back(DominatorTree::UpdateType(1649DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));1650new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);1651}16521653// The pre-header of L1 is not necessary anymore.1654assert(pred_empty(FC1.Preheader));1655FC1.Preheader->getTerminator()->eraseFromParent();1656new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);1657TreeUpdates.emplace_back(DominatorTree::UpdateType(1658DominatorTree::Delete, FC1.Preheader, FC1.Header));16591660// Moves the phi nodes from the second to the first loops header block.1661while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {1662if (SE.isSCEVable(PHI->getType()))1663SE.forgetValue(PHI);1664if (PHI->hasNUsesOrMore(1))1665PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());1666else1667PHI->eraseFromParent();1668}16691670// Introduce new phi nodes in the second loop header to ensure1671// exiting the first and jumping to the header of the second does not break1672// the SSA property of the phis originally in the first loop. See also the1673// comment above.1674BasicBlock::iterator L1HeaderIP = FC1.Header->begin();1675for (PHINode *LCPHI : OriginalFC0PHIs) {1676int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);1677assert(L1LatchBBIdx >= 0 &&1678"Expected loop carried value to be rewired at this point!");16791680Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);16811682PHINode *L1HeaderPHI =1683PHINode::Create(LCV->getType(), 2, LCPHI->getName() + ".afterFC0");1684L1HeaderPHI->insertBefore(L1HeaderIP);1685L1HeaderPHI->addIncoming(LCV, FC0.Latch);1686L1HeaderPHI->addIncoming(PoisonValue::get(LCV->getType()),1687FC0.ExitingBlock);16881689LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);1690}16911692// Replace latch terminator destinations.1693FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);1694FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);16951696// Modify the latch branch of FC0 to be unconditional as both successors of1697// the branch are the same.1698simplifyLatchBranch(FC0);16991700// If FC0.Latch and FC0.ExitingBlock are the same then we have already1701// performed the updates above.1702if (FC0.Latch != FC0.ExitingBlock)1703TreeUpdates.emplace_back(DominatorTree::UpdateType(1704DominatorTree::Insert, FC0.Latch, FC1.Header));17051706TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,1707FC0.Latch, FC0.Header));1708TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,1709FC1.Latch, FC0.Header));1710TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,1711FC1.Latch, FC1.Header));17121713// Update DT/PDT1714DTU.applyUpdates(TreeUpdates);17151716LI.removeBlock(FC1.Preheader);1717DTU.deleteBB(FC1.Preheader);1718if (FC0.Peeled) {1719LI.removeBlock(FC0.ExitBlock);1720DTU.deleteBB(FC0.ExitBlock);1721}17221723DTU.flush();17241725// Is there a way to keep SE up-to-date so we don't need to forget the loops1726// and rebuild the information in subsequent passes of fusion?1727// Note: Need to forget the loops before merging the loop latches, as1728// mergeLatch may remove the only block in FC1.1729SE.forgetLoop(FC1.L);1730SE.forgetLoop(FC0.L);1731SE.forgetLoopDispositions();17321733// Move instructions from FC0.Latch to FC1.Latch.1734// Note: mergeLatch requires an updated DT.1735mergeLatch(FC0, FC1);17361737// Merge the loops.1738SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());1739for (BasicBlock *BB : Blocks) {1740FC0.L->addBlockEntry(BB);1741FC1.L->removeBlockFromLoop(BB);1742if (LI.getLoopFor(BB) != FC1.L)1743continue;1744LI.changeLoopFor(BB, FC0.L);1745}1746while (!FC1.L->isInnermost()) {1747const auto &ChildLoopIt = FC1.L->begin();1748Loop *ChildLoop = *ChildLoopIt;1749FC1.L->removeChildLoop(ChildLoopIt);1750FC0.L->addChildLoop(ChildLoop);1751}17521753// Delete the now empty loop L1.1754LI.erase(FC1.L);17551756#ifndef NDEBUG1757assert(!verifyFunction(*FC0.Header->getParent(), &errs()));1758assert(DT.verify(DominatorTree::VerificationLevel::Fast));1759assert(PDT.verify());1760LI.verify(DT);1761SE.verify();1762#endif17631764LLVM_DEBUG(dbgs() << "Fusion done:\n");17651766return FC0.L;1767}17681769/// Report details on loop fusion opportunities.1770///1771/// This template function can be used to report both successful and missed1772/// loop fusion opportunities, based on the RemarkKind. The RemarkKind should1773/// be one of:1774/// - OptimizationRemarkMissed to report when loop fusion is unsuccessful1775/// given two valid fusion candidates.1776/// - OptimizationRemark to report successful fusion of two fusion1777/// candidates.1778/// The remarks will be printed using the form:1779/// <path/filename>:<line number>:<column number>: [<function name>]:1780/// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>1781template <typename RemarkKind>1782void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,1783llvm::Statistic &Stat) {1784assert(FC0.Preheader && FC1.Preheader &&1785"Expecting valid fusion candidates");1786using namespace ore;1787#if LLVM_ENABLE_STATS1788++Stat;1789ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),1790FC0.Preheader)1791<< "[" << FC0.Preheader->getParent()->getName()1792<< "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))1793<< " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))1794<< ": " << Stat.getDesc());1795#endif1796}17971798/// Fuse two guarded fusion candidates, creating a new fused loop.1799///1800/// Fusing guarded loops is handled much the same way as fusing non-guarded1801/// loops. The rewiring of the CFG is slightly different though, because of1802/// the presence of the guards around the loops and the exit blocks after the1803/// loop body. As such, the new loop is rewired as follows:1804/// 1. Keep the guard branch from FC0 and use the non-loop block target1805/// from the FC1 guard branch.1806/// 2. Remove the exit block from FC0 (this exit block should be empty1807/// right now).1808/// 3. Remove the guard branch for FC11809/// 4. Remove the preheader for FC1.1810/// The exit block successor for the latch of FC0 is updated to be the header1811/// of FC1 and the non-exit block successor of the latch of FC1 is updated to1812/// be the header of FC0, thus creating the fused loop.1813Loop *fuseGuardedLoops(const FusionCandidate &FC0,1814const FusionCandidate &FC1) {1815assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");18161817BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();1818BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();1819BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();1820BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();1821BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor();18221823// Move instructions from the exit block of FC0 to the beginning of the exit1824// block of FC1, in the case that the FC0 loop has not been peeled. In the1825// case that FC0 loop is peeled, then move the instructions of the successor1826// of the FC0 Exit block to the beginning of the exit block of FC1.1827moveInstructionsToTheBeginning(1828(FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock,1829DT, PDT, DI);18301831// Move instructions from the guard block of FC1 to the end of the guard1832// block of FC0.1833moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI);18341835assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");18361837SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;18381839////////////////////////////////////////////////////////////////////////////1840// Update the Loop Guard1841////////////////////////////////////////////////////////////////////////////1842// The guard for FC0 is updated to guard both FC0 and FC1. This is done by1843// changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.1844// Thus, one path from the guard goes to the preheader for FC0 (and thus1845// executes the new fused loop) and the other path goes to the NonLoopBlock1846// for FC1 (where FC1 guard would have gone if FC1 was not executed).1847FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock);1848FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);18491850BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock;1851BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header);18521853// The guard of FC1 is not necessary anymore.1854FC1.GuardBranch->eraseFromParent();1855new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock);18561857TreeUpdates.emplace_back(DominatorTree::UpdateType(1858DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));1859TreeUpdates.emplace_back(DominatorTree::UpdateType(1860DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));1861TreeUpdates.emplace_back(DominatorTree::UpdateType(1862DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));1863TreeUpdates.emplace_back(DominatorTree::UpdateType(1864DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));18651866if (FC0.Peeled) {1867// Remove the Block after the ExitBlock of FC01868TreeUpdates.emplace_back(DominatorTree::UpdateType(1869DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock));1870FC0ExitBlockSuccessor->getTerminator()->eraseFromParent();1871new UnreachableInst(FC0ExitBlockSuccessor->getContext(),1872FC0ExitBlockSuccessor);1873}18741875assert(pred_empty(FC1GuardBlock) &&1876"Expecting guard block to have no predecessors");1877assert(succ_empty(FC1GuardBlock) &&1878"Expecting guard block to have no successors");18791880// Remember the phi nodes originally in the header of FC0 in order to rewire1881// them later. However, this is only necessary if the new loop carried1882// values might not dominate the exiting branch. While we do not generally1883// test if this is the case but simply insert intermediate phi nodes, we1884// need to make sure these intermediate phi nodes have different1885// predecessors. To this end, we filter the special case where the exiting1886// block is the latch block of the first loop. Nothing needs to be done1887// anyway as all loop carried values dominate the latch and thereby also the1888// exiting branch.1889// KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch1890// (because the loops are rotated. Thus, nothing will ever be added to1891// OriginalFC0PHIs.1892SmallVector<PHINode *, 8> OriginalFC0PHIs;1893if (FC0.ExitingBlock != FC0.Latch)1894for (PHINode &PHI : FC0.Header->phis())1895OriginalFC0PHIs.push_back(&PHI);18961897assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");18981899// Replace incoming blocks for header PHIs first.1900FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);1901FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);19021903// The old exiting block of the first loop (FC0) has to jump to the header1904// of the second as we need to execute the code in the second header block1905// regardless of the trip count. That is, if the trip count is 0, so the1906// back edge is never taken, we still have to execute both loop headers,1907// especially (but not only!) if the second is a do-while style loop.1908// However, doing so might invalidate the phi nodes of the first loop as1909// the new values do only need to dominate their latch and not the exiting1910// predicate. To remedy this potential problem we always introduce phi1911// nodes in the header of the second loop later that select the loop carried1912// value, if the second header was reached through an old latch of the1913// first, or undef otherwise. This is sound as exiting the first implies the1914// second will exit too, __without__ taking the back-edge (their1915// trip-counts are equal after all).1916FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,1917FC1.Header);19181919TreeUpdates.emplace_back(DominatorTree::UpdateType(1920DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));1921TreeUpdates.emplace_back(DominatorTree::UpdateType(1922DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));19231924// Remove FC0 Exit Block1925// The exit block for FC0 is no longer needed since control will flow1926// directly to the header of FC1. Since it is an empty block, it can be1927// removed at this point.1928// TODO: In the future, we can handle non-empty exit blocks my merging any1929// instructions from FC0 exit block into FC1 exit block prior to removing1930// the block.1931assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty");1932FC0.ExitBlock->getTerminator()->eraseFromParent();1933new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);19341935// Remove FC1 Preheader1936// The pre-header of L1 is not necessary anymore.1937assert(pred_empty(FC1.Preheader));1938FC1.Preheader->getTerminator()->eraseFromParent();1939new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);1940TreeUpdates.emplace_back(DominatorTree::UpdateType(1941DominatorTree::Delete, FC1.Preheader, FC1.Header));19421943// Moves the phi nodes from the second to the first loops header block.1944while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {1945if (SE.isSCEVable(PHI->getType()))1946SE.forgetValue(PHI);1947if (PHI->hasNUsesOrMore(1))1948PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());1949else1950PHI->eraseFromParent();1951}19521953// Introduce new phi nodes in the second loop header to ensure1954// exiting the first and jumping to the header of the second does not break1955// the SSA property of the phis originally in the first loop. See also the1956// comment above.1957BasicBlock::iterator L1HeaderIP = FC1.Header->begin();1958for (PHINode *LCPHI : OriginalFC0PHIs) {1959int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);1960assert(L1LatchBBIdx >= 0 &&1961"Expected loop carried value to be rewired at this point!");19621963Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);19641965PHINode *L1HeaderPHI =1966PHINode::Create(LCV->getType(), 2, LCPHI->getName() + ".afterFC0");1967L1HeaderPHI->insertBefore(L1HeaderIP);1968L1HeaderPHI->addIncoming(LCV, FC0.Latch);1969L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),1970FC0.ExitingBlock);19711972LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);1973}19741975// Update the latches19761977// Replace latch terminator destinations.1978FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);1979FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);19801981// Modify the latch branch of FC0 to be unconditional as both successors of1982// the branch are the same.1983simplifyLatchBranch(FC0);19841985// If FC0.Latch and FC0.ExitingBlock are the same then we have already1986// performed the updates above.1987if (FC0.Latch != FC0.ExitingBlock)1988TreeUpdates.emplace_back(DominatorTree::UpdateType(1989DominatorTree::Insert, FC0.Latch, FC1.Header));19901991TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,1992FC0.Latch, FC0.Header));1993TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,1994FC1.Latch, FC0.Header));1995TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,1996FC1.Latch, FC1.Header));19971998// All done1999// Apply the updates to the Dominator Tree and cleanup.20002001assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!");2002assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!");20032004// Update DT/PDT2005DTU.applyUpdates(TreeUpdates);20062007LI.removeBlock(FC1GuardBlock);2008LI.removeBlock(FC1.Preheader);2009LI.removeBlock(FC0.ExitBlock);2010if (FC0.Peeled) {2011LI.removeBlock(FC0ExitBlockSuccessor);2012DTU.deleteBB(FC0ExitBlockSuccessor);2013}2014DTU.deleteBB(FC1GuardBlock);2015DTU.deleteBB(FC1.Preheader);2016DTU.deleteBB(FC0.ExitBlock);2017DTU.flush();20182019// Is there a way to keep SE up-to-date so we don't need to forget the loops2020// and rebuild the information in subsequent passes of fusion?2021// Note: Need to forget the loops before merging the loop latches, as2022// mergeLatch may remove the only block in FC1.2023SE.forgetLoop(FC1.L);2024SE.forgetLoop(FC0.L);2025SE.forgetLoopDispositions();20262027// Move instructions from FC0.Latch to FC1.Latch.2028// Note: mergeLatch requires an updated DT.2029mergeLatch(FC0, FC1);20302031// Merge the loops.2032SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());2033for (BasicBlock *BB : Blocks) {2034FC0.L->addBlockEntry(BB);2035FC1.L->removeBlockFromLoop(BB);2036if (LI.getLoopFor(BB) != FC1.L)2037continue;2038LI.changeLoopFor(BB, FC0.L);2039}2040while (!FC1.L->isInnermost()) {2041const auto &ChildLoopIt = FC1.L->begin();2042Loop *ChildLoop = *ChildLoopIt;2043FC1.L->removeChildLoop(ChildLoopIt);2044FC0.L->addChildLoop(ChildLoop);2045}20462047// Delete the now empty loop L1.2048LI.erase(FC1.L);20492050#ifndef NDEBUG2051assert(!verifyFunction(*FC0.Header->getParent(), &errs()));2052assert(DT.verify(DominatorTree::VerificationLevel::Fast));2053assert(PDT.verify());2054LI.verify(DT);2055SE.verify();2056#endif20572058LLVM_DEBUG(dbgs() << "Fusion done:\n");20592060return FC0.L;2061}2062};2063} // namespace20642065PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {2066auto &LI = AM.getResult<LoopAnalysis>(F);2067auto &DT = AM.getResult<DominatorTreeAnalysis>(F);2068auto &DI = AM.getResult<DependenceAnalysis>(F);2069auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);2070auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);2071auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);2072auto &AC = AM.getResult<AssumptionAnalysis>(F);2073const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);2074const DataLayout &DL = F.getDataLayout();20752076// Ensure loops are in simplifed form which is a pre-requisite for loop fusion2077// pass. Added only for new PM since the legacy PM has already added2078// LoopSimplify pass as a dependency.2079bool Changed = false;2080for (auto &L : LI) {2081Changed |=2082simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);2083}2084if (Changed)2085PDT.recalculate(F);20862087LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);2088Changed |= LF.fuseLoops(F);2089if (!Changed)2090return PreservedAnalyses::all();20912092PreservedAnalyses PA;2093PA.preserve<DominatorTreeAnalysis>();2094PA.preserve<PostDominatorTreeAnalysis>();2095PA.preserve<ScalarEvolutionAnalysis>();2096PA.preserve<LoopAnalysis>();2097return PA;2098}209921002101