Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
35266 views
//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This Pass handles loop interchange transform.9// This pass interchanges loops to provide a more cache-friendly memory access10// patterns.11//12//===----------------------------------------------------------------------===//1314#include "llvm/Transforms/Scalar/LoopInterchange.h"15#include "llvm/ADT/STLExtras.h"16#include "llvm/ADT/SmallVector.h"17#include "llvm/ADT/Statistic.h"18#include "llvm/ADT/StringRef.h"19#include "llvm/Analysis/DependenceAnalysis.h"20#include "llvm/Analysis/LoopCacheAnalysis.h"21#include "llvm/Analysis/LoopInfo.h"22#include "llvm/Analysis/LoopNestAnalysis.h"23#include "llvm/Analysis/LoopPass.h"24#include "llvm/Analysis/OptimizationRemarkEmitter.h"25#include "llvm/Analysis/ScalarEvolution.h"26#include "llvm/Analysis/ScalarEvolutionExpressions.h"27#include "llvm/IR/BasicBlock.h"28#include "llvm/IR/Constants.h"29#include "llvm/IR/DiagnosticInfo.h"30#include "llvm/IR/Dominators.h"31#include "llvm/IR/Function.h"32#include "llvm/IR/InstrTypes.h"33#include "llvm/IR/Instruction.h"34#include "llvm/IR/Instructions.h"35#include "llvm/IR/User.h"36#include "llvm/IR/Value.h"37#include "llvm/Support/Casting.h"38#include "llvm/Support/CommandLine.h"39#include "llvm/Support/Debug.h"40#include "llvm/Support/ErrorHandling.h"41#include "llvm/Support/raw_ostream.h"42#include "llvm/Transforms/Scalar/LoopPassManager.h"43#include "llvm/Transforms/Utils/BasicBlockUtils.h"44#include "llvm/Transforms/Utils/LoopUtils.h"45#include <cassert>46#include <utility>47#include <vector>4849using namespace llvm;5051#define DEBUG_TYPE "loop-interchange"5253STATISTIC(LoopsInterchanged, "Number of loops interchanged");5455static cl::opt<int> LoopInterchangeCostThreshold(56"loop-interchange-threshold", cl::init(0), cl::Hidden,57cl::desc("Interchange if you gain more than this number"));5859namespace {6061using LoopVector = SmallVector<Loop *, 8>;6263// TODO: Check if we can use a sparse matrix here.64using CharMatrix = std::vector<std::vector<char>>;6566} // end anonymous namespace6768// Maximum number of dependencies that can be handled in the dependency matrix.69static const unsigned MaxMemInstrCount = 100;7071// Maximum loop depth supported.72static const unsigned MaxLoopNestDepth = 10;7374#ifdef DUMP_DEP_MATRICIES75static void printDepMatrix(CharMatrix &DepMatrix) {76for (auto &Row : DepMatrix) {77for (auto D : Row)78LLVM_DEBUG(dbgs() << D << " ");79LLVM_DEBUG(dbgs() << "\n");80}81}82#endif8384static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,85Loop *L, DependenceInfo *DI,86ScalarEvolution *SE) {87using ValueVector = SmallVector<Value *, 16>;8889ValueVector MemInstr;9091// For each block.92for (BasicBlock *BB : L->blocks()) {93// Scan the BB and collect legal loads and stores.94for (Instruction &I : *BB) {95if (!isa<Instruction>(I))96return false;97if (auto *Ld = dyn_cast<LoadInst>(&I)) {98if (!Ld->isSimple())99return false;100MemInstr.push_back(&I);101} else if (auto *St = dyn_cast<StoreInst>(&I)) {102if (!St->isSimple())103return false;104MemInstr.push_back(&I);105}106}107}108109LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()110<< " Loads and Stores to analyze\n");111112ValueVector::iterator I, IE, J, JE;113114for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {115for (J = I, JE = MemInstr.end(); J != JE; ++J) {116std::vector<char> Dep;117Instruction *Src = cast<Instruction>(*I);118Instruction *Dst = cast<Instruction>(*J);119// Ignore Input dependencies.120if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))121continue;122// Track Output, Flow, and Anti dependencies.123if (auto D = DI->depends(Src, Dst, true)) {124assert(D->isOrdered() && "Expected an output, flow or anti dep.");125// If the direction vector is negative, normalize it to126// make it non-negative.127if (D->normalize(SE))128LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");129LLVM_DEBUG(StringRef DepType =130D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";131dbgs() << "Found " << DepType132<< " dependency between Src and Dst\n"133<< " Src:" << *Src << "\n Dst:" << *Dst << '\n');134unsigned Levels = D->getLevels();135char Direction;136for (unsigned II = 1; II <= Levels; ++II) {137if (D->isScalar(II)) {138Direction = 'S';139Dep.push_back(Direction);140} else {141unsigned Dir = D->getDirection(II);142if (Dir == Dependence::DVEntry::LT ||143Dir == Dependence::DVEntry::LE)144Direction = '<';145else if (Dir == Dependence::DVEntry::GT ||146Dir == Dependence::DVEntry::GE)147Direction = '>';148else if (Dir == Dependence::DVEntry::EQ)149Direction = '=';150else151Direction = '*';152Dep.push_back(Direction);153}154}155while (Dep.size() != Level) {156Dep.push_back('I');157}158159DepMatrix.push_back(Dep);160if (DepMatrix.size() > MaxMemInstrCount) {161LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount162<< " dependencies inside loop\n");163return false;164}165}166}167}168169return true;170}171172// A loop is moved from index 'from' to an index 'to'. Update the Dependence173// matrix by exchanging the two columns.174static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,175unsigned ToIndx) {176for (unsigned I = 0, E = DepMatrix.size(); I < E; ++I)177std::swap(DepMatrix[I][ToIndx], DepMatrix[I][FromIndx]);178}179180// After interchanging, check if the direction vector is valid.181// [Theorem] A permutation of the loops in a perfect nest is legal if and only182// if the direction matrix, after the same permutation is applied to its183// columns, has no ">" direction as the leftmost non-"=" direction in any row.184static bool isLexicographicallyPositive(std::vector<char> &DV) {185for (unsigned char Direction : DV) {186if (Direction == '<')187return true;188if (Direction == '>' || Direction == '*')189return false;190}191return true;192}193194// Checks if it is legal to interchange 2 loops.195static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,196unsigned InnerLoopId,197unsigned OuterLoopId) {198unsigned NumRows = DepMatrix.size();199std::vector<char> Cur;200// For each row check if it is valid to interchange.201for (unsigned Row = 0; Row < NumRows; ++Row) {202// Create temporary DepVector check its lexicographical order203// before and after swapping OuterLoop vs InnerLoop204Cur = DepMatrix[Row];205if (!isLexicographicallyPositive(Cur))206return false;207std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);208if (!isLexicographicallyPositive(Cur))209return false;210}211return true;212}213214static void populateWorklist(Loop &L, LoopVector &LoopList) {215LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "216<< L.getHeader()->getParent()->getName() << " Loop: %"217<< L.getHeader()->getName() << '\n');218assert(LoopList.empty() && "LoopList should initially be empty!");219Loop *CurrentLoop = &L;220const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();221while (!Vec->empty()) {222// The current loop has multiple subloops in it hence it is not tightly223// nested.224// Discard all loops above it added into Worklist.225if (Vec->size() != 1) {226LoopList = {};227return;228}229230LoopList.push_back(CurrentLoop);231CurrentLoop = Vec->front();232Vec = &CurrentLoop->getSubLoops();233}234LoopList.push_back(CurrentLoop);235}236237namespace {238239/// LoopInterchangeLegality checks if it is legal to interchange the loop.240class LoopInterchangeLegality {241public:242LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,243OptimizationRemarkEmitter *ORE)244: OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}245246/// Check if the loops can be interchanged.247bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,248CharMatrix &DepMatrix);249250/// Discover induction PHIs in the header of \p L. Induction251/// PHIs are added to \p Inductions.252bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);253254/// Check if the loop structure is understood. We do not handle triangular255/// loops for now.256bool isLoopStructureUnderstood();257258bool currentLimitations();259260const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {261return OuterInnerReductions;262}263264const SmallVectorImpl<PHINode *> &getInnerLoopInductions() const {265return InnerLoopInductions;266}267268private:269bool tightlyNested(Loop *Outer, Loop *Inner);270bool containsUnsafeInstructions(BasicBlock *BB);271272/// Discover induction and reduction PHIs in the header of \p L. Induction273/// PHIs are added to \p Inductions, reductions are added to274/// OuterInnerReductions. When the outer loop is passed, the inner loop needs275/// to be passed as \p InnerLoop.276bool findInductionAndReductions(Loop *L,277SmallVector<PHINode *, 8> &Inductions,278Loop *InnerLoop);279280Loop *OuterLoop;281Loop *InnerLoop;282283ScalarEvolution *SE;284285/// Interface to emit optimization remarks.286OptimizationRemarkEmitter *ORE;287288/// Set of reduction PHIs taking part of a reduction across the inner and289/// outer loop.290SmallPtrSet<PHINode *, 4> OuterInnerReductions;291292/// Set of inner loop induction PHIs293SmallVector<PHINode *, 8> InnerLoopInductions;294};295296/// LoopInterchangeProfitability checks if it is profitable to interchange the297/// loop.298class LoopInterchangeProfitability {299public:300LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,301OptimizationRemarkEmitter *ORE)302: OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}303304/// Check if the loop interchange is profitable.305bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,306unsigned InnerLoopId, unsigned OuterLoopId,307CharMatrix &DepMatrix,308const DenseMap<const Loop *, unsigned> &CostMap,309std::unique_ptr<CacheCost> &CC);310311private:312int getInstrOrderCost();313std::optional<bool> isProfitablePerLoopCacheAnalysis(314const DenseMap<const Loop *, unsigned> &CostMap,315std::unique_ptr<CacheCost> &CC);316std::optional<bool> isProfitablePerInstrOrderCost();317std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,318unsigned OuterLoopId,319CharMatrix &DepMatrix);320Loop *OuterLoop;321Loop *InnerLoop;322323/// Scev analysis.324ScalarEvolution *SE;325326/// Interface to emit optimization remarks.327OptimizationRemarkEmitter *ORE;328};329330/// LoopInterchangeTransform interchanges the loop.331class LoopInterchangeTransform {332public:333LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,334LoopInfo *LI, DominatorTree *DT,335const LoopInterchangeLegality &LIL)336: OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}337338/// Interchange OuterLoop and InnerLoop.339bool transform();340void restructureLoops(Loop *NewInner, Loop *NewOuter,341BasicBlock *OrigInnerPreHeader,342BasicBlock *OrigOuterPreHeader);343void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);344345private:346bool adjustLoopLinks();347bool adjustLoopBranches();348349Loop *OuterLoop;350Loop *InnerLoop;351352/// Scev analysis.353ScalarEvolution *SE;354355LoopInfo *LI;356DominatorTree *DT;357358const LoopInterchangeLegality &LIL;359};360361struct LoopInterchange {362ScalarEvolution *SE = nullptr;363LoopInfo *LI = nullptr;364DependenceInfo *DI = nullptr;365DominatorTree *DT = nullptr;366std::unique_ptr<CacheCost> CC = nullptr;367368/// Interface to emit optimization remarks.369OptimizationRemarkEmitter *ORE;370371LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,372DominatorTree *DT, std::unique_ptr<CacheCost> &CC,373OptimizationRemarkEmitter *ORE)374: SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}375376bool run(Loop *L) {377if (L->getParentLoop())378return false;379SmallVector<Loop *, 8> LoopList;380populateWorklist(*L, LoopList);381return processLoopList(LoopList);382}383384bool run(LoopNest &LN) {385SmallVector<Loop *, 8> LoopList(LN.getLoops().begin(), LN.getLoops().end());386for (unsigned I = 1; I < LoopList.size(); ++I)387if (LoopList[I]->getParentLoop() != LoopList[I - 1])388return false;389return processLoopList(LoopList);390}391392bool isComputableLoopNest(ArrayRef<Loop *> LoopList) {393for (Loop *L : LoopList) {394const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);395if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {396LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");397return false;398}399if (L->getNumBackEdges() != 1) {400LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");401return false;402}403if (!L->getExitingBlock()) {404LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");405return false;406}407}408return true;409}410411unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {412// TODO: Add a better heuristic to select the loop to be interchanged based413// on the dependence matrix. Currently we select the innermost loop.414return LoopList.size() - 1;415}416417bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {418bool Changed = false;419unsigned LoopNestDepth = LoopList.size();420if (LoopNestDepth < 2) {421LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");422return false;423}424if (LoopNestDepth > MaxLoopNestDepth) {425LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "426<< MaxLoopNestDepth << "\n");427return false;428}429if (!isComputableLoopNest(LoopList)) {430LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");431return false;432}433434LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth435<< "\n");436437CharMatrix DependencyMatrix;438Loop *OuterMostLoop = *(LoopList.begin());439if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,440OuterMostLoop, DI, SE)) {441LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");442return false;443}444#ifdef DUMP_DEP_MATRICIES445LLVM_DEBUG(dbgs() << "Dependence before interchange\n");446printDepMatrix(DependencyMatrix);447#endif448449// Get the Outermost loop exit.450BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();451if (!LoopNestExit) {452LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");453return false;454}455456unsigned SelecLoopId = selectLoopForInterchange(LoopList);457// Obtain the loop vector returned from loop cache analysis beforehand,458// and put each <Loop, index> pair into a map for constant time query459// later. Indices in loop vector reprsent the optimal order of the460// corresponding loop, e.g., given a loopnest with depth N, index 0461// indicates the loop should be placed as the outermost loop and index N462// indicates the loop should be placed as the innermost loop.463//464// For the old pass manager CacheCost would be null.465DenseMap<const Loop *, unsigned> CostMap;466if (CC != nullptr) {467const auto &LoopCosts = CC->getLoopCosts();468for (unsigned i = 0; i < LoopCosts.size(); i++) {469CostMap[LoopCosts[i].first] = i;470}471}472// We try to achieve the globally optimal memory access for the loopnest,473// and do interchange based on a bubble-sort fasion. We start from474// the innermost loop, move it outwards to the best possible position475// and repeat this process.476for (unsigned j = SelecLoopId; j > 0; j--) {477bool ChangedPerIter = false;478for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {479bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1,480DependencyMatrix, CostMap);481if (!Interchanged)482continue;483// Loops interchanged, update LoopList accordingly.484std::swap(LoopList[i - 1], LoopList[i]);485// Update the DependencyMatrix486interChangeDependencies(DependencyMatrix, i, i - 1);487#ifdef DUMP_DEP_MATRICIES488LLVM_DEBUG(dbgs() << "Dependence after interchange\n");489printDepMatrix(DependencyMatrix);490#endif491ChangedPerIter |= Interchanged;492Changed |= Interchanged;493}494// Early abort if there was no interchange during an entire round of495// moving loops outwards.496if (!ChangedPerIter)497break;498}499return Changed;500}501502bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,503unsigned OuterLoopId,504std::vector<std::vector<char>> &DependencyMatrix,505const DenseMap<const Loop *, unsigned> &CostMap) {506LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId507<< " and OuterLoopId = " << OuterLoopId << "\n");508LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);509if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {510LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");511return false;512}513LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");514LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);515if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,516DependencyMatrix, CostMap, CC)) {517LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");518return false;519}520521ORE->emit([&]() {522return OptimizationRemark(DEBUG_TYPE, "Interchanged",523InnerLoop->getStartLoc(),524InnerLoop->getHeader())525<< "Loop interchanged with enclosing loop.";526});527528LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);529LIT.transform();530LLVM_DEBUG(dbgs() << "Loops interchanged.\n");531LoopsInterchanged++;532533llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);534return true;535}536};537538} // end anonymous namespace539540bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {541return any_of(*BB, [](const Instruction &I) {542return I.mayHaveSideEffects() || I.mayReadFromMemory();543});544}545546bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {547BasicBlock *OuterLoopHeader = OuterLoop->getHeader();548BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();549BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();550551LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");552553// A perfectly nested loop will not have any branch in between the outer and554// inner block i.e. outer header will branch to either inner preheader and555// outerloop latch.556BranchInst *OuterLoopHeaderBI =557dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());558if (!OuterLoopHeaderBI)559return false;560561for (BasicBlock *Succ : successors(OuterLoopHeaderBI))562if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&563Succ != OuterLoopLatch)564return false;565566LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");567// We do not have any basic block in between now make sure the outer header568// and outer loop latch doesn't contain any unsafe instructions.569if (containsUnsafeInstructions(OuterLoopHeader) ||570containsUnsafeInstructions(OuterLoopLatch))571return false;572573// Also make sure the inner loop preheader does not contain any unsafe574// instructions. Note that all instructions in the preheader will be moved to575// the outer loop header when interchanging.576if (InnerLoopPreHeader != OuterLoopHeader &&577containsUnsafeInstructions(InnerLoopPreHeader))578return false;579580BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();581// Ensure the inner loop exit block flows to the outer loop latch possibly582// through empty blocks.583const BasicBlock &SuccInner =584LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);585if (&SuccInner != OuterLoopLatch) {586LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit587<< " does not lead to the outer loop latch.\n";);588return false;589}590// The inner loop exit block does flow to the outer loop latch and not some591// other BBs, now make sure it contains safe instructions, since it will be592// moved into the (new) inner loop after interchange.593if (containsUnsafeInstructions(InnerLoopExit))594return false;595596LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");597// We have a perfect loop nest.598return true;599}600601bool LoopInterchangeLegality::isLoopStructureUnderstood() {602BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();603for (PHINode *InnerInduction : InnerLoopInductions) {604unsigned Num = InnerInduction->getNumOperands();605for (unsigned i = 0; i < Num; ++i) {606Value *Val = InnerInduction->getOperand(i);607if (isa<Constant>(Val))608continue;609Instruction *I = dyn_cast<Instruction>(Val);610if (!I)611return false;612// TODO: Handle triangular loops.613// e.g. for(int i=0;i<N;i++)614// for(int j=i;j<N;j++)615unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);616if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==617InnerLoopPreheader &&618!OuterLoop->isLoopInvariant(I)) {619return false;620}621}622}623624// TODO: Handle triangular loops of another form.625// e.g. for(int i=0;i<N;i++)626// for(int j=0;j<i;j++)627// or,628// for(int i=0;i<N;i++)629// for(int j=0;j*i<N;j++)630BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();631BranchInst *InnerLoopLatchBI =632dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());633if (!InnerLoopLatchBI->isConditional())634return false;635if (CmpInst *InnerLoopCmp =636dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) {637Value *Op0 = InnerLoopCmp->getOperand(0);638Value *Op1 = InnerLoopCmp->getOperand(1);639640// LHS and RHS of the inner loop exit condition, e.g.,641// in "for(int j=0;j<i;j++)", LHS is j and RHS is i.642Value *Left = nullptr;643Value *Right = nullptr;644645// Check if V only involves inner loop induction variable.646// Return true if V is InnerInduction, or a cast from647// InnerInduction, or a binary operator that involves648// InnerInduction and a constant.649std::function<bool(Value *)> IsPathToInnerIndVar;650IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {651if (llvm::is_contained(InnerLoopInductions, V))652return true;653if (isa<Constant>(V))654return true;655const Instruction *I = dyn_cast<Instruction>(V);656if (!I)657return false;658if (isa<CastInst>(I))659return IsPathToInnerIndVar(I->getOperand(0));660if (isa<BinaryOperator>(I))661return IsPathToInnerIndVar(I->getOperand(0)) &&662IsPathToInnerIndVar(I->getOperand(1));663return false;664};665666// In case of multiple inner loop indvars, it is okay if LHS and RHS667// are both inner indvar related variables.668if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))669return true;670671// Otherwise we check if the cmp instruction compares an inner indvar672// related variable (Left) with a outer loop invariant (Right).673if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {674Left = Op0;675Right = Op1;676} else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {677Left = Op1;678Right = Op0;679}680681if (Left == nullptr)682return false;683684const SCEV *S = SE->getSCEV(Right);685if (!SE->isLoopInvariant(S, OuterLoop))686return false;687}688689return true;690}691692// If SV is a LCSSA PHI node with a single incoming value, return the incoming693// value.694static Value *followLCSSA(Value *SV) {695PHINode *PHI = dyn_cast<PHINode>(SV);696if (!PHI)697return SV;698699if (PHI->getNumIncomingValues() != 1)700return SV;701return followLCSSA(PHI->getIncomingValue(0));702}703704// Check V's users to see if it is involved in a reduction in L.705static PHINode *findInnerReductionPhi(Loop *L, Value *V) {706// Reduction variables cannot be constants.707if (isa<Constant>(V))708return nullptr;709710for (Value *User : V->users()) {711if (PHINode *PHI = dyn_cast<PHINode>(User)) {712if (PHI->getNumIncomingValues() == 1)713continue;714RecurrenceDescriptor RD;715if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {716// Detect floating point reduction only when it can be reordered.717if (RD.getExactFPMathInst() != nullptr)718return nullptr;719return PHI;720}721return nullptr;722}723}724725return nullptr;726}727728bool LoopInterchangeLegality::findInductionAndReductions(729Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {730if (!L->getLoopLatch() || !L->getLoopPredecessor())731return false;732for (PHINode &PHI : L->getHeader()->phis()) {733InductionDescriptor ID;734if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))735Inductions.push_back(&PHI);736else {737// PHIs in inner loops need to be part of a reduction in the outer loop,738// discovered when checking the PHIs of the outer loop earlier.739if (!InnerLoop) {740if (!OuterInnerReductions.count(&PHI)) {741LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "742"across the outer loop.\n");743return false;744}745} else {746assert(PHI.getNumIncomingValues() == 2 &&747"Phis in loop header should have exactly 2 incoming values");748// Check if we have a PHI node in the outer loop that has a reduction749// result from the inner loop as an incoming value.750Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));751PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);752if (!InnerRedPhi ||753!llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {754LLVM_DEBUG(755dbgs()756<< "Failed to recognize PHI as an induction or reduction.\n");757return false;758}759OuterInnerReductions.insert(&PHI);760OuterInnerReductions.insert(InnerRedPhi);761}762}763}764return true;765}766767// This function indicates the current limitations in the transform as a result768// of which we do not proceed.769bool LoopInterchangeLegality::currentLimitations() {770BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();771772// transform currently expects the loop latches to also be the exiting773// blocks.774if (InnerLoop->getExitingBlock() != InnerLoopLatch ||775OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||776!isa<BranchInst>(InnerLoopLatch->getTerminator()) ||777!isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {778LLVM_DEBUG(779dbgs() << "Loops where the latch is not the exiting block are not"780<< " supported currently.\n");781ORE->emit([&]() {782return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",783OuterLoop->getStartLoc(),784OuterLoop->getHeader())785<< "Loops where the latch is not the exiting block cannot be"786" interchange currently.";787});788return true;789}790791SmallVector<PHINode *, 8> Inductions;792if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {793LLVM_DEBUG(794dbgs() << "Only outer loops with induction or reduction PHI nodes "795<< "are supported currently.\n");796ORE->emit([&]() {797return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",798OuterLoop->getStartLoc(),799OuterLoop->getHeader())800<< "Only outer loops with induction or reduction PHI nodes can be"801" interchanged currently.";802});803return true;804}805806Inductions.clear();807// For multi-level loop nests, make sure that all phi nodes for inner loops808// at all levels can be recognized as a induction or reduction phi. Bail out809// if a phi node at a certain nesting level cannot be properly recognized.810Loop *CurLevelLoop = OuterLoop;811while (!CurLevelLoop->getSubLoops().empty()) {812// We already made sure that the loop nest is tightly nested.813CurLevelLoop = CurLevelLoop->getSubLoops().front();814if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) {815LLVM_DEBUG(816dbgs() << "Only inner loops with induction or reduction PHI nodes "817<< "are supported currently.\n");818ORE->emit([&]() {819return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",820CurLevelLoop->getStartLoc(),821CurLevelLoop->getHeader())822<< "Only inner loops with induction or reduction PHI nodes can be"823" interchange currently.";824});825return true;826}827}828829// TODO: Triangular loops are not handled for now.830if (!isLoopStructureUnderstood()) {831LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");832ORE->emit([&]() {833return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",834InnerLoop->getStartLoc(),835InnerLoop->getHeader())836<< "Inner loop structure not understood currently.";837});838return true;839}840841return false;842}843844bool LoopInterchangeLegality::findInductions(845Loop *L, SmallVectorImpl<PHINode *> &Inductions) {846for (PHINode &PHI : L->getHeader()->phis()) {847InductionDescriptor ID;848if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))849Inductions.push_back(&PHI);850}851return !Inductions.empty();852}853854// We currently only support LCSSA PHI nodes in the inner loop exit, if their855// users are either reduction PHIs or PHIs outside the outer loop (which means856// the we are only interested in the final value after the loop).857static bool858areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,859SmallPtrSetImpl<PHINode *> &Reductions) {860BasicBlock *InnerExit = OuterL->getUniqueExitBlock();861for (PHINode &PHI : InnerExit->phis()) {862// Reduction lcssa phi will have only 1 incoming block that from loop latch.863if (PHI.getNumIncomingValues() > 1)864return false;865if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {866PHINode *PN = dyn_cast<PHINode>(U);867return !PN ||868(!Reductions.count(PN) && OuterL->contains(PN->getParent()));869})) {870return false;871}872}873return true;874}875876// We currently support LCSSA PHI nodes in the outer loop exit, if their877// incoming values do not come from the outer loop latch or if the878// outer loop latch has a single predecessor. In that case, the value will879// be available if both the inner and outer loop conditions are true, which880// will still be true after interchanging. If we have multiple predecessor,881// that may not be the case, e.g. because the outer loop latch may be executed882// if the inner loop is not executed.883static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {884BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();885for (PHINode &PHI : LoopNestExit->phis()) {886for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {887Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));888if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())889continue;890891// The incoming value is defined in the outer loop latch. Currently we892// only support that in case the outer loop latch has a single predecessor.893// This guarantees that the outer loop latch is executed if and only if894// the inner loop is executed (because tightlyNested() guarantees that the895// outer loop header only branches to the inner loop or the outer loop896// latch).897// FIXME: We could weaken this logic and allow multiple predecessors,898// if the values are produced outside the loop latch. We would need899// additional logic to update the PHI nodes in the exit block as900// well.901if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)902return false;903}904}905return true;906}907908// In case of multi-level nested loops, it may occur that lcssa phis exist in909// the latch of InnerLoop, i.e., when defs of the incoming values are further910// inside the loopnest. Sometimes those incoming values are not available911// after interchange, since the original inner latch will become the new outer912// latch which may have predecessor paths that do not include those incoming913// values.914// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of915// multi-level loop nests.916static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {917if (InnerLoop->getSubLoops().empty())918return true;919// If the original outer latch has only one predecessor, then values defined920// further inside the looploop, e.g., in the innermost loop, will be available921// at the new outer latch after interchange.922if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)923return true;924925// The outer latch has more than one predecessors, i.e., the inner926// exit and the inner header.927// PHI nodes in the inner latch are lcssa phis where the incoming values928// are defined further inside the loopnest. Check if those phis are used929// in the original inner latch. If that is the case then bail out since930// those incoming values may not be available at the new outer latch.931BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();932for (PHINode &PHI : InnerLoopLatch->phis()) {933for (auto *U : PHI.users()) {934Instruction *UI = cast<Instruction>(U);935if (InnerLoopLatch == UI->getParent())936return false;937}938}939return true;940}941942bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,943unsigned OuterLoopId,944CharMatrix &DepMatrix) {945if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {946LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId947<< " and OuterLoopId = " << OuterLoopId948<< " due to dependence\n");949ORE->emit([&]() {950return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",951InnerLoop->getStartLoc(),952InnerLoop->getHeader())953<< "Cannot interchange loops due to dependences.";954});955return false;956}957// Check if outer and inner loop contain legal instructions only.958for (auto *BB : OuterLoop->blocks())959for (Instruction &I : BB->instructionsWithoutDebug())960if (CallInst *CI = dyn_cast<CallInst>(&I)) {961// readnone functions do not prevent interchanging.962if (CI->onlyWritesMemory())963continue;964LLVM_DEBUG(965dbgs() << "Loops with call instructions cannot be interchanged "966<< "safely.");967ORE->emit([&]() {968return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",969CI->getDebugLoc(),970CI->getParent())971<< "Cannot interchange loops due to call instruction.";972});973974return false;975}976977if (!findInductions(InnerLoop, InnerLoopInductions)) {978LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n");979return false;980}981982if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {983LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");984ORE->emit([&]() {985return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",986InnerLoop->getStartLoc(),987InnerLoop->getHeader())988<< "Cannot interchange loops because unsupported PHI nodes found "989"in inner loop latch.";990});991return false;992}993994// TODO: The loops could not be interchanged due to current limitations in the995// transform module.996if (currentLimitations()) {997LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");998return false;999}10001001// Check if the loops are tightly nested.1002if (!tightlyNested(OuterLoop, InnerLoop)) {1003LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");1004ORE->emit([&]() {1005return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",1006InnerLoop->getStartLoc(),1007InnerLoop->getHeader())1008<< "Cannot interchange loops because they are not tightly "1009"nested.";1010});1011return false;1012}10131014if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,1015OuterInnerReductions)) {1016LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");1017ORE->emit([&]() {1018return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",1019InnerLoop->getStartLoc(),1020InnerLoop->getHeader())1021<< "Found unsupported PHI node in loop exit.";1022});1023return false;1024}10251026if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {1027LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");1028ORE->emit([&]() {1029return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",1030OuterLoop->getStartLoc(),1031OuterLoop->getHeader())1032<< "Found unsupported PHI node in loop exit.";1033});1034return false;1035}10361037return true;1038}10391040int LoopInterchangeProfitability::getInstrOrderCost() {1041unsigned GoodOrder, BadOrder;1042BadOrder = GoodOrder = 0;1043for (BasicBlock *BB : InnerLoop->blocks()) {1044for (Instruction &Ins : *BB) {1045if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {1046unsigned NumOp = GEP->getNumOperands();1047bool FoundInnerInduction = false;1048bool FoundOuterInduction = false;1049for (unsigned i = 0; i < NumOp; ++i) {1050// Skip operands that are not SCEV-able.1051if (!SE->isSCEVable(GEP->getOperand(i)->getType()))1052continue;10531054const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));1055const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);1056if (!AR)1057continue;10581059// If we find the inner induction after an outer induction e.g.1060// for(int i=0;i<N;i++)1061// for(int j=0;j<N;j++)1062// A[i][j] = A[i-1][j-1]+k;1063// then it is a good order.1064if (AR->getLoop() == InnerLoop) {1065// We found an InnerLoop induction after OuterLoop induction. It is1066// a good order.1067FoundInnerInduction = true;1068if (FoundOuterInduction) {1069GoodOrder++;1070break;1071}1072}1073// If we find the outer induction after an inner induction e.g.1074// for(int i=0;i<N;i++)1075// for(int j=0;j<N;j++)1076// A[j][i] = A[j-1][i-1]+k;1077// then it is a bad order.1078if (AR->getLoop() == OuterLoop) {1079// We found an OuterLoop induction after InnerLoop induction. It is1080// a bad order.1081FoundOuterInduction = true;1082if (FoundInnerInduction) {1083BadOrder++;1084break;1085}1086}1087}1088}1089}1090}1091return GoodOrder - BadOrder;1092}10931094std::optional<bool>1095LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(1096const DenseMap<const Loop *, unsigned> &CostMap,1097std::unique_ptr<CacheCost> &CC) {1098// This is the new cost model returned from loop cache analysis.1099// A smaller index means the loop should be placed an outer loop, and vice1100// versa.1101if (CostMap.contains(InnerLoop) && CostMap.contains(OuterLoop)) {1102unsigned InnerIndex = 0, OuterIndex = 0;1103InnerIndex = CostMap.find(InnerLoop)->second;1104OuterIndex = CostMap.find(OuterLoop)->second;1105LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex1106<< ", OuterIndex = " << OuterIndex << "\n");1107if (InnerIndex < OuterIndex)1108return std::optional<bool>(true);1109assert(InnerIndex != OuterIndex && "CostMap should assign unique "1110"numbers to each loop");1111if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))1112return std::nullopt;1113return std::optional<bool>(false);1114}1115return std::nullopt;1116}11171118std::optional<bool>1119LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {1120// Legacy cost model: this is rough cost estimation algorithm. It counts the1121// good and bad order of induction variables in the instruction and allows1122// reordering if number of bad orders is more than good.1123int Cost = getInstrOrderCost();1124LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");1125if (Cost < 0 && Cost < LoopInterchangeCostThreshold)1126return std::optional<bool>(true);11271128return std::nullopt;1129}11301131std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(1132unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {1133for (auto &Row : DepMatrix) {1134// If the inner loop is loop independent or doesn't carry any dependency1135// it is not profitable to move this to outer position, since we are1136// likely able to do inner loop vectorization already.1137if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=')1138return std::optional<bool>(false);11391140// If the outer loop is not loop independent it is not profitable to move1141// this to inner position, since doing so would not enable inner loop1142// parallelism.1143if (Row[OuterLoopId] != 'I' && Row[OuterLoopId] != '=')1144return std::optional<bool>(false);1145}1146// If inner loop has dependence and outer loop is loop independent then it1147// is/ profitable to interchange to enable inner loop parallelism.1148// If there are no dependences, interchanging will not improve anything.1149return std::optional<bool>(!DepMatrix.empty());1150}11511152bool LoopInterchangeProfitability::isProfitable(1153const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,1154unsigned OuterLoopId, CharMatrix &DepMatrix,1155const DenseMap<const Loop *, unsigned> &CostMap,1156std::unique_ptr<CacheCost> &CC) {1157// isProfitable() is structured to avoid endless loop interchange.1158// If loop cache analysis could decide the profitability then,1159// profitability check will stop and return the analysis result.1160// If cache analysis failed to analyze the loopnest (e.g.,1161// due to delinearization issues) then only check whether it is1162// profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to1163// analysis the profitability then only, isProfitableForVectorization1164// will decide.1165std::optional<bool> shouldInterchange =1166isProfitablePerLoopCacheAnalysis(CostMap, CC);1167if (!shouldInterchange.has_value()) {1168shouldInterchange = isProfitablePerInstrOrderCost();1169if (!shouldInterchange.has_value())1170shouldInterchange =1171isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);1172}1173if (!shouldInterchange.has_value()) {1174ORE->emit([&]() {1175return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",1176InnerLoop->getStartLoc(),1177InnerLoop->getHeader())1178<< "Insufficient information to calculate the cost of loop for "1179"interchange.";1180});1181return false;1182} else if (!shouldInterchange.value()) {1183ORE->emit([&]() {1184return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",1185InnerLoop->getStartLoc(),1186InnerLoop->getHeader())1187<< "Interchanging loops is not considered to improve cache "1188"locality nor vectorization.";1189});1190return false;1191}1192return true;1193}11941195void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,1196Loop *InnerLoop) {1197for (Loop *L : *OuterLoop)1198if (L == InnerLoop) {1199OuterLoop->removeChildLoop(L);1200return;1201}1202llvm_unreachable("Couldn't find loop");1203}12041205/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the1206/// new inner and outer loop after interchanging: NewInner is the original1207/// outer loop and NewOuter is the original inner loop.1208///1209/// Before interchanging, we have the following structure1210/// Outer preheader1211// Outer header1212// Inner preheader1213// Inner header1214// Inner body1215// Inner latch1216// outer bbs1217// Outer latch1218//1219// After interchanging:1220// Inner preheader1221// Inner header1222// Outer preheader1223// Outer header1224// Inner body1225// outer bbs1226// Outer latch1227// Inner latch1228void LoopInterchangeTransform::restructureLoops(1229Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,1230BasicBlock *OrigOuterPreHeader) {1231Loop *OuterLoopParent = OuterLoop->getParentLoop();1232// The original inner loop preheader moves from the new inner loop to1233// the parent loop, if there is one.1234NewInner->removeBlockFromLoop(OrigInnerPreHeader);1235LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);12361237// Switch the loop levels.1238if (OuterLoopParent) {1239// Remove the loop from its parent loop.1240removeChildLoop(OuterLoopParent, NewInner);1241removeChildLoop(NewInner, NewOuter);1242OuterLoopParent->addChildLoop(NewOuter);1243} else {1244removeChildLoop(NewInner, NewOuter);1245LI->changeTopLevelLoop(NewInner, NewOuter);1246}1247while (!NewOuter->isInnermost())1248NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));1249NewOuter->addChildLoop(NewInner);12501251// BBs from the original inner loop.1252SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());12531254// Add BBs from the original outer loop to the original inner loop (excluding1255// BBs already in inner loop)1256for (BasicBlock *BB : NewInner->blocks())1257if (LI->getLoopFor(BB) == NewInner)1258NewOuter->addBlockEntry(BB);12591260// Now remove inner loop header and latch from the new inner loop and move1261// other BBs (the loop body) to the new inner loop.1262BasicBlock *OuterHeader = NewOuter->getHeader();1263BasicBlock *OuterLatch = NewOuter->getLoopLatch();1264for (BasicBlock *BB : OrigInnerBBs) {1265// Nothing will change for BBs in child loops.1266if (LI->getLoopFor(BB) != NewOuter)1267continue;1268// Remove the new outer loop header and latch from the new inner loop.1269if (BB == OuterHeader || BB == OuterLatch)1270NewInner->removeBlockFromLoop(BB);1271else1272LI->changeLoopFor(BB, NewInner);1273}12741275// The preheader of the original outer loop becomes part of the new1276// outer loop.1277NewOuter->addBlockEntry(OrigOuterPreHeader);1278LI->changeLoopFor(OrigOuterPreHeader, NewOuter);12791280// Tell SE that we move the loops around.1281SE->forgetLoop(NewOuter);1282}12831284bool LoopInterchangeTransform::transform() {1285bool Transformed = false;12861287if (InnerLoop->getSubLoops().empty()) {1288BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();1289LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");1290auto &InductionPHIs = LIL.getInnerLoopInductions();1291if (InductionPHIs.empty()) {1292LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");1293return false;1294}12951296SmallVector<Instruction *, 8> InnerIndexVarList;1297for (PHINode *CurInductionPHI : InductionPHIs) {1298if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)1299InnerIndexVarList.push_back(1300dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1)));1301else1302InnerIndexVarList.push_back(1303dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0)));1304}13051306// Create a new latch block for the inner loop. We split at the1307// current latch's terminator and then move the condition and all1308// operands that are not either loop-invariant or the induction PHI into the1309// new latch block.1310BasicBlock *NewLatch =1311SplitBlock(InnerLoop->getLoopLatch(),1312InnerLoop->getLoopLatch()->getTerminator(), DT, LI);13131314SmallSetVector<Instruction *, 4> WorkList;1315unsigned i = 0;1316auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {1317for (; i < WorkList.size(); i++) {1318// Duplicate instruction and move it the new latch. Update uses that1319// have been moved.1320Instruction *NewI = WorkList[i]->clone();1321NewI->insertBefore(NewLatch->getFirstNonPHI());1322assert(!NewI->mayHaveSideEffects() &&1323"Moving instructions with side-effects may change behavior of "1324"the loop nest!");1325for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {1326Instruction *UserI = cast<Instruction>(U.getUser());1327if (!InnerLoop->contains(UserI->getParent()) ||1328UserI->getParent() == NewLatch ||1329llvm::is_contained(InductionPHIs, UserI))1330U.set(NewI);1331}1332// Add operands of moved instruction to the worklist, except if they are1333// outside the inner loop or are the induction PHI.1334for (Value *Op : WorkList[i]->operands()) {1335Instruction *OpI = dyn_cast<Instruction>(Op);1336if (!OpI ||1337this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||1338llvm::is_contained(InductionPHIs, OpI))1339continue;1340WorkList.insert(OpI);1341}1342}1343};13441345// FIXME: Should we interchange when we have a constant condition?1346Instruction *CondI = dyn_cast<Instruction>(1347cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator())1348->getCondition());1349if (CondI)1350WorkList.insert(CondI);1351MoveInstructions();1352for (Instruction *InnerIndexVar : InnerIndexVarList)1353WorkList.insert(cast<Instruction>(InnerIndexVar));1354MoveInstructions();1355}13561357// Ensure the inner loop phi nodes have a separate basic block.1358BasicBlock *InnerLoopHeader = InnerLoop->getHeader();1359if (InnerLoopHeader->getFirstNonPHI() != InnerLoopHeader->getTerminator()) {1360SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);1361LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");1362}13631364// Instructions in the original inner loop preheader may depend on values1365// defined in the outer loop header. Move them there, because the original1366// inner loop preheader will become the entry into the interchanged loop nest.1367// Currently we move all instructions and rely on LICM to move invariant1368// instructions outside the loop nest.1369BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();1370BasicBlock *OuterLoopHeader = OuterLoop->getHeader();1371if (InnerLoopPreHeader != OuterLoopHeader) {1372SmallPtrSet<Instruction *, 4> NeedsMoving;1373for (Instruction &I :1374make_early_inc_range(make_range(InnerLoopPreHeader->begin(),1375std::prev(InnerLoopPreHeader->end()))))1376I.moveBeforePreserving(OuterLoopHeader->getTerminator());1377}13781379Transformed |= adjustLoopLinks();1380if (!Transformed) {1381LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");1382return false;1383}13841385return true;1386}13871388/// \brief Move all instructions except the terminator from FromBB right before1389/// InsertBefore1390static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {1391BasicBlock *ToBB = InsertBefore->getParent();13921393ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(),1394FromBB->getTerminator()->getIterator());1395}13961397/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.1398static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {1399// Save all non-terminator instructions of BB1 into TempInstrs and unlink them1400// from BB1 afterwards.1401auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });1402SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));1403for (Instruction *I : TempInstrs)1404I->removeFromParent();14051406// Move instructions from BB2 to BB1.1407moveBBContents(BB2, BB1->getTerminator());14081409// Move instructions from TempInstrs to BB2.1410for (Instruction *I : TempInstrs)1411I->insertBefore(BB2->getTerminator());1412}14131414// Update BI to jump to NewBB instead of OldBB. Records updates to the1415// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that1416// \p OldBB is exactly once in BI's successor list.1417static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,1418BasicBlock *NewBB,1419std::vector<DominatorTree::UpdateType> &DTUpdates,1420bool MustUpdateOnce = true) {1421assert((!MustUpdateOnce ||1422llvm::count_if(successors(BI),1423[OldBB](BasicBlock *BB) {1424return BB == OldBB;1425}) == 1) && "BI must jump to OldBB exactly once.");1426bool Changed = false;1427for (Use &Op : BI->operands())1428if (Op == OldBB) {1429Op.set(NewBB);1430Changed = true;1431}14321433if (Changed) {1434DTUpdates.push_back(1435{DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});1436DTUpdates.push_back(1437{DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});1438}1439assert(Changed && "Expected a successor to be updated");1440}14411442// Move Lcssa PHIs to the right place.1443static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,1444BasicBlock *InnerLatch, BasicBlock *OuterHeader,1445BasicBlock *OuterLatch, BasicBlock *OuterExit,1446Loop *InnerLoop, LoopInfo *LI) {14471448// Deal with LCSSA PHI nodes in the exit block of the inner loop, that are1449// defined either in the header or latch. Those blocks will become header and1450// latch of the new outer loop, and the only possible users can PHI nodes1451// in the exit block of the loop nest or the outer loop header (reduction1452// PHIs, in that case, the incoming value must be defined in the inner loop1453// header). We can just substitute the user with the incoming value and remove1454// the PHI.1455for (PHINode &P : make_early_inc_range(InnerExit->phis())) {1456assert(P.getNumIncomingValues() == 1 &&1457"Only loops with a single exit are supported!");14581459// Incoming values are guaranteed be instructions currently.1460auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));1461// In case of multi-level nested loops, follow LCSSA to find the incoming1462// value defined from the innermost loop.1463auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));1464// Skip phis with incoming values from the inner loop body, excluding the1465// header and latch.1466if (IncIInnerMost->getParent() != InnerLatch &&1467IncIInnerMost->getParent() != InnerHeader)1468continue;14691470assert(all_of(P.users(),1471[OuterHeader, OuterExit, IncI, InnerHeader](User *U) {1472return (cast<PHINode>(U)->getParent() == OuterHeader &&1473IncI->getParent() == InnerHeader) ||1474cast<PHINode>(U)->getParent() == OuterExit;1475}) &&1476"Can only replace phis iff the uses are in the loop nest exit or "1477"the incoming value is defined in the inner header (it will "1478"dominate all loop blocks after interchanging)");1479P.replaceAllUsesWith(IncI);1480P.eraseFromParent();1481}14821483SmallVector<PHINode *, 8> LcssaInnerExit;1484for (PHINode &P : InnerExit->phis())1485LcssaInnerExit.push_back(&P);14861487SmallVector<PHINode *, 8> LcssaInnerLatch;1488for (PHINode &P : InnerLatch->phis())1489LcssaInnerLatch.push_back(&P);14901491// Lcssa PHIs for values used outside the inner loop are in InnerExit.1492// If a PHI node has users outside of InnerExit, it has a use outside the1493// interchanged loop and we have to preserve it. We move these to1494// InnerLatch, which will become the new exit block for the innermost1495// loop after interchanging.1496for (PHINode *P : LcssaInnerExit)1497P->moveBefore(InnerLatch->getFirstNonPHI());14981499// If the inner loop latch contains LCSSA PHIs, those come from a child loop1500// and we have to move them to the new inner latch.1501for (PHINode *P : LcssaInnerLatch)1502P->moveBefore(InnerExit->getFirstNonPHI());15031504// Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have1505// incoming values defined in the outer loop, we have to add a new PHI1506// in the inner loop latch, which became the exit block of the outer loop,1507// after interchanging.1508if (OuterExit) {1509for (PHINode &P : OuterExit->phis()) {1510if (P.getNumIncomingValues() != 1)1511continue;1512// Skip Phis with incoming values defined in the inner loop. Those should1513// already have been updated.1514auto I = dyn_cast<Instruction>(P.getIncomingValue(0));1515if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)1516continue;15171518PHINode *NewPhi = dyn_cast<PHINode>(P.clone());1519NewPhi->setIncomingValue(0, P.getIncomingValue(0));1520NewPhi->setIncomingBlock(0, OuterLatch);1521// We might have incoming edges from other BBs, i.e., the original outer1522// header.1523for (auto *Pred : predecessors(InnerLatch)) {1524if (Pred == OuterLatch)1525continue;1526NewPhi->addIncoming(P.getIncomingValue(0), Pred);1527}1528NewPhi->insertBefore(InnerLatch->getFirstNonPHI());1529P.setIncomingValue(0, NewPhi);1530}1531}15321533// Now adjust the incoming blocks for the LCSSA PHIs.1534// For PHIs moved from Inner's exit block, we need to replace Inner's latch1535// with the new latch.1536InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);1537}15381539bool LoopInterchangeTransform::adjustLoopBranches() {1540LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");1541std::vector<DominatorTree::UpdateType> DTUpdates;15421543BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();1544BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();15451546assert(OuterLoopPreHeader != OuterLoop->getHeader() &&1547InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&1548InnerLoopPreHeader && "Guaranteed by loop-simplify form");1549// Ensure that both preheaders do not contain PHI nodes and have single1550// predecessors. This allows us to move them easily. We use1551// InsertPreHeaderForLoop to create an 'extra' preheader, if the existing1552// preheaders do not satisfy those conditions.1553if (isa<PHINode>(OuterLoopPreHeader->begin()) ||1554!OuterLoopPreHeader->getUniquePredecessor())1555OuterLoopPreHeader =1556InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);1557if (InnerLoopPreHeader == OuterLoop->getHeader())1558InnerLoopPreHeader =1559InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);15601561// Adjust the loop preheader1562BasicBlock *InnerLoopHeader = InnerLoop->getHeader();1563BasicBlock *OuterLoopHeader = OuterLoop->getHeader();1564BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();1565BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();1566BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();1567BasicBlock *InnerLoopLatchPredecessor =1568InnerLoopLatch->getUniquePredecessor();1569BasicBlock *InnerLoopLatchSuccessor;1570BasicBlock *OuterLoopLatchSuccessor;15711572BranchInst *OuterLoopLatchBI =1573dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());1574BranchInst *InnerLoopLatchBI =1575dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());1576BranchInst *OuterLoopHeaderBI =1577dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());1578BranchInst *InnerLoopHeaderBI =1579dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());15801581if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||1582!OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||1583!InnerLoopHeaderBI)1584return false;15851586BranchInst *InnerLoopLatchPredecessorBI =1587dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());1588BranchInst *OuterLoopPredecessorBI =1589dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());15901591if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)1592return false;1593BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();1594if (!InnerLoopHeaderSuccessor)1595return false;15961597// Adjust Loop Preheader and headers.1598// The branches in the outer loop predecessor and the outer loop header can1599// be unconditional branches or conditional branches with duplicates. Consider1600// this when updating the successors.1601updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,1602InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);1603// The outer loop header might or might not branch to the outer latch.1604// We are guaranteed to branch to the inner loop preheader.1605if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch)) {1606// In this case the outerLoopHeader should branch to the InnerLoopLatch.1607updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,1608DTUpdates,1609/*MustUpdateOnce=*/false);1610}1611updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,1612InnerLoopHeaderSuccessor, DTUpdates,1613/*MustUpdateOnce=*/false);16141615// Adjust reduction PHI's now that the incoming block has changed.1616InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,1617OuterLoopHeader);16181619updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,1620OuterLoopPreHeader, DTUpdates);16211622// -------------Adjust loop latches-----------1623if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)1624InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);1625else1626InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);16271628updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,1629InnerLoopLatchSuccessor, DTUpdates);16301631if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)1632OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);1633else1634OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);16351636updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,1637OuterLoopLatchSuccessor, DTUpdates);1638updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,1639DTUpdates);16401641DT->applyUpdates(DTUpdates);1642restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,1643OuterLoopPreHeader);16441645moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,1646OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),1647InnerLoop, LI);1648// For PHIs in the exit block of the outer loop, outer's latch has been1649// replaced by Inners'.1650OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);16511652auto &OuterInnerReductions = LIL.getOuterInnerReductions();1653// Now update the reduction PHIs in the inner and outer loop headers.1654SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;1655for (PHINode &PHI : InnerLoopHeader->phis())1656if (OuterInnerReductions.contains(&PHI))1657InnerLoopPHIs.push_back(&PHI);16581659for (PHINode &PHI : OuterLoopHeader->phis())1660if (OuterInnerReductions.contains(&PHI))1661OuterLoopPHIs.push_back(&PHI);16621663// Now move the remaining reduction PHIs from outer to inner loop header and1664// vice versa. The PHI nodes must be part of a reduction across the inner and1665// outer loop and all the remains to do is and updating the incoming blocks.1666for (PHINode *PHI : OuterLoopPHIs) {1667LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););1668PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());1669assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");1670}1671for (PHINode *PHI : InnerLoopPHIs) {1672LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););1673PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());1674assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");1675}16761677// Update the incoming blocks for moved PHI nodes.1678OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);1679OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);1680InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);1681InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);16821683// Values defined in the outer loop header could be used in the inner loop1684// latch. In that case, we need to create LCSSA phis for them, because after1685// interchanging they will be defined in the new inner loop and used in the1686// new outer loop.1687SmallVector<Instruction *, 4> MayNeedLCSSAPhis;1688for (Instruction &I :1689make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))1690MayNeedLCSSAPhis.push_back(&I);1691formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE);16921693return true;1694}16951696bool LoopInterchangeTransform::adjustLoopLinks() {1697// Adjust all branches in the inner and outer loop.1698bool Changed = adjustLoopBranches();1699if (Changed) {1700// We have interchanged the preheaders so we need to interchange the data in1701// the preheaders as well. This is because the content of the inner1702// preheader was previously executed inside the outer loop.1703BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();1704BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();1705swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);1706}1707return Changed;1708}17091710PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,1711LoopAnalysisManager &AM,1712LoopStandardAnalysisResults &AR,1713LPMUpdater &U) {1714Function &F = *LN.getParent();17151716DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);1717std::unique_ptr<CacheCost> CC =1718CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);1719OptimizationRemarkEmitter ORE(&F);1720if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))1721return PreservedAnalyses::all();1722U.markLoopNestChanged(true);1723return getLoopPassPreservedAnalyses();1724}172517261727