Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
35271 views
//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file implements loop unroll and jam as a routine, much like9// LoopUnroll.cpp implements loop unroll.10//11//===----------------------------------------------------------------------===//1213#include "llvm/ADT/ArrayRef.h"14#include "llvm/ADT/DenseMap.h"15#include "llvm/ADT/STLExtras.h"16#include "llvm/ADT/SmallPtrSet.h"17#include "llvm/ADT/SmallVector.h"18#include "llvm/ADT/Statistic.h"19#include "llvm/ADT/StringRef.h"20#include "llvm/ADT/Twine.h"21#include "llvm/Analysis/AssumptionCache.h"22#include "llvm/Analysis/DependenceAnalysis.h"23#include "llvm/Analysis/DomTreeUpdater.h"24#include "llvm/Analysis/LoopInfo.h"25#include "llvm/Analysis/LoopIterator.h"26#include "llvm/Analysis/MustExecute.h"27#include "llvm/Analysis/OptimizationRemarkEmitter.h"28#include "llvm/Analysis/ScalarEvolution.h"29#include "llvm/IR/BasicBlock.h"30#include "llvm/IR/DebugInfoMetadata.h"31#include "llvm/IR/DebugLoc.h"32#include "llvm/IR/DiagnosticInfo.h"33#include "llvm/IR/Dominators.h"34#include "llvm/IR/Function.h"35#include "llvm/IR/Instruction.h"36#include "llvm/IR/Instructions.h"37#include "llvm/IR/IntrinsicInst.h"38#include "llvm/IR/User.h"39#include "llvm/IR/Value.h"40#include "llvm/IR/ValueHandle.h"41#include "llvm/IR/ValueMap.h"42#include "llvm/Support/Casting.h"43#include "llvm/Support/Debug.h"44#include "llvm/Support/ErrorHandling.h"45#include "llvm/Support/GenericDomTree.h"46#include "llvm/Support/raw_ostream.h"47#include "llvm/Transforms/Utils/BasicBlockUtils.h"48#include "llvm/Transforms/Utils/Cloning.h"49#include "llvm/Transforms/Utils/LoopUtils.h"50#include "llvm/Transforms/Utils/UnrollLoop.h"51#include "llvm/Transforms/Utils/ValueMapper.h"52#include <assert.h>53#include <memory>54#include <type_traits>55#include <vector>5657using namespace llvm;5859#define DEBUG_TYPE "loop-unroll-and-jam"6061STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");62STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");6364typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;6566// Partition blocks in an outer/inner loop pair into blocks before and after67// the loop68static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks,69BasicBlockSet &AftBlocks, DominatorTree &DT) {70Loop *SubLoop = L.getSubLoops()[0];71BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();7273for (BasicBlock *BB : L.blocks()) {74if (!SubLoop->contains(BB)) {75if (DT.dominates(SubLoopLatch, BB))76AftBlocks.insert(BB);77else78ForeBlocks.insert(BB);79}80}8182// Check that all blocks in ForeBlocks together dominate the subloop83// TODO: This might ideally be done better with a dominator/postdominators.84BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();85for (BasicBlock *BB : ForeBlocks) {86if (BB == SubLoopPreHeader)87continue;88Instruction *TI = BB->getTerminator();89for (BasicBlock *Succ : successors(TI))90if (!ForeBlocks.count(Succ))91return false;92}9394return true;95}9697/// Partition blocks in a loop nest into blocks before and after each inner98/// loop.99static bool partitionOuterLoopBlocks(100Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks,101DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,102DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) {103JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end());104105for (Loop *L : Root.getLoopsInPreorder()) {106if (L == &JamLoop)107break;108109if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT))110return false;111}112113return true;114}115116// TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more117// than 2 levels loop.118static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,119BasicBlockSet &ForeBlocks,120BasicBlockSet &SubLoopBlocks,121BasicBlockSet &AftBlocks,122DominatorTree *DT) {123SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());124return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT);125}126127// Looks at the phi nodes in Header for values coming from Latch. For these128// instructions and all their operands calls Visit on them, keeping going for129// all the operands in AftBlocks. Returns false if Visit returns false,130// otherwise returns true. This is used to process the instructions in the131// Aft blocks that need to be moved before the subloop. It is used in two132// places. One to check that the required set of instructions can be moved133// before the loop. Then to collect the instructions to actually move in134// moveHeaderPhiOperandsToForeBlocks.135template <typename T>136static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,137BasicBlockSet &AftBlocks, T Visit) {138SmallPtrSet<Instruction *, 8> VisitedInstr;139140std::function<bool(Instruction * I)> ProcessInstr = [&](Instruction *I) {141if (VisitedInstr.count(I))142return true;143144VisitedInstr.insert(I);145146if (AftBlocks.count(I->getParent()))147for (auto &U : I->operands())148if (Instruction *II = dyn_cast<Instruction>(U))149if (!ProcessInstr(II))150return false;151152return Visit(I);153};154155for (auto &Phi : Header->phis()) {156Value *V = Phi.getIncomingValueForBlock(Latch);157if (Instruction *I = dyn_cast<Instruction>(V))158if (!ProcessInstr(I))159return false;160}161162return true;163}164165// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.166static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,167BasicBlock *Latch,168Instruction *InsertLoc,169BasicBlockSet &AftBlocks) {170// We need to ensure we move the instructions in the correct order,171// starting with the earliest required instruction and moving forward.172processHeaderPhiOperands(Header, Latch, AftBlocks,173[&AftBlocks, &InsertLoc](Instruction *I) {174if (AftBlocks.count(I->getParent()))175I->moveBefore(InsertLoc);176return true;177});178}179180/*181This method performs Unroll and Jam. For a simple loop like:182for (i = ..)183Fore(i)184for (j = ..)185SubLoop(i, j)186Aft(i)187188Instead of doing normal inner or outer unrolling, we do:189for (i = .., i+=2)190Fore(i)191Fore(i+1)192for (j = ..)193SubLoop(i, j)194SubLoop(i+1, j)195Aft(i)196Aft(i+1)197198So the outer loop is essetially unrolled and then the inner loops are fused199("jammed") together into a single loop. This can increase speed when there200are loads in SubLoop that are invariant to i, as they become shared between201the now jammed inner loops.202203We do this by spliting the blocks in the loop into Fore, Subloop and Aft.204Fore blocks are those before the inner loop, Aft are those after. Normal205Unroll code is used to copy each of these sets of blocks and the results are206combined together into the final form above.207208isSafeToUnrollAndJam should be used prior to calling this to make sure the209unrolling will be valid. Checking profitablility is also advisable.210211If EpilogueLoop is non-null, it receives the epilogue loop (if it was212necessary to create one and not fully unrolled).213*/214LoopUnrollResult215llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,216unsigned TripMultiple, bool UnrollRemainder,217LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,218AssumptionCache *AC, const TargetTransformInfo *TTI,219OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {220221// When we enter here we should have already checked that it is safe222BasicBlock *Header = L->getHeader();223assert(Header && "No header.");224assert(L->getSubLoops().size() == 1);225Loop *SubLoop = *L->begin();226227// Don't enter the unroll code if there is nothing to do.228if (TripCount == 0 && Count < 2) {229LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");230return LoopUnrollResult::Unmodified;231}232233assert(Count > 0);234assert(TripMultiple > 0);235assert(TripCount == 0 || TripCount % TripMultiple == 0);236237// Are we eliminating the loop control altogether?238bool CompletelyUnroll = (Count == TripCount);239240// We use the runtime remainder in cases where we don't know trip multiple241if (TripMultiple % Count != 0) {242if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,243/*UseEpilogRemainder*/ true,244UnrollRemainder, /*ForgetAllSCEV*/ false,245LI, SE, DT, AC, TTI, true, EpilogueLoop)) {246LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "247"generated when assuming runtime trip count\n");248return LoopUnrollResult::Unmodified;249}250}251252// Notify ScalarEvolution that the loop will be substantially changed,253// if not outright eliminated.254if (SE) {255SE->forgetLoop(L);256SE->forgetBlockAndLoopDispositions();257}258259using namespace ore;260// Report the unrolling decision.261if (CompletelyUnroll) {262LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"263<< Header->getName() << " with trip count " << TripCount264<< "!\n");265ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),266L->getHeader())267<< "completely unroll and jammed loop with "268<< NV("UnrollCount", TripCount) << " iterations");269} else {270auto DiagBuilder = [&]() {271OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),272L->getHeader());273return Diag << "unroll and jammed loop by a factor of "274<< NV("UnrollCount", Count);275};276277LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()278<< " by " << Count);279if (TripMultiple != 1) {280LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");281ORE->emit([&]() {282return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)283<< " trips per branch";284});285} else {286LLVM_DEBUG(dbgs() << " with run-time trip count");287ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });288}289LLVM_DEBUG(dbgs() << "!\n");290}291292BasicBlock *Preheader = L->getLoopPreheader();293BasicBlock *LatchBlock = L->getLoopLatch();294assert(Preheader && "No preheader");295assert(LatchBlock && "No latch block");296BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());297assert(BI && !BI->isUnconditional());298bool ContinueOnTrue = L->contains(BI->getSuccessor(0));299BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);300bool SubLoopContinueOnTrue = SubLoop->contains(301SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));302303// Partition blocks in an outer/inner loop pair into blocks before and after304// the loop305BasicBlockSet SubLoopBlocks;306BasicBlockSet ForeBlocks;307BasicBlockSet AftBlocks;308partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,309DT);310311// We keep track of the entering/first and exiting/last block of each of312// Fore/SubLoop/Aft in each iteration. This helps make the stapling up of313// blocks easier.314std::vector<BasicBlock *> ForeBlocksFirst;315std::vector<BasicBlock *> ForeBlocksLast;316std::vector<BasicBlock *> SubLoopBlocksFirst;317std::vector<BasicBlock *> SubLoopBlocksLast;318std::vector<BasicBlock *> AftBlocksFirst;319std::vector<BasicBlock *> AftBlocksLast;320ForeBlocksFirst.push_back(Header);321ForeBlocksLast.push_back(SubLoop->getLoopPreheader());322SubLoopBlocksFirst.push_back(SubLoop->getHeader());323SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());324AftBlocksFirst.push_back(SubLoop->getExitBlock());325AftBlocksLast.push_back(L->getExitingBlock());326// Maps Blocks[0] -> Blocks[It]327ValueToValueMapTy LastValueMap;328329// Move any instructions from fore phi operands from AftBlocks into Fore.330moveHeaderPhiOperandsToForeBlocks(331Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks);332333// The current on-the-fly SSA update requires blocks to be processed in334// reverse postorder so that LastValueMap contains the correct value at each335// exit.336LoopBlocksDFS DFS(L);337DFS.perform(LI);338// Stash the DFS iterators before adding blocks to the loop.339LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();340LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();341342// When a FSDiscriminator is enabled, we don't need to add the multiply343// factors to the discriminators.344if (Header->getParent()->shouldEmitDebugInfoForProfiling() &&345!EnableFSDiscriminator)346for (BasicBlock *BB : L->getBlocks())347for (Instruction &I : *BB)348if (!I.isDebugOrPseudoInst())349if (const DILocation *DIL = I.getDebugLoc()) {350auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);351if (NewDIL)352I.setDebugLoc(*NewDIL);353else354LLVM_DEBUG(dbgs()355<< "Failed to create new discriminator: "356<< DIL->getFilename() << " Line: " << DIL->getLine());357}358359// Copy all blocks360for (unsigned It = 1; It != Count; ++It) {361SmallVector<BasicBlock *, 8> NewBlocks;362// Maps Blocks[It] -> Blocks[It-1]363DenseMap<Value *, Value *> PrevItValueMap;364SmallDenseMap<const Loop *, Loop *, 4> NewLoops;365NewLoops[L] = L;366NewLoops[SubLoop] = SubLoop;367368for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {369ValueToValueMapTy VMap;370BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));371Header->getParent()->insert(Header->getParent()->end(), New);372373// Tell LI about New.374addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);375376if (ForeBlocks.count(*BB)) {377if (*BB == ForeBlocksFirst[0])378ForeBlocksFirst.push_back(New);379if (*BB == ForeBlocksLast[0])380ForeBlocksLast.push_back(New);381} else if (SubLoopBlocks.count(*BB)) {382if (*BB == SubLoopBlocksFirst[0])383SubLoopBlocksFirst.push_back(New);384if (*BB == SubLoopBlocksLast[0])385SubLoopBlocksLast.push_back(New);386} else if (AftBlocks.count(*BB)) {387if (*BB == AftBlocksFirst[0])388AftBlocksFirst.push_back(New);389if (*BB == AftBlocksLast[0])390AftBlocksLast.push_back(New);391} else {392llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");393}394395// Update our running maps of newest clones396PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);397LastValueMap[*BB] = New;398for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();399VI != VE; ++VI) {400PrevItValueMap[VI->second] =401const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);402LastValueMap[VI->first] = VI->second;403}404405NewBlocks.push_back(New);406407// Update DomTree:408if (*BB == ForeBlocksFirst[0])409DT->addNewBlock(New, ForeBlocksLast[It - 1]);410else if (*BB == SubLoopBlocksFirst[0])411DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);412else if (*BB == AftBlocksFirst[0])413DT->addNewBlock(New, AftBlocksLast[It - 1]);414else {415// Each set of blocks (Fore/Sub/Aft) will have the same internal domtree416// structure.417auto BBDomNode = DT->getNode(*BB);418auto BBIDom = BBDomNode->getIDom();419BasicBlock *OriginalBBIDom = BBIDom->getBlock();420assert(OriginalBBIDom);421assert(LastValueMap[cast<Value>(OriginalBBIDom)]);422DT->addNewBlock(423New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));424}425}426427// Remap all instructions in the most recent iteration428remapInstructionsInBlocks(NewBlocks, LastValueMap);429for (BasicBlock *NewBlock : NewBlocks) {430for (Instruction &I : *NewBlock) {431if (auto *II = dyn_cast<AssumeInst>(&I))432AC->registerAssumption(II);433}434}435436// Alter the ForeBlocks phi's, pointing them at the latest version of the437// value from the previous iteration's phis438for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {439Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);440assert(OldValue && "should have incoming edge from Aft[It]");441Value *NewValue = OldValue;442if (Value *PrevValue = PrevItValueMap[OldValue])443NewValue = PrevValue;444445assert(Phi.getNumOperands() == 2);446Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);447Phi.setIncomingValue(0, NewValue);448Phi.removeIncomingValue(1);449}450}451452// Now that all the basic blocks for the unrolled iterations are in place,453// finish up connecting the blocks and phi nodes. At this point LastValueMap454// is the last unrolled iterations values.455456// Update Phis in BB from OldBB to point to NewBB and use the latest value457// from LastValueMap458auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,459BasicBlock *NewBB,460ValueToValueMapTy &LastValueMap) {461for (PHINode &Phi : BB->phis()) {462for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {463if (Phi.getIncomingBlock(b) == OldBB) {464Value *OldValue = Phi.getIncomingValue(b);465if (Value *LastValue = LastValueMap[OldValue])466Phi.setIncomingValue(b, LastValue);467Phi.setIncomingBlock(b, NewBB);468break;469}470}471}472};473// Move all the phis from Src into Dest474auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {475BasicBlock::iterator insertPoint = Dest->getFirstNonPHIIt();476while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))477Phi->moveBefore(*Dest, insertPoint);478};479480// Update the PHI values outside the loop to point to the last block481updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),482LastValueMap);483484// Update ForeBlocks successors and phi nodes485BranchInst *ForeTerm =486cast<BranchInst>(ForeBlocksLast.back()->getTerminator());487assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");488ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]);489490if (CompletelyUnroll) {491while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {492Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));493Phi->eraseFromParent();494}495} else {496// Update the PHI values to point to the last aft block497updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],498AftBlocksLast.back(), LastValueMap);499}500501for (unsigned It = 1; It != Count; It++) {502// Remap ForeBlock successors from previous iteration to this503BranchInst *ForeTerm =504cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());505assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");506ForeTerm->setSuccessor(0, ForeBlocksFirst[It]);507}508509// Subloop successors and phis510BranchInst *SubTerm =511cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());512SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);513SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);514SubLoopBlocksFirst[0]->replacePhiUsesWith(ForeBlocksLast[0],515ForeBlocksLast.back());516SubLoopBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0],517SubLoopBlocksLast.back());518519for (unsigned It = 1; It != Count; It++) {520// Replace the conditional branch of the previous iteration subloop with an521// unconditional one to this one522BranchInst *SubTerm =523cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());524BranchInst::Create(SubLoopBlocksFirst[It], SubTerm->getIterator());525SubTerm->eraseFromParent();526527SubLoopBlocksFirst[It]->replacePhiUsesWith(ForeBlocksLast[It],528ForeBlocksLast.back());529SubLoopBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],530SubLoopBlocksLast.back());531movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);532}533534// Aft blocks successors and phis535BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator());536if (CompletelyUnroll) {537BranchInst::Create(LoopExit, AftTerm->getIterator());538AftTerm->eraseFromParent();539} else {540AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);541assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit &&542"Expecting the ContinueOnTrue successor of AftTerm to be LoopExit");543}544AftBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0],545SubLoopBlocksLast.back());546547for (unsigned It = 1; It != Count; It++) {548// Replace the conditional branch of the previous iteration subloop with an549// unconditional one to this one550BranchInst *AftTerm =551cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());552BranchInst::Create(AftBlocksFirst[It], AftTerm->getIterator());553AftTerm->eraseFromParent();554555AftBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],556SubLoopBlocksLast.back());557movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);558}559560DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);561// Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the562// new ones required.563if (Count != 1) {564SmallVector<DominatorTree::UpdateType, 4> DTUpdates;565DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],566SubLoopBlocksFirst[0]);567DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,568SubLoopBlocksLast[0], AftBlocksFirst[0]);569570DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,571ForeBlocksLast.back(), SubLoopBlocksFirst[0]);572DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,573SubLoopBlocksLast.back(), AftBlocksFirst[0]);574DTU.applyUpdatesPermissive(DTUpdates);575}576577// Merge adjacent basic blocks, if possible.578SmallPtrSet<BasicBlock *, 16> MergeBlocks;579MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());580MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());581MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());582583MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI);584585// Apply updates to the DomTree.586DT = &DTU.getDomTree();587588// At this point, the code is well formed. We now do a quick sweep over the589// inserted code, doing constant propagation and dead code elimination as we590// go.591simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI);592simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC,593TTI);594595NumCompletelyUnrolledAndJammed += CompletelyUnroll;596++NumUnrolledAndJammed;597598// Update LoopInfo if the loop is completely removed.599if (CompletelyUnroll)600LI->erase(L);601602#ifndef NDEBUG603// We shouldn't have done anything to break loop simplify form or LCSSA.604Loop *OutestLoop = SubLoop->getParentLoop()605? SubLoop->getParentLoop()->getParentLoop()606? SubLoop->getParentLoop()->getParentLoop()607: SubLoop->getParentLoop()608: SubLoop;609assert(DT->verify());610LI->verify(*DT);611assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));612if (!CompletelyUnroll)613assert(L->isLoopSimplifyForm());614assert(SubLoop->isLoopSimplifyForm());615SE->verify();616#endif617618return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled619: LoopUnrollResult::PartiallyUnrolled;620}621622static bool getLoadsAndStores(BasicBlockSet &Blocks,623SmallVector<Instruction *, 4> &MemInstr) {624// Scan the BBs and collect legal loads and stores.625// Returns false if non-simple loads/stores are found.626for (BasicBlock *BB : Blocks) {627for (Instruction &I : *BB) {628if (auto *Ld = dyn_cast<LoadInst>(&I)) {629if (!Ld->isSimple())630return false;631MemInstr.push_back(&I);632} else if (auto *St = dyn_cast<StoreInst>(&I)) {633if (!St->isSimple())634return false;635MemInstr.push_back(&I);636} else if (I.mayReadOrWriteMemory()) {637return false;638}639}640}641return true;642}643644static bool preservesForwardDependence(Instruction *Src, Instruction *Dst,645unsigned UnrollLevel, unsigned JamLevel,646bool Sequentialized, Dependence *D) {647// UnrollLevel might carry the dependency Src --> Dst648// Does a different loop after unrolling?649for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;650++CurLoopDepth) {651auto JammedDir = D->getDirection(CurLoopDepth);652if (JammedDir == Dependence::DVEntry::LT)653return true;654655if (JammedDir & Dependence::DVEntry::GT)656return false;657}658659return true;660}661662static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst,663unsigned UnrollLevel, unsigned JamLevel,664bool Sequentialized, Dependence *D) {665// UnrollLevel might carry the dependency Dst --> Src666for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;667++CurLoopDepth) {668auto JammedDir = D->getDirection(CurLoopDepth);669if (JammedDir == Dependence::DVEntry::GT)670return true;671672if (JammedDir & Dependence::DVEntry::LT)673return false;674}675676// Backward dependencies are only preserved if not interleaved.677return Sequentialized;678}679680// Check whether it is semantically safe Src and Dst considering any potential681// dependency between them.682//683// @param UnrollLevel The level of the loop being unrolled684// @param JamLevel The level of the loop being jammed; if Src and Dst are on685// different levels, the outermost common loop counts as jammed level686//687// @return true if is safe and false if there is a dependency violation.688static bool checkDependency(Instruction *Src, Instruction *Dst,689unsigned UnrollLevel, unsigned JamLevel,690bool Sequentialized, DependenceInfo &DI) {691assert(UnrollLevel <= JamLevel &&692"Expecting JamLevel to be at least UnrollLevel");693694if (Src == Dst)695return true;696// Ignore Input dependencies.697if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))698return true;699700// Check whether unroll-and-jam may violate a dependency.701// By construction, every dependency will be lexicographically non-negative702// (if it was, it would violate the current execution order), such as703// (0,0,>,*,*)704// Unroll-and-jam changes the GT execution of two executions to the same705// iteration of the chosen unroll level. That is, a GT dependence becomes a GE706// dependence (or EQ, if we fully unrolled the loop) at the loop's position:707// (0,0,>=,*,*)708// Now, the dependency is not necessarily non-negative anymore, i.e.709// unroll-and-jam may violate correctness.710std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true);711if (!D)712return true;713assert(D->isOrdered() && "Expected an output, flow or anti dep.");714715if (D->isConfused()) {716LLVM_DEBUG(dbgs() << " Confused dependency between:\n"717<< " " << *Src << "\n"718<< " " << *Dst << "\n");719return false;720}721722// If outer levels (levels enclosing the loop being unroll-and-jammed) have a723// non-equal direction, then the locations accessed in the inner levels cannot724// overlap in memory. We assumes the indexes never overlap into neighboring725// dimensions.726for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth)727if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ))728return true;729730auto UnrollDirection = D->getDirection(UnrollLevel);731732// If the distance carried by the unrolled loop is 0, then after unrolling733// that distance will become non-zero resulting in non-overlapping accesses in734// the inner loops.735if (UnrollDirection == Dependence::DVEntry::EQ)736return true;737738if (UnrollDirection & Dependence::DVEntry::LT &&739!preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel,740Sequentialized, D.get()))741return false;742743if (UnrollDirection & Dependence::DVEntry::GT &&744!preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel,745Sequentialized, D.get()))746return false;747748return true;749}750751static bool752checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks,753const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,754const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap,755DependenceInfo &DI, LoopInfo &LI) {756SmallVector<BasicBlockSet, 8> AllBlocks;757for (Loop *L : Root.getLoopsInPreorder())758if (ForeBlocksMap.contains(L))759AllBlocks.push_back(ForeBlocksMap.lookup(L));760AllBlocks.push_back(SubLoopBlocks);761for (Loop *L : Root.getLoopsInPreorder())762if (AftBlocksMap.contains(L))763AllBlocks.push_back(AftBlocksMap.lookup(L));764765unsigned LoopDepth = Root.getLoopDepth();766SmallVector<Instruction *, 4> EarlierLoadsAndStores;767SmallVector<Instruction *, 4> CurrentLoadsAndStores;768for (BasicBlockSet &Blocks : AllBlocks) {769CurrentLoadsAndStores.clear();770if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores))771return false;772773Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent());774unsigned CurLoopDepth = CurLoop->getLoopDepth();775776for (auto *Earlier : EarlierLoadsAndStores) {777Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent());778unsigned EarlierDepth = EarlierLoop->getLoopDepth();779unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth);780for (auto *Later : CurrentLoadsAndStores) {781if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false,782DI))783return false;784}785}786787size_t NumInsts = CurrentLoadsAndStores.size();788for (size_t I = 0; I < NumInsts; ++I) {789for (size_t J = I; J < NumInsts; ++J) {790if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J],791LoopDepth, CurLoopDepth, true, DI))792return false;793}794}795796EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(),797CurrentLoadsAndStores.end());798}799return true;800}801802static bool isEligibleLoopForm(const Loop &Root) {803// Root must have a child.804if (Root.getSubLoops().size() != 1)805return false;806807const Loop *L = &Root;808do {809// All loops in Root need to be in simplify and rotated form.810if (!L->isLoopSimplifyForm())811return false;812813if (!L->isRotatedForm())814return false;815816if (L->getHeader()->hasAddressTaken()) {817LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");818return false;819}820821unsigned SubLoopsSize = L->getSubLoops().size();822if (SubLoopsSize == 0)823return true;824825// Only one child is allowed.826if (SubLoopsSize != 1)827return false;828829// Only loops with a single exit block can be unrolled and jammed.830// The function getExitBlock() is used for this check, rather than831// getUniqueExitBlock() to ensure loops with mulitple exit edges are832// disallowed.833if (!L->getExitBlock()) {834LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single exit "835"blocks can be unrolled and jammed.\n");836return false;837}838839// Only loops with a single exiting block can be unrolled and jammed.840if (!L->getExitingBlock()) {841LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single "842"exiting blocks can be unrolled and jammed.\n");843return false;844}845846L = L->getSubLoops()[0];847} while (L);848849return true;850}851852static Loop *getInnerMostLoop(Loop *L) {853while (!L->getSubLoops().empty())854L = L->getSubLoops()[0];855return L;856}857858bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,859DependenceInfo &DI, LoopInfo &LI) {860if (!isEligibleLoopForm(*L)) {861LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n");862return false;863}864865/* We currently handle outer loops like this:866|867ForeFirst <------\ }868Blocks | } ForeBlocks of L869ForeLast | }870| |871... |872| |873ForeFirst <----\ | }874Blocks | | } ForeBlocks of a inner loop of L875ForeLast | | }876| | |877JamLoopFirst <\ | | }878Blocks | | | } JamLoopBlocks of the innermost loop879JamLoopLast -/ | | }880| | |881AftFirst | | }882Blocks | | } AftBlocks of a inner loop of L883AftLast ------/ | }884| |885... |886| |887AftFirst | }888Blocks | } AftBlocks of L889AftLast --------/ }890|891892There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks893and AftBlocks, providing that there is one edge from Fores to SubLoops,894one edge from SubLoops to Afts and a single outer loop exit (from Afts).895In practice we currently limit Aft blocks to a single block, and limit896things further in the profitablility checks of the unroll and jam pass.897898Because of the way we rearrange basic blocks, we also require that899the Fore blocks of L on all unrolled iterations are safe to move before the900blocks of the direct child of L of all iterations. So we require that the901phi node looping operands of ForeHeader can be moved to at least the end of902ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and903match up Phi's correctly.904905i.e. The old order of blocks used to be906(F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2.907It needs to be safe to transform this to908(F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2.909910There are then a number of checks along the lines of no calls, no911exceptions, inner loop IV is consistent, etc. Note that for loops requiring912runtime unrolling, UnrollRuntimeLoopRemainder can also fail in913UnrollAndJamLoop if the trip count cannot be easily calculated.914*/915916// Split blocks into Fore/SubLoop/Aft based on dominators917Loop *JamLoop = getInnerMostLoop(L);918BasicBlockSet SubLoopBlocks;919DenseMap<Loop *, BasicBlockSet> ForeBlocksMap;920DenseMap<Loop *, BasicBlockSet> AftBlocksMap;921if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap,922AftBlocksMap, DT)) {923LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");924return false;925}926927// Aft blocks may need to move instructions to fore blocks, which becomes more928// difficult if there are multiple (potentially conditionally executed)929// blocks. For now we just exclude loops with multiple aft blocks.930if (AftBlocksMap[L].size() != 1) {931LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "932"multiple blocks after the loop\n");933return false;934}935936// Check inner loop backedge count is consistent on all iterations of the937// outer loop938if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) {939return !hasIterationCountInvariantInParent(SubLoop, SE);940})) {941LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "942"not consistent on each iteration\n");943return false;944}945946// Check the loop safety info for exceptions.947SimpleLoopSafetyInfo LSI;948LSI.computeLoopSafetyInfo(L);949if (LSI.anyBlockMayThrow()) {950LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");951return false;952}953954// We've ruled out the easy stuff and now need to check that there are no955// interdependencies which may prevent us from moving the:956// ForeBlocks before Subloop and AftBlocks.957// Subloop before AftBlocks.958// ForeBlock phi operands before the subloop959960// Make sure we can move all instructions we need to before the subloop961BasicBlock *Header = L->getHeader();962BasicBlock *Latch = L->getLoopLatch();963BasicBlockSet AftBlocks = AftBlocksMap[L];964Loop *SubLoop = L->getSubLoops()[0];965if (!processHeaderPhiOperands(966Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {967if (SubLoop->contains(I->getParent()))968return false;969if (AftBlocks.count(I->getParent())) {970// If we hit a phi node in afts we know we are done (probably971// LCSSA)972if (isa<PHINode>(I))973return false;974// Can't move instructions with side effects or memory975// reads/writes976if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())977return false;978}979// Keep going980return true;981})) {982LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "983"instructions after subloop to before it\n");984return false;985}986987// Check for memory dependencies which prohibit the unrolling we are doing.988// Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check989// there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.990if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI,991LI)) {992LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");993return false;994}995996return true;997}9989991000