Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
35271 views
//===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file implements some loop unrolling utilities for loops with run-time9// trip counts. See LoopUnroll.cpp for unrolling loops with compile-time10// trip counts.11//12// The functions in this file are used to generate extra code when the13// run-time trip count modulo the unroll factor is not 0. When this is the14// case, we need to generate code to execute these 'left over' iterations.15//16// The current strategy generates an if-then-else sequence prior to the17// unrolled loop to execute the 'left over' iterations before or after the18// unrolled loop.19//20//===----------------------------------------------------------------------===//2122#include "llvm/ADT/Statistic.h"23#include "llvm/Analysis/DomTreeUpdater.h"24#include "llvm/Analysis/InstructionSimplify.h"25#include "llvm/Analysis/LoopIterator.h"26#include "llvm/Analysis/ScalarEvolution.h"27#include "llvm/Analysis/ValueTracking.h"28#include "llvm/IR/BasicBlock.h"29#include "llvm/IR/Dominators.h"30#include "llvm/IR/MDBuilder.h"31#include "llvm/IR/Module.h"32#include "llvm/IR/ProfDataUtils.h"33#include "llvm/Support/CommandLine.h"34#include "llvm/Support/Debug.h"35#include "llvm/Support/raw_ostream.h"36#include "llvm/Transforms/Utils/BasicBlockUtils.h"37#include "llvm/Transforms/Utils/Cloning.h"38#include "llvm/Transforms/Utils/Local.h"39#include "llvm/Transforms/Utils/LoopUtils.h"40#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"41#include "llvm/Transforms/Utils/UnrollLoop.h"42#include <algorithm>4344using namespace llvm;4546#define DEBUG_TYPE "loop-unroll"4748STATISTIC(NumRuntimeUnrolled,49"Number of loops unrolled with run-time trip counts");50static cl::opt<bool> UnrollRuntimeMultiExit(51"unroll-runtime-multi-exit", cl::init(false), cl::Hidden,52cl::desc("Allow runtime unrolling for loops with multiple exits, when "53"epilog is generated"));54static cl::opt<bool> UnrollRuntimeOtherExitPredictable(55"unroll-runtime-other-exit-predictable", cl::init(false), cl::Hidden,56cl::desc("Assume the non latch exit block to be predictable"));5758// Probability that the loop trip count is so small that after the prolog59// we do not enter the unrolled loop at all.60// It is unlikely that the loop trip count is smaller than the unroll factor;61// other than that, the choice of constant is not tuned yet.62static const uint32_t UnrolledLoopHeaderWeights[] = {1, 127};63// Probability that the loop trip count is so small that we skip the unrolled64// loop completely and immediately enter the epilogue loop.65// It is unlikely that the loop trip count is smaller than the unroll factor;66// other than that, the choice of constant is not tuned yet.67static const uint32_t EpilogHeaderWeights[] = {1, 127};6869/// Connect the unrolling prolog code to the original loop.70/// The unrolling prolog code contains code to execute the71/// 'extra' iterations if the run-time trip count modulo the72/// unroll count is non-zero.73///74/// This function performs the following:75/// - Create PHI nodes at prolog end block to combine values76/// that exit the prolog code and jump around the prolog.77/// - Add a PHI operand to a PHI node at the loop exit block78/// for values that exit the prolog and go around the loop.79/// - Branch around the original loop if the trip count is less80/// than the unroll factor.81///82static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,83BasicBlock *PrologExit,84BasicBlock *OriginalLoopLatchExit,85BasicBlock *PreHeader, BasicBlock *NewPreHeader,86ValueToValueMapTy &VMap, DominatorTree *DT,87LoopInfo *LI, bool PreserveLCSSA,88ScalarEvolution &SE) {89// Loop structure should be the following:90// Preheader91// PrologHeader92// ...93// PrologLatch94// PrologExit95// NewPreheader96// Header97// ...98// Latch99// LatchExit100BasicBlock *Latch = L->getLoopLatch();101assert(Latch && "Loop must have a latch");102BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]);103104// Create a PHI node for each outgoing value from the original loop105// (which means it is an outgoing value from the prolog code too).106// The new PHI node is inserted in the prolog end basic block.107// The new PHI node value is added as an operand of a PHI node in either108// the loop header or the loop exit block.109for (BasicBlock *Succ : successors(Latch)) {110for (PHINode &PN : Succ->phis()) {111// Add a new PHI node to the prolog end block and add the112// appropriate incoming values.113// TODO: This code assumes that the PrologExit (or the LatchExit block for114// prolog loop) contains only one predecessor from the loop, i.e. the115// PrologLatch. When supporting multiple-exiting block loops, we can have116// two or more blocks that have the LatchExit as the target in the117// original loop.118PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr");119NewPN->insertBefore(PrologExit->getFirstNonPHIIt());120// Adding a value to the new PHI node from the original loop preheader.121// This is the value that skips all the prolog code.122if (L->contains(&PN)) {123// Succ is loop header.124NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader),125PreHeader);126} else {127// Succ is LatchExit.128NewPN->addIncoming(PoisonValue::get(PN.getType()), PreHeader);129}130131Value *V = PN.getIncomingValueForBlock(Latch);132if (Instruction *I = dyn_cast<Instruction>(V)) {133if (L->contains(I)) {134V = VMap.lookup(I);135}136}137// Adding a value to the new PHI node from the last prolog block138// that was created.139NewPN->addIncoming(V, PrologLatch);140141// Update the existing PHI node operand with the value from the142// new PHI node. How this is done depends on if the existing143// PHI node is in the original loop block, or the exit block.144if (L->contains(&PN))145PN.setIncomingValueForBlock(NewPreHeader, NewPN);146else147PN.addIncoming(NewPN, PrologExit);148SE.forgetValue(&PN);149}150}151152// Make sure that created prolog loop is in simplified form153SmallVector<BasicBlock *, 4> PrologExitPreds;154Loop *PrologLoop = LI->getLoopFor(PrologLatch);155if (PrologLoop) {156for (BasicBlock *PredBB : predecessors(PrologExit))157if (PrologLoop->contains(PredBB))158PrologExitPreds.push_back(PredBB);159160SplitBlockPredecessors(PrologExit, PrologExitPreds, ".unr-lcssa", DT, LI,161nullptr, PreserveLCSSA);162}163164// Create a branch around the original loop, which is taken if there are no165// iterations remaining to be executed after running the prologue.166Instruction *InsertPt = PrologExit->getTerminator();167IRBuilder<> B(InsertPt);168169assert(Count != 0 && "nonsensical Count!");170171// If BECount <u (Count - 1) then (BECount + 1) % Count == (BECount + 1)172// This means %xtraiter is (BECount + 1) and all of the iterations of this173// loop were executed by the prologue. Note that if BECount <u (Count - 1)174// then (BECount + 1) cannot unsigned-overflow.175Value *BrLoopExit =176B.CreateICmpULT(BECount, ConstantInt::get(BECount->getType(), Count - 1));177// Split the exit to maintain loop canonicalization guarantees178SmallVector<BasicBlock *, 4> Preds(predecessors(OriginalLoopLatchExit));179SplitBlockPredecessors(OriginalLoopLatchExit, Preds, ".unr-lcssa", DT, LI,180nullptr, PreserveLCSSA);181// Add the branch to the exit block (around the unrolled loop)182MDNode *BranchWeights = nullptr;183if (hasBranchWeightMD(*Latch->getTerminator())) {184// Assume loop is nearly always entered.185MDBuilder MDB(B.getContext());186BranchWeights = MDB.createBranchWeights(UnrolledLoopHeaderWeights);187}188B.CreateCondBr(BrLoopExit, OriginalLoopLatchExit, NewPreHeader,189BranchWeights);190InsertPt->eraseFromParent();191if (DT) {192auto *NewDom = DT->findNearestCommonDominator(OriginalLoopLatchExit,193PrologExit);194DT->changeImmediateDominator(OriginalLoopLatchExit, NewDom);195}196}197198/// Connect the unrolling epilog code to the original loop.199/// The unrolling epilog code contains code to execute the200/// 'extra' iterations if the run-time trip count modulo the201/// unroll count is non-zero.202///203/// This function performs the following:204/// - Update PHI nodes at the unrolling loop exit and epilog loop exit205/// - Create PHI nodes at the unrolling loop exit to combine206/// values that exit the unrolling loop code and jump around it.207/// - Update PHI operands in the epilog loop by the new PHI nodes208/// - Branch around the epilog loop if extra iters (ModVal) is zero.209///210static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,211BasicBlock *Exit, BasicBlock *PreHeader,212BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,213ValueToValueMapTy &VMap, DominatorTree *DT,214LoopInfo *LI, bool PreserveLCSSA, ScalarEvolution &SE,215unsigned Count) {216BasicBlock *Latch = L->getLoopLatch();217assert(Latch && "Loop must have a latch");218BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);219220// Loop structure should be the following:221//222// PreHeader223// NewPreHeader224// Header225// ...226// Latch227// NewExit (PN)228// EpilogPreHeader229// EpilogHeader230// ...231// EpilogLatch232// Exit (EpilogPN)233234// Update PHI nodes at NewExit and Exit.235for (PHINode &PN : NewExit->phis()) {236// PN should be used in another PHI located in Exit block as237// Exit was split by SplitBlockPredecessors into Exit and NewExit238// Basically it should look like:239// NewExit:240// PN = PHI [I, Latch]241// ...242// Exit:243// EpilogPN = PHI [PN, EpilogPreHeader], [X, Exit2], [Y, Exit2.epil]244//245// Exits from non-latch blocks point to the original exit block and the246// epilogue edges have already been added.247//248// There is EpilogPreHeader incoming block instead of NewExit as249// NewExit was spilt 1 more time to get EpilogPreHeader.250assert(PN.hasOneUse() && "The phi should have 1 use");251PHINode *EpilogPN = cast<PHINode>(PN.use_begin()->getUser());252assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");253254// Add incoming PreHeader from branch around the Loop255PN.addIncoming(PoisonValue::get(PN.getType()), PreHeader);256SE.forgetValue(&PN);257258Value *V = PN.getIncomingValueForBlock(Latch);259Instruction *I = dyn_cast<Instruction>(V);260if (I && L->contains(I))261// If value comes from an instruction in the loop add VMap value.262V = VMap.lookup(I);263// For the instruction out of the loop, constant or undefined value264// insert value itself.265EpilogPN->addIncoming(V, EpilogLatch);266267assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 &&268"EpilogPN should have EpilogPreHeader incoming block");269// Change EpilogPreHeader incoming block to NewExit.270EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader),271NewExit);272// Now PHIs should look like:273// NewExit:274// PN = PHI [I, Latch], [poison, PreHeader]275// ...276// Exit:277// EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]278}279280// Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader).281// Update corresponding PHI nodes in epilog loop.282for (BasicBlock *Succ : successors(Latch)) {283// Skip this as we already updated phis in exit blocks.284if (!L->contains(Succ))285continue;286for (PHINode &PN : Succ->phis()) {287// Add new PHI nodes to the loop exit block and update epilog288// PHIs with the new PHI values.289PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr");290NewPN->insertBefore(NewExit->getFirstNonPHIIt());291// Adding a value to the new PHI node from the unrolling loop preheader.292NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader);293// Adding a value to the new PHI node from the unrolling loop latch.294NewPN->addIncoming(PN.getIncomingValueForBlock(Latch), Latch);295296// Update the existing PHI node operand with the value from the new PHI297// node. Corresponding instruction in epilog loop should be PHI.298PHINode *VPN = cast<PHINode>(VMap[&PN]);299VPN->setIncomingValueForBlock(EpilogPreHeader, NewPN);300}301}302303Instruction *InsertPt = NewExit->getTerminator();304IRBuilder<> B(InsertPt);305Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod");306assert(Exit && "Loop must have a single exit block only");307// Split the epilogue exit to maintain loop canonicalization guarantees308SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));309SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI, nullptr,310PreserveLCSSA);311// Add the branch to the exit block (around the unrolling loop)312MDNode *BranchWeights = nullptr;313if (hasBranchWeightMD(*Latch->getTerminator())) {314// Assume equal distribution in interval [0, Count).315MDBuilder MDB(B.getContext());316BranchWeights = MDB.createBranchWeights(1, Count - 1);317}318B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights);319InsertPt->eraseFromParent();320if (DT) {321auto *NewDom = DT->findNearestCommonDominator(Exit, NewExit);322DT->changeImmediateDominator(Exit, NewDom);323}324325// Split the main loop exit to maintain canonicalization guarantees.326SmallVector<BasicBlock*, 4> NewExitPreds{Latch};327SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI, nullptr,328PreserveLCSSA);329}330331/// Create a clone of the blocks in a loop and connect them together. A new332/// loop will be created including all cloned blocks, and the iterator of the333/// new loop switched to count NewIter down to 0.334/// The cloned blocks should be inserted between InsertTop and InsertBot.335/// InsertTop should be new preheader, InsertBot new loop exit.336/// Returns the new cloned loop that is created.337static Loop *338CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,339const bool UnrollRemainder,340BasicBlock *InsertTop,341BasicBlock *InsertBot, BasicBlock *Preheader,342std::vector<BasicBlock *> &NewBlocks,343LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,344DominatorTree *DT, LoopInfo *LI, unsigned Count) {345StringRef suffix = UseEpilogRemainder ? "epil" : "prol";346BasicBlock *Header = L->getHeader();347BasicBlock *Latch = L->getLoopLatch();348Function *F = Header->getParent();349LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();350LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();351Loop *ParentLoop = L->getParentLoop();352NewLoopsMap NewLoops;353NewLoops[ParentLoop] = ParentLoop;354355// For each block in the original loop, create a new copy,356// and update the value map with the newly created values.357for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {358BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);359NewBlocks.push_back(NewBB);360361addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops);362363VMap[*BB] = NewBB;364if (Header == *BB) {365// For the first block, add a CFG connection to this newly366// created block.367InsertTop->getTerminator()->setSuccessor(0, NewBB);368}369370if (DT) {371if (Header == *BB) {372// The header is dominated by the preheader.373DT->addNewBlock(NewBB, InsertTop);374} else {375// Copy information from original loop to unrolled loop.376BasicBlock *IDomBB = DT->getNode(*BB)->getIDom()->getBlock();377DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));378}379}380381if (Latch == *BB) {382// For the last block, create a loop back to cloned head.383VMap.erase((*BB)->getTerminator());384// Use an incrementing IV. Pre-incr/post-incr is backedge/trip count.385// Subtle: NewIter can be 0 if we wrapped when computing the trip count,386// thus we must compare the post-increment (wrapping) value.387BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);388BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());389IRBuilder<> Builder(LatchBR);390PHINode *NewIdx =391PHINode::Create(NewIter->getType(), 2, suffix + ".iter");392NewIdx->insertBefore(FirstLoopBB->getFirstNonPHIIt());393auto *Zero = ConstantInt::get(NewIdx->getType(), 0);394auto *One = ConstantInt::get(NewIdx->getType(), 1);395Value *IdxNext =396Builder.CreateAdd(NewIdx, One, NewIdx->getName() + ".next");397Value *IdxCmp = Builder.CreateICmpNE(IdxNext, NewIter, NewIdx->getName() + ".cmp");398MDNode *BranchWeights = nullptr;399if (hasBranchWeightMD(*LatchBR)) {400uint32_t ExitWeight;401uint32_t BackEdgeWeight;402if (Count >= 3) {403// Note: We do not enter this loop for zero-remainders. The check404// is at the end of the loop. We assume equal distribution between405// possible remainders in [1, Count).406ExitWeight = 1;407BackEdgeWeight = (Count - 2) / 2;408} else {409// Unnecessary backedge, should never be taken. The conditional410// jump should be optimized away later.411ExitWeight = 1;412BackEdgeWeight = 0;413}414MDBuilder MDB(Builder.getContext());415BranchWeights = MDB.createBranchWeights(BackEdgeWeight, ExitWeight);416}417Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights);418NewIdx->addIncoming(Zero, InsertTop);419NewIdx->addIncoming(IdxNext, NewBB);420LatchBR->eraseFromParent();421}422}423424// Change the incoming values to the ones defined in the preheader or425// cloned loop.426for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {427PHINode *NewPHI = cast<PHINode>(VMap[&*I]);428unsigned idx = NewPHI->getBasicBlockIndex(Preheader);429NewPHI->setIncomingBlock(idx, InsertTop);430BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);431idx = NewPHI->getBasicBlockIndex(Latch);432Value *InVal = NewPHI->getIncomingValue(idx);433NewPHI->setIncomingBlock(idx, NewLatch);434if (Value *V = VMap.lookup(InVal))435NewPHI->setIncomingValue(idx, V);436}437438Loop *NewLoop = NewLoops[L];439assert(NewLoop && "L should have been cloned");440MDNode *LoopID = NewLoop->getLoopID();441442// Only add loop metadata if the loop is not going to be completely443// unrolled.444if (UnrollRemainder)445return NewLoop;446447std::optional<MDNode *> NewLoopID = makeFollowupLoopID(448LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder});449if (NewLoopID) {450NewLoop->setLoopID(*NewLoopID);451452// Do not setLoopAlreadyUnrolled if loop attributes have been defined453// explicitly.454return NewLoop;455}456457// Add unroll disable metadata to disable future unrolling for this loop.458NewLoop->setLoopAlreadyUnrolled();459return NewLoop;460}461462/// Returns true if we can profitably unroll the multi-exit loop L. Currently,463/// we return true only if UnrollRuntimeMultiExit is set to true.464static bool canProfitablyUnrollMultiExitLoop(465Loop *L, SmallVectorImpl<BasicBlock *> &OtherExits, BasicBlock *LatchExit,466bool UseEpilogRemainder) {467468// Priority goes to UnrollRuntimeMultiExit if it's supplied.469if (UnrollRuntimeMultiExit.getNumOccurrences())470return UnrollRuntimeMultiExit;471472// The main pain point with multi-exit loop unrolling is that once unrolled,473// we will not be able to merge all blocks into a straight line code.474// There are branches within the unrolled loop that go to the OtherExits.475// The second point is the increase in code size, but this is true476// irrespective of multiple exits.477478// Note: Both the heuristics below are coarse grained. We are essentially479// enabling unrolling of loops that have a single side exit other than the480// normal LatchExit (i.e. exiting into a deoptimize block).481// The heuristics considered are:482// 1. low number of branches in the unrolled version.483// 2. high predictability of these extra branches.484// We avoid unrolling loops that have more than two exiting blocks. This485// limits the total number of branches in the unrolled loop to be atmost486// the unroll factor (since one of the exiting blocks is the latch block).487SmallVector<BasicBlock*, 4> ExitingBlocks;488L->getExitingBlocks(ExitingBlocks);489if (ExitingBlocks.size() > 2)490return false;491492// Allow unrolling of loops with no non latch exit blocks.493if (OtherExits.size() == 0)494return true;495496// The second heuristic is that L has one exit other than the latchexit and497// that exit is a deoptimize block. We know that deoptimize blocks are rarely498// taken, which also implies the branch leading to the deoptimize block is499// highly predictable. When UnrollRuntimeOtherExitPredictable is specified, we500// assume the other exit branch is predictable even if it has no deoptimize501// call.502return (OtherExits.size() == 1 &&503(UnrollRuntimeOtherExitPredictable ||504OtherExits[0]->getPostdominatingDeoptimizeCall()));505// TODO: These can be fine-tuned further to consider code size or deopt states506// that are captured by the deoptimize exit block.507// Also, we can extend this to support more cases, if we actually508// know of kinds of multiexit loops that would benefit from unrolling.509}510511/// Calculate ModVal = (BECount + 1) % Count on the abstract integer domain512/// accounting for the possibility of unsigned overflow in the 2s complement513/// domain. Preconditions:514/// 1) TripCount = BECount + 1 (allowing overflow)515/// 2) Log2(Count) <= BitWidth(BECount)516static Value *CreateTripRemainder(IRBuilder<> &B, Value *BECount,517Value *TripCount, unsigned Count) {518// Note that TripCount is BECount + 1.519if (isPowerOf2_32(Count))520// If the expression is zero, then either:521// 1. There are no iterations to be run in the prolog/epilog loop.522// OR523// 2. The addition computing TripCount overflowed.524//525// If (2) is true, we know that TripCount really is (1 << BEWidth) and so526// the number of iterations that remain to be run in the original loop is a527// multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (a528// precondition of this method).529return B.CreateAnd(TripCount, Count - 1, "xtraiter");530531// As (BECount + 1) can potentially unsigned overflow we count532// (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.533Constant *CountC = ConstantInt::get(BECount->getType(), Count);534Value *ModValTmp = B.CreateURem(BECount, CountC);535Value *ModValAdd = B.CreateAdd(ModValTmp,536ConstantInt::get(ModValTmp->getType(), 1));537// At that point (BECount % Count) + 1 could be equal to Count.538// To handle this case we need to take mod by Count one more time.539return B.CreateURem(ModValAdd, CountC, "xtraiter");540}541542543/// Insert code in the prolog/epilog code when unrolling a loop with a544/// run-time trip-count.545///546/// This method assumes that the loop unroll factor is total number547/// of loop bodies in the loop after unrolling. (Some folks refer548/// to the unroll factor as the number of *extra* copies added).549/// We assume also that the loop unroll factor is a power-of-two. So, after550/// unrolling the loop, the number of loop bodies executed is 2,551/// 4, 8, etc. Note - LLVM converts the if-then-sequence to a switch552/// instruction in SimplifyCFG.cpp. Then, the backend decides how code for553/// the switch instruction is generated.554///555/// ***Prolog case***556/// extraiters = tripcount % loopfactor557/// if (extraiters == 0) jump Loop:558/// else jump Prol:559/// Prol: LoopBody;560/// extraiters -= 1 // Omitted if unroll factor is 2.561/// if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.562/// if (tripcount < loopfactor) jump End:563/// Loop:564/// ...565/// End:566///567/// ***Epilog case***568/// extraiters = tripcount % loopfactor569/// if (tripcount < loopfactor) jump LoopExit:570/// unroll_iters = tripcount - extraiters571/// Loop: LoopBody; (executes unroll_iter times);572/// unroll_iter -= 1573/// if (unroll_iter != 0) jump Loop:574/// LoopExit:575/// if (extraiters == 0) jump EpilExit:576/// Epil: LoopBody; (executes extraiters times)577/// extraiters -= 1 // Omitted if unroll factor is 2.578/// if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.579/// EpilExit:580581bool llvm::UnrollRuntimeLoopRemainder(582Loop *L, unsigned Count, bool AllowExpensiveTripCount,583bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,584LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,585const TargetTransformInfo *TTI, bool PreserveLCSSA, Loop **ResultLoop) {586LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");587LLVM_DEBUG(L->dump());588LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"589: dbgs() << "Using prolog remainder.\n");590591// Make sure the loop is in canonical form.592if (!L->isLoopSimplifyForm()) {593LLVM_DEBUG(dbgs() << "Not in simplify form!\n");594return false;595}596597// Guaranteed by LoopSimplifyForm.598BasicBlock *Latch = L->getLoopLatch();599BasicBlock *Header = L->getHeader();600601BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());602603if (!LatchBR || LatchBR->isUnconditional()) {604// The loop-rotate pass can be helpful to avoid this in many cases.605LLVM_DEBUG(606dbgs()607<< "Loop latch not terminated by a conditional branch.\n");608return false;609}610611unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;612BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);613614if (L->contains(LatchExit)) {615// Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the616// targets of the Latch be an exit block out of the loop.617LLVM_DEBUG(618dbgs()619<< "One of the loop latch successors must be the exit block.\n");620return false;621}622623// These are exit blocks other than the target of the latch exiting block.624SmallVector<BasicBlock *, 4> OtherExits;625L->getUniqueNonLatchExitBlocks(OtherExits);626// Support only single exit and exiting block unless multi-exit loop627// unrolling is enabled.628if (!L->getExitingBlock() || OtherExits.size()) {629// We rely on LCSSA form being preserved when the exit blocks are transformed.630// (Note that only an off-by-default mode of the old PM disables PreserveLCCA.)631if (!PreserveLCSSA)632return false;633634if (!canProfitablyUnrollMultiExitLoop(L, OtherExits, LatchExit,635UseEpilogRemainder)) {636LLVM_DEBUG(637dbgs()638<< "Multiple exit/exiting blocks in loop and multi-exit unrolling not "639"enabled!\n");640return false;641}642}643// Use Scalar Evolution to compute the trip count. This allows more loops to644// be unrolled than relying on induction var simplification.645if (!SE)646return false;647648// Only unroll loops with a computable trip count.649// We calculate the backedge count by using getExitCount on the Latch block,650// which is proven to be the only exiting block in this loop. This is same as651// calculating getBackedgeTakenCount on the loop (which computes SCEV for all652// exiting blocks).653const SCEV *BECountSC = SE->getExitCount(L, Latch);654if (isa<SCEVCouldNotCompute>(BECountSC)) {655LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n");656return false;657}658659unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();660661// Add 1 since the backedge count doesn't include the first loop iteration.662// (Note that overflow can occur, this is handled explicitly below)663const SCEV *TripCountSC =664SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));665if (isa<SCEVCouldNotCompute>(TripCountSC)) {666LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n");667return false;668}669670BasicBlock *PreHeader = L->getLoopPreheader();671BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());672const DataLayout &DL = Header->getDataLayout();673SCEVExpander Expander(*SE, DL, "loop-unroll");674if (!AllowExpensiveTripCount &&675Expander.isHighCostExpansion(TripCountSC, L, SCEVCheapExpansionBudget,676TTI, PreHeaderBR)) {677LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n");678return false;679}680681// This constraint lets us deal with an overflowing trip count easily; see the682// comment on ModVal below.683if (Log2_32(Count) > BEWidth) {684LLVM_DEBUG(685dbgs()686<< "Count failed constraint on overflow trip count calculation.\n");687return false;688}689690// Loop structure is the following:691//692// PreHeader693// Header694// ...695// Latch696// LatchExit697698BasicBlock *NewPreHeader;699BasicBlock *NewExit = nullptr;700BasicBlock *PrologExit = nullptr;701BasicBlock *EpilogPreHeader = nullptr;702BasicBlock *PrologPreHeader = nullptr;703704if (UseEpilogRemainder) {705// If epilog remainder706// Split PreHeader to insert a branch around loop for unrolling.707NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);708NewPreHeader->setName(PreHeader->getName() + ".new");709// Split LatchExit to create phi nodes from branch above.710NewExit = SplitBlockPredecessors(LatchExit, {Latch}, ".unr-lcssa", DT, LI,711nullptr, PreserveLCSSA);712// NewExit gets its DebugLoc from LatchExit, which is not part of the713// original Loop.714// Fix this by setting Loop's DebugLoc to NewExit.715auto *NewExitTerminator = NewExit->getTerminator();716NewExitTerminator->setDebugLoc(Header->getTerminator()->getDebugLoc());717// Split NewExit to insert epilog remainder loop.718EpilogPreHeader = SplitBlock(NewExit, NewExitTerminator, DT, LI);719EpilogPreHeader->setName(Header->getName() + ".epil.preheader");720721// If the latch exits from multiple level of nested loops, then722// by assumption there must be another loop exit which branches to the723// outer loop and we must adjust the loop for the newly inserted blocks724// to account for the fact that our epilogue is still in the same outer725// loop. Note that this leaves loopinfo temporarily out of sync with the726// CFG until the actual epilogue loop is inserted.727if (auto *ParentL = L->getParentLoop())728if (LI->getLoopFor(LatchExit) != ParentL) {729LI->removeBlock(NewExit);730ParentL->addBasicBlockToLoop(NewExit, *LI);731LI->removeBlock(EpilogPreHeader);732ParentL->addBasicBlockToLoop(EpilogPreHeader, *LI);733}734735} else {736// If prolog remainder737// Split the original preheader twice to insert prolog remainder loop738PrologPreHeader = SplitEdge(PreHeader, Header, DT, LI);739PrologPreHeader->setName(Header->getName() + ".prol.preheader");740PrologExit = SplitBlock(PrologPreHeader, PrologPreHeader->getTerminator(),741DT, LI);742PrologExit->setName(Header->getName() + ".prol.loopexit");743// Split PrologExit to get NewPreHeader.744NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI);745NewPreHeader->setName(PreHeader->getName() + ".new");746}747// Loop structure should be the following:748// Epilog Prolog749//750// PreHeader PreHeader751// *NewPreHeader *PrologPreHeader752// Header *PrologExit753// ... *NewPreHeader754// Latch Header755// *NewExit ...756// *EpilogPreHeader Latch757// LatchExit LatchExit758759// Calculate conditions for branch around loop for unrolling760// in epilog case and around prolog remainder loop in prolog case.761// Compute the number of extra iterations required, which is:762// extra iterations = run-time trip count % loop unroll factor763PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());764IRBuilder<> B(PreHeaderBR);765Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),766PreHeaderBR);767Value *BECount;768// If there are other exits before the latch, that may cause the latch exit769// branch to never be executed, and the latch exit count may be poison.770// In this case, freeze the TripCount and base BECount on the frozen771// TripCount. We will introduce two branches using these values, and it's772// important that they see a consistent value (which would not be guaranteed773// if were frozen independently.)774if ((!OtherExits.empty() || !SE->loopHasNoAbnormalExits(L)) &&775!isGuaranteedNotToBeUndefOrPoison(TripCount, AC, PreHeaderBR, DT)) {776TripCount = B.CreateFreeze(TripCount);777BECount =778B.CreateAdd(TripCount, Constant::getAllOnesValue(TripCount->getType()));779} else {780// If we don't need to freeze, use SCEVExpander for BECount as well, to781// allow slightly better value reuse.782BECount =783Expander.expandCodeFor(BECountSC, BECountSC->getType(), PreHeaderBR);784}785786Value * const ModVal = CreateTripRemainder(B, BECount, TripCount, Count);787788Value *BranchVal =789UseEpilogRemainder ? B.CreateICmpULT(BECount,790ConstantInt::get(BECount->getType(),791Count - 1)) :792B.CreateIsNotNull(ModVal, "lcmp.mod");793BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader;794BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;795// Branch to either remainder (extra iterations) loop or unrolling loop.796MDNode *BranchWeights = nullptr;797if (hasBranchWeightMD(*Latch->getTerminator())) {798// Assume loop is nearly always entered.799MDBuilder MDB(B.getContext());800BranchWeights = MDB.createBranchWeights(EpilogHeaderWeights);801}802B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights);803PreHeaderBR->eraseFromParent();804if (DT) {805if (UseEpilogRemainder)806DT->changeImmediateDominator(NewExit, PreHeader);807else808DT->changeImmediateDominator(PrologExit, PreHeader);809}810Function *F = Header->getParent();811// Get an ordered list of blocks in the loop to help with the ordering of the812// cloned blocks in the prolog/epilog code813LoopBlocksDFS LoopBlocks(L);814LoopBlocks.perform(LI);815816//817// For each extra loop iteration, create a copy of the loop's basic blocks818// and generate a condition that branches to the copy depending on the819// number of 'left over' iterations.820//821std::vector<BasicBlock *> NewBlocks;822ValueToValueMapTy VMap;823824// Clone all the basic blocks in the loop. If Count is 2, we don't clone825// the loop, otherwise we create a cloned loop to execute the extra826// iterations. This function adds the appropriate CFG connections.827BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;828BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;829Loop *remainderLoop = CloneLoopBlocks(830L, ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop, InsertBot,831NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI, Count);832833// Insert the cloned blocks into the function.834F->splice(InsertBot->getIterator(), F, NewBlocks[0]->getIterator(), F->end());835836// Now the loop blocks are cloned and the other exiting blocks from the837// remainder are connected to the original Loop's exit blocks. The remaining838// work is to update the phi nodes in the original loop, and take in the839// values from the cloned region.840for (auto *BB : OtherExits) {841// Given we preserve LCSSA form, we know that the values used outside the842// loop will be used through these phi nodes at the exit blocks that are843// transformed below.844for (PHINode &PN : BB->phis()) {845unsigned oldNumOperands = PN.getNumIncomingValues();846// Add the incoming values from the remainder code to the end of the phi847// node.848for (unsigned i = 0; i < oldNumOperands; i++){849auto *PredBB =PN.getIncomingBlock(i);850if (PredBB == Latch)851// The latch exit is handled separately, see connectX852continue;853if (!L->contains(PredBB))854// Even if we had dedicated exits, the code above inserted an855// extra branch which can reach the latch exit.856continue;857858auto *V = PN.getIncomingValue(i);859if (Instruction *I = dyn_cast<Instruction>(V))860if (L->contains(I))861V = VMap.lookup(I);862PN.addIncoming(V, cast<BasicBlock>(VMap[PredBB]));863}864}865#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)866for (BasicBlock *SuccBB : successors(BB)) {867assert(!(llvm::is_contained(OtherExits, SuccBB) || SuccBB == LatchExit) &&868"Breaks the definition of dedicated exits!");869}870#endif871}872873// Update the immediate dominator of the exit blocks and blocks that are874// reachable from the exit blocks. This is needed because we now have paths875// from both the original loop and the remainder code reaching the exit876// blocks. While the IDom of these exit blocks were from the original loop,877// now the IDom is the preheader (which decides whether the original loop or878// remainder code should run).879if (DT && !L->getExitingBlock()) {880SmallVector<BasicBlock *, 16> ChildrenToUpdate;881// NB! We have to examine the dom children of all loop blocks, not just882// those which are the IDom of the exit blocks. This is because blocks883// reachable from the exit blocks can have their IDom as the nearest common884// dominator of the exit blocks.885for (auto *BB : L->blocks()) {886auto *DomNodeBB = DT->getNode(BB);887for (auto *DomChild : DomNodeBB->children()) {888auto *DomChildBB = DomChild->getBlock();889if (!L->contains(LI->getLoopFor(DomChildBB)))890ChildrenToUpdate.push_back(DomChildBB);891}892}893for (auto *BB : ChildrenToUpdate)894DT->changeImmediateDominator(BB, PreHeader);895}896897// Loop structure should be the following:898// Epilog Prolog899//900// PreHeader PreHeader901// NewPreHeader PrologPreHeader902// Header PrologHeader903// ... ...904// Latch PrologLatch905// NewExit PrologExit906// EpilogPreHeader NewPreHeader907// EpilogHeader Header908// ... ...909// EpilogLatch Latch910// LatchExit LatchExit911912// Rewrite the cloned instruction operands to use the values created when the913// clone is created.914for (BasicBlock *BB : NewBlocks) {915Module *M = BB->getModule();916for (Instruction &I : *BB) {917RemapInstruction(&I, VMap,918RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);919RemapDbgRecordRange(M, I.getDbgRecordRange(), VMap,920RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);921}922}923924if (UseEpilogRemainder) {925// Connect the epilog code to the original loop and update the926// PHI functions.927ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader, EpilogPreHeader,928NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE, Count);929930// Update counter in loop for unrolling.931// Use an incrementing IV. Pre-incr/post-incr is backedge/trip count.932// Subtle: TestVal can be 0 if we wrapped when computing the trip count,933// thus we must compare the post-increment (wrapping) value.934IRBuilder<> B2(NewPreHeader->getTerminator());935Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter");936BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());937PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter");938NewIdx->insertBefore(Header->getFirstNonPHIIt());939B2.SetInsertPoint(LatchBR);940auto *Zero = ConstantInt::get(NewIdx->getType(), 0);941auto *One = ConstantInt::get(NewIdx->getType(), 1);942Value *IdxNext = B2.CreateAdd(NewIdx, One, NewIdx->getName() + ".next");943auto Pred = LatchBR->getSuccessor(0) == Header ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;944Value *IdxCmp = B2.CreateICmp(Pred, IdxNext, TestVal, NewIdx->getName() + ".ncmp");945NewIdx->addIncoming(Zero, NewPreHeader);946NewIdx->addIncoming(IdxNext, Latch);947LatchBR->setCondition(IdxCmp);948} else {949// Connect the prolog code to the original loop and update the950// PHI functions.951ConnectProlog(L, BECount, Count, PrologExit, LatchExit, PreHeader,952NewPreHeader, VMap, DT, LI, PreserveLCSSA, *SE);953}954955// If this loop is nested, then the loop unroller changes the code in the any956// of its parent loops, so the Scalar Evolution pass needs to be run again.957SE->forgetTopmostLoop(L);958959// Verify that the Dom Tree and Loop Info are correct.960#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)961if (DT) {962assert(DT->verify(DominatorTree::VerificationLevel::Full));963LI->verify(*DT);964}965#endif966967// For unroll factor 2 remainder loop will have 1 iteration.968if (Count == 2 && DT && LI && SE) {969// TODO: This code could probably be pulled out into a helper function970// (e.g. breakLoopBackedgeAndSimplify) and reused in loop-deletion.971BasicBlock *RemainderLatch = remainderLoop->getLoopLatch();972assert(RemainderLatch);973SmallVector<BasicBlock*> RemainderBlocks(remainderLoop->getBlocks().begin(),974remainderLoop->getBlocks().end());975breakLoopBackedge(remainderLoop, *DT, *SE, *LI, nullptr);976remainderLoop = nullptr;977978// Simplify loop values after breaking the backedge979const DataLayout &DL = L->getHeader()->getDataLayout();980SmallVector<WeakTrackingVH, 16> DeadInsts;981for (BasicBlock *BB : RemainderBlocks) {982for (Instruction &Inst : llvm::make_early_inc_range(*BB)) {983if (Value *V = simplifyInstruction(&Inst, {DL, nullptr, DT, AC}))984if (LI->replacementPreservesLCSSAForm(&Inst, V))985Inst.replaceAllUsesWith(V);986if (isInstructionTriviallyDead(&Inst))987DeadInsts.emplace_back(&Inst);988}989// We can't do recursive deletion until we're done iterating, as we might990// have a phi which (potentially indirectly) uses instructions later in991// the block we're iterating through.992RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);993}994995// Merge latch into exit block.996auto *ExitBB = RemainderLatch->getSingleSuccessor();997assert(ExitBB && "required after breaking cond br backedge");998DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);999MergeBlockIntoPredecessor(ExitBB, &DTU, LI);1000}10011002// Canonicalize to LoopSimplifyForm both original and remainder loops. We1003// cannot rely on the LoopUnrollPass to do this because it only does1004// canonicalization for parent/subloops and not the sibling loops.1005if (OtherExits.size() > 0) {1006// Generate dedicated exit blocks for the original loop, to preserve1007// LoopSimplifyForm.1008formDedicatedExitBlocks(L, DT, LI, nullptr, PreserveLCSSA);1009// Generate dedicated exit blocks for the remainder loop if one exists, to1010// preserve LoopSimplifyForm.1011if (remainderLoop)1012formDedicatedExitBlocks(remainderLoop, DT, LI, nullptr, PreserveLCSSA);1013}10141015auto UnrollResult = LoopUnrollResult::Unmodified;1016if (remainderLoop && UnrollRemainder) {1017LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");1018UnrollLoopOptions ULO;1019ULO.Count = Count - 1;1020ULO.Force = false;1021ULO.Runtime = false;1022ULO.AllowExpensiveTripCount = false;1023ULO.UnrollRemainder = false;1024ULO.ForgetAllSCEV = ForgetAllSCEV;1025assert(!getLoopConvergenceHeart(L) &&1026"A loop with a convergence heart does not allow runtime unrolling.");1027UnrollResult = UnrollLoop(remainderLoop, ULO, LI, SE, DT, AC, TTI,1028/*ORE*/ nullptr, PreserveLCSSA);1029}10301031if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)1032*ResultLoop = remainderLoop;1033NumRuntimeUnrolled++;1034return true;1035}103610371038