Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopPeel.cpp
35271 views
//===- LoopPeel.cpp -------------------------------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// Loop Peeling Utilities.9//===----------------------------------------------------------------------===//1011#include "llvm/Transforms/Utils/LoopPeel.h"12#include "llvm/ADT/DenseMap.h"13#include "llvm/ADT/SmallVector.h"14#include "llvm/ADT/Statistic.h"15#include "llvm/Analysis/Loads.h"16#include "llvm/Analysis/LoopInfo.h"17#include "llvm/Analysis/LoopIterator.h"18#include "llvm/Analysis/ScalarEvolution.h"19#include "llvm/Analysis/ScalarEvolutionExpressions.h"20#include "llvm/Analysis/TargetTransformInfo.h"21#include "llvm/IR/BasicBlock.h"22#include "llvm/IR/Dominators.h"23#include "llvm/IR/Function.h"24#include "llvm/IR/InstrTypes.h"25#include "llvm/IR/Instruction.h"26#include "llvm/IR/Instructions.h"27#include "llvm/IR/LLVMContext.h"28#include "llvm/IR/MDBuilder.h"29#include "llvm/IR/PatternMatch.h"30#include "llvm/IR/ProfDataUtils.h"31#include "llvm/Support/Casting.h"32#include "llvm/Support/CommandLine.h"33#include "llvm/Support/Debug.h"34#include "llvm/Support/raw_ostream.h"35#include "llvm/Transforms/Utils/BasicBlockUtils.h"36#include "llvm/Transforms/Utils/Cloning.h"37#include "llvm/Transforms/Utils/LoopSimplify.h"38#include "llvm/Transforms/Utils/LoopUtils.h"39#include "llvm/Transforms/Utils/ValueMapper.h"40#include <algorithm>41#include <cassert>42#include <cstdint>43#include <optional>4445using namespace llvm;46using namespace llvm::PatternMatch;4748#define DEBUG_TYPE "loop-peel"4950STATISTIC(NumPeeled, "Number of loops peeled");5152static cl::opt<unsigned> UnrollPeelCount(53"unroll-peel-count", cl::Hidden,54cl::desc("Set the unroll peeling count, for testing purposes"));5556static cl::opt<bool>57UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,58cl::desc("Allows loops to be peeled when the dynamic "59"trip count is known to be low."));6061static cl::opt<bool>62UnrollAllowLoopNestsPeeling("unroll-allow-loop-nests-peeling",63cl::init(false), cl::Hidden,64cl::desc("Allows loop nests to be peeled."));6566static cl::opt<unsigned> UnrollPeelMaxCount(67"unroll-peel-max-count", cl::init(7), cl::Hidden,68cl::desc("Max average trip count which will cause loop peeling."));6970static cl::opt<unsigned> UnrollForcePeelCount(71"unroll-force-peel-count", cl::init(0), cl::Hidden,72cl::desc("Force a peel count regardless of profiling information."));7374static cl::opt<bool> DisableAdvancedPeeling(75"disable-advanced-peeling", cl::init(false), cl::Hidden,76cl::desc(77"Disable advance peeling. Issues for convergent targets (D134803)."));7879static const char *PeeledCountMetaData = "llvm.loop.peeled.count";8081// Check whether we are capable of peeling this loop.82bool llvm::canPeel(const Loop *L) {83// Make sure the loop is in simplified form84if (!L->isLoopSimplifyForm())85return false;86if (!DisableAdvancedPeeling)87return true;8889SmallVector<BasicBlock *, 4> Exits;90L->getUniqueNonLatchExitBlocks(Exits);91// The latch must either be the only exiting block or all non-latch exit92// blocks have either a deopt or unreachable terminator or compose a chain of93// blocks where the last one is either deopt or unreachable terminated. Both94// deopt and unreachable terminators are a strong indication they are not95// taken. Note that this is a profitability check, not a legality check. Also96// note that LoopPeeling currently can only update the branch weights of latch97// blocks and branch weights to blocks with deopt or unreachable do not need98// updating.99return llvm::all_of(Exits, IsBlockFollowedByDeoptOrUnreachable);100}101102namespace {103104// As a loop is peeled, it may be the case that Phi nodes become105// loop-invariant (ie, known because there is only one choice).106// For example, consider the following function:107// void g(int);108// void binary() {109// int x = 0;110// int y = 0;111// int a = 0;112// for(int i = 0; i <100000; ++i) {113// g(x);114// x = y;115// g(a);116// y = a + 1;117// a = 5;118// }119// }120// Peeling 3 iterations is beneficial because the values for x, y and a121// become known. The IR for this loop looks something like the following:122//123// %i = phi i32 [ 0, %entry ], [ %inc, %if.end ]124// %a = phi i32 [ 0, %entry ], [ 5, %if.end ]125// %y = phi i32 [ 0, %entry ], [ %add, %if.end ]126// %x = phi i32 [ 0, %entry ], [ %y, %if.end ]127// ...128// tail call void @_Z1gi(i32 signext %x)129// tail call void @_Z1gi(i32 signext %a)130// %add = add nuw nsw i32 %a, 1131// %inc = add nuw nsw i32 %i, 1132// %exitcond = icmp eq i32 %inc, 100000133// br i1 %exitcond, label %for.cond.cleanup, label %for.body134//135// The arguments for the calls to g will become known after 3 iterations136// of the loop, because the phi nodes values become known after 3 iterations137// of the loop (ie, they are known on the 4th iteration, so peel 3 iterations).138// The first iteration has g(0), g(0); the second has g(0), g(5); the139// third has g(1), g(5) and the fourth (and all subsequent) have g(6), g(5).140// Now consider the phi nodes:141// %a is a phi with constants so it is determined after iteration 1.142// %y is a phi based on a constant and %a so it is determined on143// the iteration after %a is determined, so iteration 2.144// %x is a phi based on a constant and %y so it is determined on145// the iteration after %y, so iteration 3.146// %i is based on itself (and is an induction variable) so it is147// never determined.148// This means that peeling off 3 iterations will result in being able to149// remove the phi nodes for %a, %y, and %x. The arguments for the150// corresponding calls to g are determined and the code for computing151// x, y, and a can be removed.152//153// The PhiAnalyzer class calculates how many times a loop should be154// peeled based on the above analysis of the phi nodes in the loop while155// respecting the maximum specified.156class PhiAnalyzer {157public:158PhiAnalyzer(const Loop &L, unsigned MaxIterations);159160// Calculate the sufficient minimum number of iterations of the loop to peel161// such that phi instructions become determined (subject to allowable limits)162std::optional<unsigned> calculateIterationsToPeel();163164protected:165using PeelCounter = std::optional<unsigned>;166const PeelCounter Unknown = std::nullopt;167168// Add 1 respecting Unknown and return Unknown if result over MaxIterations169PeelCounter addOne(PeelCounter PC) const {170if (PC == Unknown)171return Unknown;172return (*PC + 1 <= MaxIterations) ? PeelCounter{*PC + 1} : Unknown;173}174175// Calculate the number of iterations after which the given value176// becomes an invariant.177PeelCounter calculate(const Value &);178179const Loop &L;180const unsigned MaxIterations;181182// Map of Values to number of iterations to invariance183SmallDenseMap<const Value *, PeelCounter> IterationsToInvariance;184};185186PhiAnalyzer::PhiAnalyzer(const Loop &L, unsigned MaxIterations)187: L(L), MaxIterations(MaxIterations) {188assert(canPeel(&L) && "loop is not suitable for peeling");189assert(MaxIterations > 0 && "no peeling is allowed?");190}191192// This function calculates the number of iterations after which the value193// becomes an invariant. The pre-calculated values are memorized in a map.194// N.B. This number will be Unknown or <= MaxIterations.195// The function is calculated according to the following definition:196// Given %x = phi <Inputs from above the loop>, ..., [%y, %back.edge].197// F(%x) = G(%y) + 1 (N.B. [MaxIterations | Unknown] + 1 => Unknown)198// G(%y) = 0 if %y is a loop invariant199// G(%y) = G(%BackEdgeValue) if %y is a phi in the header block200// G(%y) = TODO: if %y is an expression based on phis and loop invariants201// The example looks like:202// %x = phi(0, %a) <-- becomes invariant starting from 3rd iteration.203// %y = phi(0, 5)204// %a = %y + 1205// G(%y) = Unknown otherwise (including phi not in header block)206PhiAnalyzer::PeelCounter PhiAnalyzer::calculate(const Value &V) {207// If we already know the answer, take it from the map.208auto I = IterationsToInvariance.find(&V);209if (I != IterationsToInvariance.end())210return I->second;211212// Place Unknown to map to avoid infinite recursion. Such213// cycles can never stop on an invariant.214IterationsToInvariance[&V] = Unknown;215216if (L.isLoopInvariant(&V))217// Loop invariant so known at start.218return (IterationsToInvariance[&V] = 0);219if (const PHINode *Phi = dyn_cast<PHINode>(&V)) {220if (Phi->getParent() != L.getHeader()) {221// Phi is not in header block so Unknown.222assert(IterationsToInvariance[&V] == Unknown && "unexpected value saved");223return Unknown;224}225// We need to analyze the input from the back edge and add 1.226Value *Input = Phi->getIncomingValueForBlock(L.getLoopLatch());227PeelCounter Iterations = calculate(*Input);228assert(IterationsToInvariance[Input] == Iterations &&229"unexpected value saved");230return (IterationsToInvariance[Phi] = addOne(Iterations));231}232if (const Instruction *I = dyn_cast<Instruction>(&V)) {233if (isa<CmpInst>(I) || I->isBinaryOp()) {234// Binary instructions get the max of the operands.235PeelCounter LHS = calculate(*I->getOperand(0));236if (LHS == Unknown)237return Unknown;238PeelCounter RHS = calculate(*I->getOperand(1));239if (RHS == Unknown)240return Unknown;241return (IterationsToInvariance[I] = {std::max(*LHS, *RHS)});242}243if (I->isCast())244// Cast instructions get the value of the operand.245return (IterationsToInvariance[I] = calculate(*I->getOperand(0)));246}247// TODO: handle more expressions248249// Everything else is Unknown.250assert(IterationsToInvariance[&V] == Unknown && "unexpected value saved");251return Unknown;252}253254std::optional<unsigned> PhiAnalyzer::calculateIterationsToPeel() {255unsigned Iterations = 0;256for (auto &PHI : L.getHeader()->phis()) {257PeelCounter ToInvariance = calculate(PHI);258if (ToInvariance != Unknown) {259assert(*ToInvariance <= MaxIterations && "bad result in phi analysis");260Iterations = std::max(Iterations, *ToInvariance);261if (Iterations == MaxIterations)262break;263}264}265assert((Iterations <= MaxIterations) && "bad result in phi analysis");266return Iterations ? std::optional<unsigned>(Iterations) : std::nullopt;267}268269} // unnamed namespace270271// Try to find any invariant memory reads that will become dereferenceable in272// the remainder loop after peeling. The load must also be used (transitively)273// by an exit condition. Returns the number of iterations to peel off (at the274// moment either 0 or 1).275static unsigned peelToTurnInvariantLoadsDerefencebale(Loop &L,276DominatorTree &DT,277AssumptionCache *AC) {278// Skip loops with a single exiting block, because there should be no benefit279// for the heuristic below.280if (L.getExitingBlock())281return 0;282283// All non-latch exit blocks must have an UnreachableInst terminator.284// Otherwise the heuristic below may not be profitable.285SmallVector<BasicBlock *, 4> Exits;286L.getUniqueNonLatchExitBlocks(Exits);287if (any_of(Exits, [](const BasicBlock *BB) {288return !isa<UnreachableInst>(BB->getTerminator());289}))290return 0;291292// Now look for invariant loads that dominate the latch and are not known to293// be dereferenceable. If there are such loads and no writes, they will become294// dereferenceable in the loop if the first iteration is peeled off. Also295// collect the set of instructions controlled by such loads. Only peel if an296// exit condition uses (transitively) such a load.297BasicBlock *Header = L.getHeader();298BasicBlock *Latch = L.getLoopLatch();299SmallPtrSet<Value *, 8> LoadUsers;300const DataLayout &DL = L.getHeader()->getDataLayout();301for (BasicBlock *BB : L.blocks()) {302for (Instruction &I : *BB) {303if (I.mayWriteToMemory())304return 0;305306auto Iter = LoadUsers.find(&I);307if (Iter != LoadUsers.end()) {308for (Value *U : I.users())309LoadUsers.insert(U);310}311// Do not look for reads in the header; they can already be hoisted312// without peeling.313if (BB == Header)314continue;315if (auto *LI = dyn_cast<LoadInst>(&I)) {316Value *Ptr = LI->getPointerOperand();317if (DT.dominates(BB, Latch) && L.isLoopInvariant(Ptr) &&318!isDereferenceablePointer(Ptr, LI->getType(), DL, LI, AC, &DT))319for (Value *U : I.users())320LoadUsers.insert(U);321}322}323}324SmallVector<BasicBlock *> ExitingBlocks;325L.getExitingBlocks(ExitingBlocks);326if (any_of(ExitingBlocks, [&LoadUsers](BasicBlock *Exiting) {327return LoadUsers.contains(Exiting->getTerminator());328}))329return 1;330return 0;331}332333// Return the number of iterations to peel off that make conditions in the334// body true/false. For example, if we peel 2 iterations off the loop below,335// the condition i < 2 can be evaluated at compile time.336// for (i = 0; i < n; i++)337// if (i < 2)338// ..339// else340// ..341// }342static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,343ScalarEvolution &SE) {344assert(L.isLoopSimplifyForm() && "Loop needs to be in loop simplify form");345unsigned DesiredPeelCount = 0;346347// Do not peel the entire loop.348const SCEV *BE = SE.getConstantMaxBackedgeTakenCount(&L);349if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(BE))350MaxPeelCount =351std::min((unsigned)SC->getAPInt().getLimitedValue() - 1, MaxPeelCount);352353// Increase PeelCount while (IterVal Pred BoundSCEV) condition is satisfied;354// return true if inversed condition become known before reaching the355// MaxPeelCount limit.356auto PeelWhilePredicateIsKnown =357[&](unsigned &PeelCount, const SCEV *&IterVal, const SCEV *BoundSCEV,358const SCEV *Step, ICmpInst::Predicate Pred) {359while (PeelCount < MaxPeelCount &&360SE.isKnownPredicate(Pred, IterVal, BoundSCEV)) {361IterVal = SE.getAddExpr(IterVal, Step);362++PeelCount;363}364return SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), IterVal,365BoundSCEV);366};367368const unsigned MaxDepth = 4;369std::function<void(Value *, unsigned)> ComputePeelCount =370[&](Value *Condition, unsigned Depth) -> void {371if (!Condition->getType()->isIntegerTy() || Depth >= MaxDepth)372return;373374Value *LeftVal, *RightVal;375if (match(Condition, m_And(m_Value(LeftVal), m_Value(RightVal))) ||376match(Condition, m_Or(m_Value(LeftVal), m_Value(RightVal)))) {377ComputePeelCount(LeftVal, Depth + 1);378ComputePeelCount(RightVal, Depth + 1);379return;380}381382CmpInst::Predicate Pred;383if (!match(Condition, m_ICmp(Pred, m_Value(LeftVal), m_Value(RightVal))))384return;385386const SCEV *LeftSCEV = SE.getSCEV(LeftVal);387const SCEV *RightSCEV = SE.getSCEV(RightVal);388389// Do not consider predicates that are known to be true or false390// independently of the loop iteration.391if (SE.evaluatePredicate(Pred, LeftSCEV, RightSCEV))392return;393394// Check if we have a condition with one AddRec and one non AddRec395// expression. Normalize LeftSCEV to be the AddRec.396if (!isa<SCEVAddRecExpr>(LeftSCEV)) {397if (isa<SCEVAddRecExpr>(RightSCEV)) {398std::swap(LeftSCEV, RightSCEV);399Pred = ICmpInst::getSwappedPredicate(Pred);400} else401return;402}403404const SCEVAddRecExpr *LeftAR = cast<SCEVAddRecExpr>(LeftSCEV);405406// Avoid huge SCEV computations in the loop below, make sure we only407// consider AddRecs of the loop we are trying to peel.408if (!LeftAR->isAffine() || LeftAR->getLoop() != &L)409return;410if (!(ICmpInst::isEquality(Pred) && LeftAR->hasNoSelfWrap()) &&411!SE.getMonotonicPredicateType(LeftAR, Pred))412return;413414// Check if extending the current DesiredPeelCount lets us evaluate Pred415// or !Pred in the loop body statically.416unsigned NewPeelCount = DesiredPeelCount;417418const SCEV *IterVal = LeftAR->evaluateAtIteration(419SE.getConstant(LeftSCEV->getType(), NewPeelCount), SE);420421// If the original condition is not known, get the negated predicate422// (which holds on the else branch) and check if it is known. This allows423// us to peel of iterations that make the original condition false.424if (!SE.isKnownPredicate(Pred, IterVal, RightSCEV))425Pred = ICmpInst::getInversePredicate(Pred);426427const SCEV *Step = LeftAR->getStepRecurrence(SE);428if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, RightSCEV, Step,429Pred))430return;431432// However, for equality comparisons, that isn't always sufficient to433// eliminate the comparsion in loop body, we may need to peel one more434// iteration. See if that makes !Pred become unknown again.435const SCEV *NextIterVal = SE.getAddExpr(IterVal, Step);436if (ICmpInst::isEquality(Pred) &&437!SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), NextIterVal,438RightSCEV) &&439!SE.isKnownPredicate(Pred, IterVal, RightSCEV) &&440SE.isKnownPredicate(Pred, NextIterVal, RightSCEV)) {441if (NewPeelCount >= MaxPeelCount)442return; // Need to peel one more iteration, but can't. Give up.443++NewPeelCount; // Great!444}445446DesiredPeelCount = std::max(DesiredPeelCount, NewPeelCount);447};448449auto ComputePeelCountMinMax = [&](MinMaxIntrinsic *MinMax) {450if (!MinMax->getType()->isIntegerTy())451return;452Value *LHS = MinMax->getLHS(), *RHS = MinMax->getRHS();453const SCEV *BoundSCEV, *IterSCEV;454if (L.isLoopInvariant(LHS)) {455BoundSCEV = SE.getSCEV(LHS);456IterSCEV = SE.getSCEV(RHS);457} else if (L.isLoopInvariant(RHS)) {458BoundSCEV = SE.getSCEV(RHS);459IterSCEV = SE.getSCEV(LHS);460} else461return;462const auto *AddRec = dyn_cast<SCEVAddRecExpr>(IterSCEV);463// For simplicity, we support only affine recurrences.464if (!AddRec || !AddRec->isAffine() || AddRec->getLoop() != &L)465return;466const SCEV *Step = AddRec->getStepRecurrence(SE);467bool IsSigned = MinMax->isSigned();468// To minimize number of peeled iterations, we use strict relational469// predicates here.470ICmpInst::Predicate Pred;471if (SE.isKnownPositive(Step))472Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;473else if (SE.isKnownNegative(Step))474Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;475else476return;477// Check that AddRec is not wrapping.478if (!(IsSigned ? AddRec->hasNoSignedWrap() : AddRec->hasNoUnsignedWrap()))479return;480unsigned NewPeelCount = DesiredPeelCount;481const SCEV *IterVal = AddRec->evaluateAtIteration(482SE.getConstant(AddRec->getType(), NewPeelCount), SE);483if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, BoundSCEV, Step,484Pred))485return;486DesiredPeelCount = NewPeelCount;487};488489for (BasicBlock *BB : L.blocks()) {490for (Instruction &I : *BB) {491if (SelectInst *SI = dyn_cast<SelectInst>(&I))492ComputePeelCount(SI->getCondition(), 0);493if (MinMaxIntrinsic *MinMax = dyn_cast<MinMaxIntrinsic>(&I))494ComputePeelCountMinMax(MinMax);495}496497auto *BI = dyn_cast<BranchInst>(BB->getTerminator());498if (!BI || BI->isUnconditional())499continue;500501// Ignore loop exit condition.502if (L.getLoopLatch() == BB)503continue;504505ComputePeelCount(BI->getCondition(), 0);506}507508return DesiredPeelCount;509}510511/// This "heuristic" exactly matches implicit behavior which used to exist512/// inside getLoopEstimatedTripCount. It was added here to keep an513/// improvement inside that API from causing peeling to become more aggressive.514/// This should probably be removed.515static bool violatesLegacyMultiExitLoopCheck(Loop *L) {516BasicBlock *Latch = L->getLoopLatch();517if (!Latch)518return true;519520BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator());521if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch))522return true;523524assert((LatchBR->getSuccessor(0) == L->getHeader() ||525LatchBR->getSuccessor(1) == L->getHeader()) &&526"At least one edge out of the latch must go to the header");527528SmallVector<BasicBlock *, 4> ExitBlocks;529L->getUniqueNonLatchExitBlocks(ExitBlocks);530return any_of(ExitBlocks, [](const BasicBlock *EB) {531return !EB->getTerminatingDeoptimizeCall();532});533}534535536// Return the number of iterations we want to peel off.537void llvm::computePeelCount(Loop *L, unsigned LoopSize,538TargetTransformInfo::PeelingPreferences &PP,539unsigned TripCount, DominatorTree &DT,540ScalarEvolution &SE, AssumptionCache *AC,541unsigned Threshold) {542assert(LoopSize > 0 && "Zero loop size is not allowed!");543// Save the PP.PeelCount value set by the target in544// TTI.getPeelingPreferences or by the flag -unroll-peel-count.545unsigned TargetPeelCount = PP.PeelCount;546PP.PeelCount = 0;547if (!canPeel(L))548return;549550// Only try to peel innermost loops by default.551// The constraint can be relaxed by the target in TTI.getPeelingPreferences552// or by the flag -unroll-allow-loop-nests-peeling.553if (!PP.AllowLoopNestsPeeling && !L->isInnermost())554return;555556// If the user provided a peel count, use that.557bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;558if (UserPeelCount) {559LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount560<< " iterations.\n");561PP.PeelCount = UnrollForcePeelCount;562PP.PeelProfiledIterations = true;563return;564}565566// Skip peeling if it's disabled.567if (!PP.AllowPeeling)568return;569570// Check that we can peel at least one iteration.571if (2 * LoopSize > Threshold)572return;573574unsigned AlreadyPeeled = 0;575if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))576AlreadyPeeled = *Peeled;577// Stop if we already peeled off the maximum number of iterations.578if (AlreadyPeeled >= UnrollPeelMaxCount)579return;580581// Pay respect to limitations implied by loop size and the max peel count.582unsigned MaxPeelCount = UnrollPeelMaxCount;583MaxPeelCount = std::min(MaxPeelCount, Threshold / LoopSize - 1);584585// Start the max computation with the PP.PeelCount value set by the target586// in TTI.getPeelingPreferences or by the flag -unroll-peel-count.587unsigned DesiredPeelCount = TargetPeelCount;588589// Here we try to get rid of Phis which become invariants after 1, 2, ..., N590// iterations of the loop. For this we compute the number for iterations after591// which every Phi is guaranteed to become an invariant, and try to peel the592// maximum number of iterations among these values, thus turning all those593// Phis into invariants.594if (MaxPeelCount > DesiredPeelCount) {595// Check how many iterations are useful for resolving Phis596auto NumPeels = PhiAnalyzer(*L, MaxPeelCount).calculateIterationsToPeel();597if (NumPeels)598DesiredPeelCount = std::max(DesiredPeelCount, *NumPeels);599}600601DesiredPeelCount = std::max(DesiredPeelCount,602countToEliminateCompares(*L, MaxPeelCount, SE));603604if (DesiredPeelCount == 0)605DesiredPeelCount = peelToTurnInvariantLoadsDerefencebale(*L, DT, AC);606607if (DesiredPeelCount > 0) {608DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);609// Consider max peel count limitation.610assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");611if (DesiredPeelCount + AlreadyPeeled <= UnrollPeelMaxCount) {612LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount613<< " iteration(s) to turn"614<< " some Phis into invariants.\n");615PP.PeelCount = DesiredPeelCount;616PP.PeelProfiledIterations = false;617return;618}619}620621// Bail if we know the statically calculated trip count.622// In this case we rather prefer partial unrolling.623if (TripCount)624return;625626// Do not apply profile base peeling if it is disabled.627if (!PP.PeelProfiledIterations)628return;629// If we don't know the trip count, but have reason to believe the average630// trip count is low, peeling should be beneficial, since we will usually631// hit the peeled section.632// We only do this in the presence of profile information, since otherwise633// our estimates of the trip count are not reliable enough.634if (L->getHeader()->getParent()->hasProfileData()) {635if (violatesLegacyMultiExitLoopCheck(L))636return;637std::optional<unsigned> EstimatedTripCount = getLoopEstimatedTripCount(L);638if (!EstimatedTripCount)639return;640641LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is "642<< *EstimatedTripCount << "\n");643644if (*EstimatedTripCount) {645if (*EstimatedTripCount + AlreadyPeeled <= MaxPeelCount) {646unsigned PeelCount = *EstimatedTripCount;647LLVM_DEBUG(dbgs() << "Peeling first " << PeelCount << " iterations.\n");648PP.PeelCount = PeelCount;649return;650}651LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");652LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");653LLVM_DEBUG(dbgs() << "Loop cost: " << LoopSize << "\n");654LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n");655LLVM_DEBUG(dbgs() << "Max peel count by cost: "656<< (Threshold / LoopSize - 1) << "\n");657}658}659}660661struct WeightInfo {662// Weights for current iteration.663SmallVector<uint32_t> Weights;664// Weights to subtract after each iteration.665const SmallVector<uint32_t> SubWeights;666};667668/// Update the branch weights of an exiting block of a peeled-off loop669/// iteration.670/// Let F is a weight of the edge to continue (fallthrough) into the loop.671/// Let E is a weight of the edge to an exit.672/// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to673/// go to exit.674/// Then, Estimated ExitCount = F / E.675/// For I-th (counting from 0) peeled off iteration we set the weights for676/// the peeled exit as (EC - I, 1). It gives us reasonable distribution,677/// The probability to go to exit 1/(EC-I) increases. At the same time678/// the estimated exit count in the remainder loop reduces by I.679/// To avoid dealing with division rounding we can just multiple both part680/// of weights to E and use weight as (F - I * E, E).681static void updateBranchWeights(Instruction *Term, WeightInfo &Info) {682setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false);683for (auto [Idx, SubWeight] : enumerate(Info.SubWeights))684if (SubWeight != 0)685// Don't set the probability of taking the edge from latch to loop header686// to less than 1:1 ratio (meaning Weight should not be lower than687// SubWeight), as this could significantly reduce the loop's hotness,688// which would be incorrect in the case of underestimating the trip count.689Info.Weights[Idx] =690Info.Weights[Idx] > SubWeight691? std::max(Info.Weights[Idx] - SubWeight, SubWeight)692: SubWeight;693}694695/// Initialize the weights for all exiting blocks.696static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos,697Loop *L) {698SmallVector<BasicBlock *> ExitingBlocks;699L->getExitingBlocks(ExitingBlocks);700for (BasicBlock *ExitingBlock : ExitingBlocks) {701Instruction *Term = ExitingBlock->getTerminator();702SmallVector<uint32_t> Weights;703if (!extractBranchWeights(*Term, Weights))704continue;705706// See the comment on updateBranchWeights() for an explanation of what we707// do here.708uint32_t FallThroughWeights = 0;709uint32_t ExitWeights = 0;710for (auto [Succ, Weight] : zip(successors(Term), Weights)) {711if (L->contains(Succ))712FallThroughWeights += Weight;713else714ExitWeights += Weight;715}716717// Don't try to update weights for degenerate case.718if (FallThroughWeights == 0)719continue;720721SmallVector<uint32_t> SubWeights;722for (auto [Succ, Weight] : zip(successors(Term), Weights)) {723if (!L->contains(Succ)) {724// Exit weights stay the same.725SubWeights.push_back(0);726continue;727}728729// Subtract exit weights on each iteration, distributed across all730// fallthrough edges.731double W = (double)Weight / (double)FallThroughWeights;732SubWeights.push_back((uint32_t)(ExitWeights * W));733}734735WeightInfos.insert({Term, {std::move(Weights), std::move(SubWeights)}});736}737}738739/// Clones the body of the loop L, putting it between \p InsertTop and \p740/// InsertBot.741/// \param IterNumber The serial number of the iteration currently being742/// peeled off.743/// \param ExitEdges The exit edges of the original loop.744/// \param[out] NewBlocks A list of the blocks in the newly created clone745/// \param[out] VMap The value map between the loop and the new clone.746/// \param LoopBlocks A helper for DFS-traversal of the loop.747/// \param LVMap A value-map that maps instructions from the original loop to748/// instructions in the last peeled-off iteration.749static void cloneLoopBlocks(750Loop *L, unsigned IterNumber, BasicBlock *InsertTop, BasicBlock *InsertBot,751SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *>> &ExitEdges,752SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,753ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT,754LoopInfo *LI, ArrayRef<MDNode *> LoopLocalNoAliasDeclScopes,755ScalarEvolution &SE) {756BasicBlock *Header = L->getHeader();757BasicBlock *Latch = L->getLoopLatch();758BasicBlock *PreHeader = L->getLoopPreheader();759760Function *F = Header->getParent();761LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();762LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();763Loop *ParentLoop = L->getParentLoop();764765// For each block in the original loop, create a new copy,766// and update the value map with the newly created values.767for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {768BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".peel", F);769NewBlocks.push_back(NewBB);770771// If an original block is an immediate child of the loop L, its copy772// is a child of a ParentLoop after peeling. If a block is a child of773// a nested loop, it is handled in the cloneLoop() call below.774if (ParentLoop && LI->getLoopFor(*BB) == L)775ParentLoop->addBasicBlockToLoop(NewBB, *LI);776777VMap[*BB] = NewBB;778779// If dominator tree is available, insert nodes to represent cloned blocks.780if (DT) {781if (Header == *BB)782DT->addNewBlock(NewBB, InsertTop);783else {784DomTreeNode *IDom = DT->getNode(*BB)->getIDom();785// VMap must contain entry for IDom, as the iteration order is RPO.786DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDom->getBlock()]));787}788}789}790791{792// Identify what other metadata depends on the cloned version. After793// cloning, replace the metadata with the corrected version for both794// memory instructions and noalias intrinsics.795std::string Ext = (Twine("Peel") + Twine(IterNumber)).str();796cloneAndAdaptNoAliasScopes(LoopLocalNoAliasDeclScopes, NewBlocks,797Header->getContext(), Ext);798}799800// Recursively create the new Loop objects for nested loops, if any,801// to preserve LoopInfo.802for (Loop *ChildLoop : *L) {803cloneLoop(ChildLoop, ParentLoop, VMap, LI, nullptr);804}805806// Hook-up the control flow for the newly inserted blocks.807// The new header is hooked up directly to the "top", which is either808// the original loop preheader (for the first iteration) or the previous809// iteration's exiting block (for every other iteration)810InsertTop->getTerminator()->setSuccessor(0, cast<BasicBlock>(VMap[Header]));811812// Similarly, for the latch:813// The original exiting edge is still hooked up to the loop exit.814// The backedge now goes to the "bottom", which is either the loop's real815// header (for the last peeled iteration) or the copied header of the next816// iteration (for every other iteration)817BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);818auto *LatchTerm = cast<Instruction>(NewLatch->getTerminator());819for (unsigned idx = 0, e = LatchTerm->getNumSuccessors(); idx < e; ++idx)820if (LatchTerm->getSuccessor(idx) == Header) {821LatchTerm->setSuccessor(idx, InsertBot);822break;823}824if (DT)825DT->changeImmediateDominator(InsertBot, NewLatch);826827// The new copy of the loop body starts with a bunch of PHI nodes828// that pick an incoming value from either the preheader, or the previous829// loop iteration. Since this copy is no longer part of the loop, we830// resolve this statically:831// For the first iteration, we use the value from the preheader directly.832// For any other iteration, we replace the phi with the value generated by833// the immediately preceding clone of the loop body (which represents834// the previous iteration).835for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {836PHINode *NewPHI = cast<PHINode>(VMap[&*I]);837if (IterNumber == 0) {838VMap[&*I] = NewPHI->getIncomingValueForBlock(PreHeader);839} else {840Value *LatchVal = NewPHI->getIncomingValueForBlock(Latch);841Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);842if (LatchInst && L->contains(LatchInst))843VMap[&*I] = LVMap[LatchInst];844else845VMap[&*I] = LatchVal;846}847NewPHI->eraseFromParent();848}849850// Fix up the outgoing values - we need to add a value for the iteration851// we've just created. Note that this must happen *after* the incoming852// values are adjusted, since the value going out of the latch may also be853// a value coming into the header.854for (auto Edge : ExitEdges)855for (PHINode &PHI : Edge.second->phis()) {856Value *LatchVal = PHI.getIncomingValueForBlock(Edge.first);857Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);858if (LatchInst && L->contains(LatchInst))859LatchVal = VMap[LatchVal];860PHI.addIncoming(LatchVal, cast<BasicBlock>(VMap[Edge.first]));861SE.forgetLcssaPhiWithNewPredecessor(L, &PHI);862}863864// LastValueMap is updated with the values for the current loop865// which are used the next time this function is called.866for (auto KV : VMap)867LVMap[KV.first] = KV.second;868}869870TargetTransformInfo::PeelingPreferences871llvm::gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,872const TargetTransformInfo &TTI,873std::optional<bool> UserAllowPeeling,874std::optional<bool> UserAllowProfileBasedPeeling,875bool UnrollingSpecficValues) {876TargetTransformInfo::PeelingPreferences PP;877878// Set the default values.879PP.PeelCount = 0;880PP.AllowPeeling = true;881PP.AllowLoopNestsPeeling = false;882PP.PeelProfiledIterations = true;883884// Get the target specifc values.885TTI.getPeelingPreferences(L, SE, PP);886887// User specified values using cl::opt.888if (UnrollingSpecficValues) {889if (UnrollPeelCount.getNumOccurrences() > 0)890PP.PeelCount = UnrollPeelCount;891if (UnrollAllowPeeling.getNumOccurrences() > 0)892PP.AllowPeeling = UnrollAllowPeeling;893if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0)894PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling;895}896897// User specifed values provided by argument.898if (UserAllowPeeling)899PP.AllowPeeling = *UserAllowPeeling;900if (UserAllowProfileBasedPeeling)901PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;902903return PP;904}905906/// Peel off the first \p PeelCount iterations of loop \p L.907///908/// Note that this does not peel them off as a single straight-line block.909/// Rather, each iteration is peeled off separately, and needs to check the910/// exit condition.911/// For loops that dynamically execute \p PeelCount iterations or less912/// this provides a benefit, since the peeled off iterations, which account913/// for the bulk of dynamic execution, can be further simplified by scalar914/// optimizations.915bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,916ScalarEvolution *SE, DominatorTree &DT, AssumptionCache *AC,917bool PreserveLCSSA, ValueToValueMapTy &LVMap) {918assert(PeelCount > 0 && "Attempt to peel out zero iterations?");919assert(canPeel(L) && "Attempt to peel a loop which is not peelable?");920921LoopBlocksDFS LoopBlocks(L);922LoopBlocks.perform(LI);923924BasicBlock *Header = L->getHeader();925BasicBlock *PreHeader = L->getLoopPreheader();926BasicBlock *Latch = L->getLoopLatch();927SmallVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitEdges;928L->getExitEdges(ExitEdges);929930// Remember dominators of blocks we might reach through exits to change them931// later. Immediate dominator of such block might change, because we add more932// routes which can lead to the exit: we can reach it from the peeled933// iterations too.934DenseMap<BasicBlock *, BasicBlock *> NonLoopBlocksIDom;935for (auto *BB : L->blocks()) {936auto *BBDomNode = DT.getNode(BB);937SmallVector<BasicBlock *, 16> ChildrenToUpdate;938for (auto *ChildDomNode : BBDomNode->children()) {939auto *ChildBB = ChildDomNode->getBlock();940if (!L->contains(ChildBB))941ChildrenToUpdate.push_back(ChildBB);942}943// The new idom of the block will be the nearest common dominator944// of all copies of the previous idom. This is equivalent to the945// nearest common dominator of the previous idom and the first latch,946// which dominates all copies of the previous idom.947BasicBlock *NewIDom = DT.findNearestCommonDominator(BB, Latch);948for (auto *ChildBB : ChildrenToUpdate)949NonLoopBlocksIDom[ChildBB] = NewIDom;950}951952Function *F = Header->getParent();953954// Set up all the necessary basic blocks. It is convenient to split the955// preheader into 3 parts - two blocks to anchor the peeled copy of the loop956// body, and a new preheader for the "real" loop.957958// Peeling the first iteration transforms.959//960// PreHeader:961// ...962// Header:963// LoopBody964// If (cond) goto Header965// Exit:966//967// into968//969// InsertTop:970// LoopBody971// If (!cond) goto Exit972// InsertBot:973// NewPreHeader:974// ...975// Header:976// LoopBody977// If (cond) goto Header978// Exit:979//980// Each following iteration will split the current bottom anchor in two,981// and put the new copy of the loop body between these two blocks. That is,982// after peeling another iteration from the example above, we'll split983// InsertBot, and get:984//985// InsertTop:986// LoopBody987// If (!cond) goto Exit988// InsertBot:989// LoopBody990// If (!cond) goto Exit991// InsertBot.next:992// NewPreHeader:993// ...994// Header:995// LoopBody996// If (cond) goto Header997// Exit:998999BasicBlock *InsertTop = SplitEdge(PreHeader, Header, &DT, LI);1000BasicBlock *InsertBot =1001SplitBlock(InsertTop, InsertTop->getTerminator(), &DT, LI);1002BasicBlock *NewPreHeader =1003SplitBlock(InsertBot, InsertBot->getTerminator(), &DT, LI);10041005InsertTop->setName(Header->getName() + ".peel.begin");1006InsertBot->setName(Header->getName() + ".peel.next");1007NewPreHeader->setName(PreHeader->getName() + ".peel.newph");10081009Instruction *LatchTerm =1010cast<Instruction>(cast<BasicBlock>(Latch)->getTerminator());10111012// If we have branch weight information, we'll want to update it for the1013// newly created branches.1014DenseMap<Instruction *, WeightInfo> Weights;1015initBranchWeights(Weights, L);10161017// Identify what noalias metadata is inside the loop: if it is inside the1018// loop, the associated metadata must be cloned for each iteration.1019SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;1020identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes);10211022// For each peeled-off iteration, make a copy of the loop.1023for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {1024SmallVector<BasicBlock *, 8> NewBlocks;1025ValueToValueMapTy VMap;10261027cloneLoopBlocks(L, Iter, InsertTop, InsertBot, ExitEdges, NewBlocks,1028LoopBlocks, VMap, LVMap, &DT, LI,1029LoopLocalNoAliasDeclScopes, *SE);10301031// Remap to use values from the current iteration instead of the1032// previous one.1033remapInstructionsInBlocks(NewBlocks, VMap);10341035// Update IDoms of the blocks reachable through exits.1036if (Iter == 0)1037for (auto BBIDom : NonLoopBlocksIDom)1038DT.changeImmediateDominator(BBIDom.first,1039cast<BasicBlock>(LVMap[BBIDom.second]));1040#ifdef EXPENSIVE_CHECKS1041assert(DT.verify(DominatorTree::VerificationLevel::Fast));1042#endif10431044for (auto &[Term, Info] : Weights) {1045auto *TermCopy = cast<Instruction>(VMap[Term]);1046updateBranchWeights(TermCopy, Info);1047}10481049// Remove Loop metadata from the latch branch instruction1050// because it is not the Loop's latch branch anymore.1051auto *LatchTermCopy = cast<Instruction>(VMap[LatchTerm]);1052LatchTermCopy->setMetadata(LLVMContext::MD_loop, nullptr);10531054InsertTop = InsertBot;1055InsertBot = SplitBlock(InsertBot, InsertBot->getTerminator(), &DT, LI);1056InsertBot->setName(Header->getName() + ".peel.next");10571058F->splice(InsertTop->getIterator(), F, NewBlocks[0]->getIterator(),1059F->end());1060}10611062// Now adjust the phi nodes in the loop header to get their initial values1063// from the last peeled-off iteration instead of the preheader.1064for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {1065PHINode *PHI = cast<PHINode>(I);1066Value *NewVal = PHI->getIncomingValueForBlock(Latch);1067Instruction *LatchInst = dyn_cast<Instruction>(NewVal);1068if (LatchInst && L->contains(LatchInst))1069NewVal = LVMap[LatchInst];10701071PHI->setIncomingValueForBlock(NewPreHeader, NewVal);1072}10731074for (const auto &[Term, Info] : Weights) {1075setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false);1076}10771078// Update Metadata for count of peeled off iterations.1079unsigned AlreadyPeeled = 0;1080if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))1081AlreadyPeeled = *Peeled;1082addStringMetadataToLoop(L, PeeledCountMetaData, AlreadyPeeled + PeelCount);10831084if (Loop *ParentLoop = L->getParentLoop())1085L = ParentLoop;10861087// We modified the loop, update SE.1088SE->forgetTopmostLoop(L);1089SE->forgetBlockAndLoopDispositions();10901091#ifdef EXPENSIVE_CHECKS1092// Finally DomtTree must be correct.1093assert(DT.verify(DominatorTree::VerificationLevel::Fast));1094#endif10951096// FIXME: Incrementally update loop-simplify1097simplifyLoop(L, &DT, LI, SE, AC, nullptr, PreserveLCSSA);10981099NumPeeled++;11001101return true;1102}110311041105