Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
35266 views
//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This pass implements a simple loop unroller. It works best when loops have9// been canonicalized by the -indvars pass, allowing it to determine the trip10// counts of loops easily.11//===----------------------------------------------------------------------===//1213#include "llvm/Transforms/Scalar/LoopUnrollPass.h"14#include "llvm/ADT/DenseMap.h"15#include "llvm/ADT/DenseMapInfo.h"16#include "llvm/ADT/DenseSet.h"17#include "llvm/ADT/STLExtras.h"18#include "llvm/ADT/ScopedHashTable.h"19#include "llvm/ADT/SetVector.h"20#include "llvm/ADT/SmallPtrSet.h"21#include "llvm/ADT/SmallVector.h"22#include "llvm/ADT/StringRef.h"23#include "llvm/Analysis/AssumptionCache.h"24#include "llvm/Analysis/BlockFrequencyInfo.h"25#include "llvm/Analysis/CodeMetrics.h"26#include "llvm/Analysis/LoopAnalysisManager.h"27#include "llvm/Analysis/LoopInfo.h"28#include "llvm/Analysis/LoopPass.h"29#include "llvm/Analysis/LoopUnrollAnalyzer.h"30#include "llvm/Analysis/MemorySSA.h"31#include "llvm/Analysis/OptimizationRemarkEmitter.h"32#include "llvm/Analysis/ProfileSummaryInfo.h"33#include "llvm/Analysis/ScalarEvolution.h"34#include "llvm/Analysis/TargetTransformInfo.h"35#include "llvm/IR/BasicBlock.h"36#include "llvm/IR/CFG.h"37#include "llvm/IR/Constant.h"38#include "llvm/IR/Constants.h"39#include "llvm/IR/DiagnosticInfo.h"40#include "llvm/IR/Dominators.h"41#include "llvm/IR/Function.h"42#include "llvm/IR/Instruction.h"43#include "llvm/IR/Instructions.h"44#include "llvm/IR/IntrinsicInst.h"45#include "llvm/IR/Metadata.h"46#include "llvm/IR/PassManager.h"47#include "llvm/InitializePasses.h"48#include "llvm/Pass.h"49#include "llvm/Support/Casting.h"50#include "llvm/Support/CommandLine.h"51#include "llvm/Support/Debug.h"52#include "llvm/Support/ErrorHandling.h"53#include "llvm/Support/raw_ostream.h"54#include "llvm/Transforms/Scalar.h"55#include "llvm/Transforms/Scalar/LoopPassManager.h"56#include "llvm/Transforms/Utils.h"57#include "llvm/Transforms/Utils/LoopPeel.h"58#include "llvm/Transforms/Utils/LoopSimplify.h"59#include "llvm/Transforms/Utils/LoopUtils.h"60#include "llvm/Transforms/Utils/SizeOpts.h"61#include "llvm/Transforms/Utils/UnrollLoop.h"62#include <algorithm>63#include <cassert>64#include <cstdint>65#include <limits>66#include <optional>67#include <string>68#include <tuple>69#include <utility>7071using namespace llvm;7273#define DEBUG_TYPE "loop-unroll"7475cl::opt<bool> llvm::ForgetSCEVInLoopUnroll(76"forget-scev-loop-unroll", cl::init(false), cl::Hidden,77cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"78" the current top-most loop. This is sometimes preferred to reduce"79" compile time."));8081static cl::opt<unsigned>82UnrollThreshold("unroll-threshold", cl::Hidden,83cl::desc("The cost threshold for loop unrolling"));8485static cl::opt<unsigned>86UnrollOptSizeThreshold(87"unroll-optsize-threshold", cl::init(0), cl::Hidden,88cl::desc("The cost threshold for loop unrolling when optimizing for "89"size"));9091static cl::opt<unsigned> UnrollPartialThreshold(92"unroll-partial-threshold", cl::Hidden,93cl::desc("The cost threshold for partial loop unrolling"));9495static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(96"unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,97cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "98"to the threshold when aggressively unrolling a loop due to the "99"dynamic cost savings. If completely unrolling a loop will reduce "100"the total runtime from X to Y, we boost the loop unroll "101"threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "102"X/Y). This limit avoids excessive code bloat."));103104static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(105"unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,106cl::desc("Don't allow loop unrolling to simulate more than this number of"107"iterations when checking full unroll profitability"));108109static cl::opt<unsigned> UnrollCount(110"unroll-count", cl::Hidden,111cl::desc("Use this unroll count for all loops including those with "112"unroll_count pragma values, for testing purposes"));113114static cl::opt<unsigned> UnrollMaxCount(115"unroll-max-count", cl::Hidden,116cl::desc("Set the max unroll count for partial and runtime unrolling, for"117"testing purposes"));118119static cl::opt<unsigned> UnrollFullMaxCount(120"unroll-full-max-count", cl::Hidden,121cl::desc(122"Set the max unroll count for full unrolling, for testing purposes"));123124static cl::opt<bool>125UnrollAllowPartial("unroll-allow-partial", cl::Hidden,126cl::desc("Allows loops to be partially unrolled until "127"-unroll-threshold loop size is reached."));128129static cl::opt<bool> UnrollAllowRemainder(130"unroll-allow-remainder", cl::Hidden,131cl::desc("Allow generation of a loop remainder (extra iterations) "132"when unrolling a loop."));133134static cl::opt<bool>135UnrollRuntime("unroll-runtime", cl::Hidden,136cl::desc("Unroll loops with run-time trip counts"));137138static cl::opt<unsigned> UnrollMaxUpperBound(139"unroll-max-upperbound", cl::init(8), cl::Hidden,140cl::desc(141"The max of trip count upper bound that is considered in unrolling"));142143static cl::opt<unsigned> PragmaUnrollThreshold(144"pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,145cl::desc("Unrolled size limit for loops with an unroll(full) or "146"unroll_count pragma."));147148static cl::opt<unsigned> FlatLoopTripCountThreshold(149"flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,150cl::desc("If the runtime tripcount for the loop is lower than the "151"threshold, the loop is considered as flat and will be less "152"aggressively unrolled."));153154static cl::opt<bool> UnrollUnrollRemainder(155"unroll-remainder", cl::Hidden,156cl::desc("Allow the loop remainder to be unrolled."));157158// This option isn't ever intended to be enabled, it serves to allow159// experiments to check the assumptions about when this kind of revisit is160// necessary.161static cl::opt<bool> UnrollRevisitChildLoops(162"unroll-revisit-child-loops", cl::Hidden,163cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "164"This shouldn't typically be needed as child loops (or their "165"clones) were already visited."));166167static cl::opt<unsigned> UnrollThresholdAggressive(168"unroll-threshold-aggressive", cl::init(300), cl::Hidden,169cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "170"optimizations"));171static cl::opt<unsigned>172UnrollThresholdDefault("unroll-threshold-default", cl::init(150),173cl::Hidden,174cl::desc("Default threshold (max size of unrolled "175"loop), used in all but O3 optimizations"));176177static cl::opt<unsigned> PragmaUnrollFullMaxIterations(178"pragma-unroll-full-max-iterations", cl::init(1'000'000), cl::Hidden,179cl::desc("Maximum allowed iterations to unroll under pragma unroll full."));180181/// A magic value for use with the Threshold parameter to indicate182/// that the loop unroll should be performed regardless of how much183/// code expansion would result.184static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();185186/// Gather the various unrolling parameters based on the defaults, compiler187/// flags, TTI overrides and user specified parameters.188TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(189Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,190BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,191OptimizationRemarkEmitter &ORE, int OptLevel,192std::optional<unsigned> UserThreshold, std::optional<unsigned> UserCount,193std::optional<bool> UserAllowPartial, std::optional<bool> UserRuntime,194std::optional<bool> UserUpperBound,195std::optional<unsigned> UserFullUnrollMaxCount) {196TargetTransformInfo::UnrollingPreferences UP;197198// Set up the defaults199UP.Threshold =200OptLevel > 2 ? UnrollThresholdAggressive : UnrollThresholdDefault;201UP.MaxPercentThresholdBoost = 400;202UP.OptSizeThreshold = UnrollOptSizeThreshold;203UP.PartialThreshold = 150;204UP.PartialOptSizeThreshold = UnrollOptSizeThreshold;205UP.Count = 0;206UP.DefaultUnrollRuntimeCount = 8;207UP.MaxCount = std::numeric_limits<unsigned>::max();208UP.MaxUpperBound = UnrollMaxUpperBound;209UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();210UP.BEInsns = 2;211UP.Partial = false;212UP.Runtime = false;213UP.AllowRemainder = true;214UP.UnrollRemainder = false;215UP.AllowExpensiveTripCount = false;216UP.Force = false;217UP.UpperBound = false;218UP.UnrollAndJam = false;219UP.UnrollAndJamInnerLoopThreshold = 60;220UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;221222// Override with any target specific settings223TTI.getUnrollingPreferences(L, SE, UP, &ORE);224225// Apply size attributes226bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||227// Let unroll hints / pragmas take precedence over PGSO.228(hasUnrollTransformation(L) != TM_ForcedByUser &&229llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,230PGSOQueryType::IRPass));231if (OptForSize) {232UP.Threshold = UP.OptSizeThreshold;233UP.PartialThreshold = UP.PartialOptSizeThreshold;234UP.MaxPercentThresholdBoost = 100;235}236237// Apply any user values specified by cl::opt238if (UnrollThreshold.getNumOccurrences() > 0)239UP.Threshold = UnrollThreshold;240if (UnrollPartialThreshold.getNumOccurrences() > 0)241UP.PartialThreshold = UnrollPartialThreshold;242if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)243UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;244if (UnrollMaxCount.getNumOccurrences() > 0)245UP.MaxCount = UnrollMaxCount;246if (UnrollMaxUpperBound.getNumOccurrences() > 0)247UP.MaxUpperBound = UnrollMaxUpperBound;248if (UnrollFullMaxCount.getNumOccurrences() > 0)249UP.FullUnrollMaxCount = UnrollFullMaxCount;250if (UnrollAllowPartial.getNumOccurrences() > 0)251UP.Partial = UnrollAllowPartial;252if (UnrollAllowRemainder.getNumOccurrences() > 0)253UP.AllowRemainder = UnrollAllowRemainder;254if (UnrollRuntime.getNumOccurrences() > 0)255UP.Runtime = UnrollRuntime;256if (UnrollMaxUpperBound == 0)257UP.UpperBound = false;258if (UnrollUnrollRemainder.getNumOccurrences() > 0)259UP.UnrollRemainder = UnrollUnrollRemainder;260if (UnrollMaxIterationsCountToAnalyze.getNumOccurrences() > 0)261UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;262263// Apply user values provided by argument264if (UserThreshold) {265UP.Threshold = *UserThreshold;266UP.PartialThreshold = *UserThreshold;267}268if (UserCount)269UP.Count = *UserCount;270if (UserAllowPartial)271UP.Partial = *UserAllowPartial;272if (UserRuntime)273UP.Runtime = *UserRuntime;274if (UserUpperBound)275UP.UpperBound = *UserUpperBound;276if (UserFullUnrollMaxCount)277UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;278279return UP;280}281282namespace {283284/// A struct to densely store the state of an instruction after unrolling at285/// each iteration.286///287/// This is designed to work like a tuple of <Instruction *, int> for the288/// purposes of hashing and lookup, but to be able to associate two boolean289/// states with each key.290struct UnrolledInstState {291Instruction *I;292int Iteration : 30;293unsigned IsFree : 1;294unsigned IsCounted : 1;295};296297/// Hashing and equality testing for a set of the instruction states.298struct UnrolledInstStateKeyInfo {299using PtrInfo = DenseMapInfo<Instruction *>;300using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;301302static inline UnrolledInstState getEmptyKey() {303return {PtrInfo::getEmptyKey(), 0, 0, 0};304}305306static inline UnrolledInstState getTombstoneKey() {307return {PtrInfo::getTombstoneKey(), 0, 0, 0};308}309310static inline unsigned getHashValue(const UnrolledInstState &S) {311return PairInfo::getHashValue({S.I, S.Iteration});312}313314static inline bool isEqual(const UnrolledInstState &LHS,315const UnrolledInstState &RHS) {316return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});317}318};319320struct EstimatedUnrollCost {321/// The estimated cost after unrolling.322unsigned UnrolledCost;323324/// The estimated dynamic cost of executing the instructions in the325/// rolled form.326unsigned RolledDynamicCost;327};328329struct PragmaInfo {330PragmaInfo(bool UUC, bool PFU, unsigned PC, bool PEU)331: UserUnrollCount(UUC), PragmaFullUnroll(PFU), PragmaCount(PC),332PragmaEnableUnroll(PEU) {}333const bool UserUnrollCount;334const bool PragmaFullUnroll;335const unsigned PragmaCount;336const bool PragmaEnableUnroll;337};338339} // end anonymous namespace340341/// Figure out if the loop is worth full unrolling.342///343/// Complete loop unrolling can make some loads constant, and we need to know344/// if that would expose any further optimization opportunities. This routine345/// estimates this optimization. It computes cost of unrolled loop346/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By347/// dynamic cost we mean that we won't count costs of blocks that are known not348/// to be executed (i.e. if we have a branch in the loop and we know that at the349/// given iteration its condition would be resolved to true, we won't add up the350/// cost of the 'false'-block).351/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If352/// the analysis failed (no benefits expected from the unrolling, or the loop is353/// too big to analyze), the returned value is std::nullopt.354static std::optional<EstimatedUnrollCost> analyzeLoopUnrollCost(355const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,356const SmallPtrSetImpl<const Value *> &EphValues,357const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize,358unsigned MaxIterationsCountToAnalyze) {359// We want to be able to scale offsets by the trip count and add more offsets360// to them without checking for overflows, and we already don't want to361// analyze *massive* trip counts, so we force the max to be reasonably small.362assert(MaxIterationsCountToAnalyze <363(unsigned)(std::numeric_limits<int>::max() / 2) &&364"The unroll iterations max is too large!");365366// Only analyze inner loops. We can't properly estimate cost of nested loops367// and we won't visit inner loops again anyway.368if (!L->isInnermost())369return std::nullopt;370371// Don't simulate loops with a big or unknown tripcount372if (!TripCount || TripCount > MaxIterationsCountToAnalyze)373return std::nullopt;374375SmallSetVector<BasicBlock *, 16> BBWorklist;376SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;377DenseMap<Value *, Value *> SimplifiedValues;378SmallVector<std::pair<Value *, Value *>, 4> SimplifiedInputValues;379380// The estimated cost of the unrolled form of the loop. We try to estimate381// this by simplifying as much as we can while computing the estimate.382InstructionCost UnrolledCost = 0;383384// We also track the estimated dynamic (that is, actually executed) cost in385// the rolled form. This helps identify cases when the savings from unrolling386// aren't just exposing dead control flows, but actual reduced dynamic387// instructions due to the simplifications which we expect to occur after388// unrolling.389InstructionCost RolledDynamicCost = 0;390391// We track the simplification of each instruction in each iteration. We use392// this to recursively merge costs into the unrolled cost on-demand so that393// we don't count the cost of any dead code. This is essentially a map from394// <instruction, int> to <bool, bool>, but stored as a densely packed struct.395DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;396397// A small worklist used to accumulate cost of instructions from each398// observable and reached root in the loop.399SmallVector<Instruction *, 16> CostWorklist;400401// PHI-used worklist used between iterations while accumulating cost.402SmallVector<Instruction *, 4> PHIUsedList;403404// Helper function to accumulate cost for instructions in the loop.405auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {406assert(Iteration >= 0 && "Cannot have a negative iteration!");407assert(CostWorklist.empty() && "Must start with an empty cost list");408assert(PHIUsedList.empty() && "Must start with an empty phi used list");409CostWorklist.push_back(&RootI);410TargetTransformInfo::TargetCostKind CostKind =411RootI.getFunction()->hasMinSize() ?412TargetTransformInfo::TCK_CodeSize :413TargetTransformInfo::TCK_SizeAndLatency;414for (;; --Iteration) {415do {416Instruction *I = CostWorklist.pop_back_val();417418// InstCostMap only uses I and Iteration as a key, the other two values419// don't matter here.420auto CostIter = InstCostMap.find({I, Iteration, 0, 0});421if (CostIter == InstCostMap.end())422// If an input to a PHI node comes from a dead path through the loop423// we may have no cost data for it here. What that actually means is424// that it is free.425continue;426auto &Cost = *CostIter;427if (Cost.IsCounted)428// Already counted this instruction.429continue;430431// Mark that we are counting the cost of this instruction now.432Cost.IsCounted = true;433434// If this is a PHI node in the loop header, just add it to the PHI set.435if (auto *PhiI = dyn_cast<PHINode>(I))436if (PhiI->getParent() == L->getHeader()) {437assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "438"inherently simplify during unrolling.");439if (Iteration == 0)440continue;441442// Push the incoming value from the backedge into the PHI used list443// if it is an in-loop instruction. We'll use this to populate the444// cost worklist for the next iteration (as we count backwards).445if (auto *OpI = dyn_cast<Instruction>(446PhiI->getIncomingValueForBlock(L->getLoopLatch())))447if (L->contains(OpI))448PHIUsedList.push_back(OpI);449continue;450}451452// First accumulate the cost of this instruction.453if (!Cost.IsFree) {454// Consider simplified operands in instruction cost.455SmallVector<Value *, 4> Operands;456transform(I->operands(), std::back_inserter(Operands),457[&](Value *Op) {458if (auto Res = SimplifiedValues.lookup(Op))459return Res;460return Op;461});462UnrolledCost += TTI.getInstructionCost(I, Operands, CostKind);463LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "464<< Iteration << "): ");465LLVM_DEBUG(I->dump());466}467468// We must count the cost of every operand which is not free,469// recursively. If we reach a loop PHI node, simply add it to the set470// to be considered on the next iteration (backwards!).471for (Value *Op : I->operands()) {472// Check whether this operand is free due to being a constant or473// outside the loop.474auto *OpI = dyn_cast<Instruction>(Op);475if (!OpI || !L->contains(OpI))476continue;477478// Otherwise accumulate its cost.479CostWorklist.push_back(OpI);480}481} while (!CostWorklist.empty());482483if (PHIUsedList.empty())484// We've exhausted the search.485break;486487assert(Iteration > 0 &&488"Cannot track PHI-used values past the first iteration!");489CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());490PHIUsedList.clear();491}492};493494// Ensure that we don't violate the loop structure invariants relied on by495// this analysis.496assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");497assert(L->isLCSSAForm(DT) &&498"Must have loops in LCSSA form to track live-out values.");499500LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");501502TargetTransformInfo::TargetCostKind CostKind =503L->getHeader()->getParent()->hasMinSize() ?504TargetTransformInfo::TCK_CodeSize : TargetTransformInfo::TCK_SizeAndLatency;505// Simulate execution of each iteration of the loop counting instructions,506// which would be simplified.507// Since the same load will take different values on different iterations,508// we literally have to go through all loop's iterations.509for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {510LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");511512// Prepare for the iteration by collecting any simplified entry or backedge513// inputs.514for (Instruction &I : *L->getHeader()) {515auto *PHI = dyn_cast<PHINode>(&I);516if (!PHI)517break;518519// The loop header PHI nodes must have exactly two input: one from the520// loop preheader and one from the loop latch.521assert(522PHI->getNumIncomingValues() == 2 &&523"Must have an incoming value only for the preheader and the latch.");524525Value *V = PHI->getIncomingValueForBlock(526Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());527if (Iteration != 0 && SimplifiedValues.count(V))528V = SimplifiedValues.lookup(V);529SimplifiedInputValues.push_back({PHI, V});530}531532// Now clear and re-populate the map for the next iteration.533SimplifiedValues.clear();534while (!SimplifiedInputValues.empty())535SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());536537UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);538539BBWorklist.clear();540BBWorklist.insert(L->getHeader());541// Note that we *must not* cache the size, this loop grows the worklist.542for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {543BasicBlock *BB = BBWorklist[Idx];544545// Visit all instructions in the given basic block and try to simplify546// it. We don't change the actual IR, just count optimization547// opportunities.548for (Instruction &I : *BB) {549// These won't get into the final code - don't even try calculating the550// cost for them.551if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))552continue;553554// Track this instruction's expected baseline cost when executing the555// rolled loop form.556RolledDynamicCost += TTI.getInstructionCost(&I, CostKind);557558// Visit the instruction to analyze its loop cost after unrolling,559// and if the visitor returns true, mark the instruction as free after560// unrolling and continue.561bool IsFree = Analyzer.visit(I);562bool Inserted = InstCostMap.insert({&I, (int)Iteration,563(unsigned)IsFree,564/*IsCounted*/ false}).second;565(void)Inserted;566assert(Inserted && "Cannot have a state for an unvisited instruction!");567568if (IsFree)569continue;570571// Can't properly model a cost of a call.572// FIXME: With a proper cost model we should be able to do it.573if (auto *CI = dyn_cast<CallInst>(&I)) {574const Function *Callee = CI->getCalledFunction();575if (!Callee || TTI.isLoweredToCall(Callee)) {576LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");577return std::nullopt;578}579}580581// If the instruction might have a side-effect recursively account for582// the cost of it and all the instructions leading up to it.583if (I.mayHaveSideEffects())584AddCostRecursively(I, Iteration);585586// If unrolled body turns out to be too big, bail out.587if (UnrolledCost > MaxUnrolledLoopSize) {588LLVM_DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"589<< " UnrolledCost: " << UnrolledCost590<< ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize591<< "\n");592return std::nullopt;593}594}595596Instruction *TI = BB->getTerminator();597598auto getSimplifiedConstant = [&](Value *V) -> Constant * {599if (SimplifiedValues.count(V))600V = SimplifiedValues.lookup(V);601return dyn_cast<Constant>(V);602};603604// Add in the live successors by first checking whether we have terminator605// that may be simplified based on the values simplified by this call.606BasicBlock *KnownSucc = nullptr;607if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {608if (BI->isConditional()) {609if (auto *SimpleCond = getSimplifiedConstant(BI->getCondition())) {610// Just take the first successor if condition is undef611if (isa<UndefValue>(SimpleCond))612KnownSucc = BI->getSuccessor(0);613else if (ConstantInt *SimpleCondVal =614dyn_cast<ConstantInt>(SimpleCond))615KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);616}617}618} else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {619if (auto *SimpleCond = getSimplifiedConstant(SI->getCondition())) {620// Just take the first successor if condition is undef621if (isa<UndefValue>(SimpleCond))622KnownSucc = SI->getSuccessor(0);623else if (ConstantInt *SimpleCondVal =624dyn_cast<ConstantInt>(SimpleCond))625KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();626}627}628if (KnownSucc) {629if (L->contains(KnownSucc))630BBWorklist.insert(KnownSucc);631else632ExitWorklist.insert({BB, KnownSucc});633continue;634}635636// Add BB's successors to the worklist.637for (BasicBlock *Succ : successors(BB))638if (L->contains(Succ))639BBWorklist.insert(Succ);640else641ExitWorklist.insert({BB, Succ});642AddCostRecursively(*TI, Iteration);643}644645// If we found no optimization opportunities on the first iteration, we646// won't find them on later ones too.647if (UnrolledCost == RolledDynamicCost) {648LLVM_DEBUG(dbgs() << " No opportunities found.. exiting.\n"649<< " UnrolledCost: " << UnrolledCost << "\n");650return std::nullopt;651}652}653654while (!ExitWorklist.empty()) {655BasicBlock *ExitingBB, *ExitBB;656std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();657658for (Instruction &I : *ExitBB) {659auto *PN = dyn_cast<PHINode>(&I);660if (!PN)661break;662663Value *Op = PN->getIncomingValueForBlock(ExitingBB);664if (auto *OpI = dyn_cast<Instruction>(Op))665if (L->contains(OpI))666AddCostRecursively(*OpI, TripCount - 1);667}668}669670assert(UnrolledCost.isValid() && RolledDynamicCost.isValid() &&671"All instructions must have a valid cost, whether the "672"loop is rolled or unrolled.");673674LLVM_DEBUG(dbgs() << "Analysis finished:\n"675<< "UnrolledCost: " << UnrolledCost << ", "676<< "RolledDynamicCost: " << RolledDynamicCost << "\n");677return {{unsigned(*UnrolledCost.getValue()),678unsigned(*RolledDynamicCost.getValue())}};679}680681UnrollCostEstimator::UnrollCostEstimator(682const Loop *L, const TargetTransformInfo &TTI,683const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {684CodeMetrics Metrics;685for (BasicBlock *BB : L->blocks())686Metrics.analyzeBasicBlock(BB, TTI, EphValues, /* PrepareForLTO= */ false,687L);688NumInlineCandidates = Metrics.NumInlineCandidates;689NotDuplicatable = Metrics.notDuplicatable;690Convergence = Metrics.Convergence;691LoopSize = Metrics.NumInsts;692ConvergenceAllowsRuntime =693Metrics.Convergence != ConvergenceKind::Uncontrolled &&694!getLoopConvergenceHeart(L);695696// Don't allow an estimate of size zero. This would allows unrolling of loops697// with huge iteration counts, which is a compile time problem even if it's698// not a problem for code quality. Also, the code using this size may assume699// that each loop has at least three instructions (likely a conditional700// branch, a comparison feeding that branch, and some kind of loop increment701// feeding that comparison instruction).702if (LoopSize.isValid() && LoopSize < BEInsns + 1)703// This is an open coded max() on InstructionCost704LoopSize = BEInsns + 1;705}706707bool UnrollCostEstimator::canUnroll() const {708switch (Convergence) {709case ConvergenceKind::ExtendedLoop:710LLVM_DEBUG(dbgs() << " Convergence prevents unrolling.\n");711return false;712default:713break;714}715if (!LoopSize.isValid()) {716LLVM_DEBUG(dbgs() << " Invalid loop size prevents unrolling.\n");717return false;718}719if (NotDuplicatable) {720LLVM_DEBUG(dbgs() << " Non-duplicatable blocks prevent unrolling.\n");721return false;722}723return true;724}725726uint64_t UnrollCostEstimator::getUnrolledLoopSize(727const TargetTransformInfo::UnrollingPreferences &UP,728unsigned CountOverwrite) const {729unsigned LS = *LoopSize.getValue();730assert(LS >= UP.BEInsns && "LoopSize should not be less than BEInsns!");731if (CountOverwrite)732return static_cast<uint64_t>(LS - UP.BEInsns) * CountOverwrite + UP.BEInsns;733else734return static_cast<uint64_t>(LS - UP.BEInsns) * UP.Count + UP.BEInsns;735}736737// Returns the loop hint metadata node with the given name (for example,738// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is739// returned.740static MDNode *getUnrollMetadataForLoop(const Loop *L, StringRef Name) {741if (MDNode *LoopID = L->getLoopID())742return GetUnrollMetadata(LoopID, Name);743return nullptr;744}745746// Returns true if the loop has an unroll(full) pragma.747static bool hasUnrollFullPragma(const Loop *L) {748return getUnrollMetadataForLoop(L, "llvm.loop.unroll.full");749}750751// Returns true if the loop has an unroll(enable) pragma. This metadata is used752// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.753static bool hasUnrollEnablePragma(const Loop *L) {754return getUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");755}756757// Returns true if the loop has an runtime unroll(disable) pragma.758static bool hasRuntimeUnrollDisablePragma(const Loop *L) {759return getUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");760}761762// If loop has an unroll_count pragma return the (necessarily763// positive) value from the pragma. Otherwise return 0.764static unsigned unrollCountPragmaValue(const Loop *L) {765MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll.count");766if (MD) {767assert(MD->getNumOperands() == 2 &&768"Unroll count hint metadata should have two operands.");769unsigned Count =770mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();771assert(Count >= 1 && "Unroll count must be positive.");772return Count;773}774return 0;775}776777// Computes the boosting factor for complete unrolling.778// If fully unrolling the loop would save a lot of RolledDynamicCost, it would779// be beneficial to fully unroll the loop even if unrolledcost is large. We780// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust781// the unroll threshold.782static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,783unsigned MaxPercentThresholdBoost) {784if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)785return 100;786else if (Cost.UnrolledCost != 0)787// The boosting factor is RolledDynamicCost / UnrolledCost788return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,789MaxPercentThresholdBoost);790else791return MaxPercentThresholdBoost;792}793794static std::optional<unsigned>795shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,796const unsigned TripMultiple, const unsigned TripCount,797unsigned MaxTripCount, const UnrollCostEstimator UCE,798const TargetTransformInfo::UnrollingPreferences &UP) {799800// Using unroll pragma801// 1st priority is unroll count set by "unroll-count" option.802803if (PInfo.UserUnrollCount) {804if (UP.AllowRemainder &&805UCE.getUnrolledLoopSize(UP, (unsigned)UnrollCount) < UP.Threshold)806return (unsigned)UnrollCount;807}808809// 2nd priority is unroll count set by pragma.810if (PInfo.PragmaCount > 0) {811if ((UP.AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0)))812return PInfo.PragmaCount;813}814815if (PInfo.PragmaFullUnroll && TripCount != 0) {816// Certain cases with UBSAN can cause trip count to be calculated as817// INT_MAX, Block full unrolling at a reasonable limit so that the compiler818// doesn't hang trying to unroll the loop. See PR77842819if (TripCount > PragmaUnrollFullMaxIterations) {820LLVM_DEBUG(dbgs() << "Won't unroll; trip count is too large\n");821return std::nullopt;822}823824return TripCount;825}826827if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount &&828MaxTripCount <= UP.MaxUpperBound)829return MaxTripCount;830831// if didn't return until here, should continue to other priorties832return std::nullopt;833}834835static std::optional<unsigned> shouldFullUnroll(836Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT,837ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,838const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE,839const TargetTransformInfo::UnrollingPreferences &UP) {840assert(FullUnrollTripCount && "should be non-zero!");841842if (FullUnrollTripCount > UP.FullUnrollMaxCount)843return std::nullopt;844845// When computing the unrolled size, note that BEInsns are not replicated846// like the rest of the loop body.847if (UCE.getUnrolledLoopSize(UP) < UP.Threshold)848return FullUnrollTripCount;849850// The loop isn't that small, but we still can fully unroll it if that851// helps to remove a significant number of instructions.852// To check that, run additional analysis on the loop.853if (std::optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(854L, FullUnrollTripCount, DT, SE, EphValues, TTI,855UP.Threshold * UP.MaxPercentThresholdBoost / 100,856UP.MaxIterationsCountToAnalyze)) {857unsigned Boost =858getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);859if (Cost->UnrolledCost < UP.Threshold * Boost / 100)860return FullUnrollTripCount;861}862return std::nullopt;863}864865static std::optional<unsigned>866shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,867const UnrollCostEstimator UCE,868const TargetTransformInfo::UnrollingPreferences &UP) {869870if (!TripCount)871return std::nullopt;872873if (!UP.Partial) {874LLVM_DEBUG(dbgs() << " will not try to unroll partially because "875<< "-unroll-allow-partial not given\n");876return 0;877}878unsigned count = UP.Count;879if (count == 0)880count = TripCount;881if (UP.PartialThreshold != NoThreshold) {882// Reduce unroll count to be modulo of TripCount for partial unrolling.883if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)884count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /885(LoopSize - UP.BEInsns);886if (count > UP.MaxCount)887count = UP.MaxCount;888while (count != 0 && TripCount % count != 0)889count--;890if (UP.AllowRemainder && count <= 1) {891// If there is no Count that is modulo of TripCount, set Count to892// largest power-of-two factor that satisfies the threshold limit.893// As we'll create fixup loop, do the type of unrolling only if894// remainder loop is allowed.895count = UP.DefaultUnrollRuntimeCount;896while (count != 0 &&897UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)898count >>= 1;899}900if (count < 2) {901count = 0;902}903} else {904count = TripCount;905}906if (count > UP.MaxCount)907count = UP.MaxCount;908909LLVM_DEBUG(dbgs() << " partially unrolling with count: " << count << "\n");910911return count;912}913// Returns true if unroll count was set explicitly.914// Calculates unroll count and writes it to UP.Count.915// Unless IgnoreUser is true, will also use metadata and command-line options916// that are specific to to the LoopUnroll pass (which, for instance, are917// irrelevant for the LoopUnrollAndJam pass).918// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes919// many LoopUnroll-specific options. The shared functionality should be920// refactored into it own function.921bool llvm::computeUnrollCount(922Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,923AssumptionCache *AC, ScalarEvolution &SE,924const SmallPtrSetImpl<const Value *> &EphValues,925OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount,926bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE,927TargetTransformInfo::UnrollingPreferences &UP,928TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {929930unsigned LoopSize = UCE.getRolledLoopSize();931932const bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;933const bool PragmaFullUnroll = hasUnrollFullPragma(L);934const unsigned PragmaCount = unrollCountPragmaValue(L);935const bool PragmaEnableUnroll = hasUnrollEnablePragma(L);936937const bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||938PragmaEnableUnroll || UserUnrollCount;939940PragmaInfo PInfo(UserUnrollCount, PragmaFullUnroll, PragmaCount,941PragmaEnableUnroll);942// Use an explicit peel count that has been specified for testing. In this943// case it's not permitted to also specify an explicit unroll count.944if (PP.PeelCount) {945if (UnrollCount.getNumOccurrences() > 0) {946report_fatal_error("Cannot specify both explicit peel count and "947"explicit unroll count", /*GenCrashDiag=*/false);948}949UP.Count = 1;950UP.Runtime = false;951return true;952}953// Check for explicit Count.954// 1st priority is unroll count set by "unroll-count" option.955// 2nd priority is unroll count set by pragma.956if (auto UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount,957MaxTripCount, UCE, UP)) {958UP.Count = *UnrollFactor;959960if (UserUnrollCount || (PragmaCount > 0)) {961UP.AllowExpensiveTripCount = true;962UP.Force = true;963}964UP.Runtime |= (PragmaCount > 0);965return ExplicitUnroll;966} else {967if (ExplicitUnroll && TripCount != 0) {968// If the loop has an unrolling pragma, we want to be more aggressive with969// unrolling limits. Set thresholds to at least the PragmaUnrollThreshold970// value which is larger than the default limits.971UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);972UP.PartialThreshold =973std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);974}975}976977// 3rd priority is exact full unrolling. This will eliminate all copies978// of some exit test.979UP.Count = 0;980if (TripCount) {981UP.Count = TripCount;982if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,983TripCount, UCE, UP)) {984UP.Count = *UnrollFactor;985UseUpperBound = false;986return ExplicitUnroll;987}988}989990// 4th priority is bounded unrolling.991// We can unroll by the upper bound amount if it's generally allowed or if992// we know that the loop is executed either the upper bound or zero times.993// (MaxOrZero unrolling keeps only the first loop test, so the number of994// loop tests remains the same compared to the non-unrolled version, whereas995// the generic upper bound unrolling keeps all but the last loop test so the996// number of loop tests goes up which may end up being worse on targets with997// constrained branch predictor resources so is controlled by an option.)998// In addition we only unroll small upper bounds.999// Note that the cost of bounded unrolling is always strictly greater than1000// cost of exact full unrolling. As such, if we have an exact count and1001// found it unprofitable, we'll never chose to bounded unroll.1002if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&1003MaxTripCount <= UP.MaxUpperBound) {1004UP.Count = MaxTripCount;1005if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,1006MaxTripCount, UCE, UP)) {1007UP.Count = *UnrollFactor;1008UseUpperBound = true;1009return ExplicitUnroll;1010}1011}10121013// 5th priority is loop peeling.1014computePeelCount(L, LoopSize, PP, TripCount, DT, SE, AC, UP.Threshold);1015if (PP.PeelCount) {1016UP.Runtime = false;1017UP.Count = 1;1018return ExplicitUnroll;1019}10201021// Before starting partial unrolling, set up.partial to true,1022// if user explicitly asked for unrolling1023if (TripCount)1024UP.Partial |= ExplicitUnroll;10251026// 6th priority is partial unrolling.1027// Try partial unroll only when TripCount could be statically calculated.1028if (auto UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP)) {1029UP.Count = *UnrollFactor;10301031if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&1032UP.Count != TripCount)1033ORE->emit([&]() {1034return OptimizationRemarkMissed(DEBUG_TYPE,1035"FullUnrollAsDirectedTooLarge",1036L->getStartLoc(), L->getHeader())1037<< "Unable to fully unroll loop as directed by unroll pragma "1038"because "1039"unrolled size is too large.";1040});10411042if (UP.PartialThreshold != NoThreshold) {1043if (UP.Count == 0) {1044if (PragmaEnableUnroll)1045ORE->emit([&]() {1046return OptimizationRemarkMissed(DEBUG_TYPE,1047"UnrollAsDirectedTooLarge",1048L->getStartLoc(), L->getHeader())1049<< "Unable to unroll loop as directed by unroll(enable) "1050"pragma "1051"because unrolled size is too large.";1052});1053}1054}1055return ExplicitUnroll;1056}1057assert(TripCount == 0 &&1058"All cases when TripCount is constant should be covered here.");1059if (PragmaFullUnroll)1060ORE->emit([&]() {1061return OptimizationRemarkMissed(1062DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",1063L->getStartLoc(), L->getHeader())1064<< "Unable to fully unroll loop as directed by unroll(full) "1065"pragma "1066"because loop has a runtime trip count.";1067});10681069// 7th priority is runtime unrolling.1070// Don't unroll a runtime trip count loop when it is disabled.1071if (hasRuntimeUnrollDisablePragma(L)) {1072UP.Count = 0;1073return false;1074}10751076// Don't unroll a small upper bound loop unless user or TTI asked to do so.1077if (MaxTripCount && !UP.Force && MaxTripCount < UP.MaxUpperBound) {1078UP.Count = 0;1079return false;1080}10811082// Check if the runtime trip count is too small when profile is available.1083if (L->getHeader()->getParent()->hasProfileData()) {1084if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {1085if (*ProfileTripCount < FlatLoopTripCountThreshold)1086return false;1087else1088UP.AllowExpensiveTripCount = true;1089}1090}1091UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;1092if (!UP.Runtime) {1093LLVM_DEBUG(1094dbgs() << " will not try to unroll loop with runtime trip count "1095<< "-unroll-runtime not given\n");1096UP.Count = 0;1097return false;1098}1099if (UP.Count == 0)1100UP.Count = UP.DefaultUnrollRuntimeCount;11011102// Reduce unroll count to be the largest power-of-two factor of1103// the original count which satisfies the threshold limit.1104while (UP.Count != 0 &&1105UCE.getUnrolledLoopSize(UP) > UP.PartialThreshold)1106UP.Count >>= 1;11071108#ifndef NDEBUG1109unsigned OrigCount = UP.Count;1110#endif11111112if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {1113while (UP.Count != 0 && TripMultiple % UP.Count != 0)1114UP.Count >>= 1;1115LLVM_DEBUG(1116dbgs() << "Remainder loop is restricted (that could architecture "1117"specific or because the loop contains a convergent "1118"instruction), so unroll count must divide the trip "1119"multiple, "1120<< TripMultiple << ". Reducing unroll count from " << OrigCount1121<< " to " << UP.Count << ".\n");11221123using namespace ore;11241125if (unrollCountPragmaValue(L) > 0 && !UP.AllowRemainder)1126ORE->emit([&]() {1127return OptimizationRemarkMissed(DEBUG_TYPE,1128"DifferentUnrollCountFromDirected",1129L->getStartLoc(), L->getHeader())1130<< "Unable to unroll loop the number of times directed by "1131"unroll_count pragma because remainder loop is restricted "1132"(that could architecture specific or because the loop "1133"contains a convergent instruction) and so must have an "1134"unroll "1135"count that divides the loop trip multiple of "1136<< NV("TripMultiple", TripMultiple) << ". Unrolling instead "1137<< NV("UnrollCount", UP.Count) << " time(s).";1138});1139}11401141if (UP.Count > UP.MaxCount)1142UP.Count = UP.MaxCount;11431144if (MaxTripCount && UP.Count > MaxTripCount)1145UP.Count = MaxTripCount;11461147LLVM_DEBUG(dbgs() << " runtime unrolling with count: " << UP.Count1148<< "\n");1149if (UP.Count < 2)1150UP.Count = 0;1151return ExplicitUnroll;1152}11531154static LoopUnrollResult1155tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,1156const TargetTransformInfo &TTI, AssumptionCache &AC,1157OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,1158ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,1159bool OnlyFullUnroll, bool OnlyWhenForced, bool ForgetAllSCEV,1160std::optional<unsigned> ProvidedCount,1161std::optional<unsigned> ProvidedThreshold,1162std::optional<bool> ProvidedAllowPartial,1163std::optional<bool> ProvidedRuntime,1164std::optional<bool> ProvidedUpperBound,1165std::optional<bool> ProvidedAllowPeeling,1166std::optional<bool> ProvidedAllowProfileBasedPeeling,1167std::optional<unsigned> ProvidedFullUnrollMaxCount,1168AAResults *AA = nullptr) {11691170LLVM_DEBUG(dbgs() << "Loop Unroll: F["1171<< L->getHeader()->getParent()->getName() << "] Loop %"1172<< L->getHeader()->getName() << "\n");1173TransformationMode TM = hasUnrollTransformation(L);1174if (TM & TM_Disable)1175return LoopUnrollResult::Unmodified;11761177// If this loop isn't forced to be unrolled, avoid unrolling it when the1178// parent loop has an explicit unroll-and-jam pragma. This is to prevent1179// automatic unrolling from interfering with the user requested1180// transformation.1181Loop *ParentL = L->getParentLoop();1182if (ParentL != nullptr &&1183hasUnrollAndJamTransformation(ParentL) == TM_ForcedByUser &&1184hasUnrollTransformation(L) != TM_ForcedByUser) {1185LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has"1186<< " llvm.loop.unroll_and_jam.\n");1187return LoopUnrollResult::Unmodified;1188}11891190// If this loop isn't forced to be unrolled, avoid unrolling it when the1191// loop has an explicit unroll-and-jam pragma. This is to prevent automatic1192// unrolling from interfering with the user requested transformation.1193if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser &&1194hasUnrollTransformation(L) != TM_ForcedByUser) {1195LLVM_DEBUG(1196dbgs()1197<< " Not unrolling loop since it has llvm.loop.unroll_and_jam.\n");1198return LoopUnrollResult::Unmodified;1199}12001201if (!L->isLoopSimplifyForm()) {1202LLVM_DEBUG(1203dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");1204return LoopUnrollResult::Unmodified;1205}12061207// When automatic unrolling is disabled, do not unroll unless overridden for1208// this loop.1209if (OnlyWhenForced && !(TM & TM_Enable))1210return LoopUnrollResult::Unmodified;12111212bool OptForSize = L->getHeader()->getParent()->hasOptSize();1213TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(1214L, SE, TTI, BFI, PSI, ORE, OptLevel, ProvidedThreshold, ProvidedCount,1215ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,1216ProvidedFullUnrollMaxCount);1217TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(1218L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);12191220// Exit early if unrolling is disabled. For OptForSize, we pick the loop size1221// as threshold later on.1222if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) &&1223!OptForSize)1224return LoopUnrollResult::Unmodified;12251226SmallPtrSet<const Value *, 32> EphValues;1227CodeMetrics::collectEphemeralValues(L, &AC, EphValues);12281229UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);1230if (!UCE.canUnroll()) {1231LLVM_DEBUG(dbgs() << " Loop not considered unrollable.\n");1232return LoopUnrollResult::Unmodified;1233}12341235unsigned LoopSize = UCE.getRolledLoopSize();1236LLVM_DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");12371238// When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold1239// later), to (fully) unroll loops, if it does not increase code size.1240if (OptForSize)1241UP.Threshold = std::max(UP.Threshold, LoopSize + 1);12421243if (UCE.NumInlineCandidates != 0) {1244LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");1245return LoopUnrollResult::Unmodified;1246}12471248// Find the smallest exact trip count for any exit. This is an upper bound1249// on the loop trip count, but an exit at an earlier iteration is still1250// possible. An unroll by the smallest exact trip count guarantees that all1251// branches relating to at least one exit can be eliminated. This is unlike1252// the max trip count, which only guarantees that the backedge can be broken.1253unsigned TripCount = 0;1254unsigned TripMultiple = 1;1255SmallVector<BasicBlock *, 8> ExitingBlocks;1256L->getExitingBlocks(ExitingBlocks);1257for (BasicBlock *ExitingBlock : ExitingBlocks)1258if (unsigned TC = SE.getSmallConstantTripCount(L, ExitingBlock))1259if (!TripCount || TC < TripCount)1260TripCount = TripMultiple = TC;12611262if (!TripCount) {1263// If no exact trip count is known, determine the trip multiple of either1264// the loop latch or the single exiting block.1265// TODO: Relax for multiple exits.1266BasicBlock *ExitingBlock = L->getLoopLatch();1267if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))1268ExitingBlock = L->getExitingBlock();1269if (ExitingBlock)1270TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);1271}12721273// If the loop contains a convergent operation, the prelude we'd add1274// to do the first few instructions before we hit the unrolled loop1275// is unsafe -- it adds a control-flow dependency to the convergent1276// operation. Therefore restrict remainder loop (try unrolling without).1277//1278// TODO: This is somewhat conservative; we could allow the remainder if the1279// trip count is uniform.1280UP.AllowRemainder &= UCE.ConvergenceAllowsRuntime;12811282// Try to find the trip count upper bound if we cannot find the exact trip1283// count.1284unsigned MaxTripCount = 0;1285bool MaxOrZero = false;1286if (!TripCount) {1287MaxTripCount = SE.getSmallConstantMaxTripCount(L);1288MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);1289}12901291// computeUnrollCount() decides whether it is beneficial to use upper bound to1292// fully unroll the loop.1293bool UseUpperBound = false;1294bool IsCountSetExplicitly = computeUnrollCount(1295L, TTI, DT, LI, &AC, SE, EphValues, &ORE, TripCount, MaxTripCount,1296MaxOrZero, TripMultiple, UCE, UP, PP, UseUpperBound);1297if (!UP.Count)1298return LoopUnrollResult::Unmodified;12991300UP.Runtime &= UCE.ConvergenceAllowsRuntime;13011302if (PP.PeelCount) {1303assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step");1304LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName()1305<< " with iteration count " << PP.PeelCount << "!\n");1306ORE.emit([&]() {1307return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),1308L->getHeader())1309<< " peeled loop by " << ore::NV("PeelCount", PP.PeelCount)1310<< " iterations";1311});13121313ValueToValueMapTy VMap;1314if (peelLoop(L, PP.PeelCount, LI, &SE, DT, &AC, PreserveLCSSA, VMap)) {1315simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI, nullptr);1316// If the loop was peeled, we already "used up" the profile information1317// we had, so we don't want to unroll or peel again.1318if (PP.PeelProfiledIterations)1319L->setLoopAlreadyUnrolled();1320return LoopUnrollResult::PartiallyUnrolled;1321}1322return LoopUnrollResult::Unmodified;1323}13241325// Do not attempt partial/runtime unrolling in FullLoopUnrolling1326if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) {1327LLVM_DEBUG(1328dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n");1329return LoopUnrollResult::Unmodified;1330}13311332// At this point, UP.Runtime indicates that run-time unrolling is allowed.1333// However, we only want to actually perform it if we don't know the trip1334// count and the unroll count doesn't divide the known trip multiple.1335// TODO: This decision should probably be pushed up into1336// computeUnrollCount().1337UP.Runtime &= TripCount == 0 && TripMultiple % UP.Count != 0;13381339// Save loop properties before it is transformed.1340MDNode *OrigLoopID = L->getLoopID();13411342// Unroll the loop.1343Loop *RemainderLoop = nullptr;1344UnrollLoopOptions ULO;1345ULO.Count = UP.Count;1346ULO.Force = UP.Force;1347ULO.AllowExpensiveTripCount = UP.AllowExpensiveTripCount;1348ULO.UnrollRemainder = UP.UnrollRemainder;1349ULO.Runtime = UP.Runtime;1350ULO.ForgetAllSCEV = ForgetAllSCEV;1351ULO.Heart = getLoopConvergenceHeart(L);1352LoopUnrollResult UnrollResult = UnrollLoop(1353L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);1354if (UnrollResult == LoopUnrollResult::Unmodified)1355return LoopUnrollResult::Unmodified;13561357if (RemainderLoop) {1358std::optional<MDNode *> RemainderLoopID =1359makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,1360LLVMLoopUnrollFollowupRemainder});1361if (RemainderLoopID)1362RemainderLoop->setLoopID(*RemainderLoopID);1363}13641365if (UnrollResult != LoopUnrollResult::FullyUnrolled) {1366std::optional<MDNode *> NewLoopID =1367makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,1368LLVMLoopUnrollFollowupUnrolled});1369if (NewLoopID) {1370L->setLoopID(*NewLoopID);13711372// Do not setLoopAlreadyUnrolled if loop attributes have been specified1373// explicitly.1374return UnrollResult;1375}1376}13771378// If loop has an unroll count pragma or unrolled by explicitly set count1379// mark loop as unrolled to prevent unrolling beyond that requested.1380if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)1381L->setLoopAlreadyUnrolled();13821383return UnrollResult;1384}13851386namespace {13871388class LoopUnroll : public LoopPass {1389public:1390static char ID; // Pass ID, replacement for typeid13911392int OptLevel;13931394/// If false, use a cost model to determine whether unrolling of a loop is1395/// profitable. If true, only loops that explicitly request unrolling via1396/// metadata are considered. All other loops are skipped.1397bool OnlyWhenForced;13981399/// If false, when SCEV is invalidated, only forget everything in the1400/// top-most loop (call forgetTopMostLoop), of the loop being processed.1401/// Otherwise, forgetAllLoops and rebuild when needed next.1402bool ForgetAllSCEV;14031404std::optional<unsigned> ProvidedCount;1405std::optional<unsigned> ProvidedThreshold;1406std::optional<bool> ProvidedAllowPartial;1407std::optional<bool> ProvidedRuntime;1408std::optional<bool> ProvidedUpperBound;1409std::optional<bool> ProvidedAllowPeeling;1410std::optional<bool> ProvidedAllowProfileBasedPeeling;1411std::optional<unsigned> ProvidedFullUnrollMaxCount;14121413LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,1414bool ForgetAllSCEV = false,1415std::optional<unsigned> Threshold = std::nullopt,1416std::optional<unsigned> Count = std::nullopt,1417std::optional<bool> AllowPartial = std::nullopt,1418std::optional<bool> Runtime = std::nullopt,1419std::optional<bool> UpperBound = std::nullopt,1420std::optional<bool> AllowPeeling = std::nullopt,1421std::optional<bool> AllowProfileBasedPeeling = std::nullopt,1422std::optional<unsigned> ProvidedFullUnrollMaxCount = std::nullopt)1423: LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),1424ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),1425ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),1426ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),1427ProvidedAllowPeeling(AllowPeeling),1428ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),1429ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {1430initializeLoopUnrollPass(*PassRegistry::getPassRegistry());1431}14321433bool runOnLoop(Loop *L, LPPassManager &LPM) override {1434if (skipLoop(L))1435return false;14361437Function &F = *L->getHeader()->getParent();14381439auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();1440LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();1441ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();1442const TargetTransformInfo &TTI =1443getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);1444auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);1445// For the old PM, we can't use OptimizationRemarkEmitter as an analysis1446// pass. Function analyses need to be preserved across loop transformations1447// but ORE cannot be preserved (see comment before the pass definition).1448OptimizationRemarkEmitter ORE(&F);1449bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);14501451LoopUnrollResult Result = tryToUnrollLoop(1452L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,1453/*OnlyFullUnroll*/ false, OnlyWhenForced, ForgetAllSCEV, ProvidedCount,1454ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,1455ProvidedUpperBound, ProvidedAllowPeeling,1456ProvidedAllowProfileBasedPeeling, ProvidedFullUnrollMaxCount);14571458if (Result == LoopUnrollResult::FullyUnrolled)1459LPM.markLoopAsDeleted(*L);14601461return Result != LoopUnrollResult::Unmodified;1462}14631464/// This transformation requires natural loop information & requires that1465/// loop preheaders be inserted into the CFG...1466void getAnalysisUsage(AnalysisUsage &AU) const override {1467AU.addRequired<AssumptionCacheTracker>();1468AU.addRequired<TargetTransformInfoWrapperPass>();1469// FIXME: Loop passes are required to preserve domtree, and for now we just1470// recreate dom info if anything gets unrolled.1471getLoopAnalysisUsage(AU);1472}1473};14741475} // end anonymous namespace14761477char LoopUnroll::ID = 0;14781479INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)1480INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)1481INITIALIZE_PASS_DEPENDENCY(LoopPass)1482INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)1483INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)14841485Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,1486bool ForgetAllSCEV, int Threshold, int Count,1487int AllowPartial, int Runtime, int UpperBound,1488int AllowPeeling) {1489// TODO: It would make more sense for this function to take the optionals1490// directly, but that's dangerous since it would silently break out of tree1491// callers.1492return new LoopUnroll(1493OptLevel, OnlyWhenForced, ForgetAllSCEV,1494Threshold == -1 ? std::nullopt : std::optional<unsigned>(Threshold),1495Count == -1 ? std::nullopt : std::optional<unsigned>(Count),1496AllowPartial == -1 ? std::nullopt : std::optional<bool>(AllowPartial),1497Runtime == -1 ? std::nullopt : std::optional<bool>(Runtime),1498UpperBound == -1 ? std::nullopt : std::optional<bool>(UpperBound),1499AllowPeeling == -1 ? std::nullopt : std::optional<bool>(AllowPeeling));1500}15011502PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,1503LoopStandardAnalysisResults &AR,1504LPMUpdater &Updater) {1505// For the new PM, we can't use OptimizationRemarkEmitter as an analysis1506// pass. Function analyses need to be preserved across loop transformations1507// but ORE cannot be preserved (see comment before the pass definition).1508OptimizationRemarkEmitter ORE(L.getHeader()->getParent());15091510// Keep track of the previous loop structure so we can identify new loops1511// created by unrolling.1512Loop *ParentL = L.getParentLoop();1513SmallPtrSet<Loop *, 4> OldLoops;1514if (ParentL)1515OldLoops.insert(ParentL->begin(), ParentL->end());1516else1517OldLoops.insert(AR.LI.begin(), AR.LI.end());15181519std::string LoopName = std::string(L.getName());15201521bool Changed =1522tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE,1523/*BFI*/ nullptr, /*PSI*/ nullptr,1524/*PreserveLCSSA*/ true, OptLevel, /*OnlyFullUnroll*/ true,1525OnlyWhenForced, ForgetSCEV, /*Count*/ std::nullopt,1526/*Threshold*/ std::nullopt, /*AllowPartial*/ false,1527/*Runtime*/ false, /*UpperBound*/ false,1528/*AllowPeeling*/ true,1529/*AllowProfileBasedPeeling*/ false,1530/*FullUnrollMaxCount*/ std::nullopt) !=1531LoopUnrollResult::Unmodified;1532if (!Changed)1533return PreservedAnalyses::all();15341535// The parent must not be damaged by unrolling!1536#ifndef NDEBUG1537if (ParentL)1538ParentL->verifyLoop();1539#endif15401541// Unrolling can do several things to introduce new loops into a loop nest:1542// - Full unrolling clones child loops within the current loop but then1543// removes the current loop making all of the children appear to be new1544// sibling loops.1545//1546// When a new loop appears as a sibling loop after fully unrolling,1547// its nesting structure has fundamentally changed and we want to revisit1548// it to reflect that.1549//1550// When unrolling has removed the current loop, we need to tell the1551// infrastructure that it is gone.1552//1553// Finally, we support a debugging/testing mode where we revisit child loops1554// as well. These are not expected to require further optimizations as either1555// they or the loop they were cloned from have been directly visited already.1556// But the debugging mode allows us to check this assumption.1557bool IsCurrentLoopValid = false;1558SmallVector<Loop *, 4> SibLoops;1559if (ParentL)1560SibLoops.append(ParentL->begin(), ParentL->end());1561else1562SibLoops.append(AR.LI.begin(), AR.LI.end());1563erase_if(SibLoops, [&](Loop *SibLoop) {1564if (SibLoop == &L) {1565IsCurrentLoopValid = true;1566return true;1567}15681569// Otherwise erase the loop from the list if it was in the old loops.1570return OldLoops.contains(SibLoop);1571});1572Updater.addSiblingLoops(SibLoops);15731574if (!IsCurrentLoopValid) {1575Updater.markLoopAsDeleted(L, LoopName);1576} else {1577// We can only walk child loops if the current loop remained valid.1578if (UnrollRevisitChildLoops) {1579// Walk *all* of the child loops.1580SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());1581Updater.addChildLoops(ChildLoops);1582}1583}15841585return getLoopPassPreservedAnalyses();1586}15871588PreservedAnalyses LoopUnrollPass::run(Function &F,1589FunctionAnalysisManager &AM) {1590auto &LI = AM.getResult<LoopAnalysis>(F);1591// There are no loops in the function. Return before computing other expensive1592// analyses.1593if (LI.empty())1594return PreservedAnalyses::all();1595auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);1596auto &TTI = AM.getResult<TargetIRAnalysis>(F);1597auto &DT = AM.getResult<DominatorTreeAnalysis>(F);1598auto &AC = AM.getResult<AssumptionAnalysis>(F);1599auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);1600AAResults &AA = AM.getResult<AAManager>(F);16011602LoopAnalysisManager *LAM = nullptr;1603if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))1604LAM = &LAMProxy->getManager();16051606auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);1607ProfileSummaryInfo *PSI =1608MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());1609auto *BFI = (PSI && PSI->hasProfileSummary()) ?1610&AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;16111612bool Changed = false;16131614// The unroller requires loops to be in simplified form, and also needs LCSSA.1615// Since simplification may add new inner loops, it has to run before the1616// legality and profitability checks. This means running the loop unroller1617// will simplify all loops, regardless of whether anything end up being1618// unrolled.1619for (const auto &L : LI) {1620Changed |=1621simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);1622Changed |= formLCSSARecursively(*L, DT, &LI, &SE);1623}16241625// Add the loop nests in the reverse order of LoopInfo. See method1626// declaration.1627SmallPriorityWorklist<Loop *, 4> Worklist;1628appendLoopsToWorklist(LI, Worklist);16291630while (!Worklist.empty()) {1631// Because the LoopInfo stores the loops in RPO, we walk the worklist1632// from back to front so that we work forward across the CFG, which1633// for unrolling is only needed to get optimization remarks emitted in1634// a forward order.1635Loop &L = *Worklist.pop_back_val();1636#ifndef NDEBUG1637Loop *ParentL = L.getParentLoop();1638#endif16391640// Check if the profile summary indicates that the profiled application1641// has a huge working set size, in which case we disable peeling to avoid1642// bloating it further.1643std::optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;1644if (PSI && PSI->hasHugeWorkingSetSize())1645LocalAllowPeeling = false;1646std::string LoopName = std::string(L.getName());1647// The API here is quite complex to call and we allow to select some1648// flavors of unrolling during construction time (by setting UnrollOpts).1649LoopUnrollResult Result = tryToUnrollLoop(1650&L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,1651/*PreserveLCSSA*/ true, UnrollOpts.OptLevel, /*OnlyFullUnroll*/ false,1652UnrollOpts.OnlyWhenForced, UnrollOpts.ForgetSCEV,1653/*Count*/ std::nullopt,1654/*Threshold*/ std::nullopt, UnrollOpts.AllowPartial,1655UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling,1656UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount,1657&AA);1658Changed |= Result != LoopUnrollResult::Unmodified;16591660// The parent must not be damaged by unrolling!1661#ifndef NDEBUG1662if (Result != LoopUnrollResult::Unmodified && ParentL)1663ParentL->verifyLoop();1664#endif16651666// Clear any cached analysis results for L if we removed it completely.1667if (LAM && Result == LoopUnrollResult::FullyUnrolled)1668LAM->clear(L, LoopName);1669}16701671if (!Changed)1672return PreservedAnalyses::all();16731674return getLoopPassPreservedAnalyses();1675}16761677void LoopUnrollPass::printPipeline(1678raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {1679static_cast<PassInfoMixin<LoopUnrollPass> *>(this)->printPipeline(1680OS, MapClassName2PassName);1681OS << '<';1682if (UnrollOpts.AllowPartial != std::nullopt)1683OS << (*UnrollOpts.AllowPartial ? "" : "no-") << "partial;";1684if (UnrollOpts.AllowPeeling != std::nullopt)1685OS << (*UnrollOpts.AllowPeeling ? "" : "no-") << "peeling;";1686if (UnrollOpts.AllowRuntime != std::nullopt)1687OS << (*UnrollOpts.AllowRuntime ? "" : "no-") << "runtime;";1688if (UnrollOpts.AllowUpperBound != std::nullopt)1689OS << (*UnrollOpts.AllowUpperBound ? "" : "no-") << "upperbound;";1690if (UnrollOpts.AllowProfileBasedPeeling != std::nullopt)1691OS << (*UnrollOpts.AllowProfileBasedPeeling ? "" : "no-")1692<< "profile-peeling;";1693if (UnrollOpts.FullUnrollMaxCount != std::nullopt)1694OS << "full-unroll-max=" << UnrollOpts.FullUnrollMaxCount << ';';1695OS << 'O' << UnrollOpts.OptLevel;1696OS << '>';1697}169816991700