Path: blob/main/contrib/llvm-project/llvm/lib/Analysis/InlineCost.cpp
35233 views
//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file implements inline cost analysis.9//10//===----------------------------------------------------------------------===//1112#include "llvm/Analysis/InlineCost.h"13#include "llvm/ADT/STLExtras.h"14#include "llvm/ADT/SetVector.h"15#include "llvm/ADT/SmallPtrSet.h"16#include "llvm/ADT/SmallVector.h"17#include "llvm/ADT/Statistic.h"18#include "llvm/Analysis/AssumptionCache.h"19#include "llvm/Analysis/BlockFrequencyInfo.h"20#include "llvm/Analysis/CodeMetrics.h"21#include "llvm/Analysis/ConstantFolding.h"22#include "llvm/Analysis/InstructionSimplify.h"23#include "llvm/Analysis/LoopInfo.h"24#include "llvm/Analysis/MemoryBuiltins.h"25#include "llvm/Analysis/OptimizationRemarkEmitter.h"26#include "llvm/Analysis/ProfileSummaryInfo.h"27#include "llvm/Analysis/TargetLibraryInfo.h"28#include "llvm/Analysis/TargetTransformInfo.h"29#include "llvm/Analysis/ValueTracking.h"30#include "llvm/Config/llvm-config.h"31#include "llvm/IR/AssemblyAnnotationWriter.h"32#include "llvm/IR/CallingConv.h"33#include "llvm/IR/DataLayout.h"34#include "llvm/IR/Dominators.h"35#include "llvm/IR/GetElementPtrTypeIterator.h"36#include "llvm/IR/GlobalAlias.h"37#include "llvm/IR/InstVisitor.h"38#include "llvm/IR/IntrinsicInst.h"39#include "llvm/IR/Operator.h"40#include "llvm/IR/PatternMatch.h"41#include "llvm/Support/CommandLine.h"42#include "llvm/Support/Debug.h"43#include "llvm/Support/FormattedStream.h"44#include "llvm/Support/raw_ostream.h"45#include <climits>46#include <limits>47#include <optional>4849using namespace llvm;5051#define DEBUG_TYPE "inline-cost"5253STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");5455static cl::opt<int>56DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),57cl::desc("Default amount of inlining to perform"));5859// We introduce this option since there is a minor compile-time win by avoiding60// addition of TTI attributes (target-features in particular) to inline61// candidates when they are guaranteed to be the same as top level methods in62// some use cases. If we avoid adding the attribute, we need an option to avoid63// checking these attributes.64static cl::opt<bool> IgnoreTTIInlineCompatible(65"ignore-tti-inline-compatible", cl::Hidden, cl::init(false),66cl::desc("Ignore TTI attributes compatibility check between callee/caller "67"during inline cost calculation"));6869static cl::opt<bool> PrintInstructionComments(70"print-instruction-comments", cl::Hidden, cl::init(false),71cl::desc("Prints comments for instruction based on inline cost analysis"));7273static cl::opt<int> InlineThreshold(74"inline-threshold", cl::Hidden, cl::init(225),75cl::desc("Control the amount of inlining to perform (default = 225)"));7677static cl::opt<int> HintThreshold(78"inlinehint-threshold", cl::Hidden, cl::init(325),79cl::desc("Threshold for inlining functions with inline hint"));8081static cl::opt<int>82ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,83cl::init(45),84cl::desc("Threshold for inlining cold callsites"));8586static cl::opt<bool> InlineEnableCostBenefitAnalysis(87"inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),88cl::desc("Enable the cost-benefit analysis for the inliner"));8990// InlineSavingsMultiplier overrides per TTI multipliers iff it is91// specified explicitly in command line options. This option is exposed92// for tuning and testing.93static cl::opt<int> InlineSavingsMultiplier(94"inline-savings-multiplier", cl::Hidden, cl::init(8),95cl::desc("Multiplier to multiply cycle savings by during inlining"));9697// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is98// specified explicitly in command line options. This option is exposed99// for tuning and testing.100static cl::opt<int> InlineSavingsProfitableMultiplier(101"inline-savings-profitable-multiplier", cl::Hidden, cl::init(4),102cl::desc("A multiplier on top of cycle savings to decide whether the "103"savings won't justify the cost"));104105static cl::opt<int>106InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),107cl::desc("The maximum size of a callee that get's "108"inlined without sufficient cycle savings"));109110// We introduce this threshold to help performance of instrumentation based111// PGO before we actually hook up inliner with analysis passes such as BPI and112// BFI.113static cl::opt<int> ColdThreshold(114"inlinecold-threshold", cl::Hidden, cl::init(45),115cl::desc("Threshold for inlining functions with cold attribute"));116117static cl::opt<int>118HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),119cl::desc("Threshold for hot callsites "));120121static cl::opt<int> LocallyHotCallSiteThreshold(122"locally-hot-callsite-threshold", cl::Hidden, cl::init(525),123cl::desc("Threshold for locally hot callsites "));124125static cl::opt<int> ColdCallSiteRelFreq(126"cold-callsite-rel-freq", cl::Hidden, cl::init(2),127cl::desc("Maximum block frequency, expressed as a percentage of caller's "128"entry frequency, for a callsite to be cold in the absence of "129"profile information."));130131static cl::opt<uint64_t> HotCallSiteRelFreq(132"hot-callsite-rel-freq", cl::Hidden, cl::init(60),133cl::desc("Minimum block frequency, expressed as a multiple of caller's "134"entry frequency, for a callsite to be hot in the absence of "135"profile information."));136137static cl::opt<int>138InstrCost("inline-instr-cost", cl::Hidden, cl::init(5),139cl::desc("Cost of a single instruction when inlining"));140141static cl::opt<int>142MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0),143cl::desc("Cost of load/store instruction when inlining"));144145static cl::opt<int> CallPenalty(146"inline-call-penalty", cl::Hidden, cl::init(25),147cl::desc("Call penalty that is applied per callsite when inlining"));148149static cl::opt<size_t>150StackSizeThreshold("inline-max-stacksize", cl::Hidden,151cl::init(std::numeric_limits<size_t>::max()),152cl::desc("Do not inline functions with a stack size "153"that exceeds the specified limit"));154155static cl::opt<size_t> RecurStackSizeThreshold(156"recursive-inline-max-stacksize", cl::Hidden,157cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller),158cl::desc("Do not inline recursive functions with a stack "159"size that exceeds the specified limit"));160161static cl::opt<bool> OptComputeFullInlineCost(162"inline-cost-full", cl::Hidden,163cl::desc("Compute the full inline cost of a call site even when the cost "164"exceeds the threshold."));165166static cl::opt<bool> InlineCallerSupersetNoBuiltin(167"inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),168cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "169"attributes."));170171static cl::opt<bool> DisableGEPConstOperand(172"disable-gep-const-evaluation", cl::Hidden, cl::init(false),173cl::desc("Disables evaluation of GetElementPtr with constant operands"));174175namespace llvm {176std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {177if (Attr.isValid()) {178int AttrValue = 0;179if (!Attr.getValueAsString().getAsInteger(10, AttrValue))180return AttrValue;181}182return std::nullopt;183}184185std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {186return getStringFnAttrAsInt(CB.getFnAttr(AttrKind));187}188189std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {190return getStringFnAttrAsInt(F->getFnAttribute(AttrKind));191}192193namespace InlineConstants {194int getInstrCost() { return InstrCost; }195196} // namespace InlineConstants197198} // namespace llvm199200namespace {201class InlineCostCallAnalyzer;202203// This struct is used to store information about inline cost of a204// particular instruction205struct InstructionCostDetail {206int CostBefore = 0;207int CostAfter = 0;208int ThresholdBefore = 0;209int ThresholdAfter = 0;210211int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }212213int getCostDelta() const { return CostAfter - CostBefore; }214215bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }216};217218class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {219private:220InlineCostCallAnalyzer *const ICCA;221222public:223InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}224void emitInstructionAnnot(const Instruction *I,225formatted_raw_ostream &OS) override;226};227228/// Carry out call site analysis, in order to evaluate inlinability.229/// NOTE: the type is currently used as implementation detail of functions such230/// as llvm::getInlineCost. Note the function_ref constructor parameters - the231/// expectation is that they come from the outer scope, from the wrapper232/// functions. If we want to support constructing CallAnalyzer objects where233/// lambdas are provided inline at construction, or where the object needs to234/// otherwise survive past the scope of the provided functions, we need to235/// revisit the argument types.236class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {237typedef InstVisitor<CallAnalyzer, bool> Base;238friend class InstVisitor<CallAnalyzer, bool>;239240protected:241virtual ~CallAnalyzer() = default;242/// The TargetTransformInfo available for this compilation.243const TargetTransformInfo &TTI;244245/// Getter for the cache of @llvm.assume intrinsics.246function_ref<AssumptionCache &(Function &)> GetAssumptionCache;247248/// Getter for BlockFrequencyInfo249function_ref<BlockFrequencyInfo &(Function &)> GetBFI;250251/// Profile summary information.252ProfileSummaryInfo *PSI;253254/// The called function.255Function &F;256257// Cache the DataLayout since we use it a lot.258const DataLayout &DL;259260/// The OptimizationRemarkEmitter available for this compilation.261OptimizationRemarkEmitter *ORE;262263/// The candidate callsite being analyzed. Please do not use this to do264/// analysis in the caller function; we want the inline cost query to be265/// easily cacheable. Instead, use the cover function paramHasAttr.266CallBase &CandidateCall;267268/// Extension points for handling callsite features.269// Called before a basic block was analyzed.270virtual void onBlockStart(const BasicBlock *BB) {}271272/// Called after a basic block was analyzed.273virtual void onBlockAnalyzed(const BasicBlock *BB) {}274275/// Called before an instruction was analyzed276virtual void onInstructionAnalysisStart(const Instruction *I) {}277278/// Called after an instruction was analyzed279virtual void onInstructionAnalysisFinish(const Instruction *I) {}280281/// Called at the end of the analysis of the callsite. Return the outcome of282/// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or283/// the reason it can't.284virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }285/// Called when we're about to start processing a basic block, and every time286/// we are done processing an instruction. Return true if there is no point in287/// continuing the analysis (e.g. we've determined already the call site is288/// too expensive to inline)289virtual bool shouldStop() { return false; }290291/// Called before the analysis of the callee body starts (with callsite292/// contexts propagated). It checks callsite-specific information. Return a293/// reason analysis can't continue if that's the case, or 'true' if it may294/// continue.295virtual InlineResult onAnalysisStart() { return InlineResult::success(); }296/// Called if the analysis engine decides SROA cannot be done for the given297/// alloca.298virtual void onDisableSROA(AllocaInst *Arg) {}299300/// Called the analysis engine determines load elimination won't happen.301virtual void onDisableLoadElimination() {}302303/// Called when we visit a CallBase, before the analysis starts. Return false304/// to stop further processing of the instruction.305virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }306307/// Called to account for a call.308virtual void onCallPenalty() {}309310/// Called to account for a load or store.311virtual void onMemAccess(){};312313/// Called to account for the expectation the inlining would result in a load314/// elimination.315virtual void onLoadEliminationOpportunity() {}316317/// Called to account for the cost of argument setup for the Call in the318/// callee's body (not the callsite currently under analysis).319virtual void onCallArgumentSetup(const CallBase &Call) {}320321/// Called to account for a load relative intrinsic.322virtual void onLoadRelativeIntrinsic() {}323324/// Called to account for a lowered call.325virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {326}327328/// Account for a jump table of given size. Return false to stop further329/// processing the switch instruction330virtual bool onJumpTable(unsigned JumpTableSize) { return true; }331332/// Account for a case cluster of given size. Return false to stop further333/// processing of the instruction.334virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }335336/// Called at the end of processing a switch instruction, with the given337/// number of case clusters.338virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,339bool DefaultDestUndefined) {}340341/// Called to account for any other instruction not specifically accounted342/// for.343virtual void onMissedSimplification() {}344345/// Start accounting potential benefits due to SROA for the given alloca.346virtual void onInitializeSROAArg(AllocaInst *Arg) {}347348/// Account SROA savings for the AllocaInst value.349virtual void onAggregateSROAUse(AllocaInst *V) {}350351bool handleSROA(Value *V, bool DoNotDisable) {352// Check for SROA candidates in comparisons.353if (auto *SROAArg = getSROAArgForValueOrNull(V)) {354if (DoNotDisable) {355onAggregateSROAUse(SROAArg);356return true;357}358disableSROAForArg(SROAArg);359}360return false;361}362363bool IsCallerRecursive = false;364bool IsRecursiveCall = false;365bool ExposesReturnsTwice = false;366bool HasDynamicAlloca = false;367bool ContainsNoDuplicateCall = false;368bool HasReturn = false;369bool HasIndirectBr = false;370bool HasUninlineableIntrinsic = false;371bool InitsVargArgs = false;372373/// Number of bytes allocated statically by the callee.374uint64_t AllocatedSize = 0;375unsigned NumInstructions = 0;376unsigned NumVectorInstructions = 0;377378/// While we walk the potentially-inlined instructions, we build up and379/// maintain a mapping of simplified values specific to this callsite. The380/// idea is to propagate any special information we have about arguments to381/// this call through the inlinable section of the function, and account for382/// likely simplifications post-inlining. The most important aspect we track383/// is CFG altering simplifications -- when we prove a basic block dead, that384/// can cause dramatic shifts in the cost of inlining a function.385DenseMap<Value *, Constant *> SimplifiedValues;386387/// Keep track of the values which map back (through function arguments) to388/// allocas on the caller stack which could be simplified through SROA.389DenseMap<Value *, AllocaInst *> SROAArgValues;390391/// Keep track of Allocas for which we believe we may get SROA optimization.392DenseSet<AllocaInst *> EnabledSROAAllocas;393394/// Keep track of values which map to a pointer base and constant offset.395DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;396397/// Keep track of dead blocks due to the constant arguments.398SmallPtrSet<BasicBlock *, 16> DeadBlocks;399400/// The mapping of the blocks to their known unique successors due to the401/// constant arguments.402DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;403404/// Model the elimination of repeated loads that is expected to happen405/// whenever we simplify away the stores that would otherwise cause them to be406/// loads.407bool EnableLoadElimination = true;408409/// Whether we allow inlining for recursive call.410bool AllowRecursiveCall = false;411412SmallPtrSet<Value *, 16> LoadAddrSet;413414AllocaInst *getSROAArgForValueOrNull(Value *V) const {415auto It = SROAArgValues.find(V);416if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)417return nullptr;418return It->second;419}420421// Custom simplification helper routines.422bool isAllocaDerivedArg(Value *V);423void disableSROAForArg(AllocaInst *SROAArg);424void disableSROA(Value *V);425void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);426void disableLoadElimination();427bool isGEPFree(GetElementPtrInst &GEP);428bool canFoldInboundsGEP(GetElementPtrInst &I);429bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);430bool simplifyCallSite(Function *F, CallBase &Call);431bool simplifyInstruction(Instruction &I);432bool simplifyIntrinsicCallIsConstant(CallBase &CB);433bool simplifyIntrinsicCallObjectSize(CallBase &CB);434ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);435436/// Return true if the given argument to the function being considered for437/// inlining has the given attribute set either at the call site or the438/// function declaration. Primarily used to inspect call site specific439/// attributes since these can be more precise than the ones on the callee440/// itself.441bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);442443/// Return true if the given value is known non null within the callee if444/// inlined through this particular callsite.445bool isKnownNonNullInCallee(Value *V);446447/// Return true if size growth is allowed when inlining the callee at \p Call.448bool allowSizeGrowth(CallBase &Call);449450// Custom analysis routines.451InlineResult analyzeBlock(BasicBlock *BB,452SmallPtrSetImpl<const Value *> &EphValues);453454// Disable several entry points to the visitor so we don't accidentally use455// them by declaring but not defining them here.456void visit(Module *);457void visit(Module &);458void visit(Function *);459void visit(Function &);460void visit(BasicBlock *);461void visit(BasicBlock &);462463// Provide base case for our instruction visit.464bool visitInstruction(Instruction &I);465466// Our visit overrides.467bool visitAlloca(AllocaInst &I);468bool visitPHI(PHINode &I);469bool visitGetElementPtr(GetElementPtrInst &I);470bool visitBitCast(BitCastInst &I);471bool visitPtrToInt(PtrToIntInst &I);472bool visitIntToPtr(IntToPtrInst &I);473bool visitCastInst(CastInst &I);474bool visitCmpInst(CmpInst &I);475bool visitSub(BinaryOperator &I);476bool visitBinaryOperator(BinaryOperator &I);477bool visitFNeg(UnaryOperator &I);478bool visitLoad(LoadInst &I);479bool visitStore(StoreInst &I);480bool visitExtractValue(ExtractValueInst &I);481bool visitInsertValue(InsertValueInst &I);482bool visitCallBase(CallBase &Call);483bool visitReturnInst(ReturnInst &RI);484bool visitBranchInst(BranchInst &BI);485bool visitSelectInst(SelectInst &SI);486bool visitSwitchInst(SwitchInst &SI);487bool visitIndirectBrInst(IndirectBrInst &IBI);488bool visitResumeInst(ResumeInst &RI);489bool visitCleanupReturnInst(CleanupReturnInst &RI);490bool visitCatchReturnInst(CatchReturnInst &RI);491bool visitUnreachableInst(UnreachableInst &I);492493public:494CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,495function_ref<AssumptionCache &(Function &)> GetAssumptionCache,496function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,497ProfileSummaryInfo *PSI = nullptr,498OptimizationRemarkEmitter *ORE = nullptr)499: TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),500PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE),501CandidateCall(Call) {}502503InlineResult analyze();504505std::optional<Constant *> getSimplifiedValue(Instruction *I) {506if (SimplifiedValues.contains(I))507return SimplifiedValues[I];508return std::nullopt;509}510511// Keep a bunch of stats about the cost savings found so we can print them512// out when debugging.513unsigned NumConstantArgs = 0;514unsigned NumConstantOffsetPtrArgs = 0;515unsigned NumAllocaArgs = 0;516unsigned NumConstantPtrCmps = 0;517unsigned NumConstantPtrDiffs = 0;518unsigned NumInstructionsSimplified = 0;519520void dump();521};522523// Considering forming a binary search, we should find the number of nodes524// which is same as the number of comparisons when lowered. For a given525// number of clusters, n, we can define a recursive function, f(n), to find526// the number of nodes in the tree. The recursion is :527// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,528// and f(n) = n, when n <= 3.529// This will lead a binary tree where the leaf should be either f(2) or f(3)530// when n > 3. So, the number of comparisons from leaves should be n, while531// the number of non-leaf should be :532// 2^(log2(n) - 1) - 1533// = 2^log2(n) * 2^-1 - 1534// = n / 2 - 1.535// Considering comparisons from leaf and non-leaf nodes, we can estimate the536// number of comparisons in a simple closed form :537// n + n / 2 - 1 = n * 3 / 2 - 1538int64_t getExpectedNumberOfCompare(int NumCaseCluster) {539return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;540}541542/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note543/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer544class InlineCostCallAnalyzer final : public CallAnalyzer {545const bool ComputeFullInlineCost;546int LoadEliminationCost = 0;547/// Bonus to be applied when percentage of vector instructions in callee is548/// high (see more details in updateThreshold).549int VectorBonus = 0;550/// Bonus to be applied when the callee has only one reachable basic block.551int SingleBBBonus = 0;552553/// Tunable parameters that control the analysis.554const InlineParams &Params;555556// This DenseMap stores the delta change in cost and threshold after557// accounting for the given instruction. The map is filled only with the558// flag PrintInstructionComments on.559DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap;560561/// Upper bound for the inlining cost. Bonuses are being applied to account562/// for speculative "expected profit" of the inlining decision.563int Threshold = 0;564565/// The amount of StaticBonus applied.566int StaticBonusApplied = 0;567568/// Attempt to evaluate indirect calls to boost its inline cost.569const bool BoostIndirectCalls;570571/// Ignore the threshold when finalizing analysis.572const bool IgnoreThreshold;573574// True if the cost-benefit-analysis-based inliner is enabled.575const bool CostBenefitAnalysisEnabled;576577/// Inlining cost measured in abstract units, accounts for all the578/// instructions expected to be executed for a given function invocation.579/// Instructions that are statically proven to be dead based on call-site580/// arguments are not counted here.581int Cost = 0;582583// The cumulative cost at the beginning of the basic block being analyzed. At584// the end of analyzing each basic block, "Cost - CostAtBBStart" represents585// the size of that basic block.586int CostAtBBStart = 0;587588// The static size of live but cold basic blocks. This is "static" in the589// sense that it's not weighted by profile counts at all.590int ColdSize = 0;591592// Whether inlining is decided by cost-threshold analysis.593bool DecidedByCostThreshold = false;594595// Whether inlining is decided by cost-benefit analysis.596bool DecidedByCostBenefit = false;597598// The cost-benefit pair computed by cost-benefit analysis.599std::optional<CostBenefitPair> CostBenefit;600601bool SingleBB = true;602603unsigned SROACostSavings = 0;604unsigned SROACostSavingsLost = 0;605606/// The mapping of caller Alloca values to their accumulated cost savings. If607/// we have to disable SROA for one of the allocas, this tells us how much608/// cost must be added.609DenseMap<AllocaInst *, int> SROAArgCosts;610611/// Return true if \p Call is a cold callsite.612bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);613614/// Update Threshold based on callsite properties such as callee615/// attributes and callee hotness for PGO builds. The Callee is explicitly616/// passed to support analyzing indirect calls whose target is inferred by617/// analysis.618void updateThreshold(CallBase &Call, Function &Callee);619/// Return a higher threshold if \p Call is a hot callsite.620std::optional<int> getHotCallSiteThreshold(CallBase &Call,621BlockFrequencyInfo *CallerBFI);622623/// Handle a capped 'int' increment for Cost.624void addCost(int64_t Inc) {625Inc = std::clamp<int64_t>(Inc, INT_MIN, INT_MAX);626Cost = std::clamp<int64_t>(Inc + Cost, INT_MIN, INT_MAX);627}628629void onDisableSROA(AllocaInst *Arg) override {630auto CostIt = SROAArgCosts.find(Arg);631if (CostIt == SROAArgCosts.end())632return;633addCost(CostIt->second);634SROACostSavings -= CostIt->second;635SROACostSavingsLost += CostIt->second;636SROAArgCosts.erase(CostIt);637}638639void onDisableLoadElimination() override {640addCost(LoadEliminationCost);641LoadEliminationCost = 0;642}643644bool onCallBaseVisitStart(CallBase &Call) override {645if (std::optional<int> AttrCallThresholdBonus =646getStringFnAttrAsInt(Call, "call-threshold-bonus"))647Threshold += *AttrCallThresholdBonus;648649if (std::optional<int> AttrCallCost =650getStringFnAttrAsInt(Call, "call-inline-cost")) {651addCost(*AttrCallCost);652// Prevent further processing of the call since we want to override its653// inline cost, not just add to it.654return false;655}656return true;657}658659void onCallPenalty() override { addCost(CallPenalty); }660661void onMemAccess() override { addCost(MemAccessCost); }662663void onCallArgumentSetup(const CallBase &Call) override {664// Pay the price of the argument setup. We account for the average 1665// instruction per call argument setup here.666addCost(Call.arg_size() * InstrCost);667}668void onLoadRelativeIntrinsic() override {669// This is normally lowered to 4 LLVM instructions.670addCost(3 * InstrCost);671}672void onLoweredCall(Function *F, CallBase &Call,673bool IsIndirectCall) override {674// We account for the average 1 instruction per call argument setup here.675addCost(Call.arg_size() * InstrCost);676677// If we have a constant that we are calling as a function, we can peer678// through it and see the function target. This happens not infrequently679// during devirtualization and so we want to give it a hefty bonus for680// inlining, but cap that bonus in the event that inlining wouldn't pan out.681// Pretend to inline the function, with a custom threshold.682if (IsIndirectCall && BoostIndirectCalls) {683auto IndirectCallParams = Params;684IndirectCallParams.DefaultThreshold =685InlineConstants::IndirectCallThreshold;686/// FIXME: if InlineCostCallAnalyzer is derived from, this may need687/// to instantiate the derived class.688InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,689GetAssumptionCache, GetBFI, PSI, ORE, false);690if (CA.analyze().isSuccess()) {691// We were able to inline the indirect call! Subtract the cost from the692// threshold to get the bonus we want to apply, but don't go below zero.693Cost -= std::max(0, CA.getThreshold() - CA.getCost());694}695} else696// Otherwise simply add the cost for merely making the call.697addCost(TTI.getInlineCallPenalty(CandidateCall.getCaller(), Call,698CallPenalty));699}700701void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,702bool DefaultDestUndefined) override {703// If suitable for a jump table, consider the cost for the table size and704// branch to destination.705// Maximum valid cost increased in this function.706if (JumpTableSize) {707// Suppose a default branch includes one compare and one conditional708// branch if it's reachable.709if (!DefaultDestUndefined)710addCost(2 * InstrCost);711// Suppose a jump table requires one load and one jump instruction.712int64_t JTCost =713static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost;714addCost(JTCost);715return;716}717718if (NumCaseCluster <= 3) {719// Suppose a comparison includes one compare and one conditional branch.720// We can reduce a set of instructions if the default branch is721// undefined.722addCost((NumCaseCluster - DefaultDestUndefined) * 2 * InstrCost);723return;724}725726int64_t ExpectedNumberOfCompare =727getExpectedNumberOfCompare(NumCaseCluster);728int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;729730addCost(SwitchCost);731}732void onMissedSimplification() override { addCost(InstrCost); }733734void onInitializeSROAArg(AllocaInst *Arg) override {735assert(Arg != nullptr &&736"Should not initialize SROA costs for null value.");737auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);738SROACostSavings += SROAArgCost;739SROAArgCosts[Arg] = SROAArgCost;740}741742void onAggregateSROAUse(AllocaInst *SROAArg) override {743auto CostIt = SROAArgCosts.find(SROAArg);744assert(CostIt != SROAArgCosts.end() &&745"expected this argument to have a cost");746CostIt->second += InstrCost;747SROACostSavings += InstrCost;748}749750void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }751752void onBlockAnalyzed(const BasicBlock *BB) override {753if (CostBenefitAnalysisEnabled) {754// Keep track of the static size of live but cold basic blocks. For now,755// we define a cold basic block to be one that's never executed.756assert(GetBFI && "GetBFI must be available");757BlockFrequencyInfo *BFI = &(GetBFI(F));758assert(BFI && "BFI must be available");759auto ProfileCount = BFI->getBlockProfileCount(BB);760if (*ProfileCount == 0)761ColdSize += Cost - CostAtBBStart;762}763764auto *TI = BB->getTerminator();765// If we had any successors at this point, than post-inlining is likely to766// have them as well. Note that we assume any basic blocks which existed767// due to branches or switches which folded above will also fold after768// inlining.769if (SingleBB && TI->getNumSuccessors() > 1) {770// Take off the bonus we applied to the threshold.771Threshold -= SingleBBBonus;772SingleBB = false;773}774}775776void onInstructionAnalysisStart(const Instruction *I) override {777// This function is called to store the initial cost of inlining before778// the given instruction was assessed.779if (!PrintInstructionComments)780return;781InstructionCostDetailMap[I].CostBefore = Cost;782InstructionCostDetailMap[I].ThresholdBefore = Threshold;783}784785void onInstructionAnalysisFinish(const Instruction *I) override {786// This function is called to find new values of cost and threshold after787// the instruction has been assessed.788if (!PrintInstructionComments)789return;790InstructionCostDetailMap[I].CostAfter = Cost;791InstructionCostDetailMap[I].ThresholdAfter = Threshold;792}793794bool isCostBenefitAnalysisEnabled() {795if (!PSI || !PSI->hasProfileSummary())796return false;797798if (!GetBFI)799return false;800801if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {802// Honor the explicit request from the user.803if (!InlineEnableCostBenefitAnalysis)804return false;805} else {806// Otherwise, require instrumentation profile.807if (!PSI->hasInstrumentationProfile())808return false;809}810811auto *Caller = CandidateCall.getParent()->getParent();812if (!Caller->getEntryCount())813return false;814815BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));816if (!CallerBFI)817return false;818819// For now, limit to hot call site.820if (!PSI->isHotCallSite(CandidateCall, CallerBFI))821return false;822823// Make sure we have a nonzero entry count.824auto EntryCount = F.getEntryCount();825if (!EntryCount || !EntryCount->getCount())826return false;827828BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));829if (!CalleeBFI)830return false;831832return true;833}834835// A helper function to choose between command line override and default.836unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {837if (InlineSavingsMultiplier.getNumOccurrences())838return InlineSavingsMultiplier;839return TTI.getInliningCostBenefitAnalysisSavingsMultiplier();840}841842// A helper function to choose between command line override and default.843unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {844if (InlineSavingsProfitableMultiplier.getNumOccurrences())845return InlineSavingsProfitableMultiplier;846return TTI.getInliningCostBenefitAnalysisProfitableMultiplier();847}848849void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {850if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(851CandidateCall, "inline-cycle-savings-for-test")) {852CycleSavings = *AttrCycleSavings;853}854855if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(856CandidateCall, "inline-runtime-cost-for-test")) {857Size = *AttrRuntimeCost;858}859}860861// Determine whether we should inline the given call site, taking into account862// both the size cost and the cycle savings. Return std::nullopt if we don't863// have sufficient profiling information to determine.864std::optional<bool> costBenefitAnalysis() {865if (!CostBenefitAnalysisEnabled)866return std::nullopt;867868// buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0869// for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by870// falling back to the cost-based metric.871// TODO: Improve this hacky condition.872if (Threshold == 0)873return std::nullopt;874875assert(GetBFI);876BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));877assert(CalleeBFI);878879// The cycle savings expressed as the sum of InstrCost880// multiplied by the estimated dynamic count of each instruction we can881// avoid. Savings come from the call site cost, such as argument setup and882// the call instruction, as well as the instructions that are folded.883//884// We use 128-bit APInt here to avoid potential overflow. This variable885// should stay well below 10^^24 (or 2^^80) in practice. This "worst" case886// assumes that we can avoid or fold a billion instructions, each with a887// profile count of 10^^15 -- roughly the number of cycles for a 24-hour888// period on a 4GHz machine.889APInt CycleSavings(128, 0);890891for (auto &BB : F) {892APInt CurrentSavings(128, 0);893for (auto &I : BB) {894if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {895// Count a conditional branch as savings if it becomes unconditional.896if (BI->isConditional() &&897isa_and_nonnull<ConstantInt>(898SimplifiedValues.lookup(BI->getCondition()))) {899CurrentSavings += InstrCost;900}901} else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {902if (isa_and_present<ConstantInt>(SimplifiedValues.lookup(SI->getCondition())))903CurrentSavings += InstrCost;904} else if (Value *V = dyn_cast<Value>(&I)) {905// Count an instruction as savings if we can fold it.906if (SimplifiedValues.count(V)) {907CurrentSavings += InstrCost;908}909}910}911912auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);913CurrentSavings *= *ProfileCount;914CycleSavings += CurrentSavings;915}916917// Compute the cycle savings per call.918auto EntryProfileCount = F.getEntryCount();919assert(EntryProfileCount && EntryProfileCount->getCount());920auto EntryCount = EntryProfileCount->getCount();921CycleSavings += EntryCount / 2;922CycleSavings = CycleSavings.udiv(EntryCount);923924// Compute the total savings for the call site.925auto *CallerBB = CandidateCall.getParent();926BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));927CycleSavings += getCallsiteCost(TTI, this->CandidateCall, DL);928CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);929930// Remove the cost of the cold basic blocks to model the runtime cost more931// accurately. Both machine block placement and function splitting could932// place cold blocks further from hot blocks.933int Size = Cost - ColdSize;934935// Allow tiny callees to be inlined regardless of whether they meet the936// savings threshold.937Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;938939OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);940CostBenefit.emplace(APInt(128, Size), CycleSavings);941942// Let R be the ratio of CycleSavings to Size. We accept the inlining943// opportunity if R is really high and reject if R is really low. If R is944// somewhere in the middle, we fall back to the cost-based analysis.945//946// Specifically, let R = CycleSavings / Size, we accept the inlining947// opportunity if:948//949// PSI->getOrCompHotCountThreshold()950// R > -------------------------------------------------951// getInliningCostBenefitAnalysisSavingsMultiplier()952//953// and reject the inlining opportunity if:954//955// PSI->getOrCompHotCountThreshold()956// R <= ----------------------------------------------------957// getInliningCostBenefitAnalysisProfitableMultiplier()958//959// Otherwise, we fall back to the cost-based analysis.960//961// Implementation-wise, use multiplication (CycleSavings * Multiplier,962// HotCountThreshold * Size) rather than division to avoid precision loss.963APInt Threshold(128, PSI->getOrCompHotCountThreshold());964Threshold *= Size;965966APInt UpperBoundCycleSavings = CycleSavings;967UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();968if (UpperBoundCycleSavings.uge(Threshold))969return true;970971APInt LowerBoundCycleSavings = CycleSavings;972LowerBoundCycleSavings *=973getInliningCostBenefitAnalysisProfitableMultiplier();974if (LowerBoundCycleSavings.ult(Threshold))975return false;976977// Otherwise, fall back to the cost-based analysis.978return std::nullopt;979}980981InlineResult finalizeAnalysis() override {982// Loops generally act a lot like calls in that they act like barriers to983// movement, require a certain amount of setup, etc. So when optimising for984// size, we penalise any call sites that perform loops. We do this after all985// other costs here, so will likely only be dealing with relatively small986// functions (and hence DT and LI will hopefully be cheap).987auto *Caller = CandidateCall.getFunction();988if (Caller->hasMinSize()) {989DominatorTree DT(F);990LoopInfo LI(DT);991int NumLoops = 0;992for (Loop *L : LI) {993// Ignore loops that will not be executed994if (DeadBlocks.count(L->getHeader()))995continue;996NumLoops++;997}998addCost(NumLoops * InlineConstants::LoopPenalty);999}10001001// We applied the maximum possible vector bonus at the beginning. Now,1002// subtract the excess bonus, if any, from the Threshold before1003// comparing against Cost.1004if (NumVectorInstructions <= NumInstructions / 10)1005Threshold -= VectorBonus;1006else if (NumVectorInstructions <= NumInstructions / 2)1007Threshold -= VectorBonus / 2;10081009if (std::optional<int> AttrCost =1010getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))1011Cost = *AttrCost;10121013if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(1014CandidateCall,1015InlineConstants::FunctionInlineCostMultiplierAttributeName))1016Cost *= *AttrCostMult;10171018if (std::optional<int> AttrThreshold =1019getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))1020Threshold = *AttrThreshold;10211022if (auto Result = costBenefitAnalysis()) {1023DecidedByCostBenefit = true;1024if (*Result)1025return InlineResult::success();1026else1027return InlineResult::failure("Cost over threshold.");1028}10291030if (IgnoreThreshold)1031return InlineResult::success();10321033DecidedByCostThreshold = true;1034return Cost < std::max(1, Threshold)1035? InlineResult::success()1036: InlineResult::failure("Cost over threshold.");1037}10381039bool shouldStop() override {1040if (IgnoreThreshold || ComputeFullInlineCost)1041return false;1042// Bail out the moment we cross the threshold. This means we'll under-count1043// the cost, but only when undercounting doesn't matter.1044if (Cost < Threshold)1045return false;1046DecidedByCostThreshold = true;1047return true;1048}10491050void onLoadEliminationOpportunity() override {1051LoadEliminationCost += InstrCost;1052}10531054InlineResult onAnalysisStart() override {1055// Perform some tweaks to the cost and threshold based on the direct1056// callsite information.10571058// We want to more aggressively inline vector-dense kernels, so up the1059// threshold, and we'll lower it if the % of vector instructions gets too1060// low. Note that these bonuses are some what arbitrary and evolved over1061// time by accident as much as because they are principled bonuses.1062//1063// FIXME: It would be nice to remove all such bonuses. At least it would be1064// nice to base the bonus values on something more scientific.1065assert(NumInstructions == 0);1066assert(NumVectorInstructions == 0);10671068// Update the threshold based on callsite properties1069updateThreshold(CandidateCall, F);10701071// While Threshold depends on commandline options that can take negative1072// values, we want to enforce the invariant that the computed threshold and1073// bonuses are non-negative.1074assert(Threshold >= 0);1075assert(SingleBBBonus >= 0);1076assert(VectorBonus >= 0);10771078// Speculatively apply all possible bonuses to Threshold. If cost exceeds1079// this Threshold any time, and cost cannot decrease, we can stop processing1080// the rest of the function body.1081Threshold += (SingleBBBonus + VectorBonus);10821083// Give out bonuses for the callsite, as the instructions setting them up1084// will be gone after inlining.1085addCost(-getCallsiteCost(TTI, this->CandidateCall, DL));10861087// If this function uses the coldcc calling convention, prefer not to inline1088// it.1089if (F.getCallingConv() == CallingConv::Cold)1090Cost += InlineConstants::ColdccPenalty;10911092LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");10931094// Check if we're done. This can happen due to bonuses and penalties.1095if (Cost >= Threshold && !ComputeFullInlineCost)1096return InlineResult::failure("high cost");10971098return InlineResult::success();1099}11001101public:1102InlineCostCallAnalyzer(1103Function &Callee, CallBase &Call, const InlineParams &Params,1104const TargetTransformInfo &TTI,1105function_ref<AssumptionCache &(Function &)> GetAssumptionCache,1106function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,1107ProfileSummaryInfo *PSI = nullptr,1108OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,1109bool IgnoreThreshold = false)1110: CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),1111ComputeFullInlineCost(OptComputeFullInlineCost ||1112Params.ComputeFullInlineCost || ORE ||1113isCostBenefitAnalysisEnabled()),1114Params(Params), Threshold(Params.DefaultThreshold),1115BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),1116CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),1117Writer(this) {1118AllowRecursiveCall = *Params.AllowRecursiveCall;1119}11201121/// Annotation Writer for instruction details1122InlineCostAnnotationWriter Writer;11231124void dump();11251126// Prints the same analysis as dump(), but its definition is not dependent1127// on the build.1128void print(raw_ostream &OS);11291130std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {1131if (InstructionCostDetailMap.contains(I))1132return InstructionCostDetailMap[I];1133return std::nullopt;1134}11351136virtual ~InlineCostCallAnalyzer() = default;1137int getThreshold() const { return Threshold; }1138int getCost() const { return Cost; }1139int getStaticBonusApplied() const { return StaticBonusApplied; }1140std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }1141bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }1142bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }1143};11441145// Return true if CB is the sole call to local function Callee.1146static bool isSoleCallToLocalFunction(const CallBase &CB,1147const Function &Callee) {1148return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&1149&Callee == CB.getCalledFunction();1150}11511152class InlineCostFeaturesAnalyzer final : public CallAnalyzer {1153private:1154InlineCostFeatures Cost = {};11551156// FIXME: These constants are taken from the heuristic-based cost visitor.1157// These should be removed entirely in a later revision to avoid reliance on1158// heuristics in the ML inliner.1159static constexpr int JTCostMultiplier = 2;1160static constexpr int CaseClusterCostMultiplier = 2;1161static constexpr int SwitchDefaultDestCostMultiplier = 2;1162static constexpr int SwitchCostMultiplier = 2;11631164// FIXME: These are taken from the heuristic-based cost visitor: we should1165// eventually abstract these to the CallAnalyzer to avoid duplication.1166unsigned SROACostSavingOpportunities = 0;1167int VectorBonus = 0;1168int SingleBBBonus = 0;1169int Threshold = 5;11701171DenseMap<AllocaInst *, unsigned> SROACosts;11721173void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {1174Cost[static_cast<size_t>(Feature)] += Delta;1175}11761177void set(InlineCostFeatureIndex Feature, int64_t Value) {1178Cost[static_cast<size_t>(Feature)] = Value;1179}11801181void onDisableSROA(AllocaInst *Arg) override {1182auto CostIt = SROACosts.find(Arg);1183if (CostIt == SROACosts.end())1184return;11851186increment(InlineCostFeatureIndex::sroa_losses, CostIt->second);1187SROACostSavingOpportunities -= CostIt->second;1188SROACosts.erase(CostIt);1189}11901191void onDisableLoadElimination() override {1192set(InlineCostFeatureIndex::load_elimination, 1);1193}11941195void onCallPenalty() override {1196increment(InlineCostFeatureIndex::call_penalty, CallPenalty);1197}11981199void onCallArgumentSetup(const CallBase &Call) override {1200increment(InlineCostFeatureIndex::call_argument_setup,1201Call.arg_size() * InstrCost);1202}12031204void onLoadRelativeIntrinsic() override {1205increment(InlineCostFeatureIndex::load_relative_intrinsic, 3 * InstrCost);1206}12071208void onLoweredCall(Function *F, CallBase &Call,1209bool IsIndirectCall) override {1210increment(InlineCostFeatureIndex::lowered_call_arg_setup,1211Call.arg_size() * InstrCost);12121213if (IsIndirectCall) {1214InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,1215/*HintThreshold*/ {},1216/*ColdThreshold*/ {},1217/*OptSizeThreshold*/ {},1218/*OptMinSizeThreshold*/ {},1219/*HotCallSiteThreshold*/ {},1220/*LocallyHotCallSiteThreshold*/ {},1221/*ColdCallSiteThreshold*/ {},1222/*ComputeFullInlineCost*/ true,1223/*EnableDeferral*/ true};1224IndirectCallParams.DefaultThreshold =1225InlineConstants::IndirectCallThreshold;12261227InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,1228GetAssumptionCache, GetBFI, PSI, ORE, false,1229true);1230if (CA.analyze().isSuccess()) {1231increment(InlineCostFeatureIndex::nested_inline_cost_estimate,1232CA.getCost());1233increment(InlineCostFeatureIndex::nested_inlines, 1);1234}1235} else {1236onCallPenalty();1237}1238}12391240void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,1241bool DefaultDestUndefined) override {1242if (JumpTableSize) {1243if (!DefaultDestUndefined)1244increment(InlineCostFeatureIndex::switch_default_dest_penalty,1245SwitchDefaultDestCostMultiplier * InstrCost);1246int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +1247JTCostMultiplier * InstrCost;1248increment(InlineCostFeatureIndex::jump_table_penalty, JTCost);1249return;1250}12511252if (NumCaseCluster <= 3) {1253increment(InlineCostFeatureIndex::case_cluster_penalty,1254(NumCaseCluster - DefaultDestUndefined) *1255CaseClusterCostMultiplier * InstrCost);1256return;1257}12581259int64_t ExpectedNumberOfCompare =1260getExpectedNumberOfCompare(NumCaseCluster);12611262int64_t SwitchCost =1263ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;1264increment(InlineCostFeatureIndex::switch_penalty, SwitchCost);1265}12661267void onMissedSimplification() override {1268increment(InlineCostFeatureIndex::unsimplified_common_instructions,1269InstrCost);1270}12711272void onInitializeSROAArg(AllocaInst *Arg) override {1273auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);1274SROACosts[Arg] = SROAArgCost;1275SROACostSavingOpportunities += SROAArgCost;1276}12771278void onAggregateSROAUse(AllocaInst *Arg) override {1279SROACosts.find(Arg)->second += InstrCost;1280SROACostSavingOpportunities += InstrCost;1281}12821283void onBlockAnalyzed(const BasicBlock *BB) override {1284if (BB->getTerminator()->getNumSuccessors() > 1)1285set(InlineCostFeatureIndex::is_multiple_blocks, 1);1286Threshold -= SingleBBBonus;1287}12881289InlineResult finalizeAnalysis() override {1290auto *Caller = CandidateCall.getFunction();1291if (Caller->hasMinSize()) {1292DominatorTree DT(F);1293LoopInfo LI(DT);1294for (Loop *L : LI) {1295// Ignore loops that will not be executed1296if (DeadBlocks.count(L->getHeader()))1297continue;1298increment(InlineCostFeatureIndex::num_loops,1299InlineConstants::LoopPenalty);1300}1301}1302set(InlineCostFeatureIndex::dead_blocks, DeadBlocks.size());1303set(InlineCostFeatureIndex::simplified_instructions,1304NumInstructionsSimplified);1305set(InlineCostFeatureIndex::constant_args, NumConstantArgs);1306set(InlineCostFeatureIndex::constant_offset_ptr_args,1307NumConstantOffsetPtrArgs);1308set(InlineCostFeatureIndex::sroa_savings, SROACostSavingOpportunities);13091310if (NumVectorInstructions <= NumInstructions / 10)1311Threshold -= VectorBonus;1312else if (NumVectorInstructions <= NumInstructions / 2)1313Threshold -= VectorBonus / 2;13141315set(InlineCostFeatureIndex::threshold, Threshold);13161317return InlineResult::success();1318}13191320bool shouldStop() override { return false; }13211322void onLoadEliminationOpportunity() override {1323increment(InlineCostFeatureIndex::load_elimination, 1);1324}13251326InlineResult onAnalysisStart() override {1327increment(InlineCostFeatureIndex::callsite_cost,1328-1 * getCallsiteCost(TTI, this->CandidateCall, DL));13291330set(InlineCostFeatureIndex::cold_cc_penalty,1331(F.getCallingConv() == CallingConv::Cold));13321333set(InlineCostFeatureIndex::last_call_to_static_bonus,1334isSoleCallToLocalFunction(CandidateCall, F));13351336// FIXME: we shouldn't repeat this logic in both the Features and Cost1337// analyzer - instead, we should abstract it to a common method in the1338// CallAnalyzer1339int SingleBBBonusPercent = 50;1340int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();1341Threshold += TTI.adjustInliningThreshold(&CandidateCall);1342Threshold *= TTI.getInliningThresholdMultiplier();1343SingleBBBonus = Threshold * SingleBBBonusPercent / 100;1344VectorBonus = Threshold * VectorBonusPercent / 100;1345Threshold += (SingleBBBonus + VectorBonus);13461347return InlineResult::success();1348}13491350public:1351InlineCostFeaturesAnalyzer(1352const TargetTransformInfo &TTI,1353function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,1354function_ref<BlockFrequencyInfo &(Function &)> GetBFI,1355ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,1356CallBase &Call)1357: CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {}13581359const InlineCostFeatures &features() const { return Cost; }1360};13611362} // namespace13631364/// Test whether the given value is an Alloca-derived function argument.1365bool CallAnalyzer::isAllocaDerivedArg(Value *V) {1366return SROAArgValues.count(V);1367}13681369void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {1370onDisableSROA(SROAArg);1371EnabledSROAAllocas.erase(SROAArg);1372disableLoadElimination();1373}13741375void InlineCostAnnotationWriter::emitInstructionAnnot(1376const Instruction *I, formatted_raw_ostream &OS) {1377// The cost of inlining of the given instruction is printed always.1378// The threshold delta is printed only when it is non-zero. It happens1379// when we decided to give a bonus at a particular instruction.1380std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);1381if (!Record)1382OS << "; No analysis for the instruction";1383else {1384OS << "; cost before = " << Record->CostBefore1385<< ", cost after = " << Record->CostAfter1386<< ", threshold before = " << Record->ThresholdBefore1387<< ", threshold after = " << Record->ThresholdAfter << ", ";1388OS << "cost delta = " << Record->getCostDelta();1389if (Record->hasThresholdChanged())1390OS << ", threshold delta = " << Record->getThresholdDelta();1391}1392auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));1393if (C) {1394OS << ", simplified to ";1395(*C)->print(OS, true);1396}1397OS << "\n";1398}13991400/// If 'V' maps to a SROA candidate, disable SROA for it.1401void CallAnalyzer::disableSROA(Value *V) {1402if (auto *SROAArg = getSROAArgForValueOrNull(V)) {1403disableSROAForArg(SROAArg);1404}1405}14061407void CallAnalyzer::disableLoadElimination() {1408if (EnableLoadElimination) {1409onDisableLoadElimination();1410EnableLoadElimination = false;1411}1412}14131414/// Accumulate a constant GEP offset into an APInt if possible.1415///1416/// Returns false if unable to compute the offset for any reason. Respects any1417/// simplified values known during the analysis of this callsite.1418bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {1419unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());1420assert(IntPtrWidth == Offset.getBitWidth());14211422for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);1423GTI != GTE; ++GTI) {1424ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());1425if (!OpC)1426if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))1427OpC = dyn_cast<ConstantInt>(SimpleOp);1428if (!OpC)1429return false;1430if (OpC->isZero())1431continue;14321433// Handle a struct index, which adds its field offset to the pointer.1434if (StructType *STy = GTI.getStructTypeOrNull()) {1435unsigned ElementIdx = OpC->getZExtValue();1436const StructLayout *SL = DL.getStructLayout(STy);1437Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));1438continue;1439}14401441APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));1442Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;1443}1444return true;1445}14461447/// Use TTI to check whether a GEP is free.1448///1449/// Respects any simplified values known during the analysis of this callsite.1450bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {1451SmallVector<Value *, 4> Operands;1452Operands.push_back(GEP.getOperand(0));1453for (const Use &Op : GEP.indices())1454if (Constant *SimpleOp = SimplifiedValues.lookup(Op))1455Operands.push_back(SimpleOp);1456else1457Operands.push_back(Op);1458return TTI.getInstructionCost(&GEP, Operands,1459TargetTransformInfo::TCK_SizeAndLatency) ==1460TargetTransformInfo::TCC_Free;1461}14621463bool CallAnalyzer::visitAlloca(AllocaInst &I) {1464disableSROA(I.getOperand(0));14651466// Check whether inlining will turn a dynamic alloca into a static1467// alloca and handle that case.1468if (I.isArrayAllocation()) {1469Constant *Size = SimplifiedValues.lookup(I.getArraySize());1470if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {1471// Sometimes a dynamic alloca could be converted into a static alloca1472// after this constant prop, and become a huge static alloca on an1473// unconditional CFG path. Avoid inlining if this is going to happen above1474// a threshold.1475// FIXME: If the threshold is removed or lowered too much, we could end up1476// being too pessimistic and prevent inlining non-problematic code. This1477// could result in unintended perf regressions. A better overall strategy1478// is needed to track stack usage during inlining.1479Type *Ty = I.getAllocatedType();1480AllocatedSize = SaturatingMultiplyAdd(1481AllocSize->getLimitedValue(),1482DL.getTypeAllocSize(Ty).getKnownMinValue(), AllocatedSize);1483if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline)1484HasDynamicAlloca = true;1485return false;1486}1487}14881489// Accumulate the allocated size.1490if (I.isStaticAlloca()) {1491Type *Ty = I.getAllocatedType();1492AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinValue(),1493AllocatedSize);1494}14951496// FIXME: This is overly conservative. Dynamic allocas are inefficient for1497// a variety of reasons, and so we would like to not inline them into1498// functions which don't currently have a dynamic alloca. This simply1499// disables inlining altogether in the presence of a dynamic alloca.1500if (!I.isStaticAlloca())1501HasDynamicAlloca = true;15021503return false;1504}15051506bool CallAnalyzer::visitPHI(PHINode &I) {1507// FIXME: We need to propagate SROA *disabling* through phi nodes, even1508// though we don't want to propagate it's bonuses. The idea is to disable1509// SROA if it *might* be used in an inappropriate manner.15101511// Phi nodes are always zero-cost.1512// FIXME: Pointer sizes may differ between different address spaces, so do we1513// need to use correct address space in the call to getPointerSizeInBits here?1514// Or could we skip the getPointerSizeInBits call completely? As far as I can1515// see the ZeroOffset is used as a dummy value, so we can probably use any1516// bit width for the ZeroOffset?1517APInt ZeroOffset = APInt::getZero(DL.getPointerSizeInBits(0));1518bool CheckSROA = I.getType()->isPointerTy();15191520// Track the constant or pointer with constant offset we've seen so far.1521Constant *FirstC = nullptr;1522std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};1523Value *FirstV = nullptr;15241525for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {1526BasicBlock *Pred = I.getIncomingBlock(i);1527// If the incoming block is dead, skip the incoming block.1528if (DeadBlocks.count(Pred))1529continue;1530// If the parent block of phi is not the known successor of the incoming1531// block, skip the incoming block.1532BasicBlock *KnownSuccessor = KnownSuccessors[Pred];1533if (KnownSuccessor && KnownSuccessor != I.getParent())1534continue;15351536Value *V = I.getIncomingValue(i);1537// If the incoming value is this phi itself, skip the incoming value.1538if (&I == V)1539continue;15401541Constant *C = dyn_cast<Constant>(V);1542if (!C)1543C = SimplifiedValues.lookup(V);15441545std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};1546if (!C && CheckSROA)1547BaseAndOffset = ConstantOffsetPtrs.lookup(V);15481549if (!C && !BaseAndOffset.first)1550// The incoming value is neither a constant nor a pointer with constant1551// offset, exit early.1552return true;15531554if (FirstC) {1555if (FirstC == C)1556// If we've seen a constant incoming value before and it is the same1557// constant we see this time, continue checking the next incoming value.1558continue;1559// Otherwise early exit because we either see a different constant or saw1560// a constant before but we have a pointer with constant offset this time.1561return true;1562}15631564if (FirstV) {1565// The same logic as above, but check pointer with constant offset here.1566if (FirstBaseAndOffset == BaseAndOffset)1567continue;1568return true;1569}15701571if (C) {1572// This is the 1st time we've seen a constant, record it.1573FirstC = C;1574continue;1575}15761577// The remaining case is that this is the 1st time we've seen a pointer with1578// constant offset, record it.1579FirstV = V;1580FirstBaseAndOffset = BaseAndOffset;1581}15821583// Check if we can map phi to a constant.1584if (FirstC) {1585SimplifiedValues[&I] = FirstC;1586return true;1587}15881589// Check if we can map phi to a pointer with constant offset.1590if (FirstBaseAndOffset.first) {1591ConstantOffsetPtrs[&I] = FirstBaseAndOffset;15921593if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))1594SROAArgValues[&I] = SROAArg;1595}15961597return true;1598}15991600/// Check we can fold GEPs of constant-offset call site argument pointers.1601/// This requires target data and inbounds GEPs.1602///1603/// \return true if the specified GEP can be folded.1604bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {1605// Check if we have a base + offset for the pointer.1606std::pair<Value *, APInt> BaseAndOffset =1607ConstantOffsetPtrs.lookup(I.getPointerOperand());1608if (!BaseAndOffset.first)1609return false;16101611// Check if the offset of this GEP is constant, and if so accumulate it1612// into Offset.1613if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))1614return false;16151616// Add the result as a new mapping to Base + Offset.1617ConstantOffsetPtrs[&I] = BaseAndOffset;16181619return true;1620}16211622bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {1623auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());16241625// Lambda to check whether a GEP's indices are all constant.1626auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {1627for (const Use &Op : GEP.indices())1628if (!isa<Constant>(Op) && !SimplifiedValues.lookup(Op))1629return false;1630return true;1631};16321633if (!DisableGEPConstOperand)1634if (simplifyInstruction(I))1635return true;16361637if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {1638if (SROAArg)1639SROAArgValues[&I] = SROAArg;16401641// Constant GEPs are modeled as free.1642return true;1643}16441645// Variable GEPs will require math and will disable SROA.1646if (SROAArg)1647disableSROAForArg(SROAArg);1648return isGEPFree(I);1649}16501651/// Simplify \p I if its operands are constants and update SimplifiedValues.1652bool CallAnalyzer::simplifyInstruction(Instruction &I) {1653SmallVector<Constant *> COps;1654for (Value *Op : I.operands()) {1655Constant *COp = dyn_cast<Constant>(Op);1656if (!COp)1657COp = SimplifiedValues.lookup(Op);1658if (!COp)1659return false;1660COps.push_back(COp);1661}1662auto *C = ConstantFoldInstOperands(&I, COps, DL);1663if (!C)1664return false;1665SimplifiedValues[&I] = C;1666return true;1667}16681669/// Try to simplify a call to llvm.is.constant.1670///1671/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since1672/// we expect calls of this specific intrinsic to be infrequent.1673///1674/// FIXME: Given that we know CB's parent (F) caller1675/// (CandidateCall->getParent()->getParent()), we might be able to determine1676/// whether inlining F into F's caller would change how the call to1677/// llvm.is.constant would evaluate.1678bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {1679Value *Arg = CB.getArgOperand(0);1680auto *C = dyn_cast<Constant>(Arg);16811682if (!C)1683C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(Arg));16841685Type *RT = CB.getFunctionType()->getReturnType();1686SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0);1687return true;1688}16891690bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {1691// As per the langref, "The fourth argument to llvm.objectsize determines if1692// the value should be evaluated at runtime."1693if (cast<ConstantInt>(CB.getArgOperand(3))->isOne())1694return false;16951696Value *V = lowerObjectSizeCall(&cast<IntrinsicInst>(CB), DL, nullptr,1697/*MustSucceed=*/true);1698Constant *C = dyn_cast_or_null<Constant>(V);1699if (C)1700SimplifiedValues[&CB] = C;1701return C;1702}17031704bool CallAnalyzer::visitBitCast(BitCastInst &I) {1705// Propagate constants through bitcasts.1706if (simplifyInstruction(I))1707return true;17081709// Track base/offsets through casts1710std::pair<Value *, APInt> BaseAndOffset =1711ConstantOffsetPtrs.lookup(I.getOperand(0));1712// Casts don't change the offset, just wrap it up.1713if (BaseAndOffset.first)1714ConstantOffsetPtrs[&I] = BaseAndOffset;17151716// Also look for SROA candidates here.1717if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))1718SROAArgValues[&I] = SROAArg;17191720// Bitcasts are always zero cost.1721return true;1722}17231724bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {1725// Propagate constants through ptrtoint.1726if (simplifyInstruction(I))1727return true;17281729// Track base/offset pairs when converted to a plain integer provided the1730// integer is large enough to represent the pointer.1731unsigned IntegerSize = I.getType()->getScalarSizeInBits();1732unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();1733if (IntegerSize == DL.getPointerSizeInBits(AS)) {1734std::pair<Value *, APInt> BaseAndOffset =1735ConstantOffsetPtrs.lookup(I.getOperand(0));1736if (BaseAndOffset.first)1737ConstantOffsetPtrs[&I] = BaseAndOffset;1738}17391740// This is really weird. Technically, ptrtoint will disable SROA. However,1741// unless that ptrtoint is *used* somewhere in the live basic blocks after1742// inlining, it will be nuked, and SROA should proceed. All of the uses which1743// would block SROA would also block SROA if applied directly to a pointer,1744// and so we can just add the integer in here. The only places where SROA is1745// preserved either cannot fire on an integer, or won't in-and-of themselves1746// disable SROA (ext) w/o some later use that we would see and disable.1747if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))1748SROAArgValues[&I] = SROAArg;17491750return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==1751TargetTransformInfo::TCC_Free;1752}17531754bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {1755// Propagate constants through ptrtoint.1756if (simplifyInstruction(I))1757return true;17581759// Track base/offset pairs when round-tripped through a pointer without1760// modifications provided the integer is not too large.1761Value *Op = I.getOperand(0);1762unsigned IntegerSize = Op->getType()->getScalarSizeInBits();1763if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {1764std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);1765if (BaseAndOffset.first)1766ConstantOffsetPtrs[&I] = BaseAndOffset;1767}17681769// "Propagate" SROA here in the same manner as we do for ptrtoint above.1770if (auto *SROAArg = getSROAArgForValueOrNull(Op))1771SROAArgValues[&I] = SROAArg;17721773return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==1774TargetTransformInfo::TCC_Free;1775}17761777bool CallAnalyzer::visitCastInst(CastInst &I) {1778// Propagate constants through casts.1779if (simplifyInstruction(I))1780return true;17811782// Disable SROA in the face of arbitrary casts we don't explicitly list1783// elsewhere.1784disableSROA(I.getOperand(0));17851786// If this is a floating-point cast, and the target says this operation1787// is expensive, this may eventually become a library call. Treat the cost1788// as such.1789switch (I.getOpcode()) {1790case Instruction::FPTrunc:1791case Instruction::FPExt:1792case Instruction::UIToFP:1793case Instruction::SIToFP:1794case Instruction::FPToUI:1795case Instruction::FPToSI:1796if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)1797onCallPenalty();1798break;1799default:1800break;1801}18021803return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==1804TargetTransformInfo::TCC_Free;1805}18061807bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {1808return CandidateCall.paramHasAttr(A->getArgNo(), Attr);1809}18101811bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {1812// Does the *call site* have the NonNull attribute set on an argument? We1813// use the attribute on the call site to memoize any analysis done in the1814// caller. This will also trip if the callee function has a non-null1815// parameter attribute, but that's a less interesting case because hopefully1816// the callee would already have been simplified based on that.1817if (Argument *A = dyn_cast<Argument>(V))1818if (paramHasAttr(A, Attribute::NonNull))1819return true;18201821// Is this an alloca in the caller? This is distinct from the attribute case1822// above because attributes aren't updated within the inliner itself and we1823// always want to catch the alloca derived case.1824if (isAllocaDerivedArg(V))1825// We can actually predict the result of comparisons between an1826// alloca-derived value and null. Note that this fires regardless of1827// SROA firing.1828return true;18291830return false;1831}18321833bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {1834// If the normal destination of the invoke or the parent block of the call1835// site is unreachable-terminated, there is little point in inlining this1836// unless there is literally zero cost.1837// FIXME: Note that it is possible that an unreachable-terminated block has a1838// hot entry. For example, in below scenario inlining hot_call_X() may be1839// beneficial :1840// main() {1841// hot_call_1();1842// ...1843// hot_call_N()1844// exit(0);1845// }1846// For now, we are not handling this corner case here as it is rare in real1847// code. In future, we should elaborate this based on BPI and BFI in more1848// general threshold adjusting heuristics in updateThreshold().1849if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {1850if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))1851return false;1852} else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))1853return false;18541855return true;1856}18571858bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,1859BlockFrequencyInfo *CallerBFI) {1860// If global profile summary is available, then callsite's coldness is1861// determined based on that.1862if (PSI && PSI->hasProfileSummary())1863return PSI->isColdCallSite(Call, CallerBFI);18641865// Otherwise we need BFI to be available.1866if (!CallerBFI)1867return false;18681869// Determine if the callsite is cold relative to caller's entry. We could1870// potentially cache the computation of scaled entry frequency, but the added1871// complexity is not worth it unless this scaling shows up high in the1872// profiles.1873const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);1874auto CallSiteBB = Call.getParent();1875auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);1876auto CallerEntryFreq =1877CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));1878return CallSiteFreq < CallerEntryFreq * ColdProb;1879}18801881std::optional<int>1882InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,1883BlockFrequencyInfo *CallerBFI) {18841885// If global profile summary is available, then callsite's hotness is1886// determined based on that.1887if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))1888return Params.HotCallSiteThreshold;18891890// Otherwise we need BFI to be available and to have a locally hot callsite1891// threshold.1892if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)1893return std::nullopt;18941895// Determine if the callsite is hot relative to caller's entry. We could1896// potentially cache the computation of scaled entry frequency, but the added1897// complexity is not worth it unless this scaling shows up high in the1898// profiles.1899const BasicBlock *CallSiteBB = Call.getParent();1900BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);1901BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();1902std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(HotCallSiteRelFreq);1903if (Limit && CallSiteFreq >= *Limit)1904return Params.LocallyHotCallSiteThreshold;19051906// Otherwise treat it normally.1907return std::nullopt;1908}19091910void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {1911// If no size growth is allowed for this inlining, set Threshold to 0.1912if (!allowSizeGrowth(Call)) {1913Threshold = 0;1914return;1915}19161917Function *Caller = Call.getCaller();19181919// return min(A, B) if B is valid.1920auto MinIfValid = [](int A, std::optional<int> B) {1921return B ? std::min(A, *B) : A;1922};19231924// return max(A, B) if B is valid.1925auto MaxIfValid = [](int A, std::optional<int> B) {1926return B ? std::max(A, *B) : A;1927};19281929// Various bonus percentages. These are multiplied by Threshold to get the1930// bonus values.1931// SingleBBBonus: This bonus is applied if the callee has a single reachable1932// basic block at the given callsite context. This is speculatively applied1933// and withdrawn if more than one basic block is seen.1934//1935// LstCallToStaticBonus: This large bonus is applied to ensure the inlining1936// of the last call to a static function as inlining such functions is1937// guaranteed to reduce code size.1938//1939// These bonus percentages may be set to 0 based on properties of the caller1940// and the callsite.1941int SingleBBBonusPercent = 50;1942int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();1943int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;19441945// Lambda to set all the above bonus and bonus percentages to 0.1946auto DisallowAllBonuses = [&]() {1947SingleBBBonusPercent = 0;1948VectorBonusPercent = 0;1949LastCallToStaticBonus = 0;1950};19511952// Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available1953// and reduce the threshold if the caller has the necessary attribute.1954if (Caller->hasMinSize()) {1955Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);1956// For minsize, we want to disable the single BB bonus and the vector1957// bonuses, but not the last-call-to-static bonus. Inlining the last call to1958// a static function will, at the minimum, eliminate the parameter setup and1959// call/return instructions.1960SingleBBBonusPercent = 0;1961VectorBonusPercent = 0;1962} else if (Caller->hasOptSize())1963Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);19641965// Adjust the threshold based on inlinehint attribute and profile based1966// hotness information if the caller does not have MinSize attribute.1967if (!Caller->hasMinSize()) {1968if (Callee.hasFnAttribute(Attribute::InlineHint))1969Threshold = MaxIfValid(Threshold, Params.HintThreshold);19701971// FIXME: After switching to the new passmanager, simplify the logic below1972// by checking only the callsite hotness/coldness as we will reliably1973// have local profile information.1974//1975// Callsite hotness and coldness can be determined if sample profile is1976// used (which adds hotness metadata to calls) or if caller's1977// BlockFrequencyInfo is available.1978BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;1979auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);1980if (!Caller->hasOptSize() && HotCallSiteThreshold) {1981LLVM_DEBUG(dbgs() << "Hot callsite.\n");1982// FIXME: This should update the threshold only if it exceeds the1983// current threshold, but AutoFDO + ThinLTO currently relies on this1984// behavior to prevent inlining of hot callsites during ThinLTO1985// compile phase.1986Threshold = *HotCallSiteThreshold;1987} else if (isColdCallSite(Call, CallerBFI)) {1988LLVM_DEBUG(dbgs() << "Cold callsite.\n");1989// Do not apply bonuses for a cold callsite including the1990// LastCallToStatic bonus. While this bonus might result in code size1991// reduction, it can cause the size of a non-cold caller to increase1992// preventing it from being inlined.1993DisallowAllBonuses();1994Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);1995} else if (PSI) {1996// Use callee's global profile information only if we have no way of1997// determining this via callsite information.1998if (PSI->isFunctionEntryHot(&Callee)) {1999LLVM_DEBUG(dbgs() << "Hot callee.\n");2000// If callsite hotness can not be determined, we may still know2001// that the callee is hot and treat it as a weaker hint for threshold2002// increase.2003Threshold = MaxIfValid(Threshold, Params.HintThreshold);2004} else if (PSI->isFunctionEntryCold(&Callee)) {2005LLVM_DEBUG(dbgs() << "Cold callee.\n");2006// Do not apply bonuses for a cold callee including the2007// LastCallToStatic bonus. While this bonus might result in code size2008// reduction, it can cause the size of a non-cold caller to increase2009// preventing it from being inlined.2010DisallowAllBonuses();2011Threshold = MinIfValid(Threshold, Params.ColdThreshold);2012}2013}2014}20152016Threshold += TTI.adjustInliningThreshold(&Call);20172018// Finally, take the target-specific inlining threshold multiplier into2019// account.2020Threshold *= TTI.getInliningThresholdMultiplier();20212022SingleBBBonus = Threshold * SingleBBBonusPercent / 100;2023VectorBonus = Threshold * VectorBonusPercent / 100;20242025// If there is only one call of the function, and it has internal linkage,2026// the cost of inlining it drops dramatically. It may seem odd to update2027// Cost in updateThreshold, but the bonus depends on the logic in this method.2028if (isSoleCallToLocalFunction(Call, F)) {2029Cost -= LastCallToStaticBonus;2030StaticBonusApplied = LastCallToStaticBonus;2031}2032}20332034bool CallAnalyzer::visitCmpInst(CmpInst &I) {2035Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);2036// First try to handle simplified comparisons.2037if (simplifyInstruction(I))2038return true;20392040if (I.getOpcode() == Instruction::FCmp)2041return false;20422043// Otherwise look for a comparison between constant offset pointers with2044// a common base.2045Value *LHSBase, *RHSBase;2046APInt LHSOffset, RHSOffset;2047std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);2048if (LHSBase) {2049std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);2050if (RHSBase && LHSBase == RHSBase) {2051// We have common bases, fold the icmp to a constant based on the2052// offsets.2053SimplifiedValues[&I] = ConstantInt::getBool(2054I.getType(),2055ICmpInst::compare(LHSOffset, RHSOffset, I.getPredicate()));2056++NumConstantPtrCmps;2057return true;2058}2059}20602061auto isImplicitNullCheckCmp = [](const CmpInst &I) {2062for (auto *User : I.users())2063if (auto *Instr = dyn_cast<Instruction>(User))2064if (!Instr->getMetadata(LLVMContext::MD_make_implicit))2065return false;2066return true;2067};20682069// If the comparison is an equality comparison with null, we can simplify it2070// if we know the value (argument) can't be null2071if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1))) {2072if (isKnownNonNullInCallee(I.getOperand(0))) {2073bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;2074SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())2075: ConstantInt::getFalse(I.getType());2076return true;2077}2078// Implicit null checks act as unconditional branches and their comparisons2079// should be treated as simplified and free of cost.2080if (isImplicitNullCheckCmp(I))2081return true;2082}2083return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));2084}20852086bool CallAnalyzer::visitSub(BinaryOperator &I) {2087// Try to handle a special case: we can fold computing the difference of two2088// constant-related pointers.2089Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);2090Value *LHSBase, *RHSBase;2091APInt LHSOffset, RHSOffset;2092std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);2093if (LHSBase) {2094std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);2095if (RHSBase && LHSBase == RHSBase) {2096// We have common bases, fold the subtract to a constant based on the2097// offsets.2098Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);2099Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);2100if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {2101SimplifiedValues[&I] = C;2102++NumConstantPtrDiffs;2103return true;2104}2105}2106}21072108// Otherwise, fall back to the generic logic for simplifying and handling2109// instructions.2110return Base::visitSub(I);2111}21122113bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {2114Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);2115Constant *CLHS = dyn_cast<Constant>(LHS);2116if (!CLHS)2117CLHS = SimplifiedValues.lookup(LHS);2118Constant *CRHS = dyn_cast<Constant>(RHS);2119if (!CRHS)2120CRHS = SimplifiedValues.lookup(RHS);21212122Value *SimpleV = nullptr;2123if (auto FI = dyn_cast<FPMathOperator>(&I))2124SimpleV = simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,2125FI->getFastMathFlags(), DL);2126else2127SimpleV =2128simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);21292130if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))2131SimplifiedValues[&I] = C;21322133if (SimpleV)2134return true;21352136// Disable any SROA on arguments to arbitrary, unsimplified binary operators.2137disableSROA(LHS);2138disableSROA(RHS);21392140// If the instruction is floating point, and the target says this operation2141// is expensive, this may eventually become a library call. Treat the cost2142// as such. Unless it's fneg which can be implemented with an xor.2143using namespace llvm::PatternMatch;2144if (I.getType()->isFloatingPointTy() &&2145TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&2146!match(&I, m_FNeg(m_Value())))2147onCallPenalty();21482149return false;2150}21512152bool CallAnalyzer::visitFNeg(UnaryOperator &I) {2153Value *Op = I.getOperand(0);2154Constant *COp = dyn_cast<Constant>(Op);2155if (!COp)2156COp = SimplifiedValues.lookup(Op);21572158Value *SimpleV = simplifyFNegInst(2159COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);21602161if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))2162SimplifiedValues[&I] = C;21632164if (SimpleV)2165return true;21662167// Disable any SROA on arguments to arbitrary, unsimplified fneg.2168disableSROA(Op);21692170return false;2171}21722173bool CallAnalyzer::visitLoad(LoadInst &I) {2174if (handleSROA(I.getPointerOperand(), I.isSimple()))2175return true;21762177// If the data is already loaded from this address and hasn't been clobbered2178// by any stores or calls, this load is likely to be redundant and can be2179// eliminated.2180if (EnableLoadElimination &&2181!LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {2182onLoadEliminationOpportunity();2183return true;2184}21852186onMemAccess();2187return false;2188}21892190bool CallAnalyzer::visitStore(StoreInst &I) {2191if (handleSROA(I.getPointerOperand(), I.isSimple()))2192return true;21932194// The store can potentially clobber loads and prevent repeated loads from2195// being eliminated.2196// FIXME:2197// 1. We can probably keep an initial set of eliminatable loads substracted2198// from the cost even when we finally see a store. We just need to disable2199// *further* accumulation of elimination savings.2200// 2. We should probably at some point thread MemorySSA for the callee into2201// this and then use that to actually compute *really* precise savings.2202disableLoadElimination();22032204onMemAccess();2205return false;2206}22072208bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {2209// Constant folding for extract value is trivial.2210if (simplifyInstruction(I))2211return true;22122213// SROA can't look through these, but they may be free.2214return Base::visitExtractValue(I);2215}22162217bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {2218// Constant folding for insert value is trivial.2219if (simplifyInstruction(I))2220return true;22212222// SROA can't look through these, but they may be free.2223return Base::visitInsertValue(I);2224}22252226/// Try to simplify a call site.2227///2228/// Takes a concrete function and callsite and tries to actually simplify it by2229/// analyzing the arguments and call itself with instsimplify. Returns true if2230/// it has simplified the callsite to some other entity (a constant), making it2231/// free.2232bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {2233// FIXME: Using the instsimplify logic directly for this is inefficient2234// because we have to continually rebuild the argument list even when no2235// simplifications can be performed. Until that is fixed with remapping2236// inside of instsimplify, directly constant fold calls here.2237if (!canConstantFoldCallTo(&Call, F))2238return false;22392240// Try to re-map the arguments to constants.2241SmallVector<Constant *, 4> ConstantArgs;2242ConstantArgs.reserve(Call.arg_size());2243for (Value *I : Call.args()) {2244Constant *C = dyn_cast<Constant>(I);2245if (!C)2246C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));2247if (!C)2248return false; // This argument doesn't map to a constant.22492250ConstantArgs.push_back(C);2251}2252if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {2253SimplifiedValues[&Call] = C;2254return true;2255}22562257return false;2258}22592260bool CallAnalyzer::visitCallBase(CallBase &Call) {2261if (!onCallBaseVisitStart(Call))2262return true;22632264if (Call.hasFnAttr(Attribute::ReturnsTwice) &&2265!F.hasFnAttribute(Attribute::ReturnsTwice)) {2266// This aborts the entire analysis.2267ExposesReturnsTwice = true;2268return false;2269}2270if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())2271ContainsNoDuplicateCall = true;22722273Function *F = Call.getCalledFunction();2274bool IsIndirectCall = !F;2275if (IsIndirectCall) {2276// Check if this happens to be an indirect function call to a known function2277// in this inline context. If not, we've done all we can.2278Value *Callee = Call.getCalledOperand();2279F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));2280if (!F || F->getFunctionType() != Call.getFunctionType()) {2281onCallArgumentSetup(Call);22822283if (!Call.onlyReadsMemory())2284disableLoadElimination();2285return Base::visitCallBase(Call);2286}2287}22882289assert(F && "Expected a call to a known function");22902291// When we have a concrete function, first try to simplify it directly.2292if (simplifyCallSite(F, Call))2293return true;22942295// Next check if it is an intrinsic we know about.2296// FIXME: Lift this into part of the InstVisitor.2297if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {2298switch (II->getIntrinsicID()) {2299default:2300if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))2301disableLoadElimination();2302return Base::visitCallBase(Call);23032304case Intrinsic::load_relative:2305onLoadRelativeIntrinsic();2306return false;23072308case Intrinsic::memset:2309case Intrinsic::memcpy:2310case Intrinsic::memmove:2311disableLoadElimination();2312// SROA can usually chew through these intrinsics, but they aren't free.2313return false;2314case Intrinsic::icall_branch_funnel:2315case Intrinsic::localescape:2316HasUninlineableIntrinsic = true;2317return false;2318case Intrinsic::vastart:2319InitsVargArgs = true;2320return false;2321case Intrinsic::launder_invariant_group:2322case Intrinsic::strip_invariant_group:2323if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))2324SROAArgValues[II] = SROAArg;2325return true;2326case Intrinsic::is_constant:2327return simplifyIntrinsicCallIsConstant(Call);2328case Intrinsic::objectsize:2329return simplifyIntrinsicCallObjectSize(Call);2330}2331}23322333if (F == Call.getFunction()) {2334// This flag will fully abort the analysis, so don't bother with anything2335// else.2336IsRecursiveCall = true;2337if (!AllowRecursiveCall)2338return false;2339}23402341if (TTI.isLoweredToCall(F)) {2342onLoweredCall(F, Call, IsIndirectCall);2343}23442345if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))2346disableLoadElimination();2347return Base::visitCallBase(Call);2348}23492350bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {2351// At least one return instruction will be free after inlining.2352bool Free = !HasReturn;2353HasReturn = true;2354return Free;2355}23562357bool CallAnalyzer::visitBranchInst(BranchInst &BI) {2358// We model unconditional branches as essentially free -- they really2359// shouldn't exist at all, but handling them makes the behavior of the2360// inliner more regular and predictable. Interestingly, conditional branches2361// which will fold away are also free.2362return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||2363BI.getMetadata(LLVMContext::MD_make_implicit) ||2364isa_and_nonnull<ConstantInt>(2365SimplifiedValues.lookup(BI.getCondition()));2366}23672368bool CallAnalyzer::visitSelectInst(SelectInst &SI) {2369bool CheckSROA = SI.getType()->isPointerTy();2370Value *TrueVal = SI.getTrueValue();2371Value *FalseVal = SI.getFalseValue();23722373Constant *TrueC = dyn_cast<Constant>(TrueVal);2374if (!TrueC)2375TrueC = SimplifiedValues.lookup(TrueVal);2376Constant *FalseC = dyn_cast<Constant>(FalseVal);2377if (!FalseC)2378FalseC = SimplifiedValues.lookup(FalseVal);2379Constant *CondC =2380dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));23812382if (!CondC) {2383// Select C, X, X => X2384if (TrueC == FalseC && TrueC) {2385SimplifiedValues[&SI] = TrueC;2386return true;2387}23882389if (!CheckSROA)2390return Base::visitSelectInst(SI);23912392std::pair<Value *, APInt> TrueBaseAndOffset =2393ConstantOffsetPtrs.lookup(TrueVal);2394std::pair<Value *, APInt> FalseBaseAndOffset =2395ConstantOffsetPtrs.lookup(FalseVal);2396if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {2397ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;23982399if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))2400SROAArgValues[&SI] = SROAArg;2401return true;2402}24032404return Base::visitSelectInst(SI);2405}24062407// Select condition is a constant.2408Value *SelectedV = CondC->isAllOnesValue() ? TrueVal2409: (CondC->isNullValue()) ? FalseVal2410: nullptr;2411if (!SelectedV) {2412// Condition is a vector constant that is not all 1s or all 0s. If all2413// operands are constants, ConstantFoldSelectInstruction() can handle the2414// cases such as select vectors.2415if (TrueC && FalseC) {2416if (auto *C = ConstantFoldSelectInstruction(CondC, TrueC, FalseC)) {2417SimplifiedValues[&SI] = C;2418return true;2419}2420}2421return Base::visitSelectInst(SI);2422}24232424// Condition is either all 1s or all 0s. SI can be simplified.2425if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {2426SimplifiedValues[&SI] = SelectedC;2427return true;2428}24292430if (!CheckSROA)2431return true;24322433std::pair<Value *, APInt> BaseAndOffset =2434ConstantOffsetPtrs.lookup(SelectedV);2435if (BaseAndOffset.first) {2436ConstantOffsetPtrs[&SI] = BaseAndOffset;24372438if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))2439SROAArgValues[&SI] = SROAArg;2440}24412442return true;2443}24442445bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {2446// We model unconditional switches as free, see the comments on handling2447// branches.2448if (isa<ConstantInt>(SI.getCondition()))2449return true;2450if (Value *V = SimplifiedValues.lookup(SI.getCondition()))2451if (isa<ConstantInt>(V))2452return true;24532454// Assume the most general case where the switch is lowered into2455// either a jump table, bit test, or a balanced binary tree consisting of2456// case clusters without merging adjacent clusters with the same2457// destination. We do not consider the switches that are lowered with a mix2458// of jump table/bit test/binary search tree. The cost of the switch is2459// proportional to the size of the tree or the size of jump table range.2460//2461// NB: We convert large switches which are just used to initialize large phi2462// nodes to lookup tables instead in simplifycfg, so this shouldn't prevent2463// inlining those. It will prevent inlining in cases where the optimization2464// does not (yet) fire.24652466unsigned JumpTableSize = 0;2467BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;2468unsigned NumCaseCluster =2469TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);24702471onFinalizeSwitch(JumpTableSize, NumCaseCluster, SI.defaultDestUndefined());2472return false;2473}24742475bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {2476// We never want to inline functions that contain an indirectbr. This is2477// incorrect because all the blockaddress's (in static global initializers2478// for example) would be referring to the original function, and this2479// indirect jump would jump from the inlined copy of the function into the2480// original function which is extremely undefined behavior.2481// FIXME: This logic isn't really right; we can safely inline functions with2482// indirectbr's as long as no other function or global references the2483// blockaddress of a block within the current function.2484HasIndirectBr = true;2485return false;2486}24872488bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {2489// FIXME: It's not clear that a single instruction is an accurate model for2490// the inline cost of a resume instruction.2491return false;2492}24932494bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {2495// FIXME: It's not clear that a single instruction is an accurate model for2496// the inline cost of a cleanupret instruction.2497return false;2498}24992500bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {2501// FIXME: It's not clear that a single instruction is an accurate model for2502// the inline cost of a catchret instruction.2503return false;2504}25052506bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {2507// FIXME: It might be reasonably to discount the cost of instructions leading2508// to unreachable as they have the lowest possible impact on both runtime and2509// code size.2510return true; // No actual code is needed for unreachable.2511}25122513bool CallAnalyzer::visitInstruction(Instruction &I) {2514// Some instructions are free. All of the free intrinsics can also be2515// handled by SROA, etc.2516if (TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==2517TargetTransformInfo::TCC_Free)2518return true;25192520// We found something we don't understand or can't handle. Mark any SROA-able2521// values in the operand list as no longer viable.2522for (const Use &Op : I.operands())2523disableSROA(Op);25242525return false;2526}25272528/// Analyze a basic block for its contribution to the inline cost.2529///2530/// This method walks the analyzer over every instruction in the given basic2531/// block and accounts for their cost during inlining at this callsite. It2532/// aborts early if the threshold has been exceeded or an impossible to inline2533/// construct has been detected. It returns false if inlining is no longer2534/// viable, and true if inlining remains viable.2535InlineResult2536CallAnalyzer::analyzeBlock(BasicBlock *BB,2537SmallPtrSetImpl<const Value *> &EphValues) {2538for (Instruction &I : *BB) {2539// FIXME: Currently, the number of instructions in a function regardless of2540// our ability to simplify them during inline to constants or dead code,2541// are actually used by the vector bonus heuristic. As long as that's true,2542// we have to special case debug intrinsics here to prevent differences in2543// inlining due to debug symbols. Eventually, the number of unsimplified2544// instructions shouldn't factor into the cost computation, but until then,2545// hack around it here.2546// Similarly, skip pseudo-probes.2547if (I.isDebugOrPseudoInst())2548continue;25492550// Skip ephemeral values.2551if (EphValues.count(&I))2552continue;25532554++NumInstructions;2555if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())2556++NumVectorInstructions;25572558// If the instruction simplified to a constant, there is no cost to this2559// instruction. Visit the instructions using our InstVisitor to account for2560// all of the per-instruction logic. The visit tree returns true if we2561// consumed the instruction in any way, and false if the instruction's base2562// cost should count against inlining.2563onInstructionAnalysisStart(&I);25642565if (Base::visit(&I))2566++NumInstructionsSimplified;2567else2568onMissedSimplification();25692570onInstructionAnalysisFinish(&I);2571using namespace ore;2572// If the visit this instruction detected an uninlinable pattern, abort.2573InlineResult IR = InlineResult::success();2574if (IsRecursiveCall && !AllowRecursiveCall)2575IR = InlineResult::failure("recursive");2576else if (ExposesReturnsTwice)2577IR = InlineResult::failure("exposes returns twice");2578else if (HasDynamicAlloca)2579IR = InlineResult::failure("dynamic alloca");2580else if (HasIndirectBr)2581IR = InlineResult::failure("indirect branch");2582else if (HasUninlineableIntrinsic)2583IR = InlineResult::failure("uninlinable intrinsic");2584else if (InitsVargArgs)2585IR = InlineResult::failure("varargs");2586if (!IR.isSuccess()) {2587if (ORE)2588ORE->emit([&]() {2589return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",2590&CandidateCall)2591<< NV("Callee", &F) << " has uninlinable pattern ("2592<< NV("InlineResult", IR.getFailureReason())2593<< ") and cost is not fully computed";2594});2595return IR;2596}25972598// If the caller is a recursive function then we don't want to inline2599// functions which allocate a lot of stack space because it would increase2600// the caller stack usage dramatically.2601if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {2602auto IR =2603InlineResult::failure("recursive and allocates too much stack space");2604if (ORE)2605ORE->emit([&]() {2606return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",2607&CandidateCall)2608<< NV("Callee", &F) << " is "2609<< NV("InlineResult", IR.getFailureReason())2610<< ". Cost is not fully computed";2611});2612return IR;2613}26142615if (shouldStop())2616return InlineResult::failure(2617"Call site analysis is not favorable to inlining.");2618}26192620return InlineResult::success();2621}26222623/// Compute the base pointer and cumulative constant offsets for V.2624///2625/// This strips all constant offsets off of V, leaving it the base pointer, and2626/// accumulates the total constant offset applied in the returned constant. It2627/// returns 0 if V is not a pointer, and returns the constant '0' if there are2628/// no constant offsets applied.2629ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {2630if (!V->getType()->isPointerTy())2631return nullptr;26322633unsigned AS = V->getType()->getPointerAddressSpace();2634unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);2635APInt Offset = APInt::getZero(IntPtrWidth);26362637// Even though we don't look through PHI nodes, we could be called on an2638// instruction in an unreachable block, which may be on a cycle.2639SmallPtrSet<Value *, 4> Visited;2640Visited.insert(V);2641do {2642if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {2643if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))2644return nullptr;2645V = GEP->getPointerOperand();2646} else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {2647if (GA->isInterposable())2648break;2649V = GA->getAliasee();2650} else {2651break;2652}2653assert(V->getType()->isPointerTy() && "Unexpected operand type!");2654} while (Visited.insert(V).second);26552656Type *IdxPtrTy = DL.getIndexType(V->getType());2657return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));2658}26592660/// Find dead blocks due to deleted CFG edges during inlining.2661///2662/// If we know the successor of the current block, \p CurrBB, has to be \p2663/// NextBB, the other successors of \p CurrBB are dead if these successors have2664/// no live incoming CFG edges. If one block is found to be dead, we can2665/// continue growing the dead block list by checking the successors of the dead2666/// blocks to see if all their incoming edges are dead or not.2667void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {2668auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {2669// A CFG edge is dead if the predecessor is dead or the predecessor has a2670// known successor which is not the one under exam.2671return (DeadBlocks.count(Pred) ||2672(KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));2673};26742675auto IsNewlyDead = [&](BasicBlock *BB) {2676// If all the edges to a block are dead, the block is also dead.2677return (!DeadBlocks.count(BB) &&2678llvm::all_of(predecessors(BB),2679[&](BasicBlock *P) { return IsEdgeDead(P, BB); }));2680};26812682for (BasicBlock *Succ : successors(CurrBB)) {2683if (Succ == NextBB || !IsNewlyDead(Succ))2684continue;2685SmallVector<BasicBlock *, 4> NewDead;2686NewDead.push_back(Succ);2687while (!NewDead.empty()) {2688BasicBlock *Dead = NewDead.pop_back_val();2689if (DeadBlocks.insert(Dead).second)2690// Continue growing the dead block lists.2691for (BasicBlock *S : successors(Dead))2692if (IsNewlyDead(S))2693NewDead.push_back(S);2694}2695}2696}26972698/// Analyze a call site for potential inlining.2699///2700/// Returns true if inlining this call is viable, and false if it is not2701/// viable. It computes the cost and adjusts the threshold based on numerous2702/// factors and heuristics. If this method returns false but the computed cost2703/// is below the computed threshold, then inlining was forcibly disabled by2704/// some artifact of the routine.2705InlineResult CallAnalyzer::analyze() {2706++NumCallsAnalyzed;27072708auto Result = onAnalysisStart();2709if (!Result.isSuccess())2710return Result;27112712if (F.empty())2713return InlineResult::success();27142715Function *Caller = CandidateCall.getFunction();2716// Check if the caller function is recursive itself.2717for (User *U : Caller->users()) {2718CallBase *Call = dyn_cast<CallBase>(U);2719if (Call && Call->getFunction() == Caller) {2720IsCallerRecursive = true;2721break;2722}2723}27242725// Populate our simplified values by mapping from function arguments to call2726// arguments with known important simplifications.2727auto CAI = CandidateCall.arg_begin();2728for (Argument &FAI : F.args()) {2729assert(CAI != CandidateCall.arg_end());2730if (Constant *C = dyn_cast<Constant>(CAI))2731SimplifiedValues[&FAI] = C;27322733Value *PtrArg = *CAI;2734if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {2735ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());27362737// We can SROA any pointer arguments derived from alloca instructions.2738if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {2739SROAArgValues[&FAI] = SROAArg;2740onInitializeSROAArg(SROAArg);2741EnabledSROAAllocas.insert(SROAArg);2742}2743}2744++CAI;2745}2746NumConstantArgs = SimplifiedValues.size();2747NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();2748NumAllocaArgs = SROAArgValues.size();27492750// FIXME: If a caller has multiple calls to a callee, we end up recomputing2751// the ephemeral values multiple times (and they're completely determined by2752// the callee, so this is purely duplicate work).2753SmallPtrSet<const Value *, 32> EphValues;2754CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);27552756// The worklist of live basic blocks in the callee *after* inlining. We avoid2757// adding basic blocks of the callee which can be proven to be dead for this2758// particular call site in order to get more accurate cost estimates. This2759// requires a somewhat heavyweight iteration pattern: we need to walk the2760// basic blocks in a breadth-first order as we insert live successors. To2761// accomplish this, prioritizing for small iterations because we exit after2762// crossing our threshold, we use a small-size optimized SetVector.2763typedef SmallSetVector<BasicBlock *, 16> BBSetVector;2764BBSetVector BBWorklist;2765BBWorklist.insert(&F.getEntryBlock());27662767// Note that we *must not* cache the size, this loop grows the worklist.2768for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {2769if (shouldStop())2770break;27712772BasicBlock *BB = BBWorklist[Idx];2773if (BB->empty())2774continue;27752776onBlockStart(BB);27772778// Disallow inlining a blockaddress with uses other than strictly callbr.2779// A blockaddress only has defined behavior for an indirect branch in the2780// same function, and we do not currently support inlining indirect2781// branches. But, the inliner may not see an indirect branch that ends up2782// being dead code at a particular call site. If the blockaddress escapes2783// the function, e.g., via a global variable, inlining may lead to an2784// invalid cross-function reference.2785// FIXME: pr/39560: continue relaxing this overt restriction.2786if (BB->hasAddressTaken())2787for (User *U : BlockAddress::get(&*BB)->users())2788if (!isa<CallBrInst>(*U))2789return InlineResult::failure("blockaddress used outside of callbr");27902791// Analyze the cost of this block. If we blow through the threshold, this2792// returns false, and we can bail on out.2793InlineResult IR = analyzeBlock(BB, EphValues);2794if (!IR.isSuccess())2795return IR;27962797Instruction *TI = BB->getTerminator();27982799// Add in the live successors by first checking whether we have terminator2800// that may be simplified based on the values simplified by this call.2801if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {2802if (BI->isConditional()) {2803Value *Cond = BI->getCondition();2804if (ConstantInt *SimpleCond =2805dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {2806BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);2807BBWorklist.insert(NextBB);2808KnownSuccessors[BB] = NextBB;2809findDeadBlocks(BB, NextBB);2810continue;2811}2812}2813} else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {2814Value *Cond = SI->getCondition();2815if (ConstantInt *SimpleCond =2816dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {2817BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();2818BBWorklist.insert(NextBB);2819KnownSuccessors[BB] = NextBB;2820findDeadBlocks(BB, NextBB);2821continue;2822}2823}28242825// If we're unable to select a particular successor, just count all of2826// them.2827for (BasicBlock *Succ : successors(BB))2828BBWorklist.insert(Succ);28292830onBlockAnalyzed(BB);2831}28322833// If this is a noduplicate call, we can still inline as long as2834// inlining this would cause the removal of the caller (so the instruction2835// is not actually duplicated, just moved).2836if (!isSoleCallToLocalFunction(CandidateCall, F) && ContainsNoDuplicateCall)2837return InlineResult::failure("noduplicate");28382839// If the callee's stack size exceeds the user-specified threshold,2840// do not let it be inlined.2841// The command line option overrides a limit set in the function attributes.2842size_t FinalStackSizeThreshold = StackSizeThreshold;2843if (!StackSizeThreshold.getNumOccurrences())2844if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(2845Caller, InlineConstants::MaxInlineStackSizeAttributeName))2846FinalStackSizeThreshold = *AttrMaxStackSize;2847if (AllocatedSize > FinalStackSizeThreshold)2848return InlineResult::failure("stacksize");28492850return finalizeAnalysis();2851}28522853void InlineCostCallAnalyzer::print(raw_ostream &OS) {2854#define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"2855if (PrintInstructionComments)2856F.print(OS, &Writer);2857DEBUG_PRINT_STAT(NumConstantArgs);2858DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);2859DEBUG_PRINT_STAT(NumAllocaArgs);2860DEBUG_PRINT_STAT(NumConstantPtrCmps);2861DEBUG_PRINT_STAT(NumConstantPtrDiffs);2862DEBUG_PRINT_STAT(NumInstructionsSimplified);2863DEBUG_PRINT_STAT(NumInstructions);2864DEBUG_PRINT_STAT(SROACostSavings);2865DEBUG_PRINT_STAT(SROACostSavingsLost);2866DEBUG_PRINT_STAT(LoadEliminationCost);2867DEBUG_PRINT_STAT(ContainsNoDuplicateCall);2868DEBUG_PRINT_STAT(Cost);2869DEBUG_PRINT_STAT(Threshold);2870#undef DEBUG_PRINT_STAT2871}28722873#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)2874/// Dump stats about this call's analysis.2875LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }2876#endif28772878/// Test that there are no attribute conflicts between Caller and Callee2879/// that prevent inlining.2880static bool functionsHaveCompatibleAttributes(2881Function *Caller, Function *Callee, TargetTransformInfo &TTI,2882function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {2883// Note that CalleeTLI must be a copy not a reference. The legacy pass manager2884// caches the most recently created TLI in the TargetLibraryInfoWrapperPass2885// object, and always returns the same object (which is overwritten on each2886// GetTLI call). Therefore we copy the first result.2887auto CalleeTLI = GetTLI(*Callee);2888return (IgnoreTTIInlineCompatible ||2889TTI.areInlineCompatible(Caller, Callee)) &&2890GetTLI(*Caller).areInlineCompatible(CalleeTLI,2891InlineCallerSupersetNoBuiltin) &&2892AttributeFuncs::areInlineCompatible(*Caller, *Callee);2893}28942895int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call,2896const DataLayout &DL) {2897int64_t Cost = 0;2898for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {2899if (Call.isByValArgument(I)) {2900// We approximate the number of loads and stores needed by dividing the2901// size of the byval type by the target's pointer size.2902PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());2903unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I));2904unsigned AS = PTy->getAddressSpace();2905unsigned PointerSize = DL.getPointerSizeInBits(AS);2906// Ceiling division.2907unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;29082909// If it generates more than 8 stores it is likely to be expanded as an2910// inline memcpy so we take that as an upper bound. Otherwise we assume2911// one load and one store per word copied.2912// FIXME: The maxStoresPerMemcpy setting from the target should be used2913// here instead of a magic number of 8, but it's not available via2914// DataLayout.2915NumStores = std::min(NumStores, 8U);29162917Cost += 2 * NumStores * InstrCost;2918} else {2919// For non-byval arguments subtract off one instruction per call2920// argument.2921Cost += InstrCost;2922}2923}2924// The call instruction also disappears after inlining.2925Cost += InstrCost;2926Cost += TTI.getInlineCallPenalty(Call.getCaller(), Call, CallPenalty);29272928return std::min<int64_t>(Cost, INT_MAX);2929}29302931InlineCost llvm::getInlineCost(2932CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,2933function_ref<AssumptionCache &(Function &)> GetAssumptionCache,2934function_ref<const TargetLibraryInfo &(Function &)> GetTLI,2935function_ref<BlockFrequencyInfo &(Function &)> GetBFI,2936ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {2937return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,2938GetAssumptionCache, GetTLI, GetBFI, PSI, ORE);2939}29402941std::optional<int> llvm::getInliningCostEstimate(2942CallBase &Call, TargetTransformInfo &CalleeTTI,2943function_ref<AssumptionCache &(Function &)> GetAssumptionCache,2944function_ref<BlockFrequencyInfo &(Function &)> GetBFI,2945ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {2946const InlineParams Params = {/* DefaultThreshold*/ 0,2947/*HintThreshold*/ {},2948/*ColdThreshold*/ {},2949/*OptSizeThreshold*/ {},2950/*OptMinSizeThreshold*/ {},2951/*HotCallSiteThreshold*/ {},2952/*LocallyHotCallSiteThreshold*/ {},2953/*ColdCallSiteThreshold*/ {},2954/*ComputeFullInlineCost*/ true,2955/*EnableDeferral*/ true};29562957InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,2958GetAssumptionCache, GetBFI, PSI, ORE, true,2959/*IgnoreThreshold*/ true);2960auto R = CA.analyze();2961if (!R.isSuccess())2962return std::nullopt;2963return CA.getCost();2964}29652966std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(2967CallBase &Call, TargetTransformInfo &CalleeTTI,2968function_ref<AssumptionCache &(Function &)> GetAssumptionCache,2969function_ref<BlockFrequencyInfo &(Function &)> GetBFI,2970ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {2971InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI,2972ORE, *Call.getCalledFunction(), Call);2973auto R = CFA.analyze();2974if (!R.isSuccess())2975return std::nullopt;2976return CFA.features();2977}29782979std::optional<InlineResult> llvm::getAttributeBasedInliningDecision(2980CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,2981function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {29822983// Cannot inline indirect calls.2984if (!Callee)2985return InlineResult::failure("indirect call");29862987// When callee coroutine function is inlined into caller coroutine function2988// before coro-split pass,2989// coro-early pass can not handle this quiet well.2990// So we won't inline the coroutine function if it have not been unsplited2991if (Callee->isPresplitCoroutine())2992return InlineResult::failure("unsplited coroutine call");29932994// Never inline calls with byval arguments that does not have the alloca2995// address space. Since byval arguments can be replaced with a copy to an2996// alloca, the inlined code would need to be adjusted to handle that the2997// argument is in the alloca address space (so it is a little bit complicated2998// to solve).2999unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace();3000for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)3001if (Call.isByValArgument(I)) {3002PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());3003if (PTy->getAddressSpace() != AllocaAS)3004return InlineResult::failure("byval arguments without alloca"3005" address space");3006}30073008// Calls to functions with always-inline attributes should be inlined3009// whenever possible.3010if (Call.hasFnAttr(Attribute::AlwaysInline)) {3011if (Call.getAttributes().hasFnAttr(Attribute::NoInline))3012return InlineResult::failure("noinline call site attribute");30133014auto IsViable = isInlineViable(*Callee);3015if (IsViable.isSuccess())3016return InlineResult::success();3017return InlineResult::failure(IsViable.getFailureReason());3018}30193020// Never inline functions with conflicting attributes (unless callee has3021// always-inline attribute).3022Function *Caller = Call.getCaller();3023if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))3024return InlineResult::failure("conflicting attributes");30253026// Don't inline this call if the caller has the optnone attribute.3027if (Caller->hasOptNone())3028return InlineResult::failure("optnone attribute");30293030// Don't inline a function that treats null pointer as valid into a caller3031// that does not have this attribute.3032if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())3033return InlineResult::failure("nullptr definitions incompatible");30343035// Don't inline functions which can be interposed at link-time.3036if (Callee->isInterposable())3037return InlineResult::failure("interposable");30383039// Don't inline functions marked noinline.3040if (Callee->hasFnAttribute(Attribute::NoInline))3041return InlineResult::failure("noinline function attribute");30423043// Don't inline call sites marked noinline.3044if (Call.isNoInline())3045return InlineResult::failure("noinline call site attribute");30463047return std::nullopt;3048}30493050InlineCost llvm::getInlineCost(3051CallBase &Call, Function *Callee, const InlineParams &Params,3052TargetTransformInfo &CalleeTTI,3053function_ref<AssumptionCache &(Function &)> GetAssumptionCache,3054function_ref<const TargetLibraryInfo &(Function &)> GetTLI,3055function_ref<BlockFrequencyInfo &(Function &)> GetBFI,3056ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {30573058auto UserDecision =3059llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);30603061if (UserDecision) {3062if (UserDecision->isSuccess())3063return llvm::InlineCost::getAlways("always inline attribute");3064return llvm::InlineCost::getNever(UserDecision->getFailureReason());3065}30663067LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()3068<< "... (caller:" << Call.getCaller()->getName()3069<< ")\n");30703071InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,3072GetAssumptionCache, GetBFI, PSI, ORE);3073InlineResult ShouldInline = CA.analyze();30743075LLVM_DEBUG(CA.dump());30763077// Always make cost benefit based decision explicit.3078// We use always/never here since threshold is not meaningful,3079// as it's not what drives cost-benefit analysis.3080if (CA.wasDecidedByCostBenefit()) {3081if (ShouldInline.isSuccess())3082return InlineCost::getAlways("benefit over cost",3083CA.getCostBenefitPair());3084else3085return InlineCost::getNever("cost over benefit", CA.getCostBenefitPair());3086}30873088if (CA.wasDecidedByCostThreshold())3089return InlineCost::get(CA.getCost(), CA.getThreshold(),3090CA.getStaticBonusApplied());30913092// No details on how the decision was made, simply return always or never.3093return ShouldInline.isSuccess()3094? InlineCost::getAlways("empty function")3095: InlineCost::getNever(ShouldInline.getFailureReason());3096}30973098InlineResult llvm::isInlineViable(Function &F) {3099bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);3100for (BasicBlock &BB : F) {3101// Disallow inlining of functions which contain indirect branches.3102if (isa<IndirectBrInst>(BB.getTerminator()))3103return InlineResult::failure("contains indirect branches");31043105// Disallow inlining of blockaddresses which are used by non-callbr3106// instructions.3107if (BB.hasAddressTaken())3108for (User *U : BlockAddress::get(&BB)->users())3109if (!isa<CallBrInst>(*U))3110return InlineResult::failure("blockaddress used outside of callbr");31113112for (auto &II : BB) {3113CallBase *Call = dyn_cast<CallBase>(&II);3114if (!Call)3115continue;31163117// Disallow recursive calls.3118Function *Callee = Call->getCalledFunction();3119if (&F == Callee)3120return InlineResult::failure("recursive call");31213122// Disallow calls which expose returns-twice to a function not previously3123// attributed as such.3124if (!ReturnsTwice && isa<CallInst>(Call) &&3125cast<CallInst>(Call)->canReturnTwice())3126return InlineResult::failure("exposes returns-twice attribute");31273128if (Callee)3129switch (Callee->getIntrinsicID()) {3130default:3131break;3132case llvm::Intrinsic::icall_branch_funnel:3133// Disallow inlining of @llvm.icall.branch.funnel because current3134// backend can't separate call targets from call arguments.3135return InlineResult::failure(3136"disallowed inlining of @llvm.icall.branch.funnel");3137case llvm::Intrinsic::localescape:3138// Disallow inlining functions that call @llvm.localescape. Doing this3139// correctly would require major changes to the inliner.3140return InlineResult::failure(3141"disallowed inlining of @llvm.localescape");3142case llvm::Intrinsic::vastart:3143// Disallow inlining of functions that initialize VarArgs with3144// va_start.3145return InlineResult::failure(3146"contains VarArgs initialized with va_start");3147}3148}3149}31503151return InlineResult::success();3152}31533154// APIs to create InlineParams based on command line flags and/or other3155// parameters.31563157InlineParams llvm::getInlineParams(int Threshold) {3158InlineParams Params;31593160// This field is the threshold to use for a callee by default. This is3161// derived from one or more of:3162// * optimization or size-optimization levels,3163// * a value passed to createFunctionInliningPass function, or3164// * the -inline-threshold flag.3165// If the -inline-threshold flag is explicitly specified, that is used3166// irrespective of anything else.3167if (InlineThreshold.getNumOccurrences() > 0)3168Params.DefaultThreshold = InlineThreshold;3169else3170Params.DefaultThreshold = Threshold;31713172// Set the HintThreshold knob from the -inlinehint-threshold.3173Params.HintThreshold = HintThreshold;31743175// Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.3176Params.HotCallSiteThreshold = HotCallSiteThreshold;31773178// If the -locally-hot-callsite-threshold is explicitly specified, use it to3179// populate LocallyHotCallSiteThreshold. Later, we populate3180// Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if3181// we know that optimization level is O3 (in the getInlineParams variant that3182// takes the opt and size levels).3183// FIXME: Remove this check (and make the assignment unconditional) after3184// addressing size regression issues at O2.3185if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)3186Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;31873188// Set the ColdCallSiteThreshold knob from the3189// -inline-cold-callsite-threshold.3190Params.ColdCallSiteThreshold = ColdCallSiteThreshold;31913192// Set the OptMinSizeThreshold and OptSizeThreshold params only if the3193// -inlinehint-threshold commandline option is not explicitly given. If that3194// option is present, then its value applies even for callees with size and3195// minsize attributes.3196// If the -inline-threshold is not specified, set the ColdThreshold from the3197// -inlinecold-threshold even if it is not explicitly passed. If3198// -inline-threshold is specified, then -inlinecold-threshold needs to be3199// explicitly specified to set the ColdThreshold knob3200if (InlineThreshold.getNumOccurrences() == 0) {3201Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;3202Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;3203Params.ColdThreshold = ColdThreshold;3204} else if (ColdThreshold.getNumOccurrences() > 0) {3205Params.ColdThreshold = ColdThreshold;3206}3207return Params;3208}32093210InlineParams llvm::getInlineParams() {3211return getInlineParams(DefaultThreshold);3212}32133214// Compute the default threshold for inlining based on the opt level and the3215// size opt level.3216static int computeThresholdFromOptLevels(unsigned OptLevel,3217unsigned SizeOptLevel) {3218if (OptLevel > 2)3219return InlineConstants::OptAggressiveThreshold;3220if (SizeOptLevel == 1) // -Os3221return InlineConstants::OptSizeThreshold;3222if (SizeOptLevel == 2) // -Oz3223return InlineConstants::OptMinSizeThreshold;3224return DefaultThreshold;3225}32263227InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {3228auto Params =3229getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));3230// At O3, use the value of -locally-hot-callsite-threshold option to populate3231// Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only3232// when it is specified explicitly.3233if (OptLevel > 2)3234Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;3235return Params;3236}32373238PreservedAnalyses3239InlineCostAnnotationPrinterPass::run(Function &F,3240FunctionAnalysisManager &FAM) {3241PrintInstructionComments = true;3242std::function<AssumptionCache &(Function &)> GetAssumptionCache =3243[&](Function &F) -> AssumptionCache & {3244return FAM.getResult<AssumptionAnalysis>(F);3245};3246Module *M = F.getParent();3247ProfileSummaryInfo PSI(*M);3248DataLayout DL(M);3249TargetTransformInfo TTI(DL);3250// FIXME: Redesign the usage of InlineParams to expand the scope of this pass.3251// In the current implementation, the type of InlineParams doesn't matter as3252// the pass serves only for verification of inliner's decisions.3253// We can add a flag which determines InlineParams for this run. Right now,3254// the default InlineParams are used.3255const InlineParams Params = llvm::getInlineParams();3256for (BasicBlock &BB : F) {3257for (Instruction &I : BB) {3258if (CallInst *CI = dyn_cast<CallInst>(&I)) {3259Function *CalledFunction = CI->getCalledFunction();3260if (!CalledFunction || CalledFunction->isDeclaration())3261continue;3262OptimizationRemarkEmitter ORE(CalledFunction);3263InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI,3264GetAssumptionCache, nullptr, &PSI, &ORE);3265ICCA.analyze();3266OS << " Analyzing call of " << CalledFunction->getName()3267<< "... (caller:" << CI->getCaller()->getName() << ")\n";3268ICCA.print(OS);3269OS << "\n";3270}3271}3272}3273return PreservedAnalyses::all();3274}327532763277