Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp
35266 views
//===- PartialInlining.cpp - Inline parts of functions --------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This pass performs partial inlining, typically by inlining an if statement9// that surrounds the body of the function.10//11//===----------------------------------------------------------------------===//1213#include "llvm/Transforms/IPO/PartialInlining.h"14#include "llvm/ADT/DenseMap.h"15#include "llvm/ADT/DenseSet.h"16#include "llvm/ADT/DepthFirstIterator.h"17#include "llvm/ADT/STLExtras.h"18#include "llvm/ADT/SmallVector.h"19#include "llvm/ADT/Statistic.h"20#include "llvm/Analysis/BlockFrequencyInfo.h"21#include "llvm/Analysis/BranchProbabilityInfo.h"22#include "llvm/Analysis/InlineCost.h"23#include "llvm/Analysis/LoopInfo.h"24#include "llvm/Analysis/OptimizationRemarkEmitter.h"25#include "llvm/Analysis/ProfileSummaryInfo.h"26#include "llvm/Analysis/TargetLibraryInfo.h"27#include "llvm/Analysis/TargetTransformInfo.h"28#include "llvm/IR/Attributes.h"29#include "llvm/IR/BasicBlock.h"30#include "llvm/IR/CFG.h"31#include "llvm/IR/DebugLoc.h"32#include "llvm/IR/DiagnosticInfo.h"33#include "llvm/IR/Dominators.h"34#include "llvm/IR/Function.h"35#include "llvm/IR/InstrTypes.h"36#include "llvm/IR/Instruction.h"37#include "llvm/IR/Instructions.h"38#include "llvm/IR/IntrinsicInst.h"39#include "llvm/IR/Intrinsics.h"40#include "llvm/IR/Module.h"41#include "llvm/IR/Operator.h"42#include "llvm/IR/ProfDataUtils.h"43#include "llvm/IR/User.h"44#include "llvm/Support/BlockFrequency.h"45#include "llvm/Support/BranchProbability.h"46#include "llvm/Support/Casting.h"47#include "llvm/Support/CommandLine.h"48#include "llvm/Support/ErrorHandling.h"49#include "llvm/Transforms/IPO.h"50#include "llvm/Transforms/Utils/Cloning.h"51#include "llvm/Transforms/Utils/CodeExtractor.h"52#include "llvm/Transforms/Utils/ValueMapper.h"53#include <algorithm>54#include <cassert>55#include <cstdint>56#include <memory>57#include <tuple>58#include <vector>5960using namespace llvm;6162#define DEBUG_TYPE "partial-inlining"6364STATISTIC(NumPartialInlined,65"Number of callsites functions partially inlined into.");66STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "67"cold outlined regions were partially "68"inlined into its caller(s).");69STATISTIC(NumColdRegionsFound,70"Number of cold single entry/exit regions found.");71STATISTIC(NumColdRegionsOutlined,72"Number of cold single entry/exit regions outlined.");7374// Command line option to disable partial-inlining. The default is false:75static cl::opt<bool>76DisablePartialInlining("disable-partial-inlining", cl::init(false),77cl::Hidden, cl::desc("Disable partial inlining"));78// Command line option to disable multi-region partial-inlining. The default is79// false:80static cl::opt<bool> DisableMultiRegionPartialInline(81"disable-mr-partial-inlining", cl::init(false), cl::Hidden,82cl::desc("Disable multi-region partial inlining"));8384// Command line option to force outlining in regions with live exit variables.85// The default is false:86static cl::opt<bool>87ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,88cl::desc("Force outline regions with live exits"));8990// Command line option to enable marking outline functions with Cold Calling91// Convention. The default is false:92static cl::opt<bool>93MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,94cl::desc("Mark outline function calls with ColdCC"));9596// This is an option used by testing:97static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",9899cl::ReallyHidden,100cl::desc("Skip Cost Analysis"));101// Used to determine if a cold region is worth outlining based on102// its inlining cost compared to the original function. Default is set at 10%.103// ie. if the cold region reduces the inlining cost of the original function by104// at least 10%.105static cl::opt<float> MinRegionSizeRatio(106"min-region-size-ratio", cl::init(0.1), cl::Hidden,107cl::desc("Minimum ratio comparing relative sizes of each "108"outline candidate and original function"));109// Used to tune the minimum number of execution counts needed in the predecessor110// block to the cold edge. ie. confidence interval.111static cl::opt<unsigned>112MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,113cl::desc("Minimum block executions to consider "114"its BranchProbabilityInfo valid"));115// Used to determine when an edge is considered cold. Default is set to 10%. ie.116// if the branch probability is 10% or less, then it is deemed as 'cold'.117static cl::opt<float> ColdBranchRatio(118"cold-branch-ratio", cl::init(0.1), cl::Hidden,119cl::desc("Minimum BranchProbability to consider a region cold."));120121static cl::opt<unsigned> MaxNumInlineBlocks(122"max-num-inline-blocks", cl::init(5), cl::Hidden,123cl::desc("Max number of blocks to be partially inlined"));124125// Command line option to set the maximum number of partial inlining allowed126// for the module. The default value of -1 means no limit.127static cl::opt<int> MaxNumPartialInlining(128"max-partial-inlining", cl::init(-1), cl::Hidden,129cl::desc("Max number of partial inlining. The default is unlimited"));130131// Used only when PGO or user annotated branch data is absent. It is132// the least value that is used to weigh the outline region. If BFI133// produces larger value, the BFI value will be used.134static cl::opt<int>135OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),136cl::Hidden,137cl::desc("Relative frequency of outline region to "138"the entry block"));139140static cl::opt<unsigned> ExtraOutliningPenalty(141"partial-inlining-extra-penalty", cl::init(0), cl::Hidden,142cl::desc("A debug option to add additional penalty to the computed one."));143144namespace {145146struct FunctionOutliningInfo {147FunctionOutliningInfo() = default;148149// Returns the number of blocks to be inlined including all blocks150// in Entries and one return block.151unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }152153// A set of blocks including the function entry that guard154// the region to be outlined.155SmallVector<BasicBlock *, 4> Entries;156157// The return block that is not included in the outlined region.158BasicBlock *ReturnBlock = nullptr;159160// The dominating block of the region to be outlined.161BasicBlock *NonReturnBlock = nullptr;162163// The set of blocks in Entries that are predecessors to ReturnBlock164SmallVector<BasicBlock *, 4> ReturnBlockPreds;165};166167struct FunctionOutliningMultiRegionInfo {168FunctionOutliningMultiRegionInfo() = default;169170// Container for outline regions171struct OutlineRegionInfo {172OutlineRegionInfo(ArrayRef<BasicBlock *> Region,173BasicBlock *EntryBlock, BasicBlock *ExitBlock,174BasicBlock *ReturnBlock)175: Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),176ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}177SmallVector<BasicBlock *, 8> Region;178BasicBlock *EntryBlock;179BasicBlock *ExitBlock;180BasicBlock *ReturnBlock;181};182183SmallVector<OutlineRegionInfo, 4> ORI;184};185186struct PartialInlinerImpl {187188PartialInlinerImpl(189function_ref<AssumptionCache &(Function &)> GetAC,190function_ref<AssumptionCache *(Function &)> LookupAC,191function_ref<TargetTransformInfo &(Function &)> GTTI,192function_ref<const TargetLibraryInfo &(Function &)> GTLI,193ProfileSummaryInfo &ProfSI,194function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)195: GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),196GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}197198bool run(Module &M);199// Main part of the transformation that calls helper functions to find200// outlining candidates, clone & outline the function, and attempt to201// partially inline the resulting function. Returns true if202// inlining was successful, false otherwise. Also returns the outline203// function (only if we partially inlined early returns) as there is a204// possibility to further "peel" early return statements that were left in the205// outline function due to code size.206std::pair<bool, Function *> unswitchFunction(Function &F);207208// This class speculatively clones the function to be partial inlined.209// At the end of partial inlining, the remaining callsites to the cloned210// function that are not partially inlined will be fixed up to reference211// the original function, and the cloned function will be erased.212struct FunctionCloner {213// Two constructors, one for single region outlining, the other for214// multi-region outlining.215FunctionCloner(Function *F, FunctionOutliningInfo *OI,216OptimizationRemarkEmitter &ORE,217function_ref<AssumptionCache *(Function &)> LookupAC,218function_ref<TargetTransformInfo &(Function &)> GetTTI);219FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,220OptimizationRemarkEmitter &ORE,221function_ref<AssumptionCache *(Function &)> LookupAC,222function_ref<TargetTransformInfo &(Function &)> GetTTI);223224~FunctionCloner();225226// Prepare for function outlining: making sure there is only227// one incoming edge from the extracted/outlined region to228// the return block.229void normalizeReturnBlock() const;230231// Do function outlining for cold regions.232bool doMultiRegionFunctionOutlining();233// Do function outlining for region after early return block(s).234// NOTE: For vararg functions that do the vararg handling in the outlined235// function, we temporarily generate IR that does not properly236// forward varargs to the outlined function. Calling InlineFunction237// will update calls to the outlined functions to properly forward238// the varargs.239Function *doSingleRegionFunctionOutlining();240241Function *OrigFunc = nullptr;242Function *ClonedFunc = nullptr;243244typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;245// Keep track of Outlined Functions and the basic block they're called from.246SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;247248// ClonedFunc is inlined in one of its callers after function249// outlining.250bool IsFunctionInlined = false;251// The cost of the region to be outlined.252InstructionCost OutlinedRegionCost = 0;253// ClonedOI is specific to outlining non-early return blocks.254std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;255// ClonedOMRI is specific to outlining cold regions.256std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;257std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;258OptimizationRemarkEmitter &ORE;259function_ref<AssumptionCache *(Function &)> LookupAC;260function_ref<TargetTransformInfo &(Function &)> GetTTI;261};262263private:264int NumPartialInlining = 0;265function_ref<AssumptionCache &(Function &)> GetAssumptionCache;266function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;267function_ref<TargetTransformInfo &(Function &)> GetTTI;268function_ref<BlockFrequencyInfo &(Function &)> GetBFI;269function_ref<const TargetLibraryInfo &(Function &)> GetTLI;270ProfileSummaryInfo &PSI;271272// Return the frequency of the OutlininingBB relative to F's entry point.273// The result is no larger than 1 and is represented using BP.274// (Note that the outlined region's 'head' block can only have incoming275// edges from the guarding entry blocks).276BranchProbability277getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;278279// Return true if the callee of CB should be partially inlined with280// profit.281bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,282BlockFrequency WeightedOutliningRcost,283OptimizationRemarkEmitter &ORE) const;284285// Try to inline DuplicateFunction (cloned from F with call to286// the OutlinedFunction into its callers. Return true287// if there is any successful inlining.288bool tryPartialInline(FunctionCloner &Cloner);289290// Compute the mapping from use site of DuplicationFunction to the enclosing291// BB's profile count.292void293computeCallsiteToProfCountMap(Function *DuplicateFunction,294DenseMap<User *, uint64_t> &SiteCountMap) const;295296bool isLimitReached() const {297return (MaxNumPartialInlining != -1 &&298NumPartialInlining >= MaxNumPartialInlining);299}300301static CallBase *getSupportedCallBase(User *U) {302if (isa<CallInst>(U) || isa<InvokeInst>(U))303return cast<CallBase>(U);304llvm_unreachable("All uses must be calls");305return nullptr;306}307308static CallBase *getOneCallSiteTo(Function &F) {309User *User = *F.user_begin();310return getSupportedCallBase(User);311}312313std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {314CallBase *CB = getOneCallSiteTo(F);315DebugLoc DLoc = CB->getDebugLoc();316BasicBlock *Block = CB->getParent();317return std::make_tuple(DLoc, Block);318}319320// Returns the costs associated with function outlining:321// - The first value is the non-weighted runtime cost for making the call322// to the outlined function, including the addtional setup cost in the323// outlined function itself;324// - The second value is the estimated size of the new call sequence in325// basic block Cloner.OutliningCallBB;326std::tuple<InstructionCost, InstructionCost>327computeOutliningCosts(FunctionCloner &Cloner) const;328329// Compute the 'InlineCost' of block BB. InlineCost is a proxy used to330// approximate both the size and runtime cost (Note that in the current331// inline cost analysis, there is no clear distinction there either).332static InstructionCost computeBBInlineCost(BasicBlock *BB,333TargetTransformInfo *TTI);334335std::unique_ptr<FunctionOutliningInfo>336computeOutliningInfo(Function &F) const;337338std::unique_ptr<FunctionOutliningMultiRegionInfo>339computeOutliningColdRegionsInfo(Function &F,340OptimizationRemarkEmitter &ORE) const;341};342343} // end anonymous namespace344345std::unique_ptr<FunctionOutliningMultiRegionInfo>346PartialInlinerImpl::computeOutliningColdRegionsInfo(347Function &F, OptimizationRemarkEmitter &ORE) const {348BasicBlock *EntryBlock = &F.front();349350DominatorTree DT(F);351LoopInfo LI(DT);352BranchProbabilityInfo BPI(F, LI);353std::unique_ptr<BlockFrequencyInfo> ScopedBFI;354BlockFrequencyInfo *BFI;355if (!GetBFI) {356ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));357BFI = ScopedBFI.get();358} else359BFI = &(GetBFI(F));360361// Return if we don't have profiling information.362if (!PSI.hasInstrumentationProfile())363return std::unique_ptr<FunctionOutliningMultiRegionInfo>();364365std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =366std::make_unique<FunctionOutliningMultiRegionInfo>();367368auto IsSingleExit =369[&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {370BasicBlock *ExitBlock = nullptr;371for (auto *Block : BlockList) {372for (BasicBlock *Succ : successors(Block)) {373if (!is_contained(BlockList, Succ)) {374if (ExitBlock) {375ORE.emit([&]() {376return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",377&Succ->front())378<< "Region dominated by "379<< ore::NV("Block", BlockList.front()->getName())380<< " has more than one region exit edge.";381});382return nullptr;383}384385ExitBlock = Block;386}387}388}389return ExitBlock;390};391392auto BBProfileCount = [BFI](BasicBlock *BB) {393return BFI->getBlockProfileCount(BB).value_or(0);394};395396// Use the same computeBBInlineCost function to compute the cost savings of397// the outlining the candidate region.398TargetTransformInfo *FTTI = &GetTTI(F);399InstructionCost OverallFunctionCost = 0;400for (auto &BB : F)401OverallFunctionCost += computeBBInlineCost(&BB, FTTI);402403LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost404<< "\n";);405406InstructionCost MinOutlineRegionCost = OverallFunctionCost.map(407[&](auto Cost) { return Cost * MinRegionSizeRatio; });408409BranchProbability MinBranchProbability(410static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),411MinBlockCounterExecution);412bool ColdCandidateFound = false;413BasicBlock *CurrEntry = EntryBlock;414std::vector<BasicBlock *> DFS;415DenseMap<BasicBlock *, bool> VisitedMap;416DFS.push_back(CurrEntry);417VisitedMap[CurrEntry] = true;418419// Use Depth First Search on the basic blocks to find CFG edges that are420// considered cold.421// Cold regions considered must also have its inline cost compared to the422// overall inline cost of the original function. The region is outlined only423// if it reduced the inline cost of the function by 'MinOutlineRegionCost' or424// more.425while (!DFS.empty()) {426auto *ThisBB = DFS.back();427DFS.pop_back();428// Only consider regions with predecessor blocks that are considered429// not-cold (default: part of the top 99.99% of all block counters)430// AND greater than our minimum block execution count (default: 100).431if (PSI.isColdBlock(ThisBB, BFI) ||432BBProfileCount(ThisBB) < MinBlockCounterExecution)433continue;434for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {435if (VisitedMap[*SI])436continue;437VisitedMap[*SI] = true;438DFS.push_back(*SI);439// If branch isn't cold, we skip to the next one.440BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);441if (SuccProb > MinBranchProbability)442continue;443444LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"445<< SI->getName()446<< "\nBranch Probability = " << SuccProb << "\n";);447448SmallVector<BasicBlock *, 8> DominateVector;449DT.getDescendants(*SI, DominateVector);450assert(!DominateVector.empty() &&451"SI should be reachable and have at least itself as descendant");452453// We can only outline single entry regions (for now).454if (!DominateVector.front()->hasNPredecessors(1)) {455LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()456<< " doesn't have a single predecessor in the "457"dominator tree\n";);458continue;459}460461BasicBlock *ExitBlock = nullptr;462// We can only outline single exit regions (for now).463if (!(ExitBlock = IsSingleExit(DominateVector))) {464LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()465<< " doesn't have a unique successor\n";);466continue;467}468469InstructionCost OutlineRegionCost = 0;470for (auto *BB : DominateVector)471OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));472473LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost474<< "\n";);475476if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {477ORE.emit([&]() {478return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",479&SI->front())480<< ore::NV("Callee", &F)481<< " inline cost-savings smaller than "482<< ore::NV("Cost", MinOutlineRegionCost);483});484485LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "486<< MinOutlineRegionCost << "\n";);487continue;488}489490// For now, ignore blocks that belong to a SISE region that is a491// candidate for outlining. In the future, we may want to look492// at inner regions because the outer region may have live-exit493// variables.494for (auto *BB : DominateVector)495VisitedMap[BB] = true;496497// ReturnBlock here means the block after the outline call498BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();499FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(500DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);501OutliningInfo->ORI.push_back(RegInfo);502LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "503<< DominateVector.front()->getName() << "\n";);504ColdCandidateFound = true;505NumColdRegionsFound++;506}507}508509if (ColdCandidateFound)510return OutliningInfo;511512return std::unique_ptr<FunctionOutliningMultiRegionInfo>();513}514515std::unique_ptr<FunctionOutliningInfo>516PartialInlinerImpl::computeOutliningInfo(Function &F) const {517BasicBlock *EntryBlock = &F.front();518BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());519if (!BR || BR->isUnconditional())520return std::unique_ptr<FunctionOutliningInfo>();521522// Returns true if Succ is BB's successor523auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {524return is_contained(successors(BB), Succ);525};526527auto IsReturnBlock = [](BasicBlock *BB) {528Instruction *TI = BB->getTerminator();529return isa<ReturnInst>(TI);530};531532auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {533if (IsReturnBlock(Succ1))534return std::make_tuple(Succ1, Succ2);535if (IsReturnBlock(Succ2))536return std::make_tuple(Succ2, Succ1);537538return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);539};540541// Detect a triangular shape:542auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {543if (IsSuccessor(Succ1, Succ2))544return std::make_tuple(Succ1, Succ2);545if (IsSuccessor(Succ2, Succ1))546return std::make_tuple(Succ2, Succ1);547548return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);549};550551std::unique_ptr<FunctionOutliningInfo> OutliningInfo =552std::make_unique<FunctionOutliningInfo>();553554BasicBlock *CurrEntry = EntryBlock;555bool CandidateFound = false;556do {557// The number of blocks to be inlined has already reached558// the limit. When MaxNumInlineBlocks is set to 0 or 1, this559// disables partial inlining for the function.560if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)561break;562563if (succ_size(CurrEntry) != 2)564break;565566BasicBlock *Succ1 = *succ_begin(CurrEntry);567BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);568569BasicBlock *ReturnBlock, *NonReturnBlock;570std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);571572if (ReturnBlock) {573OutliningInfo->Entries.push_back(CurrEntry);574OutliningInfo->ReturnBlock = ReturnBlock;575OutliningInfo->NonReturnBlock = NonReturnBlock;576CandidateFound = true;577break;578}579580BasicBlock *CommSucc, *OtherSucc;581std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);582583if (!CommSucc)584break;585586OutliningInfo->Entries.push_back(CurrEntry);587CurrEntry = OtherSucc;588} while (true);589590if (!CandidateFound)591return std::unique_ptr<FunctionOutliningInfo>();592593// There should not be any successors (not in the entry set) other than594// {ReturnBlock, NonReturnBlock}595assert(OutliningInfo->Entries[0] == &F.front() &&596"Function Entry must be the first in Entries vector");597DenseSet<BasicBlock *> Entries;598for (BasicBlock *E : OutliningInfo->Entries)599Entries.insert(E);600601// Returns true of BB has Predecessor which is not602// in Entries set.603auto HasNonEntryPred = [Entries](BasicBlock *BB) {604for (auto *Pred : predecessors(BB)) {605if (!Entries.count(Pred))606return true;607}608return false;609};610auto CheckAndNormalizeCandidate =611[Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {612for (BasicBlock *E : OutliningInfo->Entries) {613for (auto *Succ : successors(E)) {614if (Entries.count(Succ))615continue;616if (Succ == OutliningInfo->ReturnBlock)617OutliningInfo->ReturnBlockPreds.push_back(E);618else if (Succ != OutliningInfo->NonReturnBlock)619return false;620}621// There should not be any outside incoming edges either:622if (HasNonEntryPred(E))623return false;624}625return true;626};627628if (!CheckAndNormalizeCandidate(OutliningInfo.get()))629return std::unique_ptr<FunctionOutliningInfo>();630631// Now further growing the candidate's inlining region by632// peeling off dominating blocks from the outlining region:633while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {634BasicBlock *Cand = OutliningInfo->NonReturnBlock;635if (succ_size(Cand) != 2)636break;637638if (HasNonEntryPred(Cand))639break;640641BasicBlock *Succ1 = *succ_begin(Cand);642BasicBlock *Succ2 = *(succ_begin(Cand) + 1);643644BasicBlock *ReturnBlock, *NonReturnBlock;645std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);646if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)647break;648649if (NonReturnBlock->getSinglePredecessor() != Cand)650break;651652// Now grow and update OutlininigInfo:653OutliningInfo->Entries.push_back(Cand);654OutliningInfo->NonReturnBlock = NonReturnBlock;655OutliningInfo->ReturnBlockPreds.push_back(Cand);656Entries.insert(Cand);657}658659return OutliningInfo;660}661662// Check if there is PGO data or user annotated branch data:663static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {664if (F.hasProfileData())665return true;666// Now check if any of the entry block has MD_prof data:667for (auto *E : OI.Entries) {668BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());669if (!BR || BR->isUnconditional())670continue;671if (hasBranchWeightMD(*BR))672return true;673}674return false;675}676677BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(678FunctionCloner &Cloner) const {679BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;680auto EntryFreq =681Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());682auto OutliningCallFreq =683Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);684// FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE685// we outlined any regions, so we may encounter situations where the686// OutliningCallFreq is *slightly* bigger than the EntryFreq.687if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())688OutliningCallFreq = EntryFreq;689690auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(691OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());692693if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI))694return OutlineRegionRelFreq;695696// When profile data is not available, we need to be conservative in697// estimating the overall savings. Static branch prediction can usually698// guess the branch direction right (taken/non-taken), but the guessed699// branch probability is usually not biased enough. In case when the700// outlined region is predicted to be likely, its probability needs701// to be made higher (more biased) to not under-estimate the cost of702// function outlining. On the other hand, if the outlined region703// is predicted to be less likely, the predicted probablity is usually704// higher than the actual. For instance, the actual probability of the705// less likely target is only 5%, but the guessed probablity can be706// 40%. In the latter case, there is no need for further adjustment.707// FIXME: add an option for this.708if (OutlineRegionRelFreq < BranchProbability(45, 100))709return OutlineRegionRelFreq;710711OutlineRegionRelFreq = std::max(712OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));713714return OutlineRegionRelFreq;715}716717bool PartialInlinerImpl::shouldPartialInline(718CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,719OptimizationRemarkEmitter &ORE) const {720using namespace ore;721722Function *Callee = CB.getCalledFunction();723assert(Callee == Cloner.ClonedFunc);724725if (SkipCostAnalysis)726return isInlineViable(*Callee).isSuccess();727728Function *Caller = CB.getCaller();729auto &CalleeTTI = GetTTI(*Callee);730bool RemarksEnabled =731Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(732DEBUG_TYPE);733InlineCost IC =734getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,735GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);736737if (IC.isAlways()) {738ORE.emit([&]() {739return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)740<< NV("Callee", Cloner.OrigFunc)741<< " should always be fully inlined, not partially";742});743return false;744}745746if (IC.isNever()) {747ORE.emit([&]() {748return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)749<< NV("Callee", Cloner.OrigFunc) << " not partially inlined into "750<< NV("Caller", Caller)751<< " because it should never be inlined (cost=never)";752});753return false;754}755756if (!IC) {757ORE.emit([&]() {758return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)759<< NV("Callee", Cloner.OrigFunc) << " not partially inlined into "760<< NV("Caller", Caller) << " because too costly to inline (cost="761<< NV("Cost", IC.getCost()) << ", threshold="762<< NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";763});764return false;765}766const DataLayout &DL = Caller->getDataLayout();767768// The savings of eliminating the call:769int NonWeightedSavings = getCallsiteCost(CalleeTTI, CB, DL);770BlockFrequency NormWeightedSavings(NonWeightedSavings);771772// Weighted saving is smaller than weighted cost, return false773if (NormWeightedSavings < WeightedOutliningRcost) {774ORE.emit([&]() {775return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",776&CB)777<< NV("Callee", Cloner.OrigFunc) << " not partially inlined into "778<< NV("Caller", Caller) << " runtime overhead (overhead="779<< NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())780<< ", savings="781<< NV("Savings", (unsigned)NormWeightedSavings.getFrequency())782<< ")"783<< " of making the outlined call is too high";784});785786return false;787}788789ORE.emit([&]() {790return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)791<< NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "792<< NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())793<< " (threshold="794<< NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";795});796return true;797}798799// TODO: Ideally we should share Inliner's InlineCost Analysis code.800// For now use a simplified version. The returned 'InlineCost' will be used801// to esimate the size cost as well as runtime cost of the BB.802InstructionCost803PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,804TargetTransformInfo *TTI) {805InstructionCost InlineCost = 0;806const DataLayout &DL = BB->getDataLayout();807int InstrCost = InlineConstants::getInstrCost();808for (Instruction &I : BB->instructionsWithoutDebug()) {809// Skip free instructions.810switch (I.getOpcode()) {811case Instruction::BitCast:812case Instruction::PtrToInt:813case Instruction::IntToPtr:814case Instruction::Alloca:815case Instruction::PHI:816continue;817case Instruction::GetElementPtr:818if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())819continue;820break;821default:822break;823}824825if (I.isLifetimeStartOrEnd())826continue;827828if (auto *II = dyn_cast<IntrinsicInst>(&I)) {829Intrinsic::ID IID = II->getIntrinsicID();830SmallVector<Type *, 4> Tys;831FastMathFlags FMF;832for (Value *Val : II->args())833Tys.push_back(Val->getType());834835if (auto *FPMO = dyn_cast<FPMathOperator>(II))836FMF = FPMO->getFastMathFlags();837838IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);839InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency);840continue;841}842843if (CallInst *CI = dyn_cast<CallInst>(&I)) {844InlineCost += getCallsiteCost(*TTI, *CI, DL);845continue;846}847848if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {849InlineCost += getCallsiteCost(*TTI, *II, DL);850continue;851}852853if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {854InlineCost += (SI->getNumCases() + 1) * InstrCost;855continue;856}857InlineCost += InstrCost;858}859860return InlineCost;861}862863std::tuple<InstructionCost, InstructionCost>864PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {865InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;866for (auto FuncBBPair : Cloner.OutlinedFunctions) {867Function *OutlinedFunc = FuncBBPair.first;868BasicBlock* OutliningCallBB = FuncBBPair.second;869// Now compute the cost of the call sequence to the outlined function870// 'OutlinedFunction' in BB 'OutliningCallBB':871auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);872OutliningFuncCallCost +=873computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);874875// Now compute the cost of the extracted/outlined function itself:876for (BasicBlock &BB : *OutlinedFunc)877OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);878}879assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&880"Outlined function cost should be no less than the outlined region");881882// The code extractor introduces a new root and exit stub blocks with883// additional unconditional branches. Those branches will be eliminated884// later with bb layout. The cost should be adjusted accordingly:885OutlinedFunctionCost -=8862 * InlineConstants::getInstrCost() * Cloner.OutlinedFunctions.size();887888InstructionCost OutliningRuntimeOverhead =889OutliningFuncCallCost +890(OutlinedFunctionCost - Cloner.OutlinedRegionCost) +891ExtraOutliningPenalty.getValue();892893return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);894}895896// Create the callsite to profile count map which is897// used to update the original function's entry count,898// after the function is partially inlined into the callsite.899void PartialInlinerImpl::computeCallsiteToProfCountMap(900Function *DuplicateFunction,901DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {902std::vector<User *> Users(DuplicateFunction->user_begin(),903DuplicateFunction->user_end());904Function *CurrentCaller = nullptr;905std::unique_ptr<BlockFrequencyInfo> TempBFI;906BlockFrequencyInfo *CurrentCallerBFI = nullptr;907908auto ComputeCurrBFI = [&,this](Function *Caller) {909// For the old pass manager:910if (!GetBFI) {911DominatorTree DT(*Caller);912LoopInfo LI(DT);913BranchProbabilityInfo BPI(*Caller, LI);914TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));915CurrentCallerBFI = TempBFI.get();916} else {917// New pass manager:918CurrentCallerBFI = &(GetBFI(*Caller));919}920};921922for (User *User : Users) {923// Don't bother with BlockAddress used by CallBr for asm goto.924if (isa<BlockAddress>(User))925continue;926CallBase *CB = getSupportedCallBase(User);927Function *Caller = CB->getCaller();928if (CurrentCaller != Caller) {929CurrentCaller = Caller;930ComputeCurrBFI(Caller);931} else {932assert(CurrentCallerBFI && "CallerBFI is not set");933}934BasicBlock *CallBB = CB->getParent();935auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);936if (Count)937CallSiteToProfCountMap[User] = *Count;938else939CallSiteToProfCountMap[User] = 0;940}941}942943PartialInlinerImpl::FunctionCloner::FunctionCloner(944Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,945function_ref<AssumptionCache *(Function &)> LookupAC,946function_ref<TargetTransformInfo &(Function &)> GetTTI)947: OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {948ClonedOI = std::make_unique<FunctionOutliningInfo>();949950// Clone the function, so that we can hack away on it.951ValueToValueMapTy VMap;952ClonedFunc = CloneFunction(F, VMap);953954ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);955ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);956for (BasicBlock *BB : OI->Entries)957ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));958959for (BasicBlock *E : OI->ReturnBlockPreds) {960BasicBlock *NewE = cast<BasicBlock>(VMap[E]);961ClonedOI->ReturnBlockPreds.push_back(NewE);962}963// Go ahead and update all uses to the duplicate, so that we can just964// use the inliner functionality when we're done hacking.965F->replaceAllUsesWith(ClonedFunc);966}967968PartialInlinerImpl::FunctionCloner::FunctionCloner(969Function *F, FunctionOutliningMultiRegionInfo *OI,970OptimizationRemarkEmitter &ORE,971function_ref<AssumptionCache *(Function &)> LookupAC,972function_ref<TargetTransformInfo &(Function &)> GetTTI)973: OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {974ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();975976// Clone the function, so that we can hack away on it.977ValueToValueMapTy VMap;978ClonedFunc = CloneFunction(F, VMap);979980// Go through all Outline Candidate Regions and update all BasicBlock981// information.982for (const FunctionOutliningMultiRegionInfo::OutlineRegionInfo &RegionInfo :983OI->ORI) {984SmallVector<BasicBlock *, 8> Region;985for (BasicBlock *BB : RegionInfo.Region)986Region.push_back(cast<BasicBlock>(VMap[BB]));987988BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);989BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);990BasicBlock *NewReturnBlock = nullptr;991if (RegionInfo.ReturnBlock)992NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);993FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(994Region, NewEntryBlock, NewExitBlock, NewReturnBlock);995ClonedOMRI->ORI.push_back(MappedRegionInfo);996}997// Go ahead and update all uses to the duplicate, so that we can just998// use the inliner functionality when we're done hacking.999F->replaceAllUsesWith(ClonedFunc);1000}10011002void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {1003auto GetFirstPHI = [](BasicBlock *BB) {1004BasicBlock::iterator I = BB->begin();1005PHINode *FirstPhi = nullptr;1006while (I != BB->end()) {1007PHINode *Phi = dyn_cast<PHINode>(I);1008if (!Phi)1009break;1010if (!FirstPhi) {1011FirstPhi = Phi;1012break;1013}1014}1015return FirstPhi;1016};10171018// Shouldn't need to normalize PHIs if we're not outlining non-early return1019// blocks.1020if (!ClonedOI)1021return;10221023// Special hackery is needed with PHI nodes that have inputs from more than1024// one extracted block. For simplicity, just split the PHIs into a two-level1025// sequence of PHIs, some of which will go in the extracted region, and some1026// of which will go outside.1027BasicBlock *PreReturn = ClonedOI->ReturnBlock;1028// only split block when necessary:1029PHINode *FirstPhi = GetFirstPHI(PreReturn);1030unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();10311032if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)1033return;10341035auto IsTrivialPhi = [](PHINode *PN) -> Value * {1036if (llvm::all_equal(PN->incoming_values()))1037return PN->getIncomingValue(0);1038return nullptr;1039};10401041ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(1042ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());1043BasicBlock::iterator I = PreReturn->begin();1044BasicBlock::iterator Ins = ClonedOI->ReturnBlock->begin();1045SmallVector<Instruction *, 4> DeadPhis;1046while (I != PreReturn->end()) {1047PHINode *OldPhi = dyn_cast<PHINode>(I);1048if (!OldPhi)1049break;10501051PHINode *RetPhi =1052PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "");1053RetPhi->insertBefore(Ins);1054OldPhi->replaceAllUsesWith(RetPhi);1055Ins = ClonedOI->ReturnBlock->getFirstNonPHIIt();10561057RetPhi->addIncoming(&*I, PreReturn);1058for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {1059RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);1060OldPhi->removeIncomingValue(E);1061}10621063// After incoming values splitting, the old phi may become trivial.1064// Keeping the trivial phi can introduce definition inside the outline1065// region which is live-out, causing necessary overhead (load, store1066// arg passing etc).1067if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {1068OldPhi->replaceAllUsesWith(OldPhiVal);1069DeadPhis.push_back(OldPhi);1070}1071++I;1072}1073for (auto *DP : DeadPhis)1074DP->eraseFromParent();10751076for (auto *E : ClonedOI->ReturnBlockPreds)1077E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);1078}10791080bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {10811082auto ComputeRegionCost =1083[&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost {1084InstructionCost Cost = 0;1085for (BasicBlock* BB : Region)1086Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));1087return Cost;1088};10891090assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");10911092if (ClonedOMRI->ORI.empty())1093return false;10941095// The CodeExtractor needs a dominator tree.1096DominatorTree DT;1097DT.recalculate(*ClonedFunc);10981099// Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.1100LoopInfo LI(DT);1101BranchProbabilityInfo BPI(*ClonedFunc, LI);1102ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));11031104// Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.1105CodeExtractorAnalysisCache CEAC(*ClonedFunc);11061107SetVector<Value *> Inputs, Outputs, Sinks;1108for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :1109ClonedOMRI->ORI) {1110InstructionCost CurrentOutlinedRegionCost =1111ComputeRegionCost(RegionInfo.Region);11121113CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,1114ClonedFuncBFI.get(), &BPI,1115LookupAC(*RegionInfo.EntryBlock->getParent()),1116/* AllowVarargs */ false);11171118CE.findInputsOutputs(Inputs, Outputs, Sinks);11191120LLVM_DEBUG({1121dbgs() << "inputs: " << Inputs.size() << "\n";1122dbgs() << "outputs: " << Outputs.size() << "\n";1123for (Value *value : Inputs)1124dbgs() << "value used in func: " << *value << "\n";1125for (Value *output : Outputs)1126dbgs() << "instr used in func: " << *output << "\n";1127});11281129// Do not extract regions that have live exit variables.1130if (Outputs.size() > 0 && !ForceLiveExit)1131continue;11321133if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {1134CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);1135BasicBlock *OutliningCallBB = OCS->getParent();1136assert(OutliningCallBB->getParent() == ClonedFunc);1137OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));1138NumColdRegionsOutlined++;1139OutlinedRegionCost += CurrentOutlinedRegionCost;11401141if (MarkOutlinedColdCC) {1142OutlinedFunc->setCallingConv(CallingConv::Cold);1143OCS->setCallingConv(CallingConv::Cold);1144}1145} else1146ORE.emit([&]() {1147return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",1148&RegionInfo.Region.front()->front())1149<< "Failed to extract region at block "1150<< ore::NV("Block", RegionInfo.Region.front());1151});1152}11531154return !OutlinedFunctions.empty();1155}11561157Function *1158PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {1159// Returns true if the block is to be partial inlined into the caller1160// (i.e. not to be extracted to the out of line function)1161auto ToBeInlined = [&, this](BasicBlock *BB) {1162return BB == ClonedOI->ReturnBlock ||1163llvm::is_contained(ClonedOI->Entries, BB);1164};11651166assert(ClonedOI && "Expecting OutlineInfo for single region outline");1167// The CodeExtractor needs a dominator tree.1168DominatorTree DT;1169DT.recalculate(*ClonedFunc);11701171// Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.1172LoopInfo LI(DT);1173BranchProbabilityInfo BPI(*ClonedFunc, LI);1174ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));11751176// Gather up the blocks that we're going to extract.1177std::vector<BasicBlock *> ToExtract;1178auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);1179ToExtract.push_back(ClonedOI->NonReturnBlock);1180OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(1181ClonedOI->NonReturnBlock, ClonedFuncTTI);1182for (BasicBlock *BB : depth_first(&ClonedFunc->getEntryBlock()))1183if (!ToBeInlined(BB) && BB != ClonedOI->NonReturnBlock) {1184ToExtract.push_back(BB);1185// FIXME: the code extractor may hoist/sink more code1186// into the outlined function which may make the outlining1187// overhead (the difference of the outlined function cost1188// and OutliningRegionCost) look larger.1189OutlinedRegionCost += computeBBInlineCost(BB, ClonedFuncTTI);1190}11911192// Extract the body of the if.1193CodeExtractorAnalysisCache CEAC(*ClonedFunc);1194Function *OutlinedFunc =1195CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,1196ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),1197/* AllowVarargs */ true)1198.extractCodeRegion(CEAC);11991200if (OutlinedFunc) {1201BasicBlock *OutliningCallBB =1202PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();1203assert(OutliningCallBB->getParent() == ClonedFunc);1204OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));1205} else1206ORE.emit([&]() {1207return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",1208&ToExtract.front()->front())1209<< "Failed to extract region at block "1210<< ore::NV("Block", ToExtract.front());1211});12121213return OutlinedFunc;1214}12151216PartialInlinerImpl::FunctionCloner::~FunctionCloner() {1217// Ditch the duplicate, since we're done with it, and rewrite all remaining1218// users (function pointers, etc.) back to the original function.1219ClonedFunc->replaceAllUsesWith(OrigFunc);1220ClonedFunc->eraseFromParent();1221if (!IsFunctionInlined) {1222// Remove each function that was speculatively created if there is no1223// reference.1224for (auto FuncBBPair : OutlinedFunctions) {1225Function *Func = FuncBBPair.first;1226Func->eraseFromParent();1227}1228}1229}12301231std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {1232if (F.hasAddressTaken())1233return {false, nullptr};12341235// Let inliner handle it1236if (F.hasFnAttribute(Attribute::AlwaysInline))1237return {false, nullptr};12381239if (F.hasFnAttribute(Attribute::NoInline))1240return {false, nullptr};12411242if (PSI.isFunctionEntryCold(&F))1243return {false, nullptr};12441245if (F.users().empty())1246return {false, nullptr};12471248OptimizationRemarkEmitter ORE(&F);12491250// Only try to outline cold regions if we have a profile summary, which1251// implies we have profiling information.1252if (PSI.hasProfileSummary() && F.hasProfileData() &&1253!DisableMultiRegionPartialInline) {1254std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =1255computeOutliningColdRegionsInfo(F, ORE);1256if (OMRI) {1257FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);12581259LLVM_DEBUG({1260dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";1261dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()1262<< "\n";1263});12641265bool DidOutline = Cloner.doMultiRegionFunctionOutlining();12661267if (DidOutline) {1268LLVM_DEBUG({1269dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";1270Cloner.ClonedFunc->print(dbgs());1271dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";1272});12731274if (tryPartialInline(Cloner))1275return {true, nullptr};1276}1277}1278}12791280// Fall-thru to regular partial inlining if we:1281// i) can't find any cold regions to outline, or1282// ii) can't inline the outlined function anywhere.1283std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);1284if (!OI)1285return {false, nullptr};12861287FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);1288Cloner.normalizeReturnBlock();12891290Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();12911292if (!OutlinedFunction)1293return {false, nullptr};12941295if (tryPartialInline(Cloner))1296return {true, OutlinedFunction};12971298return {false, nullptr};1299}13001301bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {1302if (Cloner.OutlinedFunctions.empty())1303return false;13041305auto OutliningCosts = computeOutliningCosts(Cloner);13061307InstructionCost SizeCost = std::get<0>(OutliningCosts);1308InstructionCost NonWeightedRcost = std::get<1>(OutliningCosts);13091310assert(SizeCost.isValid() && NonWeightedRcost.isValid() &&1311"Expected valid costs");13121313// Only calculate RelativeToEntryFreq when we are doing single region1314// outlining.1315BranchProbability RelativeToEntryFreq;1316if (Cloner.ClonedOI)1317RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);1318else1319// RelativeToEntryFreq doesn't make sense when we have more than one1320// outlined call because each call will have a different relative frequency1321// to the entry block. We can consider using the average, but the1322// usefulness of that information is questionable. For now, assume we never1323// execute the calls to outlined functions.1324RelativeToEntryFreq = BranchProbability(0, 1);13251326BlockFrequency WeightedRcost =1327BlockFrequency(*NonWeightedRcost.getValue()) * RelativeToEntryFreq;13281329// The call sequence(s) to the outlined function(s) are larger than the sum of1330// the original outlined region size(s), it does not increase the chances of1331// inlining the function with outlining (The inliner uses the size increase to1332// model the cost of inlining a callee).1333if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {1334OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);1335DebugLoc DLoc;1336BasicBlock *Block;1337std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);1338OrigFuncORE.emit([&]() {1339return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",1340DLoc, Block)1341<< ore::NV("Function", Cloner.OrigFunc)1342<< " not partially inlined into callers (Original Size = "1343<< ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)1344<< ", Size of call sequence to outlined function = "1345<< ore::NV("NewSize", SizeCost) << ")";1346});1347return false;1348}13491350assert(Cloner.OrigFunc->users().empty() &&1351"F's users should all be replaced!");13521353std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),1354Cloner.ClonedFunc->user_end());13551356DenseMap<User *, uint64_t> CallSiteToProfCountMap;1357auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();1358if (CalleeEntryCount)1359computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);13601361uint64_t CalleeEntryCountV =1362(CalleeEntryCount ? CalleeEntryCount->getCount() : 0);13631364bool AnyInline = false;1365for (User *User : Users) {1366// Don't bother with BlockAddress used by CallBr for asm goto.1367if (isa<BlockAddress>(User))1368continue;13691370CallBase *CB = getSupportedCallBase(User);13711372if (isLimitReached())1373continue;13741375OptimizationRemarkEmitter CallerORE(CB->getCaller());1376if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))1377continue;13781379// Construct remark before doing the inlining, as after successful inlining1380// the callsite is removed.1381OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);1382OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "1383<< ore::NV("Caller", CB->getCaller());13841385InlineFunctionInfo IFI(GetAssumptionCache, &PSI);1386// We can only forward varargs when we outlined a single region, else we1387// bail on vararg functions.1388if (!InlineFunction(*CB, IFI, /*MergeAttributes=*/false, nullptr, true,1389(Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first1390: nullptr))1391.isSuccess())1392continue;13931394CallerORE.emit(OR);13951396// Now update the entry count:1397if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {1398uint64_t CallSiteCount = CallSiteToProfCountMap[User];1399CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);1400}14011402AnyInline = true;1403NumPartialInlining++;1404// Update the stats1405if (Cloner.ClonedOI)1406NumPartialInlined++;1407else1408NumColdOutlinePartialInlined++;1409}14101411if (AnyInline) {1412Cloner.IsFunctionInlined = true;1413if (CalleeEntryCount)1414Cloner.OrigFunc->setEntryCount(Function::ProfileCount(1415CalleeEntryCountV, CalleeEntryCount->getType()));1416OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);1417OrigFuncORE.emit([&]() {1418return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)1419<< "Partially inlined into at least one caller";1420});1421}14221423return AnyInline;1424}14251426bool PartialInlinerImpl::run(Module &M) {1427if (DisablePartialInlining)1428return false;14291430std::vector<Function *> Worklist;1431Worklist.reserve(M.size());1432for (Function &F : M)1433if (!F.use_empty() && !F.isDeclaration())1434Worklist.push_back(&F);14351436bool Changed = false;1437while (!Worklist.empty()) {1438Function *CurrFunc = Worklist.back();1439Worklist.pop_back();14401441if (CurrFunc->use_empty())1442continue;14431444std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);1445if (Result.second)1446Worklist.push_back(Result.second);1447Changed |= Result.first;1448}14491450return Changed;1451}14521453PreservedAnalyses PartialInlinerPass::run(Module &M,1454ModuleAnalysisManager &AM) {1455auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();14561457auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {1458return FAM.getResult<AssumptionAnalysis>(F);1459};14601461auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {1462return FAM.getCachedResult<AssumptionAnalysis>(F);1463};14641465auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {1466return FAM.getResult<BlockFrequencyAnalysis>(F);1467};14681469auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {1470return FAM.getResult<TargetIRAnalysis>(F);1471};14721473auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {1474return FAM.getResult<TargetLibraryAnalysis>(F);1475};14761477ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);14781479if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,1480GetTLI, PSI, GetBFI)1481.run(M))1482return PreservedAnalyses::none();1483return PreservedAnalyses::all();1484}148514861487