Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp
35266 views
//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file implements the SampleProfileLoader transformation. This pass9// reads a profile file generated by a sampling profiler (e.g. Linux Perf -10// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the11// profile information in the given profile.12//13// This pass generates branch weight annotations on the IR:14//15// - prof: Represents branch weights. This annotation is added to branches16// to indicate the weights of each edge coming out of the branch.17// The weight of each edge is the weight of the target block for18// that edge. The weight of a block B is computed as the maximum19// number of samples found in B.20//21//===----------------------------------------------------------------------===//2223#include "llvm/Transforms/IPO/SampleProfile.h"24#include "llvm/ADT/ArrayRef.h"25#include "llvm/ADT/DenseMap.h"26#include "llvm/ADT/DenseSet.h"27#include "llvm/ADT/MapVector.h"28#include "llvm/ADT/PriorityQueue.h"29#include "llvm/ADT/SCCIterator.h"30#include "llvm/ADT/SmallVector.h"31#include "llvm/ADT/Statistic.h"32#include "llvm/ADT/StringMap.h"33#include "llvm/ADT/StringRef.h"34#include "llvm/ADT/Twine.h"35#include "llvm/Analysis/AssumptionCache.h"36#include "llvm/Analysis/BlockFrequencyInfoImpl.h"37#include "llvm/Analysis/InlineAdvisor.h"38#include "llvm/Analysis/InlineCost.h"39#include "llvm/Analysis/LazyCallGraph.h"40#include "llvm/Analysis/OptimizationRemarkEmitter.h"41#include "llvm/Analysis/ProfileSummaryInfo.h"42#include "llvm/Analysis/ReplayInlineAdvisor.h"43#include "llvm/Analysis/TargetLibraryInfo.h"44#include "llvm/Analysis/TargetTransformInfo.h"45#include "llvm/IR/BasicBlock.h"46#include "llvm/IR/DebugLoc.h"47#include "llvm/IR/DiagnosticInfo.h"48#include "llvm/IR/Function.h"49#include "llvm/IR/GlobalValue.h"50#include "llvm/IR/InstrTypes.h"51#include "llvm/IR/Instruction.h"52#include "llvm/IR/Instructions.h"53#include "llvm/IR/IntrinsicInst.h"54#include "llvm/IR/LLVMContext.h"55#include "llvm/IR/MDBuilder.h"56#include "llvm/IR/Module.h"57#include "llvm/IR/PassManager.h"58#include "llvm/IR/ProfDataUtils.h"59#include "llvm/IR/PseudoProbe.h"60#include "llvm/IR/ValueSymbolTable.h"61#include "llvm/ProfileData/InstrProf.h"62#include "llvm/ProfileData/SampleProf.h"63#include "llvm/ProfileData/SampleProfReader.h"64#include "llvm/Support/Casting.h"65#include "llvm/Support/CommandLine.h"66#include "llvm/Support/Debug.h"67#include "llvm/Support/ErrorOr.h"68#include "llvm/Support/VirtualFileSystem.h"69#include "llvm/Support/raw_ostream.h"70#include "llvm/Transforms/IPO.h"71#include "llvm/Transforms/IPO/ProfiledCallGraph.h"72#include "llvm/Transforms/IPO/SampleContextTracker.h"73#include "llvm/Transforms/IPO/SampleProfileMatcher.h"74#include "llvm/Transforms/IPO/SampleProfileProbe.h"75#include "llvm/Transforms/Instrumentation.h"76#include "llvm/Transforms/Utils/CallPromotionUtils.h"77#include "llvm/Transforms/Utils/Cloning.h"78#include "llvm/Transforms/Utils/MisExpect.h"79#include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h"80#include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h"81#include <algorithm>82#include <cassert>83#include <cstdint>84#include <functional>85#include <limits>86#include <map>87#include <memory>88#include <queue>89#include <string>90#include <system_error>91#include <utility>92#include <vector>9394using namespace llvm;95using namespace sampleprof;96using namespace llvm::sampleprofutil;97using ProfileCount = Function::ProfileCount;98#define DEBUG_TYPE "sample-profile"99#define CSINLINE_DEBUG DEBUG_TYPE "-inline"100101STATISTIC(NumCSInlined,102"Number of functions inlined with context sensitive profile");103STATISTIC(NumCSNotInlined,104"Number of functions not inlined with context sensitive profile");105STATISTIC(NumMismatchedProfile,106"Number of functions with CFG mismatched profile");107STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");108STATISTIC(NumDuplicatedInlinesite,109"Number of inlined callsites with a partial distribution factor");110111STATISTIC(NumCSInlinedHitMinLimit,112"Number of functions with FDO inline stopped due to min size limit");113STATISTIC(NumCSInlinedHitMaxLimit,114"Number of functions with FDO inline stopped due to max size limit");115STATISTIC(116NumCSInlinedHitGrowthLimit,117"Number of functions with FDO inline stopped due to growth size limit");118119// Command line option to specify the file to read samples from. This is120// mainly used for debugging.121static cl::opt<std::string> SampleProfileFile(122"sample-profile-file", cl::init(""), cl::value_desc("filename"),123cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);124125// The named file contains a set of transformations that may have been applied126// to the symbol names between the program from which the sample data was127// collected and the current program's symbols.128static cl::opt<std::string> SampleProfileRemappingFile(129"sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),130cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);131132cl::opt<bool> SalvageStaleProfile(133"salvage-stale-profile", cl::Hidden, cl::init(false),134cl::desc("Salvage stale profile by fuzzy matching and use the remapped "135"location for sample profile query."));136cl::opt<bool>137SalvageUnusedProfile("salvage-unused-profile", cl::Hidden, cl::init(false),138cl::desc("Salvage unused profile by matching with new "139"functions on call graph."));140141cl::opt<bool> ReportProfileStaleness(142"report-profile-staleness", cl::Hidden, cl::init(false),143cl::desc("Compute and report stale profile statistical metrics."));144145cl::opt<bool> PersistProfileStaleness(146"persist-profile-staleness", cl::Hidden, cl::init(false),147cl::desc("Compute stale profile statistical metrics and write it into the "148"native object file(.llvm_stats section)."));149150static cl::opt<bool> ProfileSampleAccurate(151"profile-sample-accurate", cl::Hidden, cl::init(false),152cl::desc("If the sample profile is accurate, we will mark all un-sampled "153"callsite and function as having 0 samples. Otherwise, treat "154"un-sampled callsites and functions conservatively as unknown. "));155156static cl::opt<bool> ProfileSampleBlockAccurate(157"profile-sample-block-accurate", cl::Hidden, cl::init(false),158cl::desc("If the sample profile is accurate, we will mark all un-sampled "159"branches and calls as having 0 samples. Otherwise, treat "160"them conservatively as unknown. "));161162static cl::opt<bool> ProfileAccurateForSymsInList(163"profile-accurate-for-symsinlist", cl::Hidden, cl::init(true),164cl::desc("For symbols in profile symbol list, regard their profiles to "165"be accurate. It may be overriden by profile-sample-accurate. "));166167static cl::opt<bool> ProfileMergeInlinee(168"sample-profile-merge-inlinee", cl::Hidden, cl::init(true),169cl::desc("Merge past inlinee's profile to outline version if sample "170"profile loader decided not to inline a call site. It will "171"only be enabled when top-down order of profile loading is "172"enabled. "));173174static cl::opt<bool> ProfileTopDownLoad(175"sample-profile-top-down-load", cl::Hidden, cl::init(true),176cl::desc("Do profile annotation and inlining for functions in top-down "177"order of call graph during sample profile loading. It only "178"works for new pass manager. "));179180static cl::opt<bool>181UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden,182cl::desc("Process functions in a top-down order "183"defined by the profiled call graph when "184"-sample-profile-top-down-load is on."));185186static cl::opt<bool> ProfileSizeInline(187"sample-profile-inline-size", cl::Hidden, cl::init(false),188cl::desc("Inline cold call sites in profile loader if it's beneficial "189"for code size."));190191// Since profiles are consumed by many passes, turning on this option has192// side effects. For instance, pre-link SCC inliner would see merged profiles193// and inline the hot functions (that are skipped in this pass).194static cl::opt<bool> DisableSampleLoaderInlining(195"disable-sample-loader-inlining", cl::Hidden, cl::init(false),196cl::desc("If true, artifically skip inline transformation in sample-loader "197"pass, and merge (or scale) profiles (as configured by "198"--sample-profile-merge-inlinee)."));199200namespace llvm {201cl::opt<bool>202SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden,203cl::desc("Sort profiled recursion by edge weights."));204205cl::opt<int> ProfileInlineGrowthLimit(206"sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),207cl::desc("The size growth ratio limit for proirity-based sample profile "208"loader inlining."));209210cl::opt<int> ProfileInlineLimitMin(211"sample-profile-inline-limit-min", cl::Hidden, cl::init(100),212cl::desc("The lower bound of size growth limit for "213"proirity-based sample profile loader inlining."));214215cl::opt<int> ProfileInlineLimitMax(216"sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),217cl::desc("The upper bound of size growth limit for "218"proirity-based sample profile loader inlining."));219220cl::opt<int> SampleHotCallSiteThreshold(221"sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),222cl::desc("Hot callsite threshold for proirity-based sample profile loader "223"inlining."));224225cl::opt<int> SampleColdCallSiteThreshold(226"sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),227cl::desc("Threshold for inlining cold callsites"));228} // namespace llvm229230static cl::opt<unsigned> ProfileICPRelativeHotness(231"sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25),232cl::desc(233"Relative hotness percentage threshold for indirect "234"call promotion in proirity-based sample profile loader inlining."));235236static cl::opt<unsigned> ProfileICPRelativeHotnessSkip(237"sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1),238cl::desc(239"Skip relative hotness check for ICP up to given number of targets."));240241static cl::opt<unsigned> HotFuncCutoffForStalenessError(242"hot-func-cutoff-for-staleness-error", cl::Hidden, cl::init(800000),243cl::desc("A function is considered hot for staleness error check if its "244"total sample count is above the specified percentile"));245246static cl::opt<unsigned> MinfuncsForStalenessError(247"min-functions-for-staleness-error", cl::Hidden, cl::init(50),248cl::desc("Skip the check if the number of hot functions is smaller than "249"the specified number."));250251static cl::opt<unsigned> PrecentMismatchForStalenessError(252"precent-mismatch-for-staleness-error", cl::Hidden, cl::init(80),253cl::desc("Reject the profile if the mismatch percent is higher than the "254"given number."));255256static cl::opt<bool> CallsitePrioritizedInline(257"sample-profile-prioritized-inline", cl::Hidden,258cl::desc("Use call site prioritized inlining for sample profile loader."259"Currently only CSSPGO is supported."));260261static cl::opt<bool> UsePreInlinerDecision(262"sample-profile-use-preinliner", cl::Hidden,263cl::desc("Use the preinliner decisions stored in profile context."));264265static cl::opt<bool> AllowRecursiveInline(266"sample-profile-recursive-inline", cl::Hidden,267cl::desc("Allow sample loader inliner to inline recursive calls."));268269static cl::opt<bool> RemoveProbeAfterProfileAnnotation(270"sample-profile-remove-probe", cl::Hidden, cl::init(false),271cl::desc("Remove pseudo-probe after sample profile annotation."));272273static cl::opt<std::string> ProfileInlineReplayFile(274"sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),275cl::desc(276"Optimization remarks file containing inline remarks to be replayed "277"by inlining from sample profile loader."),278cl::Hidden);279280static cl::opt<ReplayInlinerSettings::Scope> ProfileInlineReplayScope(281"sample-profile-inline-replay-scope",282cl::init(ReplayInlinerSettings::Scope::Function),283cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function",284"Replay on functions that have remarks associated "285"with them (default)"),286clEnumValN(ReplayInlinerSettings::Scope::Module, "Module",287"Replay on the entire module")),288cl::desc("Whether inline replay should be applied to the entire "289"Module or just the Functions (default) that are present as "290"callers in remarks during sample profile inlining."),291cl::Hidden);292293static cl::opt<ReplayInlinerSettings::Fallback> ProfileInlineReplayFallback(294"sample-profile-inline-replay-fallback",295cl::init(ReplayInlinerSettings::Fallback::Original),296cl::values(297clEnumValN(298ReplayInlinerSettings::Fallback::Original, "Original",299"All decisions not in replay send to original advisor (default)"),300clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline,301"AlwaysInline", "All decisions not in replay are inlined"),302clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline",303"All decisions not in replay are not inlined")),304cl::desc("How sample profile inline replay treats sites that don't come "305"from the replay. Original: defers to original advisor, "306"AlwaysInline: inline all sites not in replay, NeverInline: "307"inline no sites not in replay"),308cl::Hidden);309310static cl::opt<CallSiteFormat::Format> ProfileInlineReplayFormat(311"sample-profile-inline-replay-format",312cl::init(CallSiteFormat::Format::LineColumnDiscriminator),313cl::values(314clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"),315clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn",316"<Line Number>:<Column Number>"),317clEnumValN(CallSiteFormat::Format::LineDiscriminator,318"LineDiscriminator", "<Line Number>.<Discriminator>"),319clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator,320"LineColumnDiscriminator",321"<Line Number>:<Column Number>.<Discriminator> (default)")),322cl::desc("How sample profile inline replay file is formatted"), cl::Hidden);323324static cl::opt<unsigned>325MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden,326cl::desc("Max number of promotions for a single indirect "327"call callsite in sample profile loader"));328329static cl::opt<bool> OverwriteExistingWeights(330"overwrite-existing-weights", cl::Hidden, cl::init(false),331cl::desc("Ignore existing branch weights on IR and always overwrite."));332333static cl::opt<bool> AnnotateSampleProfileInlinePhase(334"annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false),335cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for "336"sample-profile inline pass name."));337338namespace llvm {339extern cl::opt<bool> EnableExtTspBlockPlacement;340}341342namespace {343344using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;345using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;346using Edge = std::pair<const BasicBlock *, const BasicBlock *>;347using EdgeWeightMap = DenseMap<Edge, uint64_t>;348using BlockEdgeMap =349DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>;350351class GUIDToFuncNameMapper {352public:353GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,354DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)355: CurrentReader(Reader), CurrentModule(M),356CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {357if (!CurrentReader.useMD5())358return;359360for (const auto &F : CurrentModule) {361StringRef OrigName = F.getName();362CurrentGUIDToFuncNameMap.insert(363{Function::getGUID(OrigName), OrigName});364365// Local to global var promotion used by optimization like thinlto366// will rename the var and add suffix like ".llvm.xxx" to the367// original local name. In sample profile, the suffixes of function368// names are all stripped. Since it is possible that the mapper is369// built in post-thin-link phase and var promotion has been done,370// we need to add the substring of function name without the suffix371// into the GUIDToFuncNameMap.372StringRef CanonName = FunctionSamples::getCanonicalFnName(F);373if (CanonName != OrigName)374CurrentGUIDToFuncNameMap.insert(375{Function::getGUID(CanonName), CanonName});376}377378// Update GUIDToFuncNameMap for each function including inlinees.379SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);380}381382~GUIDToFuncNameMapper() {383if (!CurrentReader.useMD5())384return;385386CurrentGUIDToFuncNameMap.clear();387388// Reset GUIDToFuncNameMap for of each function as they're no389// longer valid at this point.390SetGUIDToFuncNameMapForAll(nullptr);391}392393private:394void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {395std::queue<FunctionSamples *> FSToUpdate;396for (auto &IFS : CurrentReader.getProfiles()) {397FSToUpdate.push(&IFS.second);398}399400while (!FSToUpdate.empty()) {401FunctionSamples *FS = FSToUpdate.front();402FSToUpdate.pop();403FS->GUIDToFuncNameMap = Map;404for (const auto &ICS : FS->getCallsiteSamples()) {405const FunctionSamplesMap &FSMap = ICS.second;406for (const auto &IFS : FSMap) {407FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);408FSToUpdate.push(&FS);409}410}411}412}413414SampleProfileReader &CurrentReader;415Module &CurrentModule;416DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;417};418419// Inline candidate used by iterative callsite prioritized inliner420struct InlineCandidate {421CallBase *CallInstr;422const FunctionSamples *CalleeSamples;423// Prorated callsite count, which will be used to guide inlining. For example,424// if a callsite is duplicated in LTO prelink, then in LTO postlink the two425// copies will get their own distribution factors and their prorated counts426// will be used to decide if they should be inlined independently.427uint64_t CallsiteCount;428// Call site distribution factor to prorate the profile samples for a429// duplicated callsite. Default value is 1.0.430float CallsiteDistribution;431};432433// Inline candidate comparer using call site weight434struct CandidateComparer {435bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) {436if (LHS.CallsiteCount != RHS.CallsiteCount)437return LHS.CallsiteCount < RHS.CallsiteCount;438439const FunctionSamples *LCS = LHS.CalleeSamples;440const FunctionSamples *RCS = RHS.CalleeSamples;441// In inline replay mode, CalleeSamples may be null and the order doesn't442// matter.443if (!LCS || !RCS)444return LCS;445446// Tie breaker using number of samples try to favor smaller functions first447if (LCS->getBodySamples().size() != RCS->getBodySamples().size())448return LCS->getBodySamples().size() > RCS->getBodySamples().size();449450// Tie breaker using GUID so we have stable/deterministic inlining order451return LCS->getGUID() < RCS->getGUID();452}453};454455using CandidateQueue =456PriorityQueue<InlineCandidate, std::vector<InlineCandidate>,457CandidateComparer>;458459/// Sample profile pass.460///461/// This pass reads profile data from the file specified by462/// -sample-profile-file and annotates every affected function with the463/// profile information found in that file.464class SampleProfileLoader final : public SampleProfileLoaderBaseImpl<Function> {465public:466SampleProfileLoader(467StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase,468IntrusiveRefCntPtr<vfs::FileSystem> FS,469std::function<AssumptionCache &(Function &)> GetAssumptionCache,470std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,471std::function<const TargetLibraryInfo &(Function &)> GetTLI,472LazyCallGraph &CG)473: SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName),474std::move(FS)),475GetAC(std::move(GetAssumptionCache)),476GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),477CG(CG), LTOPhase(LTOPhase),478AnnotatedPassName(AnnotateSampleProfileInlinePhase479? llvm::AnnotateInlinePassName(InlineContext{480LTOPhase, InlinePass::SampleProfileInliner})481: CSINLINE_DEBUG) {}482483bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);484bool runOnModule(Module &M, ModuleAnalysisManager *AM,485ProfileSummaryInfo *_PSI);486487protected:488bool runOnFunction(Function &F, ModuleAnalysisManager *AM);489bool emitAnnotations(Function &F);490ErrorOr<uint64_t> getInstWeight(const Instruction &I) override;491const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;492const FunctionSamples *493findFunctionSamples(const Instruction &I) const override;494std::vector<const FunctionSamples *>495findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;496void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples,497DenseSet<GlobalValue::GUID> &InlinedGUIDs,498uint64_t Threshold);499// Attempt to promote indirect call and also inline the promoted call500bool tryPromoteAndInlineCandidate(501Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,502uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);503504bool inlineHotFunctions(Function &F,505DenseSet<GlobalValue::GUID> &InlinedGUIDs);506std::optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB);507bool getExternalInlineAdvisorShouldInline(CallBase &CB);508InlineCost shouldInlineCandidate(InlineCandidate &Candidate);509bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);510bool511tryInlineCandidate(InlineCandidate &Candidate,512SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);513bool514inlineHotFunctionsWithPriority(Function &F,515DenseSet<GlobalValue::GUID> &InlinedGUIDs);516// Inline cold/small functions in addition to hot ones517bool shouldInlineColdCallee(CallBase &CallInst);518void emitOptimizationRemarksForInlineCandidates(519const SmallVectorImpl<CallBase *> &Candidates, const Function &F,520bool Hot);521void promoteMergeNotInlinedContextSamples(522MapVector<CallBase *, const FunctionSamples *> NonInlinedCallSites,523const Function &F);524std::vector<Function *> buildFunctionOrder(Module &M, LazyCallGraph &CG);525std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(Module &M);526void generateMDProfMetadata(Function &F);527bool rejectHighStalenessProfile(Module &M, ProfileSummaryInfo *PSI,528const SampleProfileMap &Profiles);529void removePseudoProbeInsts(Module &M);530531/// Map from function name to Function *. Used to find the function from532/// the function name. If the function name contains suffix, additional533/// entry is added to map from the stripped name to the function if there534/// is one-to-one mapping.535HashKeyMap<std::unordered_map, FunctionId, Function *> SymbolMap;536537/// Map from function name to profile name generated by call-graph based538/// profile fuzzy matching(--salvage-unused-profile).539HashKeyMap<std::unordered_map, FunctionId, FunctionId> FuncNameToProfNameMap;540541std::function<AssumptionCache &(Function &)> GetAC;542std::function<TargetTransformInfo &(Function &)> GetTTI;543std::function<const TargetLibraryInfo &(Function &)> GetTLI;544LazyCallGraph &CG;545546/// Profile tracker for different context.547std::unique_ptr<SampleContextTracker> ContextTracker;548549/// Flag indicating which LTO/ThinLTO phase the pass is invoked in.550///551/// We need to know the LTO phase because for example in ThinLTOPrelink552/// phase, in annotation, we should not promote indirect calls. Instead,553/// we will mark GUIDs that needs to be annotated to the function.554const ThinOrFullLTOPhase LTOPhase;555const std::string AnnotatedPassName;556557/// Profle Symbol list tells whether a function name appears in the binary558/// used to generate the current profile.559std::shared_ptr<ProfileSymbolList> PSL;560561/// Total number of samples collected in this profile.562///563/// This is the sum of all the samples collected in all the functions executed564/// at runtime.565uint64_t TotalCollectedSamples = 0;566567// Information recorded when we declined to inline a call site568// because we have determined it is too cold is accumulated for569// each callee function. Initially this is just the entry count.570struct NotInlinedProfileInfo {571uint64_t entryCount;572};573DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo;574575// GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for576// all the function symbols defined or declared in current module.577DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;578579// All the Names used in FunctionSamples including outline function580// names, inline instance names and call target names.581StringSet<> NamesInProfile;582// MD5 version of NamesInProfile. Either NamesInProfile or GUIDsInProfile is583// populated, depends on whether the profile uses MD5. Because the name table584// generally contains several magnitude more entries than the number of585// functions, we do not want to convert all names from one form to another.586llvm::DenseSet<uint64_t> GUIDsInProfile;587588// For symbol in profile symbol list, whether to regard their profiles589// to be accurate. It is mainly decided by existance of profile symbol590// list and -profile-accurate-for-symsinlist flag, but it can be591// overriden by -profile-sample-accurate or profile-sample-accurate592// attribute.593bool ProfAccForSymsInList;594595// External inline advisor used to replay inline decision from remarks.596std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor;597598// A helper to implement the sample profile matching algorithm.599std::unique_ptr<SampleProfileMatcher> MatchingManager;600601private:602const char *getAnnotatedRemarkPassName() const {603return AnnotatedPassName.c_str();604}605};606} // end anonymous namespace607608namespace llvm {609template <>610inline bool SampleProfileInference<Function>::isExit(const BasicBlock *BB) {611return succ_empty(BB);612}613614template <>615inline void SampleProfileInference<Function>::findUnlikelyJumps(616const std::vector<const BasicBlockT *> &BasicBlocks,617BlockEdgeMap &Successors, FlowFunction &Func) {618for (auto &Jump : Func.Jumps) {619const auto *BB = BasicBlocks[Jump.Source];620const auto *Succ = BasicBlocks[Jump.Target];621const Instruction *TI = BB->getTerminator();622// Check if a block ends with InvokeInst and mark non-taken branch unlikely.623// In that case block Succ should be a landing pad624if (Successors[BB].size() == 2 && Successors[BB].back() == Succ) {625if (isa<InvokeInst>(TI)) {626Jump.IsUnlikely = true;627}628}629const Instruction *SuccTI = Succ->getTerminator();630// Check if the target block contains UnreachableInst and mark it unlikely631if (SuccTI->getNumSuccessors() == 0) {632if (isa<UnreachableInst>(SuccTI)) {633Jump.IsUnlikely = true;634}635}636}637}638639template <>640void SampleProfileLoaderBaseImpl<Function>::computeDominanceAndLoopInfo(641Function &F) {642DT.reset(new DominatorTree);643DT->recalculate(F);644645PDT.reset(new PostDominatorTree(F));646647LI.reset(new LoopInfo);648LI->analyze(*DT);649}650} // namespace llvm651652ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {653if (FunctionSamples::ProfileIsProbeBased)654return getProbeWeight(Inst);655656const DebugLoc &DLoc = Inst.getDebugLoc();657if (!DLoc)658return std::error_code();659660// Ignore all intrinsics, phinodes and branch instructions.661// Branch and phinodes instruction usually contains debug info from sources662// outside of the residing basic block, thus we ignore them during annotation.663if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))664return std::error_code();665666// For non-CS profile, if a direct call/invoke instruction is inlined in667// profile (findCalleeFunctionSamples returns non-empty result), but not668// inlined here, it means that the inlined callsite has no sample, thus the669// call instruction should have 0 count.670// For CS profile, the callsite count of previously inlined callees is671// populated with the entry count of the callees.672if (!FunctionSamples::ProfileIsCS)673if (const auto *CB = dyn_cast<CallBase>(&Inst))674if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))675return 0;676677return getInstWeightImpl(Inst);678}679680/// Get the FunctionSamples for a call instruction.681///682/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined683/// instance in which that call instruction is calling to. It contains684/// all samples that resides in the inlined instance. We first find the685/// inlined instance in which the call instruction is from, then we686/// traverse its children to find the callsite with the matching687/// location.688///689/// \param Inst Call/Invoke instruction to query.690///691/// \returns The FunctionSamples pointer to the inlined instance.692const FunctionSamples *693SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {694const DILocation *DIL = Inst.getDebugLoc();695if (!DIL) {696return nullptr;697}698699StringRef CalleeName;700if (Function *Callee = Inst.getCalledFunction())701CalleeName = Callee->getName();702703if (FunctionSamples::ProfileIsCS)704return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);705706const FunctionSamples *FS = findFunctionSamples(Inst);707if (FS == nullptr)708return nullptr;709710return FS->findFunctionSamplesAt(FunctionSamples::getCallSiteIdentifier(DIL),711CalleeName, Reader->getRemapper(),712&FuncNameToProfNameMap);713}714715/// Returns a vector of FunctionSamples that are the indirect call targets716/// of \p Inst. The vector is sorted by the total number of samples. Stores717/// the total call count of the indirect call in \p Sum.718std::vector<const FunctionSamples *>719SampleProfileLoader::findIndirectCallFunctionSamples(720const Instruction &Inst, uint64_t &Sum) const {721const DILocation *DIL = Inst.getDebugLoc();722std::vector<const FunctionSamples *> R;723724if (!DIL) {725return R;726}727728auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {729assert(L && R && "Expect non-null FunctionSamples");730if (L->getHeadSamplesEstimate() != R->getHeadSamplesEstimate())731return L->getHeadSamplesEstimate() > R->getHeadSamplesEstimate();732return L->getGUID() < R->getGUID();733};734735if (FunctionSamples::ProfileIsCS) {736auto CalleeSamples =737ContextTracker->getIndirectCalleeContextSamplesFor(DIL);738if (CalleeSamples.empty())739return R;740741// For CSSPGO, we only use target context profile's entry count742// as that already includes both inlined callee and non-inlined ones..743Sum = 0;744for (const auto *const FS : CalleeSamples) {745Sum += FS->getHeadSamplesEstimate();746R.push_back(FS);747}748llvm::sort(R, FSCompare);749return R;750}751752const FunctionSamples *FS = findFunctionSamples(Inst);753if (FS == nullptr)754return R;755756auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);757Sum = 0;758if (auto T = FS->findCallTargetMapAt(CallSite))759for (const auto &T_C : *T)760Sum += T_C.second;761if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {762if (M->empty())763return R;764for (const auto &NameFS : *M) {765Sum += NameFS.second.getHeadSamplesEstimate();766R.push_back(&NameFS.second);767}768llvm::sort(R, FSCompare);769}770return R;771}772773const FunctionSamples *774SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {775if (FunctionSamples::ProfileIsProbeBased) {776std::optional<PseudoProbe> Probe = extractProbe(Inst);777if (!Probe)778return nullptr;779}780781const DILocation *DIL = Inst.getDebugLoc();782if (!DIL)783return Samples;784785auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);786if (it.second) {787if (FunctionSamples::ProfileIsCS)788it.first->second = ContextTracker->getContextSamplesFor(DIL);789else790it.first->second = Samples->findFunctionSamples(791DIL, Reader->getRemapper(), &FuncNameToProfNameMap);792}793return it.first->second;794}795796/// Check whether the indirect call promotion history of \p Inst allows797/// the promotion for \p Candidate.798/// If the profile count for the promotion candidate \p Candidate is799/// NOMORE_ICP_MAGICNUM, it means \p Candidate has already been promoted800/// for \p Inst. If we already have at least MaxNumPromotions801/// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we802/// cannot promote for \p Inst anymore.803static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) {804uint64_t TotalCount = 0;805auto ValueData = getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget,806MaxNumPromotions, TotalCount, true);807// No valid value profile so no promoted targets have been recorded808// before. Ok to do ICP.809if (ValueData.empty())810return true;811812unsigned NumPromoted = 0;813for (const auto &V : ValueData) {814if (V.Count != NOMORE_ICP_MAGICNUM)815continue;816817// If the promotion candidate has NOMORE_ICP_MAGICNUM count in the818// metadata, it means the candidate has been promoted for this819// indirect call.820if (V.Value == Function::getGUID(Candidate))821return false;822NumPromoted++;823// If already have MaxNumPromotions promotion, don't do it anymore.824if (NumPromoted == MaxNumPromotions)825return false;826}827return true;828}829830/// Update indirect call target profile metadata for \p Inst.831/// Usually \p Sum is the sum of counts of all the targets for \p Inst.832/// If it is 0, it means updateIDTMetaData is used to mark a833/// certain target to be promoted already. If it is not zero,834/// we expect to use it to update the total count in the value profile.835static void836updateIDTMetaData(Instruction &Inst,837const SmallVectorImpl<InstrProfValueData> &CallTargets,838uint64_t Sum) {839// Bail out early if MaxNumPromotions is zero.840// This prevents allocating an array of zero length below.841//842// Note `updateIDTMetaData` is called in two places so check843// `MaxNumPromotions` inside it.844if (MaxNumPromotions == 0)845return;846// OldSum is the existing total count in the value profile data.847uint64_t OldSum = 0;848auto ValueData = getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget,849MaxNumPromotions, OldSum, true);850851DenseMap<uint64_t, uint64_t> ValueCountMap;852if (Sum == 0) {853assert((CallTargets.size() == 1 &&854CallTargets[0].Count == NOMORE_ICP_MAGICNUM) &&855"If sum is 0, assume only one element in CallTargets "856"with count being NOMORE_ICP_MAGICNUM");857// Initialize ValueCountMap with existing value profile data.858for (const auto &V : ValueData)859ValueCountMap[V.Value] = V.Count;860auto Pair =861ValueCountMap.try_emplace(CallTargets[0].Value, CallTargets[0].Count);862// If the target already exists in value profile, decrease the total863// count OldSum and reset the target's count to NOMORE_ICP_MAGICNUM.864if (!Pair.second) {865OldSum -= Pair.first->second;866Pair.first->second = NOMORE_ICP_MAGICNUM;867}868Sum = OldSum;869} else {870// Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM871// counts in the value profile.872for (const auto &V : ValueData) {873if (V.Count == NOMORE_ICP_MAGICNUM)874ValueCountMap[V.Value] = V.Count;875}876877for (const auto &Data : CallTargets) {878auto Pair = ValueCountMap.try_emplace(Data.Value, Data.Count);879if (Pair.second)880continue;881// The target represented by Data.Value has already been promoted.882// Keep the count as NOMORE_ICP_MAGICNUM in the profile and decrease883// Sum by Data.Count.884assert(Sum >= Data.Count && "Sum should never be less than Data.Count");885Sum -= Data.Count;886}887}888889SmallVector<InstrProfValueData, 8> NewCallTargets;890for (const auto &ValueCount : ValueCountMap) {891NewCallTargets.emplace_back(892InstrProfValueData{ValueCount.first, ValueCount.second});893}894895llvm::sort(NewCallTargets,896[](const InstrProfValueData &L, const InstrProfValueData &R) {897if (L.Count != R.Count)898return L.Count > R.Count;899return L.Value > R.Value;900});901902uint32_t MaxMDCount =903std::min(NewCallTargets.size(), static_cast<size_t>(MaxNumPromotions));904annotateValueSite(*Inst.getParent()->getParent()->getParent(), Inst,905NewCallTargets, Sum, IPVK_IndirectCallTarget, MaxMDCount);906}907908/// Attempt to promote indirect call and also inline the promoted call.909///910/// \param F Caller function.911/// \param Candidate ICP and inline candidate.912/// \param SumOrigin Original sum of target counts for indirect call before913/// promoting given candidate.914/// \param Sum Prorated sum of remaining target counts for indirect call915/// after promoting given candidate.916/// \param InlinedCallSite Output vector for new call sites exposed after917/// inlining.918bool SampleProfileLoader::tryPromoteAndInlineCandidate(919Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,920SmallVector<CallBase *, 8> *InlinedCallSite) {921// Bail out early if sample-loader inliner is disabled.922if (DisableSampleLoaderInlining)923return false;924925// Bail out early if MaxNumPromotions is zero.926// This prevents allocating an array of zero length in callees below.927if (MaxNumPromotions == 0)928return false;929auto CalleeFunctionName = Candidate.CalleeSamples->getFunction();930auto R = SymbolMap.find(CalleeFunctionName);931if (R == SymbolMap.end() || !R->second)932return false;933934auto &CI = *Candidate.CallInstr;935if (!doesHistoryAllowICP(CI, R->second->getName()))936return false;937938const char *Reason = "Callee function not available";939// R->getValue() != &F is to prevent promoting a recursive call.940// If it is a recursive call, we do not inline it as it could bloat941// the code exponentially. There is way to better handle this, e.g.942// clone the caller first, and inline the cloned caller if it is943// recursive. As llvm does not inline recursive calls, we will944// simply ignore it instead of handling it explicitly.945if (!R->second->isDeclaration() && R->second->getSubprogram() &&946R->second->hasFnAttribute("use-sample-profile") &&947R->second != &F && isLegalToPromote(CI, R->second, &Reason)) {948// For promoted target, set its value with NOMORE_ICP_MAGICNUM count949// in the value profile metadata so the target won't be promoted again.950SmallVector<InstrProfValueData, 1> SortedCallTargets = {InstrProfValueData{951Function::getGUID(R->second->getName()), NOMORE_ICP_MAGICNUM}};952updateIDTMetaData(CI, SortedCallTargets, 0);953954auto *DI = &pgo::promoteIndirectCall(955CI, R->second, Candidate.CallsiteCount, Sum, false, ORE);956if (DI) {957Sum -= Candidate.CallsiteCount;958// Do not prorate the indirect callsite distribution since the original959// distribution will be used to scale down non-promoted profile target960// counts later. By doing this we lose track of the real callsite count961// for the leftover indirect callsite as a trade off for accurate call962// target counts.963// TODO: Ideally we would have two separate factors, one for call site964// counts and one is used to prorate call target counts.965// Do not update the promoted direct callsite distribution at this966// point since the original distribution combined with the callee profile967// will be used to prorate callsites from the callee if inlined. Once not968// inlined, the direct callsite distribution should be prorated so that969// the it will reflect the real callsite counts.970Candidate.CallInstr = DI;971if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) {972bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite);973if (!Inlined) {974// Prorate the direct callsite distribution so that it reflects real975// callsite counts.976setProbeDistributionFactor(977*DI, static_cast<float>(Candidate.CallsiteCount) / SumOrigin);978}979return Inlined;980}981}982} else {983LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "984<< FunctionSamples::getCanonicalFnName(985Candidate.CallInstr->getName())<< " because "986<< Reason << "\n");987}988return false;989}990991bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {992if (!ProfileSizeInline)993return false;994995Function *Callee = CallInst.getCalledFunction();996if (Callee == nullptr)997return false;998999InlineCost Cost = getInlineCost(CallInst, getInlineParams(), GetTTI(*Callee),1000GetAC, GetTLI);10011002if (Cost.isNever())1003return false;10041005if (Cost.isAlways())1006return true;10071008return Cost.getCost() <= SampleColdCallSiteThreshold;1009}10101011void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(1012const SmallVectorImpl<CallBase *> &Candidates, const Function &F,1013bool Hot) {1014for (auto *I : Candidates) {1015Function *CalledFunction = I->getCalledFunction();1016if (CalledFunction) {1017ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),1018"InlineAttempt", I->getDebugLoc(),1019I->getParent())1020<< "previous inlining reattempted for "1021<< (Hot ? "hotness: '" : "size: '")1022<< ore::NV("Callee", CalledFunction) << "' into '"1023<< ore::NV("Caller", &F) << "'");1024}1025}1026}10271028void SampleProfileLoader::findExternalInlineCandidate(1029CallBase *CB, const FunctionSamples *Samples,1030DenseSet<GlobalValue::GUID> &InlinedGUIDs, uint64_t Threshold) {10311032// If ExternalInlineAdvisor(ReplayInlineAdvisor) wants to inline an external1033// function make sure it's imported1034if (CB && getExternalInlineAdvisorShouldInline(*CB)) {1035// Samples may not exist for replayed function, if so1036// just add the direct GUID and move on1037if (!Samples) {1038InlinedGUIDs.insert(1039Function::getGUID(CB->getCalledFunction()->getName()));1040return;1041}1042// Otherwise, drop the threshold to import everything that we can1043Threshold = 0;1044}10451046// In some rare cases, call instruction could be changed after being pushed1047// into inline candidate queue, this is because earlier inlining may expose1048// constant propagation which can change indirect call to direct call. When1049// this happens, we may fail to find matching function samples for the1050// candidate later, even if a match was found when the candidate was enqueued.1051if (!Samples)1052return;10531054// For AutoFDO profile, retrieve candidate profiles by walking over1055// the nested inlinee profiles.1056if (!FunctionSamples::ProfileIsCS) {1057// Set threshold to zero to honor pre-inliner decision.1058if (UsePreInlinerDecision)1059Threshold = 0;1060Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold);1061return;1062}10631064ContextTrieNode *Caller = ContextTracker->getContextNodeForProfile(Samples);1065std::queue<ContextTrieNode *> CalleeList;1066CalleeList.push(Caller);1067while (!CalleeList.empty()) {1068ContextTrieNode *Node = CalleeList.front();1069CalleeList.pop();1070FunctionSamples *CalleeSample = Node->getFunctionSamples();1071// For CSSPGO profile, retrieve candidate profile by walking over the1072// trie built for context profile. Note that also take call targets1073// even if callee doesn't have a corresponding context profile.1074if (!CalleeSample)1075continue;10761077// If pre-inliner decision is used, honor that for importing as well.1078bool PreInline =1079UsePreInlinerDecision &&1080CalleeSample->getContext().hasAttribute(ContextShouldBeInlined);1081if (!PreInline && CalleeSample->getHeadSamplesEstimate() < Threshold)1082continue;10831084Function *Func = SymbolMap.lookup(CalleeSample->getFunction());1085// Add to the import list only when it's defined out of module.1086if (!Func || Func->isDeclaration())1087InlinedGUIDs.insert(CalleeSample->getGUID());10881089// Import hot CallTargets, which may not be available in IR because full1090// profile annotation cannot be done until backend compilation in ThinLTO.1091for (const auto &BS : CalleeSample->getBodySamples())1092for (const auto &TS : BS.second.getCallTargets())1093if (TS.second > Threshold) {1094const Function *Callee = SymbolMap.lookup(TS.first);1095if (!Callee || Callee->isDeclaration())1096InlinedGUIDs.insert(TS.first.getHashCode());1097}10981099// Import hot child context profile associted with callees. Note that this1100// may have some overlap with the call target loop above, but doing this1101// based child context profile again effectively allow us to use the max of1102// entry count and call target count to determine importing.1103for (auto &Child : Node->getAllChildContext()) {1104ContextTrieNode *CalleeNode = &Child.second;1105CalleeList.push(CalleeNode);1106}1107}1108}11091110/// Iteratively inline hot callsites of a function.1111///1112/// Iteratively traverse all callsites of the function \p F, so as to1113/// find out callsites with corresponding inline instances.1114///1115/// For such callsites,1116/// - If it is hot enough, inline the callsites and adds callsites of the callee1117/// into the caller. If the call is an indirect call, first promote1118/// it to direct call. Each indirect call is limited with a single target.1119///1120/// - If a callsite is not inlined, merge the its profile to the outline1121/// version (if --sample-profile-merge-inlinee is true), or scale the1122/// counters of standalone function based on the profile of inlined1123/// instances (if --sample-profile-merge-inlinee is false).1124///1125/// Later passes may consume the updated profiles.1126///1127/// \param F function to perform iterative inlining.1128/// \param InlinedGUIDs a set to be updated to include all GUIDs that are1129/// inlined in the profiled binary.1130///1131/// \returns True if there is any inline happened.1132bool SampleProfileLoader::inlineHotFunctions(1133Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {1134// ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure1135// Profile symbol list is ignored when profile-sample-accurate is on.1136assert((!ProfAccForSymsInList ||1137(!ProfileSampleAccurate &&1138!F.hasFnAttribute("profile-sample-accurate"))) &&1139"ProfAccForSymsInList should be false when profile-sample-accurate "1140"is enabled");11411142MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;1143bool Changed = false;1144bool LocalChanged = true;1145while (LocalChanged) {1146LocalChanged = false;1147SmallVector<CallBase *, 10> CIS;1148for (auto &BB : F) {1149bool Hot = false;1150SmallVector<CallBase *, 10> AllCandidates;1151SmallVector<CallBase *, 10> ColdCandidates;1152for (auto &I : BB) {1153const FunctionSamples *FS = nullptr;1154if (auto *CB = dyn_cast<CallBase>(&I)) {1155if (!isa<IntrinsicInst>(I)) {1156if ((FS = findCalleeFunctionSamples(*CB))) {1157assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&1158"GUIDToFuncNameMap has to be populated");1159AllCandidates.push_back(CB);1160if (FS->getHeadSamplesEstimate() > 0 ||1161FunctionSamples::ProfileIsCS)1162LocalNotInlinedCallSites.insert({CB, FS});1163if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))1164Hot = true;1165else if (shouldInlineColdCallee(*CB))1166ColdCandidates.push_back(CB);1167} else if (getExternalInlineAdvisorShouldInline(*CB)) {1168AllCandidates.push_back(CB);1169}1170}1171}1172}1173if (Hot || ExternalInlineAdvisor) {1174CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());1175emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);1176} else {1177CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());1178emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);1179}1180}1181for (CallBase *I : CIS) {1182Function *CalledFunction = I->getCalledFunction();1183InlineCandidate Candidate = {I, LocalNotInlinedCallSites.lookup(I),11840 /* dummy count */,11851.0 /* dummy distribution factor */};1186// Do not inline recursive calls.1187if (CalledFunction == &F)1188continue;1189if (I->isIndirectCall()) {1190uint64_t Sum;1191for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {1192uint64_t SumOrigin = Sum;1193if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {1194findExternalInlineCandidate(I, FS, InlinedGUIDs,1195PSI->getOrCompHotCountThreshold());1196continue;1197}1198if (!callsiteIsHot(FS, PSI, ProfAccForSymsInList))1199continue;12001201Candidate = {I, FS, FS->getHeadSamplesEstimate(), 1.0};1202if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) {1203LocalNotInlinedCallSites.erase(I);1204LocalChanged = true;1205}1206}1207} else if (CalledFunction && CalledFunction->getSubprogram() &&1208!CalledFunction->isDeclaration()) {1209if (tryInlineCandidate(Candidate)) {1210LocalNotInlinedCallSites.erase(I);1211LocalChanged = true;1212}1213} else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {1214findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),1215InlinedGUIDs,1216PSI->getOrCompHotCountThreshold());1217}1218}1219Changed |= LocalChanged;1220}12211222// For CS profile, profile for not inlined context will be merged when1223// base profile is being retrieved.1224if (!FunctionSamples::ProfileIsCS)1225promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);1226return Changed;1227}12281229bool SampleProfileLoader::tryInlineCandidate(1230InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {1231// Do not attempt to inline a candidate if1232// --disable-sample-loader-inlining is true.1233if (DisableSampleLoaderInlining)1234return false;12351236CallBase &CB = *Candidate.CallInstr;1237Function *CalledFunction = CB.getCalledFunction();1238assert(CalledFunction && "Expect a callee with definition");1239DebugLoc DLoc = CB.getDebugLoc();1240BasicBlock *BB = CB.getParent();12411242InlineCost Cost = shouldInlineCandidate(Candidate);1243if (Cost.isNever()) {1244ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),1245"InlineFail", DLoc, BB)1246<< "incompatible inlining");1247return false;1248}12491250if (!Cost)1251return false;12521253InlineFunctionInfo IFI(GetAC);1254IFI.UpdateProfile = false;1255InlineResult IR = InlineFunction(CB, IFI,1256/*MergeAttributes=*/true);1257if (!IR.isSuccess())1258return false;12591260// The call to InlineFunction erases I, so we can't pass it here.1261emitInlinedIntoBasedOnCost(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(),1262Cost, true, getAnnotatedRemarkPassName());12631264// Now populate the list of newly exposed call sites.1265if (InlinedCallSites) {1266InlinedCallSites->clear();1267for (auto &I : IFI.InlinedCallSites)1268InlinedCallSites->push_back(I);1269}12701271if (FunctionSamples::ProfileIsCS)1272ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);1273++NumCSInlined;12741275// Prorate inlined probes for a duplicated inlining callsite which probably1276// has a distribution less than 100%. Samples for an inlinee should be1277// distributed among the copies of the original callsite based on each1278// callsite's distribution factor for counts accuracy. Note that an inlined1279// probe may come with its own distribution factor if it has been duplicated1280// in the inlinee body. The two factor are multiplied to reflect the1281// aggregation of duplication.1282if (Candidate.CallsiteDistribution < 1) {1283for (auto &I : IFI.InlinedCallSites) {1284if (std::optional<PseudoProbe> Probe = extractProbe(*I))1285setProbeDistributionFactor(*I, Probe->Factor *1286Candidate.CallsiteDistribution);1287}1288NumDuplicatedInlinesite++;1289}12901291return true;1292}12931294bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,1295CallBase *CB) {1296assert(CB && "Expect non-null call instruction");12971298if (isa<IntrinsicInst>(CB))1299return false;13001301// Find the callee's profile. For indirect call, find hottest target profile.1302const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);1303// If ExternalInlineAdvisor wants to inline this site, do so even1304// if Samples are not present.1305if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(*CB))1306return false;13071308float Factor = 1.0;1309if (std::optional<PseudoProbe> Probe = extractProbe(*CB))1310Factor = Probe->Factor;13111312uint64_t CallsiteCount =1313CalleeSamples ? CalleeSamples->getHeadSamplesEstimate() * Factor : 0;1314*NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};1315return true;1316}13171318std::optional<InlineCost>1319SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) {1320std::unique_ptr<InlineAdvice> Advice = nullptr;1321if (ExternalInlineAdvisor) {1322Advice = ExternalInlineAdvisor->getAdvice(CB);1323if (Advice) {1324if (!Advice->isInliningRecommended()) {1325Advice->recordUnattemptedInlining();1326return InlineCost::getNever("not previously inlined");1327}1328Advice->recordInlining();1329return InlineCost::getAlways("previously inlined");1330}1331}13321333return {};1334}13351336bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) {1337std::optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB);1338return Cost ? !!*Cost : false;1339}13401341InlineCost1342SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {1343if (std::optional<InlineCost> ReplayCost =1344getExternalInlineAdvisorCost(*Candidate.CallInstr))1345return *ReplayCost;1346// Adjust threshold based on call site hotness, only do this for callsite1347// prioritized inliner because otherwise cost-benefit check is done earlier.1348int SampleThreshold = SampleColdCallSiteThreshold;1349if (CallsitePrioritizedInline) {1350if (Candidate.CallsiteCount > PSI->getHotCountThreshold())1351SampleThreshold = SampleHotCallSiteThreshold;1352else if (!ProfileSizeInline)1353return InlineCost::getNever("cold callsite");1354}13551356Function *Callee = Candidate.CallInstr->getCalledFunction();1357assert(Callee && "Expect a definition for inline candidate of direct call");13581359InlineParams Params = getInlineParams();1360// We will ignore the threshold from inline cost, so always get full cost.1361Params.ComputeFullInlineCost = true;1362Params.AllowRecursiveCall = AllowRecursiveInline;1363// Checks if there is anything in the reachable portion of the callee at1364// this callsite that makes this inlining potentially illegal. Need to1365// set ComputeFullInlineCost, otherwise getInlineCost may return early1366// when cost exceeds threshold without checking all IRs in the callee.1367// The acutal cost does not matter because we only checks isNever() to1368// see if it is legal to inline the callsite.1369InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,1370GetTTI(*Callee), GetAC, GetTLI);13711372// Honor always inline and never inline from call analyzer1373if (Cost.isNever() || Cost.isAlways())1374return Cost;13751376// With CSSPGO, the preinliner in llvm-profgen can estimate global inline1377// decisions based on hotness as well as accurate function byte sizes for1378// given context using function/inlinee sizes from previous build. It1379// stores the decision in profile, and also adjust/merge context profile1380// aiming at better context-sensitive post-inline profile quality, assuming1381// all inline decision estimates are going to be honored by compiler. Here1382// we replay that inline decision under `sample-profile-use-preinliner`.1383// Note that we don't need to handle negative decision from preinliner as1384// context profile for not inlined calls are merged by preinliner already.1385if (UsePreInlinerDecision && Candidate.CalleeSamples) {1386// Once two node are merged due to promotion, we're losing some context1387// so the original context-sensitive preinliner decision should be ignored1388// for SyntheticContext.1389SampleContext &Context = Candidate.CalleeSamples->getContext();1390if (!Context.hasState(SyntheticContext) &&1391Context.hasAttribute(ContextShouldBeInlined))1392return InlineCost::getAlways("preinliner");1393}13941395// For old FDO inliner, we inline the call site if it is below hot threshold,1396// even if the function is hot based on sample profile data. This is to1397// prevent huge functions from being inlined.1398if (!CallsitePrioritizedInline) {1399return InlineCost::get(Cost.getCost(), SampleHotCallSiteThreshold);1400}14011402// Otherwise only use the cost from call analyzer, but overwite threshold with1403// Sample PGO threshold.1404return InlineCost::get(Cost.getCost(), SampleThreshold);1405}14061407bool SampleProfileLoader::inlineHotFunctionsWithPriority(1408Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {1409// ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure1410// Profile symbol list is ignored when profile-sample-accurate is on.1411assert((!ProfAccForSymsInList ||1412(!ProfileSampleAccurate &&1413!F.hasFnAttribute("profile-sample-accurate"))) &&1414"ProfAccForSymsInList should be false when profile-sample-accurate "1415"is enabled");14161417// Populating worklist with initial call sites from root inliner, along1418// with call site weights.1419CandidateQueue CQueue;1420InlineCandidate NewCandidate;1421for (auto &BB : F) {1422for (auto &I : BB) {1423auto *CB = dyn_cast<CallBase>(&I);1424if (!CB)1425continue;1426if (getInlineCandidate(&NewCandidate, CB))1427CQueue.push(NewCandidate);1428}1429}14301431// Cap the size growth from profile guided inlining. This is needed even1432// though cost of each inline candidate already accounts for callee size,1433// because with top-down inlining, we can grow inliner size significantly1434// with large number of smaller inlinees each pass the cost check.1435assert(ProfileInlineLimitMax >= ProfileInlineLimitMin &&1436"Max inline size limit should not be smaller than min inline size "1437"limit.");1438unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit;1439SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);1440SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);1441if (ExternalInlineAdvisor)1442SizeLimit = std::numeric_limits<unsigned>::max();14431444MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;14451446// Perform iterative BFS call site prioritized inlining1447bool Changed = false;1448while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {1449InlineCandidate Candidate = CQueue.top();1450CQueue.pop();1451CallBase *I = Candidate.CallInstr;1452Function *CalledFunction = I->getCalledFunction();14531454if (CalledFunction == &F)1455continue;1456if (I->isIndirectCall()) {1457uint64_t Sum = 0;1458auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);1459uint64_t SumOrigin = Sum;1460Sum *= Candidate.CallsiteDistribution;1461unsigned ICPCount = 0;1462for (const auto *FS : CalleeSamples) {1463// TODO: Consider disable pre-lTO ICP for MonoLTO as well1464if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {1465findExternalInlineCandidate(I, FS, InlinedGUIDs,1466PSI->getOrCompHotCountThreshold());1467continue;1468}1469uint64_t EntryCountDistributed =1470FS->getHeadSamplesEstimate() * Candidate.CallsiteDistribution;1471// In addition to regular inline cost check, we also need to make sure1472// ICP isn't introducing excessive speculative checks even if individual1473// target looks beneficial to promote and inline. That means we should1474// only do ICP when there's a small number dominant targets.1475if (ICPCount >= ProfileICPRelativeHotnessSkip &&1476EntryCountDistributed * 100 < SumOrigin * ProfileICPRelativeHotness)1477break;1478// TODO: Fix CallAnalyzer to handle all indirect calls.1479// For indirect call, we don't run CallAnalyzer to get InlineCost1480// before actual inlining. This is because we could see two different1481// types from the same definition, which makes CallAnalyzer choke as1482// it's expecting matching parameter type on both caller and callee1483// side. See example from PR18962 for the triggering cases (the bug was1484// fixed, but we generate different types).1485if (!PSI->isHotCount(EntryCountDistributed))1486break;1487SmallVector<CallBase *, 8> InlinedCallSites;1488// Attach function profile for promoted indirect callee, and update1489// call site count for the promoted inline candidate too.1490Candidate = {I, FS, EntryCountDistributed,1491Candidate.CallsiteDistribution};1492if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,1493&InlinedCallSites)) {1494for (auto *CB : InlinedCallSites) {1495if (getInlineCandidate(&NewCandidate, CB))1496CQueue.emplace(NewCandidate);1497}1498ICPCount++;1499Changed = true;1500} else if (!ContextTracker) {1501LocalNotInlinedCallSites.insert({I, FS});1502}1503}1504} else if (CalledFunction && CalledFunction->getSubprogram() &&1505!CalledFunction->isDeclaration()) {1506SmallVector<CallBase *, 8> InlinedCallSites;1507if (tryInlineCandidate(Candidate, &InlinedCallSites)) {1508for (auto *CB : InlinedCallSites) {1509if (getInlineCandidate(&NewCandidate, CB))1510CQueue.emplace(NewCandidate);1511}1512Changed = true;1513} else if (!ContextTracker) {1514LocalNotInlinedCallSites.insert({I, Candidate.CalleeSamples});1515}1516} else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {1517findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),1518InlinedGUIDs,1519PSI->getOrCompHotCountThreshold());1520}1521}15221523if (!CQueue.empty()) {1524if (SizeLimit == (unsigned)ProfileInlineLimitMax)1525++NumCSInlinedHitMaxLimit;1526else if (SizeLimit == (unsigned)ProfileInlineLimitMin)1527++NumCSInlinedHitMinLimit;1528else1529++NumCSInlinedHitGrowthLimit;1530}15311532// For CS profile, profile for not inlined context will be merged when1533// base profile is being retrieved.1534if (!FunctionSamples::ProfileIsCS)1535promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);1536return Changed;1537}15381539void SampleProfileLoader::promoteMergeNotInlinedContextSamples(1540MapVector<CallBase *, const FunctionSamples *> NonInlinedCallSites,1541const Function &F) {1542// Accumulate not inlined callsite information into notInlinedSamples1543for (const auto &Pair : NonInlinedCallSites) {1544CallBase *I = Pair.first;1545Function *Callee = I->getCalledFunction();1546if (!Callee || Callee->isDeclaration())1547continue;15481549ORE->emit(1550OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), "NotInline",1551I->getDebugLoc(), I->getParent())1552<< "previous inlining not repeated: '" << ore::NV("Callee", Callee)1553<< "' into '" << ore::NV("Caller", &F) << "'");15541555++NumCSNotInlined;1556const FunctionSamples *FS = Pair.second;1557if (FS->getTotalSamples() == 0 && FS->getHeadSamplesEstimate() == 0) {1558continue;1559}15601561// Do not merge a context that is already duplicated into the base profile.1562if (FS->getContext().hasAttribute(sampleprof::ContextDuplicatedIntoBase))1563continue;15641565if (ProfileMergeInlinee) {1566// A function call can be replicated by optimizations like callsite1567// splitting or jump threading and the replicates end up sharing the1568// sample nested callee profile instead of slicing the original1569// inlinee's profile. We want to do merge exactly once by filtering out1570// callee profiles with a non-zero head sample count.1571if (FS->getHeadSamples() == 0) {1572// Use entry samples as head samples during the merge, as inlinees1573// don't have head samples.1574const_cast<FunctionSamples *>(FS)->addHeadSamples(1575FS->getHeadSamplesEstimate());15761577// Note that we have to do the merge right after processing function.1578// This allows OutlineFS's profile to be used for annotation during1579// top-down processing of functions' annotation.1580FunctionSamples *OutlineFS = Reader->getSamplesFor(*Callee);1581// If outlined function does not exist in the profile, add it to a1582// separate map so that it does not rehash the original profile.1583if (!OutlineFS)1584OutlineFS = &OutlineFunctionSamples[1585FunctionId(FunctionSamples::getCanonicalFnName(Callee->getName()))];1586OutlineFS->merge(*FS, 1);1587// Set outlined profile to be synthetic to not bias the inliner.1588OutlineFS->setContextSynthetic();1589}1590} else {1591auto pair =1592notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});1593pair.first->second.entryCount += FS->getHeadSamplesEstimate();1594}1595}1596}15971598/// Returns the sorted CallTargetMap \p M by count in descending order.1599static SmallVector<InstrProfValueData, 2>1600GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M) {1601SmallVector<InstrProfValueData, 2> R;1602for (const auto &I : SampleRecord::sortCallTargets(M)) {1603R.emplace_back(1604InstrProfValueData{I.first.getHashCode(), I.second});1605}1606return R;1607}16081609// Generate MD_prof metadata for every branch instruction using the1610// edge weights computed during propagation.1611void SampleProfileLoader::generateMDProfMetadata(Function &F) {1612// Generate MD_prof metadata for every branch instruction using the1613// edge weights computed during propagation.1614LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");1615LLVMContext &Ctx = F.getContext();1616MDBuilder MDB(Ctx);1617for (auto &BI : F) {1618BasicBlock *BB = &BI;16191620if (BlockWeights[BB]) {1621for (auto &I : *BB) {1622if (!isa<CallInst>(I) && !isa<InvokeInst>(I))1623continue;1624if (!cast<CallBase>(I).getCalledFunction()) {1625const DebugLoc &DLoc = I.getDebugLoc();1626if (!DLoc)1627continue;1628const DILocation *DIL = DLoc;1629const FunctionSamples *FS = findFunctionSamples(I);1630if (!FS)1631continue;1632auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);1633ErrorOr<SampleRecord::CallTargetMap> T =1634FS->findCallTargetMapAt(CallSite);1635if (!T || T.get().empty())1636continue;1637if (FunctionSamples::ProfileIsProbeBased) {1638// Prorate the callsite counts based on the pre-ICP distribution1639// factor to reflect what is already done to the callsite before1640// ICP, such as calliste cloning.1641if (std::optional<PseudoProbe> Probe = extractProbe(I)) {1642if (Probe->Factor < 1)1643T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);1644}1645}1646SmallVector<InstrProfValueData, 2> SortedCallTargets =1647GetSortedValueDataFromCallTargets(T.get());1648uint64_t Sum = 0;1649for (const auto &C : T.get())1650Sum += C.second;1651// With CSSPGO all indirect call targets are counted torwards the1652// original indirect call site in the profile, including both1653// inlined and non-inlined targets.1654if (!FunctionSamples::ProfileIsCS) {1655if (const FunctionSamplesMap *M =1656FS->findFunctionSamplesMapAt(CallSite)) {1657for (const auto &NameFS : *M)1658Sum += NameFS.second.getHeadSamplesEstimate();1659}1660}1661if (Sum)1662updateIDTMetaData(I, SortedCallTargets, Sum);1663else if (OverwriteExistingWeights)1664I.setMetadata(LLVMContext::MD_prof, nullptr);1665} else if (!isa<IntrinsicInst>(&I)) {1666setBranchWeights(I, {static_cast<uint32_t>(BlockWeights[BB])},1667/*IsExpected=*/false);1668}1669}1670} else if (OverwriteExistingWeights || ProfileSampleBlockAccurate) {1671// Set profile metadata (possibly annotated by LTO prelink) to zero or1672// clear it for cold code.1673for (auto &I : *BB) {1674if (isa<CallInst>(I) || isa<InvokeInst>(I)) {1675if (cast<CallBase>(I).isIndirectCall()) {1676I.setMetadata(LLVMContext::MD_prof, nullptr);1677} else {1678setBranchWeights(I, {uint32_t(0)}, /*IsExpected=*/false);1679}1680}1681}1682}16831684Instruction *TI = BB->getTerminator();1685if (TI->getNumSuccessors() == 1)1686continue;1687if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI) &&1688!isa<IndirectBrInst>(TI))1689continue;16901691DebugLoc BranchLoc = TI->getDebugLoc();1692LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "1693<< ((BranchLoc) ? Twine(BranchLoc.getLine())1694: Twine("<UNKNOWN LOCATION>"))1695<< ".\n");1696SmallVector<uint32_t, 4> Weights;1697uint32_t MaxWeight = 0;1698Instruction *MaxDestInst;1699// Since profi treats multiple edges (multiway branches) as a single edge,1700// we need to distribute the computed weight among the branches. We do1701// this by evenly splitting the edge weight among destinations.1702DenseMap<const BasicBlock *, uint64_t> EdgeMultiplicity;1703std::vector<uint64_t> EdgeIndex;1704if (SampleProfileUseProfi) {1705EdgeIndex.resize(TI->getNumSuccessors());1706for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {1707const BasicBlock *Succ = TI->getSuccessor(I);1708EdgeIndex[I] = EdgeMultiplicity[Succ];1709EdgeMultiplicity[Succ]++;1710}1711}1712for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {1713BasicBlock *Succ = TI->getSuccessor(I);1714Edge E = std::make_pair(BB, Succ);1715uint64_t Weight = EdgeWeights[E];1716LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));1717// Use uint32_t saturated arithmetic to adjust the incoming weights,1718// if needed. Sample counts in profiles are 64-bit unsigned values,1719// but internally branch weights are expressed as 32-bit values.1720if (Weight > std::numeric_limits<uint32_t>::max()) {1721LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)\n");1722Weight = std::numeric_limits<uint32_t>::max();1723}1724if (!SampleProfileUseProfi) {1725// Weight is added by one to avoid propagation errors introduced by1726// 0 weights.1727Weights.push_back(static_cast<uint32_t>(1728Weight == std::numeric_limits<uint32_t>::max() ? Weight1729: Weight + 1));1730} else {1731// Profi creates proper weights that do not require "+1" adjustments but1732// we evenly split the weight among branches with the same destination.1733uint64_t W = Weight / EdgeMultiplicity[Succ];1734// Rounding up, if needed, so that first branches are hotter.1735if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ])1736W++;1737Weights.push_back(static_cast<uint32_t>(W));1738}1739if (Weight != 0) {1740if (Weight > MaxWeight) {1741MaxWeight = Weight;1742MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();1743}1744}1745}17461747misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false);17481749uint64_t TempWeight;1750// Only set weights if there is at least one non-zero weight.1751// In any other case, let the analyzer set weights.1752// Do not set weights if the weights are present unless under1753// OverwriteExistingWeights. In ThinLTO, the profile annotation is done1754// twice. If the first annotation already set the weights, the second pass1755// does not need to set it. With OverwriteExistingWeights, Blocks with zero1756// weight should have their existing metadata (possibly annotated by LTO1757// prelink) cleared.1758if (MaxWeight > 0 &&1759(!TI->extractProfTotalWeight(TempWeight) || OverwriteExistingWeights)) {1760LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");1761setBranchWeights(*TI, Weights, /*IsExpected=*/false);1762ORE->emit([&]() {1763return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)1764<< "most popular destination for conditional branches at "1765<< ore::NV("CondBranchesLoc", BranchLoc);1766});1767} else {1768if (OverwriteExistingWeights) {1769TI->setMetadata(LLVMContext::MD_prof, nullptr);1770LLVM_DEBUG(dbgs() << "CLEARED. All branch weights are zero.\n");1771} else {1772LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");1773}1774}1775}1776}17771778/// Once all the branch weights are computed, we emit the MD_prof1779/// metadata on BB using the computed values for each of its branches.1780///1781/// \param F The function to query.1782///1783/// \returns true if \p F was modified. Returns false, otherwise.1784bool SampleProfileLoader::emitAnnotations(Function &F) {1785bool Changed = false;17861787if (FunctionSamples::ProfileIsProbeBased) {1788LLVM_DEBUG({1789if (!ProbeManager->getDesc(F))1790dbgs() << "Probe descriptor missing for Function " << F.getName()1791<< "\n";1792});17931794if (ProbeManager->profileIsValid(F, *Samples)) {1795++NumMatchedProfile;1796} else {1797++NumMismatchedProfile;1798LLVM_DEBUG(1799dbgs() << "Profile is invalid due to CFG mismatch for Function "1800<< F.getName() << "\n");1801if (!SalvageStaleProfile)1802return false;1803}1804} else {1805if (getFunctionLoc(F) == 0)1806return false;18071808LLVM_DEBUG(dbgs() << "Line number for the first instruction in "1809<< F.getName() << ": " << getFunctionLoc(F) << "\n");1810}18111812DenseSet<GlobalValue::GUID> InlinedGUIDs;1813if (CallsitePrioritizedInline)1814Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);1815else1816Changed |= inlineHotFunctions(F, InlinedGUIDs);18171818Changed |= computeAndPropagateWeights(F, InlinedGUIDs);18191820if (Changed)1821generateMDProfMetadata(F);18221823emitCoverageRemarks(F);1824return Changed;1825}18261827std::unique_ptr<ProfiledCallGraph>1828SampleProfileLoader::buildProfiledCallGraph(Module &M) {1829std::unique_ptr<ProfiledCallGraph> ProfiledCG;1830if (FunctionSamples::ProfileIsCS)1831ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker);1832else1833ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles());18341835// Add all functions into the profiled call graph even if they are not in1836// the profile. This makes sure functions missing from the profile still1837// gets a chance to be processed.1838for (Function &F : M) {1839if (skipProfileForFunction(F))1840continue;1841ProfiledCG->addProfiledFunction(1842getRepInFormat(FunctionSamples::getCanonicalFnName(F)));1843}18441845return ProfiledCG;1846}18471848std::vector<Function *>1849SampleProfileLoader::buildFunctionOrder(Module &M, LazyCallGraph &CG) {1850std::vector<Function *> FunctionOrderList;1851FunctionOrderList.reserve(M.size());18521853if (!ProfileTopDownLoad && UseProfiledCallGraph)1854errs() << "WARNING: -use-profiled-call-graph ignored, should be used "1855"together with -sample-profile-top-down-load.\n";18561857if (!ProfileTopDownLoad) {1858if (ProfileMergeInlinee) {1859// Disable ProfileMergeInlinee if profile is not loaded in top down order,1860// because the profile for a function may be used for the profile1861// annotation of its outline copy before the profile merging of its1862// non-inlined inline instances, and that is not the way how1863// ProfileMergeInlinee is supposed to work.1864ProfileMergeInlinee = false;1865}18661867for (Function &F : M)1868if (!skipProfileForFunction(F))1869FunctionOrderList.push_back(&F);1870return FunctionOrderList;1871}18721873if (UseProfiledCallGraph || (FunctionSamples::ProfileIsCS &&1874!UseProfiledCallGraph.getNumOccurrences())) {1875// Use profiled call edges to augment the top-down order. There are cases1876// that the top-down order computed based on the static call graph doesn't1877// reflect real execution order. For example1878//1879// 1. Incomplete static call graph due to unknown indirect call targets.1880// Adjusting the order by considering indirect call edges from the1881// profile can enable the inlining of indirect call targets by allowing1882// the caller processed before them.1883// 2. Mutual call edges in an SCC. The static processing order computed for1884// an SCC may not reflect the call contexts in the context-sensitive1885// profile, thus may cause potential inlining to be overlooked. The1886// function order in one SCC is being adjusted to a top-down order based1887// on the profile to favor more inlining. This is only a problem with CS1888// profile.1889// 3. Transitive indirect call edges due to inlining. When a callee function1890// (say B) is inlined into a caller function (say A) in LTO prelink,1891// every call edge originated from the callee B will be transferred to1892// the caller A. If any transferred edge (say A->C) is indirect, the1893// original profiled indirect edge B->C, even if considered, would not1894// enforce a top-down order from the caller A to the potential indirect1895// call target C in LTO postlink since the inlined callee B is gone from1896// the static call graph.1897// 4. #3 can happen even for direct call targets, due to functions defined1898// in header files. A header function (say A), when included into source1899// files, is defined multiple times but only one definition survives due1900// to ODR. Therefore, the LTO prelink inlining done on those dropped1901// definitions can be useless based on a local file scope. More1902// importantly, the inlinee (say B), once fully inlined to a1903// to-be-dropped A, will have no profile to consume when its outlined1904// version is compiled. This can lead to a profile-less prelink1905// compilation for the outlined version of B which may be called from1906// external modules. while this isn't easy to fix, we rely on the1907// postlink AutoFDO pipeline to optimize B. Since the survived copy of1908// the A can be inlined in its local scope in prelink, it may not exist1909// in the merged IR in postlink, and we'll need the profiled call edges1910// to enforce a top-down order for the rest of the functions.1911//1912// Considering those cases, a profiled call graph completely independent of1913// the static call graph is constructed based on profile data, where1914// function objects are not even needed to handle case #3 and case 4.1915//1916// Note that static callgraph edges are completely ignored since they1917// can be conflicting with profiled edges for cyclic SCCs and may result in1918// an SCC order incompatible with profile-defined one. Using strictly1919// profile order ensures a maximum inlining experience. On the other hand,1920// static call edges are not so important when they don't correspond to a1921// context in the profile.19221923std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(M);1924scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get());1925while (!CGI.isAtEnd()) {1926auto Range = *CGI;1927if (SortProfiledSCC) {1928// Sort nodes in one SCC based on callsite hotness.1929scc_member_iterator<ProfiledCallGraph *> SI(*CGI);1930Range = *SI;1931}1932for (auto *Node : Range) {1933Function *F = SymbolMap.lookup(Node->Name);1934if (F && !skipProfileForFunction(*F))1935FunctionOrderList.push_back(F);1936}1937++CGI;1938}1939std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());1940} else1941buildTopDownFuncOrder(CG, FunctionOrderList);19421943LLVM_DEBUG({1944dbgs() << "Function processing order:\n";1945for (auto F : FunctionOrderList) {1946dbgs() << F->getName() << "\n";1947}1948});19491950return FunctionOrderList;1951}19521953bool SampleProfileLoader::doInitialization(Module &M,1954FunctionAnalysisManager *FAM) {1955auto &Ctx = M.getContext();19561957auto ReaderOrErr = SampleProfileReader::create(1958Filename, Ctx, *FS, FSDiscriminatorPass::Base, RemappingFilename);1959if (std::error_code EC = ReaderOrErr.getError()) {1960std::string Msg = "Could not open profile: " + EC.message();1961Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));1962return false;1963}1964Reader = std::move(ReaderOrErr.get());1965Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink);1966// set module before reading the profile so reader may be able to only1967// read the function profiles which are used by the current module.1968Reader->setModule(&M);1969if (std::error_code EC = Reader->read()) {1970std::string Msg = "profile reading failed: " + EC.message();1971Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));1972return false;1973}19741975PSL = Reader->getProfileSymbolList();19761977// While profile-sample-accurate is on, ignore symbol list.1978ProfAccForSymsInList =1979ProfileAccurateForSymsInList && PSL && !ProfileSampleAccurate;1980if (ProfAccForSymsInList) {1981NamesInProfile.clear();1982GUIDsInProfile.clear();1983if (auto NameTable = Reader->getNameTable()) {1984if (FunctionSamples::UseMD5) {1985for (auto Name : *NameTable)1986GUIDsInProfile.insert(Name.getHashCode());1987} else {1988for (auto Name : *NameTable)1989NamesInProfile.insert(Name.stringRef());1990}1991}1992CoverageTracker.setProfAccForSymsInList(true);1993}19941995if (FAM && !ProfileInlineReplayFile.empty()) {1996ExternalInlineAdvisor = getReplayInlineAdvisor(1997M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr,1998ReplayInlinerSettings{ProfileInlineReplayFile,1999ProfileInlineReplayScope,2000ProfileInlineReplayFallback,2001{ProfileInlineReplayFormat}},2002/*EmitRemarks=*/false, InlineContext{LTOPhase, InlinePass::ReplaySampleProfileInliner});2003}20042005// Apply tweaks if context-sensitive or probe-based profile is available.2006if (Reader->profileIsCS() || Reader->profileIsPreInlined() ||2007Reader->profileIsProbeBased()) {2008if (!UseIterativeBFIInference.getNumOccurrences())2009UseIterativeBFIInference = true;2010if (!SampleProfileUseProfi.getNumOccurrences())2011SampleProfileUseProfi = true;2012if (!EnableExtTspBlockPlacement.getNumOccurrences())2013EnableExtTspBlockPlacement = true;2014// Enable priority-base inliner and size inline by default for CSSPGO.2015if (!ProfileSizeInline.getNumOccurrences())2016ProfileSizeInline = true;2017if (!CallsitePrioritizedInline.getNumOccurrences())2018CallsitePrioritizedInline = true;2019// For CSSPGO, we also allow recursive inline to best use context profile.2020if (!AllowRecursiveInline.getNumOccurrences())2021AllowRecursiveInline = true;20222023if (Reader->profileIsPreInlined()) {2024if (!UsePreInlinerDecision.getNumOccurrences())2025UsePreInlinerDecision = true;2026}20272028// Enable stale profile matching by default for probe-based profile.2029// Currently the matching relies on if the checksum mismatch is detected,2030// which is currently only available for pseudo-probe mode. Removing the2031// checksum check could cause regressions for some cases, so further tuning2032// might be needed if we want to enable it for all cases.2033if (Reader->profileIsProbeBased() &&2034!SalvageStaleProfile.getNumOccurrences()) {2035SalvageStaleProfile = true;2036}20372038if (!Reader->profileIsCS()) {2039// Non-CS profile should be fine without a function size budget for the2040// inliner since the contexts in the profile are either all from inlining2041// in the prevoius build or pre-computed by the preinliner with a size2042// cap, thus they are bounded.2043if (!ProfileInlineLimitMin.getNumOccurrences())2044ProfileInlineLimitMin = std::numeric_limits<unsigned>::max();2045if (!ProfileInlineLimitMax.getNumOccurrences())2046ProfileInlineLimitMax = std::numeric_limits<unsigned>::max();2047}2048}20492050if (Reader->profileIsCS()) {2051// Tracker for profiles under different context2052ContextTracker = std::make_unique<SampleContextTracker>(2053Reader->getProfiles(), &GUIDToFuncNameMap);2054}20552056// Load pseudo probe descriptors for probe-based function samples.2057if (Reader->profileIsProbeBased()) {2058ProbeManager = std::make_unique<PseudoProbeManager>(M);2059if (!ProbeManager->moduleIsProbed(M)) {2060const char *Msg =2061"Pseudo-probe-based profile requires SampleProfileProbePass";2062Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg,2063DS_Warning));2064return false;2065}2066}20672068if (ReportProfileStaleness || PersistProfileStaleness ||2069SalvageStaleProfile) {2070MatchingManager = std::make_unique<SampleProfileMatcher>(2071M, *Reader, CG, ProbeManager.get(), LTOPhase, SymbolMap, PSL,2072FuncNameToProfNameMap);2073}20742075return true;2076}20772078// Note that this is a module-level check. Even if one module is errored out,2079// the entire build will be errored out. However, the user could make big2080// changes to functions in single module but those changes might not be2081// performance significant to the whole binary. Therefore, to avoid those false2082// positives, we select a reasonable big set of hot functions that are supposed2083// to be globally performance significant, only compute and check the mismatch2084// within those functions. The function selection is based on two criteria:2085// 1) The function is hot enough, which is tuned by a hotness-based2086// flag(HotFuncCutoffForStalenessError). 2) The num of function is large enough2087// which is tuned by the MinfuncsForStalenessError flag.2088bool SampleProfileLoader::rejectHighStalenessProfile(2089Module &M, ProfileSummaryInfo *PSI, const SampleProfileMap &Profiles) {2090assert(FunctionSamples::ProfileIsProbeBased &&2091"Only support for probe-based profile");2092uint64_t TotalHotFunc = 0;2093uint64_t NumMismatchedFunc = 0;2094for (const auto &I : Profiles) {2095const auto &FS = I.second;2096const auto *FuncDesc = ProbeManager->getDesc(FS.getGUID());2097if (!FuncDesc)2098continue;20992100// Use a hotness-based threshold to control the function selection.2101if (!PSI->isHotCountNthPercentile(HotFuncCutoffForStalenessError,2102FS.getTotalSamples()))2103continue;21042105TotalHotFunc++;2106if (ProbeManager->profileIsHashMismatched(*FuncDesc, FS))2107NumMismatchedFunc++;2108}2109// Make sure that the num of selected function is not too small to distinguish2110// from the user's benign changes.2111if (TotalHotFunc < MinfuncsForStalenessError)2112return false;21132114// Finally check the mismatch percentage against the threshold.2115if (NumMismatchedFunc * 100 >=2116TotalHotFunc * PrecentMismatchForStalenessError) {2117auto &Ctx = M.getContext();2118const char *Msg =2119"The input profile significantly mismatches current source code. "2120"Please recollect profile to avoid performance regression.";2121Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg));2122return true;2123}2124return false;2125}21262127void SampleProfileLoader::removePseudoProbeInsts(Module &M) {2128for (auto &F : M) {2129std::vector<Instruction *> InstsToDel;2130for (auto &BB : F) {2131for (auto &I : BB) {2132if (isa<PseudoProbeInst>(&I))2133InstsToDel.push_back(&I);2134}2135}2136for (auto *I : InstsToDel)2137I->eraseFromParent();2138}2139}21402141bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,2142ProfileSummaryInfo *_PSI) {2143GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);21442145PSI = _PSI;2146if (M.getProfileSummary(/* IsCS */ false) == nullptr) {2147M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),2148ProfileSummary::PSK_Sample);2149PSI->refresh();2150}21512152if (FunctionSamples::ProfileIsProbeBased &&2153rejectHighStalenessProfile(M, PSI, Reader->getProfiles()))2154return false;21552156// Compute the total number of samples collected in this profile.2157for (const auto &I : Reader->getProfiles())2158TotalCollectedSamples += I.second.getTotalSamples();21592160auto Remapper = Reader->getRemapper();2161// Populate the symbol map.2162for (const auto &N_F : M.getValueSymbolTable()) {2163StringRef OrigName = N_F.getKey();2164Function *F = dyn_cast<Function>(N_F.getValue());2165if (F == nullptr || OrigName.empty())2166continue;2167SymbolMap[FunctionId(OrigName)] = F;2168StringRef NewName = FunctionSamples::getCanonicalFnName(*F);2169if (OrigName != NewName && !NewName.empty()) {2170auto r = SymbolMap.emplace(FunctionId(NewName), F);2171// Failiing to insert means there is already an entry in SymbolMap,2172// thus there are multiple functions that are mapped to the same2173// stripped name. In this case of name conflicting, set the value2174// to nullptr to avoid confusion.2175if (!r.second)2176r.first->second = nullptr;2177OrigName = NewName;2178}2179// Insert the remapped names into SymbolMap.2180if (Remapper) {2181if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {2182if (*MapName != OrigName && !MapName->empty())2183SymbolMap.emplace(FunctionId(*MapName), F);2184}2185}2186}21872188// Stale profile matching.2189if (ReportProfileStaleness || PersistProfileStaleness ||2190SalvageStaleProfile) {2191MatchingManager->runOnModule();2192MatchingManager->clearMatchingData();2193}2194assert(SymbolMap.count(FunctionId()) == 0 &&2195"No empty StringRef should be added in SymbolMap");2196assert((SalvageUnusedProfile || FuncNameToProfNameMap.empty()) &&2197"FuncNameToProfNameMap is not empty when --salvage-unused-profile is "2198"not enabled");21992200bool retval = false;2201for (auto *F : buildFunctionOrder(M, CG)) {2202assert(!F->isDeclaration());2203clearFunctionData();2204retval |= runOnFunction(*F, AM);2205}22062207// Account for cold calls not inlined....2208if (!FunctionSamples::ProfileIsCS)2209for (const std::pair<Function *, NotInlinedProfileInfo> &pair :2210notInlinedCallInfo)2211updateProfileCallee(pair.first, pair.second.entryCount);22122213if (RemoveProbeAfterProfileAnnotation && FunctionSamples::ProfileIsProbeBased)2214removePseudoProbeInsts(M);22152216return retval;2217}22182219bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {2220LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");2221DILocation2SampleMap.clear();2222// By default the entry count is initialized to -1, which will be treated2223// conservatively by getEntryCount as the same as unknown (None). This is2224// to avoid newly added code to be treated as cold. If we have samples2225// this will be overwritten in emitAnnotations.2226uint64_t initialEntryCount = -1;22272228ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;2229if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {2230// initialize all the function entry counts to 0. It means all the2231// functions without profile will be regarded as cold.2232initialEntryCount = 0;2233// profile-sample-accurate is a user assertion which has a higher precedence2234// than symbol list. When profile-sample-accurate is on, ignore symbol list.2235ProfAccForSymsInList = false;2236}2237CoverageTracker.setProfAccForSymsInList(ProfAccForSymsInList);22382239// PSL -- profile symbol list include all the symbols in sampled binary.2240// If ProfileAccurateForSymsInList is enabled, PSL is used to treat2241// old functions without samples being cold, without having to worry2242// about new and hot functions being mistakenly treated as cold.2243if (ProfAccForSymsInList) {2244// Initialize the entry count to 0 for functions in the list.2245if (PSL->contains(F.getName()))2246initialEntryCount = 0;22472248// Function in the symbol list but without sample will be regarded as2249// cold. To minimize the potential negative performance impact it could2250// have, we want to be a little conservative here saying if a function2251// shows up in the profile, no matter as outline function, inline instance2252// or call targets, treat the function as not being cold. This will handle2253// the cases such as most callsites of a function are inlined in sampled2254// binary but not inlined in current build (because of source code drift,2255// imprecise debug information, or the callsites are all cold individually2256// but not cold accumulatively...), so the outline function showing up as2257// cold in sampled binary will actually not be cold after current build.2258StringRef CanonName = FunctionSamples::getCanonicalFnName(F);2259if ((FunctionSamples::UseMD5 &&2260GUIDsInProfile.count(Function::getGUID(CanonName))) ||2261(!FunctionSamples::UseMD5 && NamesInProfile.count(CanonName)))2262initialEntryCount = -1;2263}22642265// Initialize entry count when the function has no existing entry2266// count value.2267if (!F.getEntryCount())2268F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));2269std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;2270if (AM) {2271auto &FAM =2272AM->getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent())2273.getManager();2274ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);2275} else {2276OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);2277ORE = OwnedORE.get();2278}22792280if (FunctionSamples::ProfileIsCS)2281Samples = ContextTracker->getBaseSamplesFor(F);2282else {2283Samples = Reader->getSamplesFor(F);2284// Try search in previously inlined functions that were split or duplicated2285// into base.2286if (!Samples) {2287StringRef CanonName = FunctionSamples::getCanonicalFnName(F);2288auto It = OutlineFunctionSamples.find(FunctionId(CanonName));2289if (It != OutlineFunctionSamples.end()) {2290Samples = &It->second;2291} else if (auto Remapper = Reader->getRemapper()) {2292if (auto RemppedName = Remapper->lookUpNameInProfile(CanonName)) {2293It = OutlineFunctionSamples.find(FunctionId(*RemppedName));2294if (It != OutlineFunctionSamples.end())2295Samples = &It->second;2296}2297}2298}2299}23002301if (Samples && !Samples->empty())2302return emitAnnotations(F);2303return false;2304}2305SampleProfileLoaderPass::SampleProfileLoaderPass(2306std::string File, std::string RemappingFile, ThinOrFullLTOPhase LTOPhase,2307IntrusiveRefCntPtr<vfs::FileSystem> FS)2308: ProfileFileName(File), ProfileRemappingFileName(RemappingFile),2309LTOPhase(LTOPhase), FS(std::move(FS)) {}23102311PreservedAnalyses SampleProfileLoaderPass::run(Module &M,2312ModuleAnalysisManager &AM) {2313FunctionAnalysisManager &FAM =2314AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();23152316auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {2317return FAM.getResult<AssumptionAnalysis>(F);2318};2319auto GetTTI = [&](Function &F) -> TargetTransformInfo & {2320return FAM.getResult<TargetIRAnalysis>(F);2321};2322auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {2323return FAM.getResult<TargetLibraryAnalysis>(F);2324};23252326if (!FS)2327FS = vfs::getRealFileSystem();2328LazyCallGraph &CG = AM.getResult<LazyCallGraphAnalysis>(M);23292330SampleProfileLoader SampleLoader(2331ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,2332ProfileRemappingFileName.empty() ? SampleProfileRemappingFile2333: ProfileRemappingFileName,2334LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI, CG);2335if (!SampleLoader.doInitialization(M, &FAM))2336return PreservedAnalyses::all();23372338ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);2339if (!SampleLoader.runOnModule(M, &AM, PSI))2340return PreservedAnalyses::all();23412342return PreservedAnalyses::none();2343}234423452346