Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
35269 views
//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file implements support for context disambiguation of allocation9// calls for profile guided heap optimization. Specifically, it uses Memprof10// profiles which indicate context specific allocation behavior (currently11// distinguishing cold vs hot memory allocations). Cloning is performed to12// expose the cold allocation call contexts, and the allocation calls are13// subsequently annotated with an attribute for later transformation.14//15// The transformations can be performed either directly on IR (regular LTO), or16// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).17// Both types of LTO operate on a the same base graph representation, which18// uses CRTP to support either IR or Index formats.19//20//===----------------------------------------------------------------------===//2122#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"23#include "llvm/ADT/DenseMap.h"24#include "llvm/ADT/DenseSet.h"25#include "llvm/ADT/MapVector.h"26#include "llvm/ADT/SetOperations.h"27#include "llvm/ADT/SmallPtrSet.h"28#include "llvm/ADT/SmallSet.h"29#include "llvm/ADT/SmallVector.h"30#include "llvm/ADT/Statistic.h"31#include "llvm/Analysis/MemoryProfileInfo.h"32#include "llvm/Analysis/ModuleSummaryAnalysis.h"33#include "llvm/Analysis/OptimizationRemarkEmitter.h"34#include "llvm/Bitcode/BitcodeReader.h"35#include "llvm/IR/Constants.h"36#include "llvm/IR/Instructions.h"37#include "llvm/IR/Module.h"38#include "llvm/IR/ModuleSummaryIndex.h"39#include "llvm/Pass.h"40#include "llvm/Support/CommandLine.h"41#include "llvm/Support/FileSystem.h"42#include "llvm/Support/GraphWriter.h"43#include "llvm/Support/raw_ostream.h"44#include "llvm/Transforms/IPO.h"45#include "llvm/Transforms/Utils/Cloning.h"46#include <deque>47#include <sstream>48#include <unordered_map>49#include <vector>50using namespace llvm;51using namespace llvm::memprof;5253#define DEBUG_TYPE "memprof-context-disambiguation"5455STATISTIC(FunctionClonesAnalysis,56"Number of function clones created during whole program analysis");57STATISTIC(FunctionClonesThinBackend,58"Number of function clones created during ThinLTO backend");59STATISTIC(FunctionsClonedThinBackend,60"Number of functions that had clones created during ThinLTO backend");61STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "62"cloned) during whole program analysis");63STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "64"during whole program analysis");65STATISTIC(AllocTypeNotColdThinBackend,66"Number of not cold static allocations (possibly cloned) during "67"ThinLTO backend");68STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "69"(possibly cloned) during ThinLTO backend");70STATISTIC(OrigAllocsThinBackend,71"Number of original (not cloned) allocations with memprof profiles "72"during ThinLTO backend");73STATISTIC(74AllocVersionsThinBackend,75"Number of allocation versions (including clones) during ThinLTO backend");76STATISTIC(MaxAllocVersionsThinBackend,77"Maximum number of allocation versions created for an original "78"allocation during ThinLTO backend");79STATISTIC(UnclonableAllocsThinBackend,80"Number of unclonable ambigous allocations during ThinLTO backend");81STATISTIC(RemovedEdgesWithMismatchedCallees,82"Number of edges removed due to mismatched callees (profiled vs IR)");83STATISTIC(FoundProfiledCalleeCount,84"Number of profiled callees found via tail calls");85STATISTIC(FoundProfiledCalleeDepth,86"Aggregate depth of profiled callees found via tail calls");87STATISTIC(FoundProfiledCalleeMaxDepth,88"Maximum depth of profiled callees found via tail calls");89STATISTIC(FoundProfiledCalleeNonUniquelyCount,90"Number of profiled callees found via multiple tail call chains");9192static cl::opt<std::string> DotFilePathPrefix(93"memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,94cl::value_desc("filename"),95cl::desc("Specify the path prefix of the MemProf dot files."));9697static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),98cl::Hidden,99cl::desc("Export graph to dot files."));100101static cl::opt<bool>102DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,103cl::desc("Dump CallingContextGraph to stdout after each stage."));104105static cl::opt<bool>106VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,107cl::desc("Perform verification checks on CallingContextGraph."));108109static cl::opt<bool>110VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,111cl::desc("Perform frequent verification checks on nodes."));112113static cl::opt<std::string> MemProfImportSummary(114"memprof-import-summary",115cl::desc("Import summary to use for testing the ThinLTO backend via opt"),116cl::Hidden);117118static cl::opt<unsigned>119TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),120cl::Hidden,121cl::desc("Max depth to recursively search for missing "122"frames through tail calls."));123124namespace llvm {125cl::opt<bool> EnableMemProfContextDisambiguation(126"enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,127cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));128129// Indicate we are linking with an allocator that supports hot/cold operator130// new interfaces.131cl::opt<bool> SupportsHotColdNew(132"supports-hot-cold-new", cl::init(false), cl::Hidden,133cl::desc("Linking with hot/cold operator new interfaces"));134} // namespace llvm135136extern cl::opt<bool> MemProfReportHintedSizes;137138namespace {139/// CRTP base for graphs built from either IR or ThinLTO summary index.140///141/// The graph represents the call contexts in all memprof metadata on allocation142/// calls, with nodes for the allocations themselves, as well as for the calls143/// in each context. The graph is initially built from the allocation memprof144/// metadata (or summary) MIBs. It is then updated to match calls with callsite145/// metadata onto the nodes, updating it to reflect any inlining performed on146/// those calls.147///148/// Each MIB (representing an allocation's call context with allocation149/// behavior) is assigned a unique context id during the graph build. The edges150/// and nodes in the graph are decorated with the context ids they carry. This151/// is used to correctly update the graph when cloning is performed so that we152/// can uniquify the context for a single (possibly cloned) allocation.153template <typename DerivedCCG, typename FuncTy, typename CallTy>154class CallsiteContextGraph {155public:156CallsiteContextGraph() = default;157CallsiteContextGraph(const CallsiteContextGraph &) = default;158CallsiteContextGraph(CallsiteContextGraph &&) = default;159160/// Main entry point to perform analysis and transformations on graph.161bool process();162163/// Perform cloning on the graph necessary to uniquely identify the allocation164/// behavior of an allocation based on its context.165void identifyClones();166167/// Assign callsite clones to functions, cloning functions as needed to168/// accommodate the combinations of their callsite clones reached by callers.169/// For regular LTO this clones functions and callsites in the IR, but for170/// ThinLTO the cloning decisions are noted in the summaries and later applied171/// in applyImport.172bool assignFunctions();173174void dump() const;175void print(raw_ostream &OS) const;176void printTotalSizes(raw_ostream &OS) const;177178friend raw_ostream &operator<<(raw_ostream &OS,179const CallsiteContextGraph &CCG) {180CCG.print(OS);181return OS;182}183184friend struct GraphTraits<185const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;186friend struct DOTGraphTraits<187const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;188189void exportToDot(std::string Label) const;190191/// Represents a function clone via FuncTy pointer and clone number pair.192struct FuncInfo final193: public std::pair<FuncTy *, unsigned /*Clone number*/> {194using Base = std::pair<FuncTy *, unsigned>;195FuncInfo(const Base &B) : Base(B) {}196FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}197explicit operator bool() const { return this->first != nullptr; }198FuncTy *func() const { return this->first; }199unsigned cloneNo() const { return this->second; }200};201202/// Represents a callsite clone via CallTy and clone number pair.203struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {204using Base = std::pair<CallTy, unsigned>;205CallInfo(const Base &B) : Base(B) {}206CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)207: Base(Call, CloneNo) {}208explicit operator bool() const { return (bool)this->first; }209CallTy call() const { return this->first; }210unsigned cloneNo() const { return this->second; }211void setCloneNo(unsigned N) { this->second = N; }212void print(raw_ostream &OS) const {213if (!operator bool()) {214assert(!cloneNo());215OS << "null Call";216return;217}218call()->print(OS);219OS << "\t(clone " << cloneNo() << ")";220}221void dump() const {222print(dbgs());223dbgs() << "\n";224}225friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {226Call.print(OS);227return OS;228}229};230231struct ContextEdge;232233/// Node in the Callsite Context Graph234struct ContextNode {235// Keep this for now since in the IR case where we have an Instruction* it236// is not as immediately discoverable. Used for printing richer information237// when dumping graph.238bool IsAllocation;239240// Keeps track of when the Call was reset to null because there was241// recursion.242bool Recursive = false;243244// The corresponding allocation or interior call.245CallInfo Call;246247// For alloc nodes this is a unique id assigned when constructed, and for248// callsite stack nodes it is the original stack id when the node is249// constructed from the memprof MIB metadata on the alloc nodes. Note that250// this is only used when matching callsite metadata onto the stack nodes251// created when processing the allocation memprof MIBs, and for labeling252// nodes in the dot graph. Therefore we don't bother to assign a value for253// clones.254uint64_t OrigStackOrAllocId = 0;255256// This will be formed by ORing together the AllocationType enum values257// for contexts including this node.258uint8_t AllocTypes = 0;259260// Edges to all callees in the profiled call stacks.261// TODO: Should this be a map (from Callee node) for more efficient lookup?262std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;263264// Edges to all callers in the profiled call stacks.265// TODO: Should this be a map (from Caller node) for more efficient lookup?266std::vector<std::shared_ptr<ContextEdge>> CallerEdges;267268// Get the list of edges from which we can compute allocation information269// such as the context ids and allocation type of this node.270const std::vector<std::shared_ptr<ContextEdge>> *271getEdgesWithAllocInfo() const {272// If node has any callees, compute from those, otherwise compute from273// callers (i.e. if this is the leaf allocation node).274if (!CalleeEdges.empty())275return &CalleeEdges;276if (!CallerEdges.empty()) {277// A node with caller edges but no callee edges must be the allocation278// node.279assert(IsAllocation);280return &CallerEdges;281}282return nullptr;283}284285// Compute the context ids for this node from the union of its edge context286// ids.287DenseSet<uint32_t> getContextIds() const {288DenseSet<uint32_t> ContextIds;289auto *Edges = getEdgesWithAllocInfo();290if (!Edges)291return {};292unsigned Count = 0;293for (auto &Edge : *Edges)294Count += Edge->getContextIds().size();295ContextIds.reserve(Count);296for (auto &Edge : *Edges)297ContextIds.insert(Edge->getContextIds().begin(),298Edge->getContextIds().end());299return ContextIds;300}301302// Compute the allocation type for this node from the OR of its edge303// allocation types.304uint8_t computeAllocType() const {305auto *Edges = getEdgesWithAllocInfo();306if (!Edges)307return (uint8_t)AllocationType::None;308uint8_t BothTypes =309(uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;310uint8_t AllocType = (uint8_t)AllocationType::None;311for (auto &Edge : *Edges) {312AllocType |= Edge->AllocTypes;313// Bail early if alloc type reached both, no further refinement.314if (AllocType == BothTypes)315return AllocType;316}317return AllocType;318}319320// The context ids set for this node is empty if its edge context ids are321// also all empty.322bool emptyContextIds() const {323auto *Edges = getEdgesWithAllocInfo();324if (!Edges)325return true;326for (auto &Edge : *Edges) {327if (!Edge->getContextIds().empty())328return false;329}330return true;331}332333// List of clones of this ContextNode, initially empty.334std::vector<ContextNode *> Clones;335336// If a clone, points to the original uncloned node.337ContextNode *CloneOf = nullptr;338339ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}340341ContextNode(bool IsAllocation, CallInfo C)342: IsAllocation(IsAllocation), Call(C) {}343344void addClone(ContextNode *Clone) {345if (CloneOf) {346CloneOf->Clones.push_back(Clone);347Clone->CloneOf = CloneOf;348} else {349Clones.push_back(Clone);350assert(!Clone->CloneOf);351Clone->CloneOf = this;352}353}354355ContextNode *getOrigNode() {356if (!CloneOf)357return this;358return CloneOf;359}360361void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,362unsigned int ContextId);363364ContextEdge *findEdgeFromCallee(const ContextNode *Callee);365ContextEdge *findEdgeFromCaller(const ContextNode *Caller);366void eraseCalleeEdge(const ContextEdge *Edge);367void eraseCallerEdge(const ContextEdge *Edge);368369void setCall(CallInfo C) { Call = C; }370371bool hasCall() const { return (bool)Call.call(); }372373void printCall(raw_ostream &OS) const { Call.print(OS); }374375// True if this node was effectively removed from the graph, in which case376// it should have an allocation type of None and empty context ids.377bool isRemoved() const {378assert((AllocTypes == (uint8_t)AllocationType::None) ==379emptyContextIds());380return AllocTypes == (uint8_t)AllocationType::None;381}382383void dump() const;384void print(raw_ostream &OS) const;385386friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {387Node.print(OS);388return OS;389}390};391392/// Edge in the Callsite Context Graph from a ContextNode N to a caller or393/// callee.394struct ContextEdge {395ContextNode *Callee;396ContextNode *Caller;397398// This will be formed by ORing together the AllocationType enum values399// for contexts including this edge.400uint8_t AllocTypes = 0;401402// The set of IDs for contexts including this edge.403DenseSet<uint32_t> ContextIds;404405ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,406DenseSet<uint32_t> ContextIds)407: Callee(Callee), Caller(Caller), AllocTypes(AllocType),408ContextIds(std::move(ContextIds)) {}409410DenseSet<uint32_t> &getContextIds() { return ContextIds; }411412void dump() const;413void print(raw_ostream &OS) const;414415friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {416Edge.print(OS);417return OS;418}419};420421/// Helpers to remove callee edges that have allocation type None (due to not422/// carrying any context ids) after transformations.423void removeNoneTypeCalleeEdges(ContextNode *Node);424void425recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,426DenseSet<const ContextNode *> &Visited);427428protected:429/// Get a list of nodes corresponding to the stack ids in the given callsite430/// context.431template <class NodeT, class IteratorT>432std::vector<uint64_t>433getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);434435/// Adds nodes for the given allocation and any stack ids on its memprof MIB436/// metadata (or summary).437ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);438439/// Adds nodes for the given MIB stack ids.440template <class NodeT, class IteratorT>441void addStackNodesForMIB(ContextNode *AllocNode,442CallStack<NodeT, IteratorT> &StackContext,443CallStack<NodeT, IteratorT> &CallsiteContext,444AllocationType AllocType, uint64_t TotalSize);445446/// Matches all callsite metadata (or summary) to the nodes created for447/// allocation memprof MIB metadata, synthesizing new nodes to reflect any448/// inlining performed on those callsite instructions.449void updateStackNodes();450451/// Update graph to conservatively handle any callsite stack nodes that target452/// multiple different callee target functions.453void handleCallsitesWithMultipleTargets();454455/// Save lists of calls with MemProf metadata in each function, for faster456/// iteration.457MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;458459/// Map from callsite node to the enclosing caller function.460std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;461462private:463using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;464465using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,466const FuncTy *, DenseSet<uint32_t>>;467468/// Assigns the given Node to calls at or inlined into the location with469/// the Node's stack id, after post order traversing and processing its470/// caller nodes. Uses the call information recorded in the given471/// StackIdToMatchingCalls map, and creates new nodes for inlined sequences472/// as needed. Called by updateStackNodes which sets up the given473/// StackIdToMatchingCalls map.474void assignStackNodesPostOrder(475ContextNode *Node, DenseSet<const ContextNode *> &Visited,476DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);477478/// Duplicates the given set of context ids, updating the provided479/// map from each original id with the newly generated context ids,480/// and returning the new duplicated id set.481DenseSet<uint32_t> duplicateContextIds(482const DenseSet<uint32_t> &StackSequenceContextIds,483DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);484485/// Propagates all duplicated context ids across the graph.486void propagateDuplicateContextIds(487const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);488489/// Connect the NewNode to OrigNode's callees if TowardsCallee is true,490/// else to its callers. Also updates OrigNode's edges to remove any context491/// ids moved to the newly created edge.492void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,493bool TowardsCallee,494DenseSet<uint32_t> RemainingContextIds);495496/// Get the stack id corresponding to the given Id or Index (for IR this will497/// return itself, for a summary index this will return the id recorded in the498/// index for that stack id index value).499uint64_t getStackId(uint64_t IdOrIndex) const {500return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);501}502503/// Returns true if the given call targets the callee of the given edge, or if504/// we were able to identify the call chain through intermediate tail calls.505/// In the latter case new context nodes are added to the graph for the506/// identified tail calls, and their synthesized nodes are added to507/// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for508/// the updated edges and to prepare it for an increment in the caller.509bool510calleesMatch(CallTy Call, EdgeIter &EI,511MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);512513/// Returns true if the given call targets the given function, or if we were514/// able to identify the call chain through intermediate tail calls (in which515/// case FoundCalleeChain will be populated).516bool calleeMatchesFunc(517CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,518std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {519return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(520Call, Func, CallerFunc, FoundCalleeChain);521}522523/// Get a list of nodes corresponding to the stack ids in the given524/// callsite's context.525std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {526return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(527Call);528}529530/// Get the last stack id in the context for callsite.531uint64_t getLastStackId(CallTy Call) {532return static_cast<DerivedCCG *>(this)->getLastStackId(Call);533}534535/// Update the allocation call to record type of allocated memory.536void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {537AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;538static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);539}540541/// Update non-allocation call to invoke (possibly cloned) function542/// CalleeFunc.543void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {544static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);545}546547/// Clone the given function for the given callsite, recording mapping of all548/// of the functions tracked calls to their new versions in the CallMap.549/// Assigns new clones to clone number CloneNo.550FuncInfo cloneFunctionForCallsite(551FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,552std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {553return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(554Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);555}556557/// Gets a label to use in the dot graph for the given call clone in the given558/// function.559std::string getLabel(const FuncTy *Func, const CallTy Call,560unsigned CloneNo) const {561return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);562}563564/// Helpers to find the node corresponding to the given call or stackid.565ContextNode *getNodeForInst(const CallInfo &C);566ContextNode *getNodeForAlloc(const CallInfo &C);567ContextNode *getNodeForStackId(uint64_t StackId);568569/// Computes the alloc type corresponding to the given context ids, by570/// unioning their recorded alloc types.571uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);572573/// Returns the allocation type of the intersection of the contexts of two574/// nodes (based on their provided context id sets), optimized for the case575/// when Node1Ids is smaller than Node2Ids.576uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,577const DenseSet<uint32_t> &Node2Ids);578579/// Returns the allocation type of the intersection of the contexts of two580/// nodes (based on their provided context id sets).581uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,582const DenseSet<uint32_t> &Node2Ids);583584/// Create a clone of Edge's callee and move Edge to that new callee node,585/// performing the necessary context id and allocation type updates.586/// If callee's caller edge iterator is supplied, it is updated when removing587/// the edge from that list. If ContextIdsToMove is non-empty, only that588/// subset of Edge's ids are moved to an edge to the new callee.589ContextNode *590moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,591EdgeIter *CallerEdgeI = nullptr,592DenseSet<uint32_t> ContextIdsToMove = {});593594/// Change the callee of Edge to existing callee clone NewCallee, performing595/// the necessary context id and allocation type updates.596/// If callee's caller edge iterator is supplied, it is updated when removing597/// the edge from that list. If ContextIdsToMove is non-empty, only that598/// subset of Edge's ids are moved to an edge to the new callee.599void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,600ContextNode *NewCallee,601EdgeIter *CallerEdgeI = nullptr,602bool NewClone = false,603DenseSet<uint32_t> ContextIdsToMove = {});604605/// Recursively perform cloning on the graph for the given Node and its606/// callers, in order to uniquely identify the allocation behavior of an607/// allocation given its context. The context ids of the allocation being608/// processed are given in AllocContextIds.609void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,610const DenseSet<uint32_t> &AllocContextIds);611612/// Map from each context ID to the AllocationType assigned to that context.613DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;614615/// Map from each contextID to the profiled aggregate allocation size,616/// optionally populated when requested (via MemProfReportHintedSizes).617DenseMap<uint32_t, uint64_t> ContextIdToTotalSize;618619/// Identifies the context node created for a stack id when adding the MIB620/// contexts to the graph. This is used to locate the context nodes when621/// trying to assign the corresponding callsites with those stack ids to these622/// nodes.623DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;624625/// Maps to track the calls to their corresponding nodes in the graph.626MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;627MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;628629/// Owner of all ContextNode unique_ptrs.630std::vector<std::unique_ptr<ContextNode>> NodeOwner;631632/// Perform sanity checks on graph when requested.633void check() const;634635/// Keeps track of the last unique context id assigned.636unsigned int LastContextId = 0;637};638639template <typename DerivedCCG, typename FuncTy, typename CallTy>640using ContextNode =641typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;642template <typename DerivedCCG, typename FuncTy, typename CallTy>643using ContextEdge =644typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;645template <typename DerivedCCG, typename FuncTy, typename CallTy>646using FuncInfo =647typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;648template <typename DerivedCCG, typename FuncTy, typename CallTy>649using CallInfo =650typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;651652/// CRTP derived class for graphs built from IR (regular LTO).653class ModuleCallsiteContextGraph654: public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,655Instruction *> {656public:657ModuleCallsiteContextGraph(658Module &M,659llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);660661private:662friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,663Instruction *>;664665uint64_t getStackId(uint64_t IdOrIndex) const;666bool calleeMatchesFunc(667Instruction *Call, const Function *Func, const Function *CallerFunc,668std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);669bool findProfiledCalleeThroughTailCalls(670const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,671std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,672bool &FoundMultipleCalleeChains);673uint64_t getLastStackId(Instruction *Call);674std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);675void updateAllocationCall(CallInfo &Call, AllocationType AllocType);676void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);677CallsiteContextGraph<ModuleCallsiteContextGraph, Function,678Instruction *>::FuncInfo679cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,680std::map<CallInfo, CallInfo> &CallMap,681std::vector<CallInfo> &CallsWithMetadataInFunc,682unsigned CloneNo);683std::string getLabel(const Function *Func, const Instruction *Call,684unsigned CloneNo) const;685686const Module &Mod;687llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;688};689690/// Represents a call in the summary index graph, which can either be an691/// allocation or an interior callsite node in an allocation's context.692/// Holds a pointer to the corresponding data structure in the index.693struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {694IndexCall() : PointerUnion() {}695IndexCall(std::nullptr_t) : IndexCall() {}696IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}697IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}698IndexCall(PointerUnion PT) : PointerUnion(PT) {}699700IndexCall *operator->() { return this; }701702PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }703704void print(raw_ostream &OS) const {705if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {706OS << *AI;707} else {708auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());709assert(CI);710OS << *CI;711}712}713};714715/// CRTP derived class for graphs built from summary index (ThinLTO).716class IndexCallsiteContextGraph717: public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,718IndexCall> {719public:720IndexCallsiteContextGraph(721ModuleSummaryIndex &Index,722llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>723isPrevailing);724725~IndexCallsiteContextGraph() {726// Now that we are done with the graph it is safe to add the new727// CallsiteInfo structs to the function summary vectors. The graph nodes728// point into locations within these vectors, so we don't want to add them729// any earlier.730for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {731auto *FS = I.first;732for (auto &Callsite : I.second)733FS->addCallsite(*Callsite.second);734}735}736737private:738friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,739IndexCall>;740741uint64_t getStackId(uint64_t IdOrIndex) const;742bool calleeMatchesFunc(743IndexCall &Call, const FunctionSummary *Func,744const FunctionSummary *CallerFunc,745std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);746bool findProfiledCalleeThroughTailCalls(747ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,748std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,749bool &FoundMultipleCalleeChains);750uint64_t getLastStackId(IndexCall &Call);751std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);752void updateAllocationCall(CallInfo &Call, AllocationType AllocType);753void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);754CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,755IndexCall>::FuncInfo756cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,757std::map<CallInfo, CallInfo> &CallMap,758std::vector<CallInfo> &CallsWithMetadataInFunc,759unsigned CloneNo);760std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,761unsigned CloneNo) const;762763// Saves mapping from function summaries containing memprof records back to764// its VI, for use in checking and debugging.765std::map<const FunctionSummary *, ValueInfo> FSToVIMap;766767const ModuleSummaryIndex &Index;768llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>769isPrevailing;770771// Saves/owns the callsite info structures synthesized for missing tail call772// frames that we discover while building the graph.773// It maps from the summary of the function making the tail call, to a map774// of callee ValueInfo to corresponding synthesized callsite info.775std::unordered_map<FunctionSummary *,776std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>777FunctionCalleesToSynthesizedCallsiteInfos;778};779} // namespace780781namespace llvm {782template <>783struct DenseMapInfo<typename CallsiteContextGraph<784ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>785: public DenseMapInfo<std::pair<Instruction *, unsigned>> {};786template <>787struct DenseMapInfo<typename CallsiteContextGraph<788IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>789: public DenseMapInfo<std::pair<IndexCall, unsigned>> {};790template <>791struct DenseMapInfo<IndexCall>792: public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};793} // end namespace llvm794795namespace {796797struct FieldSeparator {798bool Skip = true;799const char *Sep;800801FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}802};803804raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {805if (FS.Skip) {806FS.Skip = false;807return OS;808}809return OS << FS.Sep;810}811812// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc813// type we should actually use on the corresponding allocation.814// If we can't clone a node that has NotCold+Cold alloc type, we will fall815// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold816// from NotCold.817AllocationType allocTypeToUse(uint8_t AllocTypes) {818assert(AllocTypes != (uint8_t)AllocationType::None);819if (AllocTypes ==820((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))821return AllocationType::NotCold;822else823return (AllocationType)AllocTypes;824}825826// Helper to check if the alloc types for all edges recorded in the827// InAllocTypes vector match the alloc types for all edges in the Edges828// vector.829template <typename DerivedCCG, typename FuncTy, typename CallTy>830bool allocTypesMatch(831const std::vector<uint8_t> &InAllocTypes,832const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>833&Edges) {834return std::equal(835InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),836[](const uint8_t &l,837const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {838// Can share if one of the edges is None type - don't839// care about the type along that edge as it doesn't840// exist for those context ids.841if (l == (uint8_t)AllocationType::None ||842r->AllocTypes == (uint8_t)AllocationType::None)843return true;844return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);845});846}847848} // end anonymous namespace849850template <typename DerivedCCG, typename FuncTy, typename CallTy>851typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *852CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(853const CallInfo &C) {854ContextNode *Node = getNodeForAlloc(C);855if (Node)856return Node;857858return NonAllocationCallToContextNodeMap.lookup(C);859}860861template <typename DerivedCCG, typename FuncTy, typename CallTy>862typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *863CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(864const CallInfo &C) {865return AllocationCallToContextNodeMap.lookup(C);866}867868template <typename DerivedCCG, typename FuncTy, typename CallTy>869typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *870CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(871uint64_t StackId) {872auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);873if (StackEntryNode != StackEntryIdToContextNodeMap.end())874return StackEntryNode->second;875return nullptr;876}877878template <typename DerivedCCG, typename FuncTy, typename CallTy>879void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::880addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,881unsigned int ContextId) {882for (auto &Edge : CallerEdges) {883if (Edge->Caller == Caller) {884Edge->AllocTypes |= (uint8_t)AllocType;885Edge->getContextIds().insert(ContextId);886return;887}888}889std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(890this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));891CallerEdges.push_back(Edge);892Caller->CalleeEdges.push_back(Edge);893}894895template <typename DerivedCCG, typename FuncTy, typename CallTy>896void CallsiteContextGraph<897DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {898for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {899auto Edge = *EI;900if (Edge->AllocTypes == (uint8_t)AllocationType::None) {901assert(Edge->ContextIds.empty());902Edge->Callee->eraseCallerEdge(Edge.get());903EI = Node->CalleeEdges.erase(EI);904} else905++EI;906}907}908909template <typename DerivedCCG, typename FuncTy, typename CallTy>910typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *911CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::912findEdgeFromCallee(const ContextNode *Callee) {913for (const auto &Edge : CalleeEdges)914if (Edge->Callee == Callee)915return Edge.get();916return nullptr;917}918919template <typename DerivedCCG, typename FuncTy, typename CallTy>920typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *921CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::922findEdgeFromCaller(const ContextNode *Caller) {923for (const auto &Edge : CallerEdges)924if (Edge->Caller == Caller)925return Edge.get();926return nullptr;927}928929template <typename DerivedCCG, typename FuncTy, typename CallTy>930void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::931eraseCalleeEdge(const ContextEdge *Edge) {932auto EI = llvm::find_if(933CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {934return CalleeEdge.get() == Edge;935});936assert(EI != CalleeEdges.end());937CalleeEdges.erase(EI);938}939940template <typename DerivedCCG, typename FuncTy, typename CallTy>941void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::942eraseCallerEdge(const ContextEdge *Edge) {943auto EI = llvm::find_if(944CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {945return CallerEdge.get() == Edge;946});947assert(EI != CallerEdges.end());948CallerEdges.erase(EI);949}950951template <typename DerivedCCG, typename FuncTy, typename CallTy>952uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(953DenseSet<uint32_t> &ContextIds) {954uint8_t BothTypes =955(uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;956uint8_t AllocType = (uint8_t)AllocationType::None;957for (auto Id : ContextIds) {958AllocType |= (uint8_t)ContextIdToAllocationType[Id];959// Bail early if alloc type reached both, no further refinement.960if (AllocType == BothTypes)961return AllocType;962}963return AllocType;964}965966template <typename DerivedCCG, typename FuncTy, typename CallTy>967uint8_t968CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(969const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {970uint8_t BothTypes =971(uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;972uint8_t AllocType = (uint8_t)AllocationType::None;973for (auto Id : Node1Ids) {974if (!Node2Ids.count(Id))975continue;976AllocType |= (uint8_t)ContextIdToAllocationType[Id];977// Bail early if alloc type reached both, no further refinement.978if (AllocType == BothTypes)979return AllocType;980}981return AllocType;982}983984template <typename DerivedCCG, typename FuncTy, typename CallTy>985uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(986const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {987if (Node1Ids.size() < Node2Ids.size())988return intersectAllocTypesImpl(Node1Ids, Node2Ids);989else990return intersectAllocTypesImpl(Node2Ids, Node1Ids);991}992993template <typename DerivedCCG, typename FuncTy, typename CallTy>994typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *995CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(996CallInfo Call, const FuncTy *F) {997assert(!getNodeForAlloc(Call));998NodeOwner.push_back(999std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));1000ContextNode *AllocNode = NodeOwner.back().get();1001AllocationCallToContextNodeMap[Call] = AllocNode;1002NodeToCallingFunc[AllocNode] = F;1003// Use LastContextId as a uniq id for MIB allocation nodes.1004AllocNode->OrigStackOrAllocId = LastContextId;1005// Alloc type should be updated as we add in the MIBs. We should assert1006// afterwards that it is not still None.1007AllocNode->AllocTypes = (uint8_t)AllocationType::None;10081009return AllocNode;1010}10111012static std::string getAllocTypeString(uint8_t AllocTypes) {1013if (!AllocTypes)1014return "None";1015std::string Str;1016if (AllocTypes & (uint8_t)AllocationType::NotCold)1017Str += "NotCold";1018if (AllocTypes & (uint8_t)AllocationType::Cold)1019Str += "Cold";1020return Str;1021}10221023template <typename DerivedCCG, typename FuncTy, typename CallTy>1024template <class NodeT, class IteratorT>1025void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(1026ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,1027CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,1028uint64_t TotalSize) {1029assert(!MemProfReportHintedSizes || TotalSize > 0);1030// Treating the hot alloc type as NotCold before the disambiguation for "hot"1031// is done.1032if (AllocType == AllocationType::Hot)1033AllocType = AllocationType::NotCold;10341035ContextIdToAllocationType[++LastContextId] = AllocType;10361037if (MemProfReportHintedSizes) {1038assert(TotalSize);1039ContextIdToTotalSize[LastContextId] = TotalSize;1040}10411042// Update alloc type and context ids for this MIB.1043AllocNode->AllocTypes |= (uint8_t)AllocType;10441045// Now add or update nodes for each stack id in alloc's context.1046// Later when processing the stack ids on non-alloc callsites we will adjust1047// for any inlining in the context.1048ContextNode *PrevNode = AllocNode;1049// Look for recursion (direct recursion should have been collapsed by1050// module summary analysis, here we should just be detecting mutual1051// recursion). Mark these nodes so we don't try to clone.1052SmallSet<uint64_t, 8> StackIdSet;1053// Skip any on the allocation call (inlining).1054for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);1055ContextIter != StackContext.end(); ++ContextIter) {1056auto StackId = getStackId(*ContextIter);1057ContextNode *StackNode = getNodeForStackId(StackId);1058if (!StackNode) {1059NodeOwner.push_back(1060std::make_unique<ContextNode>(/*IsAllocation=*/false));1061StackNode = NodeOwner.back().get();1062StackEntryIdToContextNodeMap[StackId] = StackNode;1063StackNode->OrigStackOrAllocId = StackId;1064}1065auto Ins = StackIdSet.insert(StackId);1066if (!Ins.second)1067StackNode->Recursive = true;1068StackNode->AllocTypes |= (uint8_t)AllocType;1069PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);1070PrevNode = StackNode;1071}1072}10731074template <typename DerivedCCG, typename FuncTy, typename CallTy>1075DenseSet<uint32_t>1076CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(1077const DenseSet<uint32_t> &StackSequenceContextIds,1078DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {1079DenseSet<uint32_t> NewContextIds;1080for (auto OldId : StackSequenceContextIds) {1081NewContextIds.insert(++LastContextId);1082OldToNewContextIds[OldId].insert(LastContextId);1083assert(ContextIdToAllocationType.count(OldId));1084// The new context has the same allocation type as original.1085ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];1086// For now set this to 0 so we don't duplicate sizes. Not clear how to divvy1087// up the size. Assume that if we are able to duplicate context ids that we1088// will be able to disambiguate all copies.1089ContextIdToTotalSize[LastContextId] = 0;1090}1091return NewContextIds;1092}10931094template <typename DerivedCCG, typename FuncTy, typename CallTy>1095void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::1096propagateDuplicateContextIds(1097const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {1098// Build a set of duplicated context ids corresponding to the input id set.1099auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {1100DenseSet<uint32_t> NewIds;1101for (auto Id : ContextIds)1102if (auto NewId = OldToNewContextIds.find(Id);1103NewId != OldToNewContextIds.end())1104NewIds.insert(NewId->second.begin(), NewId->second.end());1105return NewIds;1106};11071108// Recursively update context ids sets along caller edges.1109auto UpdateCallers = [&](ContextNode *Node,1110DenseSet<const ContextEdge *> &Visited,1111auto &&UpdateCallers) -> void {1112for (const auto &Edge : Node->CallerEdges) {1113auto Inserted = Visited.insert(Edge.get());1114if (!Inserted.second)1115continue;1116ContextNode *NextNode = Edge->Caller;1117DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());1118// Only need to recursively iterate to NextNode via this caller edge if1119// it resulted in any added ids to NextNode.1120if (!NewIdsToAdd.empty()) {1121Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());1122UpdateCallers(NextNode, Visited, UpdateCallers);1123}1124}1125};11261127DenseSet<const ContextEdge *> Visited;1128for (auto &Entry : AllocationCallToContextNodeMap) {1129auto *Node = Entry.second;1130UpdateCallers(Node, Visited, UpdateCallers);1131}1132}11331134template <typename DerivedCCG, typename FuncTy, typename CallTy>1135void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(1136ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,1137// This must be passed by value to make a copy since it will be adjusted1138// as ids are moved.1139DenseSet<uint32_t> RemainingContextIds) {1140auto &OrigEdges =1141TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;1142// Increment iterator in loop so that we can remove edges as needed.1143for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {1144auto Edge = *EI;1145// Remove any matching context ids from Edge, return set that were found and1146// removed, these are the new edge's context ids. Also update the remaining1147// (not found ids).1148DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;1149set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,1150NotFoundContextIds);1151RemainingContextIds.swap(NotFoundContextIds);1152// If no matching context ids for this edge, skip it.1153if (NewEdgeContextIds.empty()) {1154++EI;1155continue;1156}1157if (TowardsCallee) {1158uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);1159auto NewEdge = std::make_shared<ContextEdge>(1160Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));1161NewNode->CalleeEdges.push_back(NewEdge);1162NewEdge->Callee->CallerEdges.push_back(NewEdge);1163} else {1164uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);1165auto NewEdge = std::make_shared<ContextEdge>(1166NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));1167NewNode->CallerEdges.push_back(NewEdge);1168NewEdge->Caller->CalleeEdges.push_back(NewEdge);1169}1170// Remove old edge if context ids empty.1171if (Edge->getContextIds().empty()) {1172if (TowardsCallee) {1173Edge->Callee->eraseCallerEdge(Edge.get());1174EI = OrigNode->CalleeEdges.erase(EI);1175} else {1176Edge->Caller->eraseCalleeEdge(Edge.get());1177EI = OrigNode->CallerEdges.erase(EI);1178}1179continue;1180}1181++EI;1182}1183}11841185template <typename DerivedCCG, typename FuncTy, typename CallTy>1186static void checkEdge(1187const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {1188// Confirm that alloc type is not None and that we have at least one context1189// id.1190assert(Edge->AllocTypes != (uint8_t)AllocationType::None);1191assert(!Edge->ContextIds.empty());1192}11931194template <typename DerivedCCG, typename FuncTy, typename CallTy>1195static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,1196bool CheckEdges = true) {1197if (Node->isRemoved())1198return;1199#ifndef NDEBUG1200// Compute node's context ids once for use in asserts.1201auto NodeContextIds = Node->getContextIds();1202#endif1203// Node's context ids should be the union of both its callee and caller edge1204// context ids.1205if (Node->CallerEdges.size()) {1206DenseSet<uint32_t> CallerEdgeContextIds(1207Node->CallerEdges.front()->ContextIds);1208for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {1209if (CheckEdges)1210checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);1211set_union(CallerEdgeContextIds, Edge->ContextIds);1212}1213// Node can have more context ids than callers if some contexts terminate at1214// node and some are longer.1215assert(NodeContextIds == CallerEdgeContextIds ||1216set_is_subset(CallerEdgeContextIds, NodeContextIds));1217}1218if (Node->CalleeEdges.size()) {1219DenseSet<uint32_t> CalleeEdgeContextIds(1220Node->CalleeEdges.front()->ContextIds);1221for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {1222if (CheckEdges)1223checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);1224set_union(CalleeEdgeContextIds, Edge->getContextIds());1225}1226assert(NodeContextIds == CalleeEdgeContextIds);1227}1228}12291230template <typename DerivedCCG, typename FuncTy, typename CallTy>1231void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::1232assignStackNodesPostOrder(ContextNode *Node,1233DenseSet<const ContextNode *> &Visited,1234DenseMap<uint64_t, std::vector<CallContextInfo>>1235&StackIdToMatchingCalls) {1236auto Inserted = Visited.insert(Node);1237if (!Inserted.second)1238return;1239// Post order traversal. Iterate over a copy since we may add nodes and1240// therefore new callers during the recursive call, invalidating any1241// iterator over the original edge vector. We don't need to process these1242// new nodes as they were already processed on creation.1243auto CallerEdges = Node->CallerEdges;1244for (auto &Edge : CallerEdges) {1245// Skip any that have been removed during the recursion.1246if (!Edge)1247continue;1248assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);1249}12501251// If this node's stack id is in the map, update the graph to contain new1252// nodes representing any inlining at interior callsites. Note we move the1253// associated context ids over to the new nodes.12541255// Ignore this node if it is for an allocation or we didn't record any1256// stack id lists ending at it.1257if (Node->IsAllocation ||1258!StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))1259return;12601261auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];1262// Handle the simple case first. A single call with a single stack id.1263// In this case there is no need to create any new context nodes, simply1264// assign the context node for stack id to this Call.1265if (Calls.size() == 1) {1266auto &[Call, Ids, Func, SavedContextIds] = Calls[0];1267if (Ids.size() == 1) {1268assert(SavedContextIds.empty());1269// It should be this Node1270assert(Node == getNodeForStackId(Ids[0]));1271if (Node->Recursive)1272return;1273Node->setCall(Call);1274NonAllocationCallToContextNodeMap[Call] = Node;1275NodeToCallingFunc[Node] = Func;1276return;1277}1278}12791280// Find the node for the last stack id, which should be the same1281// across all calls recorded for this id, and is this node's id.1282uint64_t LastId = Node->OrigStackOrAllocId;1283ContextNode *LastNode = getNodeForStackId(LastId);1284// We should only have kept stack ids that had nodes.1285assert(LastNode);12861287for (unsigned I = 0; I < Calls.size(); I++) {1288auto &[Call, Ids, Func, SavedContextIds] = Calls[I];1289// Skip any for which we didn't assign any ids, these don't get a node in1290// the graph.1291if (SavedContextIds.empty())1292continue;12931294assert(LastId == Ids.back());12951296ContextNode *FirstNode = getNodeForStackId(Ids[0]);1297assert(FirstNode);12981299// Recompute the context ids for this stack id sequence (the1300// intersection of the context ids of the corresponding nodes).1301// Start with the ids we saved in the map for this call, which could be1302// duplicated context ids. We have to recompute as we might have overlap1303// overlap between the saved context ids for different last nodes, and1304// removed them already during the post order traversal.1305set_intersect(SavedContextIds, FirstNode->getContextIds());1306ContextNode *PrevNode = nullptr;1307for (auto Id : Ids) {1308ContextNode *CurNode = getNodeForStackId(Id);1309// We should only have kept stack ids that had nodes and weren't1310// recursive.1311assert(CurNode);1312assert(!CurNode->Recursive);1313if (!PrevNode) {1314PrevNode = CurNode;1315continue;1316}1317auto *Edge = CurNode->findEdgeFromCallee(PrevNode);1318if (!Edge) {1319SavedContextIds.clear();1320break;1321}1322PrevNode = CurNode;1323set_intersect(SavedContextIds, Edge->getContextIds());13241325// If we now have no context ids for clone, skip this call.1326if (SavedContextIds.empty())1327break;1328}1329if (SavedContextIds.empty())1330continue;13311332// Create new context node.1333NodeOwner.push_back(1334std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));1335ContextNode *NewNode = NodeOwner.back().get();1336NodeToCallingFunc[NewNode] = Func;1337NonAllocationCallToContextNodeMap[Call] = NewNode;1338NewNode->AllocTypes = computeAllocType(SavedContextIds);13391340// Connect to callees of innermost stack frame in inlined call chain.1341// This updates context ids for FirstNode's callee's to reflect those1342// moved to NewNode.1343connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);13441345// Connect to callers of outermost stack frame in inlined call chain.1346// This updates context ids for FirstNode's caller's to reflect those1347// moved to NewNode.1348connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);13491350// Now we need to remove context ids from edges/nodes between First and1351// Last Node.1352PrevNode = nullptr;1353for (auto Id : Ids) {1354ContextNode *CurNode = getNodeForStackId(Id);1355// We should only have kept stack ids that had nodes.1356assert(CurNode);13571358// Remove the context ids moved to NewNode from CurNode, and the1359// edge from the prior node.1360if (PrevNode) {1361auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);1362assert(PrevEdge);1363set_subtract(PrevEdge->getContextIds(), SavedContextIds);1364if (PrevEdge->getContextIds().empty()) {1365PrevNode->eraseCallerEdge(PrevEdge);1366CurNode->eraseCalleeEdge(PrevEdge);1367}1368}1369// Since we update the edges from leaf to tail, only look at the callee1370// edges. This isn't an alloc node, so if there are no callee edges, the1371// alloc type is None.1372CurNode->AllocTypes = CurNode->CalleeEdges.empty()1373? (uint8_t)AllocationType::None1374: CurNode->computeAllocType();1375PrevNode = CurNode;1376}1377if (VerifyNodes) {1378checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);1379for (auto Id : Ids) {1380ContextNode *CurNode = getNodeForStackId(Id);1381// We should only have kept stack ids that had nodes.1382assert(CurNode);1383checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);1384}1385}1386}1387}13881389template <typename DerivedCCG, typename FuncTy, typename CallTy>1390void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {1391// Map of stack id to all calls with that as the last (outermost caller)1392// callsite id that has a context node (some might not due to pruning1393// performed during matching of the allocation profile contexts).1394// The CallContextInfo contains the Call and a list of its stack ids with1395// ContextNodes, the function containing Call, and the set of context ids1396// the analysis will eventually identify for use in any new node created1397// for that callsite.1398DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;1399for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {1400for (auto &Call : CallsWithMetadata) {1401// Ignore allocations, already handled.1402if (AllocationCallToContextNodeMap.count(Call))1403continue;1404auto StackIdsWithContextNodes =1405getStackIdsWithContextNodesForCall(Call.call());1406// If there were no nodes created for MIBs on allocs (maybe this was in1407// the unambiguous part of the MIB stack that was pruned), ignore.1408if (StackIdsWithContextNodes.empty())1409continue;1410// Otherwise, record this Call along with the list of ids for the last1411// (outermost caller) stack id with a node.1412StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(1413{Call.call(), StackIdsWithContextNodes, Func, {}});1414}1415}14161417// First make a pass through all stack ids that correspond to a call,1418// as identified in the above loop. Compute the context ids corresponding to1419// each of these calls when they correspond to multiple stack ids due to1420// due to inlining. Perform any duplication of context ids required when1421// there is more than one call with the same stack ids. Their (possibly newly1422// duplicated) context ids are saved in the StackIdToMatchingCalls map.1423DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;1424for (auto &It : StackIdToMatchingCalls) {1425auto &Calls = It.getSecond();1426// Skip single calls with a single stack id. These don't need a new node.1427if (Calls.size() == 1) {1428auto &Ids = std::get<1>(Calls[0]);1429if (Ids.size() == 1)1430continue;1431}1432// In order to do the best and maximal matching of inlined calls to context1433// node sequences we will sort the vectors of stack ids in descending order1434// of length, and within each length, lexicographically by stack id. The1435// latter is so that we can specially handle calls that have identical stack1436// id sequences (either due to cloning or artificially because of the MIB1437// context pruning).1438std::stable_sort(Calls.begin(), Calls.end(),1439[](const CallContextInfo &A, const CallContextInfo &B) {1440auto &IdsA = std::get<1>(A);1441auto &IdsB = std::get<1>(B);1442return IdsA.size() > IdsB.size() ||1443(IdsA.size() == IdsB.size() && IdsA < IdsB);1444});14451446// Find the node for the last stack id, which should be the same1447// across all calls recorded for this id, and is the id for this1448// entry in the StackIdToMatchingCalls map.1449uint64_t LastId = It.getFirst();1450ContextNode *LastNode = getNodeForStackId(LastId);1451// We should only have kept stack ids that had nodes.1452assert(LastNode);14531454if (LastNode->Recursive)1455continue;14561457// Initialize the context ids with the last node's. We will subsequently1458// refine the context ids by computing the intersection along all edges.1459DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();1460assert(!LastNodeContextIds.empty());14611462for (unsigned I = 0; I < Calls.size(); I++) {1463auto &[Call, Ids, Func, SavedContextIds] = Calls[I];1464assert(SavedContextIds.empty());1465assert(LastId == Ids.back());14661467// First compute the context ids for this stack id sequence (the1468// intersection of the context ids of the corresponding nodes).1469// Start with the remaining saved ids for the last node.1470assert(!LastNodeContextIds.empty());1471DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;14721473ContextNode *PrevNode = LastNode;1474ContextNode *CurNode = LastNode;1475bool Skip = false;14761477// Iterate backwards through the stack Ids, starting after the last Id1478// in the list, which was handled once outside for all Calls.1479for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {1480auto Id = *IdIter;1481CurNode = getNodeForStackId(Id);1482// We should only have kept stack ids that had nodes.1483assert(CurNode);14841485if (CurNode->Recursive) {1486Skip = true;1487break;1488}14891490auto *Edge = CurNode->findEdgeFromCaller(PrevNode);1491// If there is no edge then the nodes belong to different MIB contexts,1492// and we should skip this inlined context sequence. For example, this1493// particular inlined context may include stack ids A->B, and we may1494// indeed have nodes for both A and B, but it is possible that they were1495// never profiled in sequence in a single MIB for any allocation (i.e.1496// we might have profiled an allocation that involves the callsite A,1497// but through a different one of its callee callsites, and we might1498// have profiled an allocation that involves callsite B, but reached1499// from a different caller callsite).1500if (!Edge) {1501Skip = true;1502break;1503}1504PrevNode = CurNode;15051506// Update the context ids, which is the intersection of the ids along1507// all edges in the sequence.1508set_intersect(StackSequenceContextIds, Edge->getContextIds());15091510// If we now have no context ids for clone, skip this call.1511if (StackSequenceContextIds.empty()) {1512Skip = true;1513break;1514}1515}1516if (Skip)1517continue;15181519// If some of this call's stack ids did not have corresponding nodes (due1520// to pruning), don't include any context ids for contexts that extend1521// beyond these nodes. Otherwise we would be matching part of unrelated /1522// not fully matching stack contexts. To do this, subtract any context ids1523// found in caller nodes of the last node found above.1524if (Ids.back() != getLastStackId(Call)) {1525for (const auto &PE : LastNode->CallerEdges) {1526set_subtract(StackSequenceContextIds, PE->getContextIds());1527if (StackSequenceContextIds.empty())1528break;1529}1530// If we now have no context ids for clone, skip this call.1531if (StackSequenceContextIds.empty())1532continue;1533}15341535// Check if the next set of stack ids is the same (since the Calls vector1536// of tuples is sorted by the stack ids we can just look at the next one).1537bool DuplicateContextIds = false;1538if (I + 1 < Calls.size()) {1539auto NextIds = std::get<1>(Calls[I + 1]);1540DuplicateContextIds = Ids == NextIds;1541}15421543// If we don't have duplicate context ids, then we can assign all the1544// context ids computed for the original node sequence to this call.1545// If there are duplicate calls with the same stack ids then we synthesize1546// new context ids that are duplicates of the originals. These are1547// assigned to SavedContextIds, which is a reference into the map entry1548// for this call, allowing us to access these ids later on.1549OldToNewContextIds.reserve(OldToNewContextIds.size() +1550StackSequenceContextIds.size());1551SavedContextIds =1552DuplicateContextIds1553? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)1554: StackSequenceContextIds;1555assert(!SavedContextIds.empty());15561557if (!DuplicateContextIds) {1558// Update saved last node's context ids to remove those that are1559// assigned to other calls, so that it is ready for the next call at1560// this stack id.1561set_subtract(LastNodeContextIds, StackSequenceContextIds);1562if (LastNodeContextIds.empty())1563break;1564}1565}1566}15671568// Propagate the duplicate context ids over the graph.1569propagateDuplicateContextIds(OldToNewContextIds);15701571if (VerifyCCG)1572check();15731574// Now perform a post-order traversal over the graph, starting with the1575// allocation nodes, essentially processing nodes from callers to callees.1576// For any that contains an id in the map, update the graph to contain new1577// nodes representing any inlining at interior callsites. Note we move the1578// associated context ids over to the new nodes.1579DenseSet<const ContextNode *> Visited;1580for (auto &Entry : AllocationCallToContextNodeMap)1581assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);1582if (VerifyCCG)1583check();1584}15851586uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {1587CallStack<MDNode, MDNode::op_iterator> CallsiteContext(1588Call->getMetadata(LLVMContext::MD_callsite));1589return CallsiteContext.back();1590}15911592uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {1593assert(isa<CallsiteInfo *>(Call.getBase()));1594CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>1595CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));1596// Need to convert index into stack id.1597return Index.getStackIdAtIndex(CallsiteContext.back());1598}15991600static const std::string MemProfCloneSuffix = ".memprof.";16011602static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {1603// We use CloneNo == 0 to refer to the original version, which doesn't get1604// renamed with a suffix.1605if (!CloneNo)1606return Base.str();1607return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();1608}16091610std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,1611const Instruction *Call,1612unsigned CloneNo) const {1613return (Twine(Call->getFunction()->getName()) + " -> " +1614cast<CallBase>(Call)->getCalledFunction()->getName())1615.str();1616}16171618std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,1619const IndexCall &Call,1620unsigned CloneNo) const {1621auto VI = FSToVIMap.find(Func);1622assert(VI != FSToVIMap.end());1623if (isa<AllocInfo *>(Call.getBase()))1624return (VI->second.name() + " -> alloc").str();1625else {1626auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());1627return (VI->second.name() + " -> " +1628getMemProfFuncName(Callsite->Callee.name(),1629Callsite->Clones[CloneNo]))1630.str();1631}1632}16331634std::vector<uint64_t>1635ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(1636Instruction *Call) {1637CallStack<MDNode, MDNode::op_iterator> CallsiteContext(1638Call->getMetadata(LLVMContext::MD_callsite));1639return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(1640CallsiteContext);1641}16421643std::vector<uint64_t>1644IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {1645assert(isa<CallsiteInfo *>(Call.getBase()));1646CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>1647CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));1648return getStackIdsWithContextNodes<CallsiteInfo,1649SmallVector<unsigned>::const_iterator>(1650CallsiteContext);1651}16521653template <typename DerivedCCG, typename FuncTy, typename CallTy>1654template <class NodeT, class IteratorT>1655std::vector<uint64_t>1656CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(1657CallStack<NodeT, IteratorT> &CallsiteContext) {1658std::vector<uint64_t> StackIds;1659for (auto IdOrIndex : CallsiteContext) {1660auto StackId = getStackId(IdOrIndex);1661ContextNode *Node = getNodeForStackId(StackId);1662if (!Node)1663break;1664StackIds.push_back(StackId);1665}1666return StackIds;1667}16681669ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(1670Module &M,1671llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)1672: Mod(M), OREGetter(OREGetter) {1673for (auto &F : M) {1674std::vector<CallInfo> CallsWithMetadata;1675for (auto &BB : F) {1676for (auto &I : BB) {1677if (!isa<CallBase>(I))1678continue;1679if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {1680CallsWithMetadata.push_back(&I);1681auto *AllocNode = addAllocNode(&I, &F);1682auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);1683assert(CallsiteMD);1684CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);1685// Add all of the MIBs and their stack nodes.1686for (auto &MDOp : MemProfMD->operands()) {1687auto *MIBMD = cast<const MDNode>(MDOp);1688MDNode *StackNode = getMIBStackNode(MIBMD);1689assert(StackNode);1690CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);1691addStackNodesForMIB<MDNode, MDNode::op_iterator>(1692AllocNode, StackContext, CallsiteContext,1693getMIBAllocType(MIBMD), getMIBTotalSize(MIBMD));1694}1695assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);1696// Memprof and callsite metadata on memory allocations no longer1697// needed.1698I.setMetadata(LLVMContext::MD_memprof, nullptr);1699I.setMetadata(LLVMContext::MD_callsite, nullptr);1700}1701// For callsite metadata, add to list for this function for later use.1702else if (I.getMetadata(LLVMContext::MD_callsite))1703CallsWithMetadata.push_back(&I);1704}1705}1706if (!CallsWithMetadata.empty())1707FuncToCallsWithMetadata[&F] = CallsWithMetadata;1708}17091710if (DumpCCG) {1711dbgs() << "CCG before updating call stack chains:\n";1712dbgs() << *this;1713}17141715if (ExportToDot)1716exportToDot("prestackupdate");17171718updateStackNodes();17191720handleCallsitesWithMultipleTargets();17211722// Strip off remaining callsite metadata, no longer needed.1723for (auto &FuncEntry : FuncToCallsWithMetadata)1724for (auto &Call : FuncEntry.second)1725Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);1726}17271728IndexCallsiteContextGraph::IndexCallsiteContextGraph(1729ModuleSummaryIndex &Index,1730llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>1731isPrevailing)1732: Index(Index), isPrevailing(isPrevailing) {1733for (auto &I : Index) {1734auto VI = Index.getValueInfo(I);1735for (auto &S : VI.getSummaryList()) {1736// We should only add the prevailing nodes. Otherwise we may try to clone1737// in a weak copy that won't be linked (and may be different than the1738// prevailing version).1739// We only keep the memprof summary on the prevailing copy now when1740// building the combined index, as a space optimization, however don't1741// rely on this optimization. The linker doesn't resolve local linkage1742// values so don't check whether those are prevailing.1743if (!GlobalValue::isLocalLinkage(S->linkage()) &&1744!isPrevailing(VI.getGUID(), S.get()))1745continue;1746auto *FS = dyn_cast<FunctionSummary>(S.get());1747if (!FS)1748continue;1749std::vector<CallInfo> CallsWithMetadata;1750if (!FS->allocs().empty()) {1751for (auto &AN : FS->mutableAllocs()) {1752// This can happen because of recursion elimination handling that1753// currently exists in ModuleSummaryAnalysis. Skip these for now.1754// We still added them to the summary because we need to be able to1755// correlate properly in applyImport in the backends.1756if (AN.MIBs.empty())1757continue;1758CallsWithMetadata.push_back({&AN});1759auto *AllocNode = addAllocNode({&AN}, FS);1760// Pass an empty CallStack to the CallsiteContext (second)1761// parameter, since for ThinLTO we already collapsed out the inlined1762// stack ids on the allocation call during ModuleSummaryAnalysis.1763CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>1764EmptyContext;1765unsigned I = 0;1766assert(!MemProfReportHintedSizes ||1767AN.TotalSizes.size() == AN.MIBs.size());1768// Now add all of the MIBs and their stack nodes.1769for (auto &MIB : AN.MIBs) {1770CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>1771StackContext(&MIB);1772uint64_t TotalSize = 0;1773if (MemProfReportHintedSizes)1774TotalSize = AN.TotalSizes[I];1775addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(1776AllocNode, StackContext, EmptyContext, MIB.AllocType,1777TotalSize);1778I++;1779}1780assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);1781// Initialize version 0 on the summary alloc node to the current alloc1782// type, unless it has both types in which case make it default, so1783// that in the case where we aren't able to clone the original version1784// always ends up with the default allocation behavior.1785AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);1786}1787}1788// For callsite metadata, add to list for this function for later use.1789if (!FS->callsites().empty())1790for (auto &SN : FS->mutableCallsites())1791CallsWithMetadata.push_back({&SN});17921793if (!CallsWithMetadata.empty())1794FuncToCallsWithMetadata[FS] = CallsWithMetadata;17951796if (!FS->allocs().empty() || !FS->callsites().empty())1797FSToVIMap[FS] = VI;1798}1799}18001801if (DumpCCG) {1802dbgs() << "CCG before updating call stack chains:\n";1803dbgs() << *this;1804}18051806if (ExportToDot)1807exportToDot("prestackupdate");18081809updateStackNodes();18101811handleCallsitesWithMultipleTargets();1812}18131814template <typename DerivedCCG, typename FuncTy, typename CallTy>1815void CallsiteContextGraph<DerivedCCG, FuncTy,1816CallTy>::handleCallsitesWithMultipleTargets() {1817// Look for and workaround callsites that call multiple functions.1818// This can happen for indirect calls, which needs better handling, and in1819// more rare cases (e.g. macro expansion).1820// TODO: To fix this for indirect calls we will want to perform speculative1821// devirtualization using either the normal PGO info with ICP, or using the1822// information in the profiled MemProf contexts. We can do this prior to1823// this transformation for regular LTO, and for ThinLTO we can simulate that1824// effect in the summary and perform the actual speculative devirtualization1825// while cloning in the ThinLTO backend.18261827// Keep track of the new nodes synthesized for discovered tail calls missing1828// from the profiled contexts.1829MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;18301831for (auto &Entry : NonAllocationCallToContextNodeMap) {1832auto *Node = Entry.second;1833assert(Node->Clones.empty());1834// Check all node callees and see if in the same function.1835auto Call = Node->Call.call();1836for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();1837++EI) {1838auto Edge = *EI;1839if (!Edge->Callee->hasCall())1840continue;1841assert(NodeToCallingFunc.count(Edge->Callee));1842// Check if the called function matches that of the callee node.1843if (calleesMatch(Call, EI, TailCallToContextNodeMap))1844continue;1845RemovedEdgesWithMismatchedCallees++;1846// Work around by setting Node to have a null call, so it gets1847// skipped during cloning. Otherwise assignFunctions will assert1848// because its data structures are not designed to handle this case.1849Node->setCall(CallInfo());1850break;1851}1852}18531854// Remove all mismatched nodes identified in the above loop from the node map1855// (checking whether they have a null call which is set above). For a1856// MapVector like NonAllocationCallToContextNodeMap it is much more efficient1857// to do the removal via remove_if than by individually erasing entries above.1858NonAllocationCallToContextNodeMap.remove_if(1859[](const auto &it) { return !it.second->hasCall(); });18601861// Add the new nodes after the above loop so that the iteration is not1862// invalidated.1863for (auto &[Call, Node] : TailCallToContextNodeMap)1864NonAllocationCallToContextNodeMap[Call] = Node;1865}18661867uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {1868// In the Module (IR) case this is already the Id.1869return IdOrIndex;1870}18711872uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {1873// In the Index case this is an index into the stack id list in the summary1874// index, convert it to an Id.1875return Index.getStackIdAtIndex(IdOrIndex);1876}18771878template <typename DerivedCCG, typename FuncTy, typename CallTy>1879bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(1880CallTy Call, EdgeIter &EI,1881MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {1882auto Edge = *EI;1883const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];1884const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];1885// Will be populated in order of callee to caller if we find a chain of tail1886// calls between the profiled caller and callee.1887std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;1888if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,1889FoundCalleeChain))1890return false;18911892// The usual case where the profiled callee matches that of the IR/summary.1893if (FoundCalleeChain.empty())1894return true;18951896auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {1897auto *CurEdge = Callee->findEdgeFromCaller(Caller);1898// If there is already an edge between these nodes, simply update it and1899// return.1900if (CurEdge) {1901CurEdge->ContextIds.insert(Edge->ContextIds.begin(),1902Edge->ContextIds.end());1903CurEdge->AllocTypes |= Edge->AllocTypes;1904return;1905}1906// Otherwise, create a new edge and insert it into the caller and callee1907// lists.1908auto NewEdge = std::make_shared<ContextEdge>(1909Callee, Caller, Edge->AllocTypes, Edge->ContextIds);1910Callee->CallerEdges.push_back(NewEdge);1911if (Caller == Edge->Caller) {1912// If we are inserting the new edge into the current edge's caller, insert1913// the new edge before the current iterator position, and then increment1914// back to the current edge.1915EI = Caller->CalleeEdges.insert(EI, NewEdge);1916++EI;1917assert(*EI == Edge &&1918"Iterator position not restored after insert and increment");1919} else1920Caller->CalleeEdges.push_back(NewEdge);1921};19221923// Create new nodes for each found callee and connect in between the profiled1924// caller and callee.1925auto *CurCalleeNode = Edge->Callee;1926for (auto &[NewCall, Func] : FoundCalleeChain) {1927ContextNode *NewNode = nullptr;1928// First check if we have already synthesized a node for this tail call.1929if (TailCallToContextNodeMap.count(NewCall)) {1930NewNode = TailCallToContextNodeMap[NewCall];1931NewNode->AllocTypes |= Edge->AllocTypes;1932} else {1933FuncToCallsWithMetadata[Func].push_back({NewCall});1934// Create Node and record node info.1935NodeOwner.push_back(1936std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall));1937NewNode = NodeOwner.back().get();1938NodeToCallingFunc[NewNode] = Func;1939TailCallToContextNodeMap[NewCall] = NewNode;1940NewNode->AllocTypes = Edge->AllocTypes;1941}19421943// Hook up node to its callee node1944AddEdge(NewNode, CurCalleeNode);19451946CurCalleeNode = NewNode;1947}19481949// Hook up edge's original caller to new callee node.1950AddEdge(Edge->Caller, CurCalleeNode);19511952// Remove old edge1953Edge->Callee->eraseCallerEdge(Edge.get());1954EI = Edge->Caller->CalleeEdges.erase(EI);19551956// To simplify the increment of EI in the caller, subtract one from EI.1957// In the final AddEdge call we would have either added a new callee edge,1958// to Edge->Caller, or found an existing one. Either way we are guaranteed1959// that there is at least one callee edge.1960assert(!Edge->Caller->CalleeEdges.empty());1961--EI;19621963return true;1964}19651966bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(1967const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,1968std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,1969bool &FoundMultipleCalleeChains) {1970// Stop recursive search if we have already explored the maximum specified1971// depth.1972if (Depth > TailCallSearchDepth)1973return false;19741975auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {1976FoundCalleeChain.push_back({Callsite, F});1977};19781979auto *CalleeFunc = dyn_cast<Function>(CurCallee);1980if (!CalleeFunc) {1981auto *Alias = dyn_cast<GlobalAlias>(CurCallee);1982assert(Alias);1983CalleeFunc = dyn_cast<Function>(Alias->getAliasee());1984assert(CalleeFunc);1985}19861987// Look for tail calls in this function, and check if they either call the1988// profiled callee directly, or indirectly (via a recursive search).1989// Only succeed if there is a single unique tail call chain found between the1990// profiled caller and callee, otherwise we could perform incorrect cloning.1991bool FoundSingleCalleeChain = false;1992for (auto &BB : *CalleeFunc) {1993for (auto &I : BB) {1994auto *CB = dyn_cast<CallBase>(&I);1995if (!CB || !CB->isTailCall())1996continue;1997auto *CalledValue = CB->getCalledOperand();1998auto *CalledFunction = CB->getCalledFunction();1999if (CalledValue && !CalledFunction) {2000CalledValue = CalledValue->stripPointerCasts();2001// Stripping pointer casts can reveal a called function.2002CalledFunction = dyn_cast<Function>(CalledValue);2003}2004// Check if this is an alias to a function. If so, get the2005// called aliasee for the checks below.2006if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {2007assert(!CalledFunction &&2008"Expected null called function in callsite for alias");2009CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());2010}2011if (!CalledFunction)2012continue;2013if (CalledFunction == ProfiledCallee) {2014if (FoundSingleCalleeChain) {2015FoundMultipleCalleeChains = true;2016return false;2017}2018FoundSingleCalleeChain = true;2019FoundProfiledCalleeCount++;2020FoundProfiledCalleeDepth += Depth;2021if (Depth > FoundProfiledCalleeMaxDepth)2022FoundProfiledCalleeMaxDepth = Depth;2023SaveCallsiteInfo(&I, CalleeFunc);2024} else if (findProfiledCalleeThroughTailCalls(2025ProfiledCallee, CalledFunction, Depth + 1,2026FoundCalleeChain, FoundMultipleCalleeChains)) {2027// findProfiledCalleeThroughTailCalls should not have returned2028// true if FoundMultipleCalleeChains.2029assert(!FoundMultipleCalleeChains);2030if (FoundSingleCalleeChain) {2031FoundMultipleCalleeChains = true;2032return false;2033}2034FoundSingleCalleeChain = true;2035SaveCallsiteInfo(&I, CalleeFunc);2036} else if (FoundMultipleCalleeChains)2037return false;2038}2039}20402041return FoundSingleCalleeChain;2042}20432044bool ModuleCallsiteContextGraph::calleeMatchesFunc(2045Instruction *Call, const Function *Func, const Function *CallerFunc,2046std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {2047auto *CB = dyn_cast<CallBase>(Call);2048if (!CB->getCalledOperand())2049return false;2050auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();2051auto *CalleeFunc = dyn_cast<Function>(CalleeVal);2052if (CalleeFunc == Func)2053return true;2054auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);2055if (Alias && Alias->getAliasee() == Func)2056return true;20572058// Recursively search for the profiled callee through tail calls starting with2059// the actual Callee. The discovered tail call chain is saved in2060// FoundCalleeChain, and we will fixup the graph to include these callsites2061// after returning.2062// FIXME: We will currently redo the same recursive walk if we find the same2063// mismatched callee from another callsite. We can improve this with more2064// bookkeeping of the created chain of new nodes for each mismatch.2065unsigned Depth = 1;2066bool FoundMultipleCalleeChains = false;2067if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,2068FoundCalleeChain,2069FoundMultipleCalleeChains)) {2070LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "2071<< Func->getName() << " from " << CallerFunc->getName()2072<< " that actually called " << CalleeVal->getName()2073<< (FoundMultipleCalleeChains2074? " (found multiple possible chains)"2075: "")2076<< "\n");2077if (FoundMultipleCalleeChains)2078FoundProfiledCalleeNonUniquelyCount++;2079return false;2080}20812082return true;2083}20842085bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(2086ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,2087std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,2088bool &FoundMultipleCalleeChains) {2089// Stop recursive search if we have already explored the maximum specified2090// depth.2091if (Depth > TailCallSearchDepth)2092return false;20932094auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {2095// Make a CallsiteInfo for each discovered callee, if one hasn't already2096// been synthesized.2097if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||2098!FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))2099// StackIds is empty (we don't have debug info available in the index for2100// these callsites)2101FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =2102std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());2103CallsiteInfo *NewCallsiteInfo =2104FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();2105FoundCalleeChain.push_back({NewCallsiteInfo, FS});2106};21072108// Look for tail calls in this function, and check if they either call the2109// profiled callee directly, or indirectly (via a recursive search).2110// Only succeed if there is a single unique tail call chain found between the2111// profiled caller and callee, otherwise we could perform incorrect cloning.2112bool FoundSingleCalleeChain = false;2113for (auto &S : CurCallee.getSummaryList()) {2114if (!GlobalValue::isLocalLinkage(S->linkage()) &&2115!isPrevailing(CurCallee.getGUID(), S.get()))2116continue;2117auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());2118if (!FS)2119continue;2120auto FSVI = CurCallee;2121auto *AS = dyn_cast<AliasSummary>(S.get());2122if (AS)2123FSVI = AS->getAliaseeVI();2124for (auto &CallEdge : FS->calls()) {2125if (!CallEdge.second.hasTailCall())2126continue;2127if (CallEdge.first == ProfiledCallee) {2128if (FoundSingleCalleeChain) {2129FoundMultipleCalleeChains = true;2130return false;2131}2132FoundSingleCalleeChain = true;2133FoundProfiledCalleeCount++;2134FoundProfiledCalleeDepth += Depth;2135if (Depth > FoundProfiledCalleeMaxDepth)2136FoundProfiledCalleeMaxDepth = Depth;2137CreateAndSaveCallsiteInfo(CallEdge.first, FS);2138// Add FS to FSToVIMap in case it isn't already there.2139assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);2140FSToVIMap[FS] = FSVI;2141} else if (findProfiledCalleeThroughTailCalls(2142ProfiledCallee, CallEdge.first, Depth + 1,2143FoundCalleeChain, FoundMultipleCalleeChains)) {2144// findProfiledCalleeThroughTailCalls should not have returned2145// true if FoundMultipleCalleeChains.2146assert(!FoundMultipleCalleeChains);2147if (FoundSingleCalleeChain) {2148FoundMultipleCalleeChains = true;2149return false;2150}2151FoundSingleCalleeChain = true;2152CreateAndSaveCallsiteInfo(CallEdge.first, FS);2153// Add FS to FSToVIMap in case it isn't already there.2154assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);2155FSToVIMap[FS] = FSVI;2156} else if (FoundMultipleCalleeChains)2157return false;2158}2159}21602161return FoundSingleCalleeChain;2162}21632164bool IndexCallsiteContextGraph::calleeMatchesFunc(2165IndexCall &Call, const FunctionSummary *Func,2166const FunctionSummary *CallerFunc,2167std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {2168ValueInfo Callee =2169dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;2170// If there is no summary list then this is a call to an externally defined2171// symbol.2172AliasSummary *Alias =2173Callee.getSummaryList().empty()2174? nullptr2175: dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());2176assert(FSToVIMap.count(Func));2177auto FuncVI = FSToVIMap[Func];2178if (Callee == FuncVI ||2179// If callee is an alias, check the aliasee, since only function2180// summary base objects will contain the stack node summaries and thus2181// get a context node.2182(Alias && Alias->getAliaseeVI() == FuncVI))2183return true;21842185// Recursively search for the profiled callee through tail calls starting with2186// the actual Callee. The discovered tail call chain is saved in2187// FoundCalleeChain, and we will fixup the graph to include these callsites2188// after returning.2189// FIXME: We will currently redo the same recursive walk if we find the same2190// mismatched callee from another callsite. We can improve this with more2191// bookkeeping of the created chain of new nodes for each mismatch.2192unsigned Depth = 1;2193bool FoundMultipleCalleeChains = false;2194if (!findProfiledCalleeThroughTailCalls(2195FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {2196LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI2197<< " from " << FSToVIMap[CallerFunc]2198<< " that actually called " << Callee2199<< (FoundMultipleCalleeChains2200? " (found multiple possible chains)"2201: "")2202<< "\n");2203if (FoundMultipleCalleeChains)2204FoundProfiledCalleeNonUniquelyCount++;2205return false;2206}22072208return true;2209}22102211template <typename DerivedCCG, typename FuncTy, typename CallTy>2212void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()2213const {2214print(dbgs());2215dbgs() << "\n";2216}22172218template <typename DerivedCCG, typename FuncTy, typename CallTy>2219void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(2220raw_ostream &OS) const {2221OS << "Node " << this << "\n";2222OS << "\t";2223printCall(OS);2224if (Recursive)2225OS << " (recursive)";2226OS << "\n";2227OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";2228OS << "\tContextIds:";2229// Make a copy of the computed context ids that we can sort for stability.2230auto ContextIds = getContextIds();2231std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());2232std::sort(SortedIds.begin(), SortedIds.end());2233for (auto Id : SortedIds)2234OS << " " << Id;2235OS << "\n";2236OS << "\tCalleeEdges:\n";2237for (auto &Edge : CalleeEdges)2238OS << "\t\t" << *Edge << "\n";2239OS << "\tCallerEdges:\n";2240for (auto &Edge : CallerEdges)2241OS << "\t\t" << *Edge << "\n";2242if (!Clones.empty()) {2243OS << "\tClones: ";2244FieldSeparator FS;2245for (auto *Clone : Clones)2246OS << FS << Clone;2247OS << "\n";2248} else if (CloneOf) {2249OS << "\tClone of " << CloneOf << "\n";2250}2251}22522253template <typename DerivedCCG, typename FuncTy, typename CallTy>2254void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()2255const {2256print(dbgs());2257dbgs() << "\n";2258}22592260template <typename DerivedCCG, typename FuncTy, typename CallTy>2261void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(2262raw_ostream &OS) const {2263OS << "Edge from Callee " << Callee << " to Caller: " << Caller2264<< " AllocTypes: " << getAllocTypeString(AllocTypes);2265OS << " ContextIds:";2266std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());2267std::sort(SortedIds.begin(), SortedIds.end());2268for (auto Id : SortedIds)2269OS << " " << Id;2270}22712272template <typename DerivedCCG, typename FuncTy, typename CallTy>2273void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {2274print(dbgs());2275}22762277template <typename DerivedCCG, typename FuncTy, typename CallTy>2278void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(2279raw_ostream &OS) const {2280OS << "Callsite Context Graph:\n";2281using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;2282for (const auto Node : nodes<GraphType>(this)) {2283if (Node->isRemoved())2284continue;2285Node->print(OS);2286OS << "\n";2287}2288}22892290template <typename DerivedCCG, typename FuncTy, typename CallTy>2291void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(2292raw_ostream &OS) const {2293using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;2294for (const auto Node : nodes<GraphType>(this)) {2295if (Node->isRemoved())2296continue;2297if (!Node->IsAllocation)2298continue;2299DenseSet<uint32_t> ContextIds = Node->getContextIds();2300std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());2301std::sort(SortedIds.begin(), SortedIds.end());2302for (auto Id : SortedIds) {2303auto SizeI = ContextIdToTotalSize.find(Id);2304assert(SizeI != ContextIdToTotalSize.end());2305auto TypeI = ContextIdToAllocationType.find(Id);2306assert(TypeI != ContextIdToAllocationType.end());2307OS << getAllocTypeString((uint8_t)TypeI->second) << " context " << Id2308<< " with total size " << SizeI->second << " is "2309<< getAllocTypeString(Node->AllocTypes) << " after cloning\n";2310}2311}2312}23132314template <typename DerivedCCG, typename FuncTy, typename CallTy>2315void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {2316using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;2317for (const auto Node : nodes<GraphType>(this)) {2318checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);2319for (auto &Edge : Node->CallerEdges)2320checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);2321}2322}23232324template <typename DerivedCCG, typename FuncTy, typename CallTy>2325struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {2326using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;2327using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;23282329using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;2330static NodeRef getNode(const NodePtrTy &P) { return P.get(); }23312332using nodes_iterator =2333mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,2334decltype(&getNode)>;23352336static nodes_iterator nodes_begin(GraphType G) {2337return nodes_iterator(G->NodeOwner.begin(), &getNode);2338}23392340static nodes_iterator nodes_end(GraphType G) {2341return nodes_iterator(G->NodeOwner.end(), &getNode);2342}23432344static NodeRef getEntryNode(GraphType G) {2345return G->NodeOwner.begin()->get();2346}23472348using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;2349static const ContextNode<DerivedCCG, FuncTy, CallTy> *2350GetCallee(const EdgePtrTy &P) {2351return P->Callee;2352}23532354using ChildIteratorType =2355mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<2356DerivedCCG, FuncTy, CallTy>>>::const_iterator,2357decltype(&GetCallee)>;23582359static ChildIteratorType child_begin(NodeRef N) {2360return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);2361}23622363static ChildIteratorType child_end(NodeRef N) {2364return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);2365}2366};23672368template <typename DerivedCCG, typename FuncTy, typename CallTy>2369struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>2370: public DefaultDOTGraphTraits {2371DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}23722373using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;2374using GTraits = GraphTraits<GraphType>;2375using NodeRef = typename GTraits::NodeRef;2376using ChildIteratorType = typename GTraits::ChildIteratorType;23772378static std::string getNodeLabel(NodeRef Node, GraphType G) {2379std::string LabelString =2380(Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +2381Twine(Node->OrigStackOrAllocId))2382.str();2383LabelString += "\n";2384if (Node->hasCall()) {2385auto Func = G->NodeToCallingFunc.find(Node);2386assert(Func != G->NodeToCallingFunc.end());2387LabelString +=2388G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());2389} else {2390LabelString += "null call";2391if (Node->Recursive)2392LabelString += " (recursive)";2393else2394LabelString += " (external)";2395}2396return LabelString;2397}23982399static std::string getNodeAttributes(NodeRef Node, GraphType) {2400std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +2401getContextIds(Node->getContextIds()) + "\"")2402.str();2403AttributeString +=2404(Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();2405AttributeString += ",style=\"filled\"";2406if (Node->CloneOf) {2407AttributeString += ",color=\"blue\"";2408AttributeString += ",style=\"filled,bold,dashed\"";2409} else2410AttributeString += ",style=\"filled\"";2411return AttributeString;2412}24132414static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,2415GraphType) {2416auto &Edge = *(ChildIter.getCurrent());2417return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +2418Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")2419.str();2420}24212422// Since the NodeOwners list includes nodes that are no longer connected to2423// the graph, skip them here.2424static bool isNodeHidden(NodeRef Node, GraphType) {2425return Node->isRemoved();2426}24272428private:2429static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {2430std::string IdString = "ContextIds:";2431if (ContextIds.size() < 100) {2432std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());2433std::sort(SortedIds.begin(), SortedIds.end());2434for (auto Id : SortedIds)2435IdString += (" " + Twine(Id)).str();2436} else {2437IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();2438}2439return IdString;2440}24412442static std::string getColor(uint8_t AllocTypes) {2443if (AllocTypes == (uint8_t)AllocationType::NotCold)2444// Color "brown1" actually looks like a lighter red.2445return "brown1";2446if (AllocTypes == (uint8_t)AllocationType::Cold)2447return "cyan";2448if (AllocTypes ==2449((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))2450// Lighter purple.2451return "mediumorchid1";2452return "gray";2453}24542455static std::string getNodeId(NodeRef Node) {2456std::stringstream SStream;2457SStream << std::hex << "N0x" << (unsigned long long)Node;2458std::string Result = SStream.str();2459return Result;2460}2461};24622463template <typename DerivedCCG, typename FuncTy, typename CallTy>2464void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(2465std::string Label) const {2466WriteGraph(this, "", false, Label,2467DotFilePathPrefix + "ccg." + Label + ".dot");2468}24692470template <typename DerivedCCG, typename FuncTy, typename CallTy>2471typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *2472CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(2473const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI,2474DenseSet<uint32_t> ContextIdsToMove) {2475ContextNode *Node = Edge->Callee;2476NodeOwner.push_back(2477std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));2478ContextNode *Clone = NodeOwner.back().get();2479Node->addClone(Clone);2480assert(NodeToCallingFunc.count(Node));2481NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];2482moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,2483ContextIdsToMove);2484return Clone;2485}24862487template <typename DerivedCCG, typename FuncTy, typename CallTy>2488void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::2489moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,2490ContextNode *NewCallee, EdgeIter *CallerEdgeI,2491bool NewClone,2492DenseSet<uint32_t> ContextIdsToMove) {2493// NewCallee and Edge's current callee must be clones of the same original2494// node (Edge's current callee may be the original node too).2495assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());24962497ContextNode *OldCallee = Edge->Callee;24982499// We might already have an edge to the new callee from earlier cloning for a2500// different allocation. If one exists we will reuse it.2501auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);25022503// Callers will pass an empty ContextIdsToMove set when they want to move the2504// edge. Copy in Edge's ids for simplicity.2505if (ContextIdsToMove.empty())2506ContextIdsToMove = Edge->getContextIds();25072508// If we are moving all of Edge's ids, then just move the whole Edge.2509// Otherwise only move the specified subset, to a new edge if needed.2510if (Edge->getContextIds().size() == ContextIdsToMove.size()) {2511// Moving the whole Edge.2512if (CallerEdgeI)2513*CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);2514else2515OldCallee->eraseCallerEdge(Edge.get());2516if (ExistingEdgeToNewCallee) {2517// Since we already have an edge to NewCallee, simply move the ids2518// onto it, and remove the existing Edge.2519ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),2520ContextIdsToMove.end());2521ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;2522assert(Edge->ContextIds == ContextIdsToMove);2523Edge->ContextIds.clear();2524Edge->AllocTypes = (uint8_t)AllocationType::None;2525Edge->Caller->eraseCalleeEdge(Edge.get());2526} else {2527// Otherwise just reconnect Edge to NewCallee.2528Edge->Callee = NewCallee;2529NewCallee->CallerEdges.push_back(Edge);2530// Don't need to update Edge's context ids since we are simply2531// reconnecting it.2532}2533// In either case, need to update the alloc types on New Callee.2534NewCallee->AllocTypes |= Edge->AllocTypes;2535} else {2536// Only moving a subset of Edge's ids.2537if (CallerEdgeI)2538++CallerEdgeI;2539// Compute the alloc type of the subset of ids being moved.2540auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);2541if (ExistingEdgeToNewCallee) {2542// Since we already have an edge to NewCallee, simply move the ids2543// onto it.2544ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),2545ContextIdsToMove.end());2546ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;2547} else {2548// Otherwise, create a new edge to NewCallee for the ids being moved.2549auto NewEdge = std::make_shared<ContextEdge>(2550NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);2551Edge->Caller->CalleeEdges.push_back(NewEdge);2552NewCallee->CallerEdges.push_back(NewEdge);2553}2554// In either case, need to update the alloc types on NewCallee, and remove2555// those ids and update the alloc type on the original Edge.2556NewCallee->AllocTypes |= CallerEdgeAllocType;2557set_subtract(Edge->ContextIds, ContextIdsToMove);2558Edge->AllocTypes = computeAllocType(Edge->ContextIds);2559}2560// Now walk the old callee node's callee edges and move Edge's context ids2561// over to the corresponding edge into the clone (which is created here if2562// this is a newly created clone).2563for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {2564// The context ids moving to the new callee are the subset of this edge's2565// context ids and the context ids on the caller edge being moved.2566DenseSet<uint32_t> EdgeContextIdsToMove =2567set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);2568set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);2569OldCalleeEdge->AllocTypes =2570computeAllocType(OldCalleeEdge->getContextIds());2571if (!NewClone) {2572// Update context ids / alloc type on corresponding edge to NewCallee.2573// There is a chance this may not exist if we are reusing an existing2574// clone, specifically during function assignment, where we would have2575// removed none type edges after creating the clone. If we can't find2576// a corresponding edge there, fall through to the cloning below.2577if (auto *NewCalleeEdge =2578NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {2579NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),2580EdgeContextIdsToMove.end());2581NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);2582continue;2583}2584}2585auto NewEdge = std::make_shared<ContextEdge>(2586OldCalleeEdge->Callee, NewCallee,2587computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);2588NewCallee->CalleeEdges.push_back(NewEdge);2589NewEdge->Callee->CallerEdges.push_back(NewEdge);2590}2591// Recompute the node alloc type now that its callee edges have been2592// updated (since we will compute from those edges).2593OldCallee->AllocTypes = OldCallee->computeAllocType();2594// OldCallee alloc type should be None iff its context id set is now empty.2595assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==2596OldCallee->emptyContextIds());2597if (VerifyCCG) {2598checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);2599checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);2600for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)2601checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,2602/*CheckEdges=*/false);2603for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)2604checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,2605/*CheckEdges=*/false);2606}2607}26082609template <typename DerivedCCG, typename FuncTy, typename CallTy>2610void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::2611recursivelyRemoveNoneTypeCalleeEdges(2612ContextNode *Node, DenseSet<const ContextNode *> &Visited) {2613auto Inserted = Visited.insert(Node);2614if (!Inserted.second)2615return;26162617removeNoneTypeCalleeEdges(Node);26182619for (auto *Clone : Node->Clones)2620recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);26212622// The recursive call may remove some of this Node's caller edges.2623// Iterate over a copy and skip any that were removed.2624auto CallerEdges = Node->CallerEdges;2625for (auto &Edge : CallerEdges) {2626// Skip any that have been removed by an earlier recursive call.2627if (Edge->Callee == nullptr && Edge->Caller == nullptr) {2628assert(!is_contained(Node->CallerEdges, Edge));2629continue;2630}2631recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);2632}2633}26342635template <typename DerivedCCG, typename FuncTy, typename CallTy>2636void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {2637DenseSet<const ContextNode *> Visited;2638for (auto &Entry : AllocationCallToContextNodeMap) {2639Visited.clear();2640identifyClones(Entry.second, Visited, Entry.second->getContextIds());2641}2642Visited.clear();2643for (auto &Entry : AllocationCallToContextNodeMap)2644recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);2645if (VerifyCCG)2646check();2647}26482649// helper function to check an AllocType is cold or notcold or both.2650bool checkColdOrNotCold(uint8_t AllocType) {2651return (AllocType == (uint8_t)AllocationType::Cold) ||2652(AllocType == (uint8_t)AllocationType::NotCold) ||2653(AllocType ==2654((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));2655}26562657template <typename DerivedCCG, typename FuncTy, typename CallTy>2658void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(2659ContextNode *Node, DenseSet<const ContextNode *> &Visited,2660const DenseSet<uint32_t> &AllocContextIds) {2661if (VerifyNodes)2662checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);2663assert(!Node->CloneOf);26642665// If Node as a null call, then either it wasn't found in the module (regular2666// LTO) or summary index (ThinLTO), or there were other conditions blocking2667// cloning (e.g. recursion, calls multiple targets, etc).2668// Do this here so that we don't try to recursively clone callers below, which2669// isn't useful at least for this node.2670if (!Node->hasCall())2671return;26722673#ifndef NDEBUG2674auto Insert =2675#endif2676Visited.insert(Node);2677// We should not have visited this node yet.2678assert(Insert.second);2679// The recursive call to identifyClones may delete the current edge from the2680// CallerEdges vector. Make a copy and iterate on that, simpler than passing2681// in an iterator and having recursive call erase from it. Other edges may2682// also get removed during the recursion, which will have null Callee and2683// Caller pointers (and are deleted later), so we skip those below.2684{2685auto CallerEdges = Node->CallerEdges;2686for (auto &Edge : CallerEdges) {2687// Skip any that have been removed by an earlier recursive call.2688if (Edge->Callee == nullptr && Edge->Caller == nullptr) {2689assert(!llvm::count(Node->CallerEdges, Edge));2690continue;2691}2692// Ignore any caller we previously visited via another edge.2693if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {2694identifyClones(Edge->Caller, Visited, AllocContextIds);2695}2696}2697}26982699// Check if we reached an unambiguous call or have have only a single caller.2700if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)2701return;27022703// We need to clone.27042705// Try to keep the original version as alloc type NotCold. This will make2706// cases with indirect calls or any other situation with an unknown call to2707// the original function get the default behavior. We do this by sorting the2708// CallerEdges of the Node we will clone by alloc type.2709//2710// Give NotCold edge the lowest sort priority so those edges are at the end of2711// the caller edges vector, and stay on the original version (since the below2712// code clones greedily until it finds all remaining edges have the same type2713// and leaves the remaining ones on the original Node).2714//2715// We shouldn't actually have any None type edges, so the sorting priority for2716// that is arbitrary, and we assert in that case below.2717const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,2718/*Cold*/ 1,2719/*NotColdCold*/ 2};2720std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),2721[&](const std::shared_ptr<ContextEdge> &A,2722const std::shared_ptr<ContextEdge> &B) {2723// Nodes with non-empty context ids should be sorted before2724// those with empty context ids.2725if (A->ContextIds.empty())2726// Either B ContextIds are non-empty (in which case we2727// should return false because B < A), or B ContextIds2728// are empty, in which case they are equal, and we should2729// maintain the original relative ordering.2730return false;2731if (B->ContextIds.empty())2732return true;27332734if (A->AllocTypes == B->AllocTypes)2735// Use the first context id for each edge as a2736// tie-breaker.2737return *A->ContextIds.begin() < *B->ContextIds.begin();2738return AllocTypeCloningPriority[A->AllocTypes] <2739AllocTypeCloningPriority[B->AllocTypes];2740});27412742assert(Node->AllocTypes != (uint8_t)AllocationType::None);27432744// Iterate until we find no more opportunities for disambiguating the alloc2745// types via cloning. In most cases this loop will terminate once the Node2746// has a single allocation type, in which case no more cloning is needed.2747// We need to be able to remove Edge from CallerEdges, so need to adjust2748// iterator inside the loop.2749for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {2750auto CallerEdge = *EI;27512752// See if cloning the prior caller edge left this node with a single alloc2753// type or a single caller. In that case no more cloning of Node is needed.2754if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)2755break;27562757// Only need to process the ids along this edge pertaining to the given2758// allocation.2759auto CallerEdgeContextsForAlloc =2760set_intersection(CallerEdge->getContextIds(), AllocContextIds);2761if (CallerEdgeContextsForAlloc.empty()) {2762++EI;2763continue;2764}2765auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);27662767// Compute the node callee edge alloc types corresponding to the context ids2768// for this caller edge.2769std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;2770CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());2771for (auto &CalleeEdge : Node->CalleeEdges)2772CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(2773CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));27742775// Don't clone if doing so will not disambiguate any alloc types amongst2776// caller edges (including the callee edges that would be cloned).2777// Otherwise we will simply move all edges to the clone.2778//2779// First check if by cloning we will disambiguate the caller allocation2780// type from node's allocation type. Query allocTypeToUse so that we don't2781// bother cloning to distinguish NotCold+Cold from NotCold. Note that2782// neither of these should be None type.2783//2784// Then check if by cloning node at least one of the callee edges will be2785// disambiguated by splitting out different context ids.2786assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);2787assert(Node->AllocTypes != (uint8_t)AllocationType::None);2788if (allocTypeToUse(CallerAllocTypeForAlloc) ==2789allocTypeToUse(Node->AllocTypes) &&2790allocTypesMatch<DerivedCCG, FuncTy, CallTy>(2791CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {2792++EI;2793continue;2794}27952796// First see if we can use an existing clone. Check each clone and its2797// callee edges for matching alloc types.2798ContextNode *Clone = nullptr;2799for (auto *CurClone : Node->Clones) {2800if (allocTypeToUse(CurClone->AllocTypes) !=2801allocTypeToUse(CallerAllocTypeForAlloc))2802continue;28032804if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(2805CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))2806continue;2807Clone = CurClone;2808break;2809}28102811// The edge iterator is adjusted when we move the CallerEdge to the clone.2812if (Clone)2813moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI, /*NewClone=*/false,2814CallerEdgeContextsForAlloc);2815else2816Clone =2817moveEdgeToNewCalleeClone(CallerEdge, &EI, CallerEdgeContextsForAlloc);28182819assert(EI == Node->CallerEdges.end() ||2820Node->AllocTypes != (uint8_t)AllocationType::None);2821// Sanity check that no alloc types on clone or its edges are None.2822assert(Clone->AllocTypes != (uint8_t)AllocationType::None);2823}28242825// We should still have some context ids on the original Node.2826assert(!Node->emptyContextIds());28272828// Sanity check that no alloc types on node or edges are None.2829assert(Node->AllocTypes != (uint8_t)AllocationType::None);28302831if (VerifyNodes)2832checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);2833}28342835void ModuleCallsiteContextGraph::updateAllocationCall(2836CallInfo &Call, AllocationType AllocType) {2837std::string AllocTypeString = getAllocTypeAttributeString(AllocType);2838auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),2839"memprof", AllocTypeString);2840cast<CallBase>(Call.call())->addFnAttr(A);2841OREGetter(Call.call()->getFunction())2842.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())2843<< ore::NV("AllocationCall", Call.call()) << " in clone "2844<< ore::NV("Caller", Call.call()->getFunction())2845<< " marked with memprof allocation attribute "2846<< ore::NV("Attribute", AllocTypeString));2847}28482849void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,2850AllocationType AllocType) {2851auto *AI = Call.call().dyn_cast<AllocInfo *>();2852assert(AI);2853assert(AI->Versions.size() > Call.cloneNo());2854AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;2855}28562857void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,2858FuncInfo CalleeFunc) {2859if (CalleeFunc.cloneNo() > 0)2860cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());2861OREGetter(CallerCall.call()->getFunction())2862.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())2863<< ore::NV("Call", CallerCall.call()) << " in clone "2864<< ore::NV("Caller", CallerCall.call()->getFunction())2865<< " assigned to call function clone "2866<< ore::NV("Callee", CalleeFunc.func()));2867}28682869void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,2870FuncInfo CalleeFunc) {2871auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();2872assert(CI &&2873"Caller cannot be an allocation which should not have profiled calls");2874assert(CI->Clones.size() > CallerCall.cloneNo());2875CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();2876}28772878CallsiteContextGraph<ModuleCallsiteContextGraph, Function,2879Instruction *>::FuncInfo2880ModuleCallsiteContextGraph::cloneFunctionForCallsite(2881FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,2882std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {2883// Use existing LLVM facilities for cloning and obtaining Call in clone2884ValueToValueMapTy VMap;2885auto *NewFunc = CloneFunction(Func.func(), VMap);2886std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);2887assert(!Func.func()->getParent()->getFunction(Name));2888NewFunc->setName(Name);2889for (auto &Inst : CallsWithMetadataInFunc) {2890// This map always has the initial version in it.2891assert(Inst.cloneNo() == 0);2892CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};2893}2894OREGetter(Func.func())2895.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())2896<< "created clone " << ore::NV("NewFunction", NewFunc));2897return {NewFunc, CloneNo};2898}28992900CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,2901IndexCall>::FuncInfo2902IndexCallsiteContextGraph::cloneFunctionForCallsite(2903FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,2904std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {2905// Check how many clones we have of Call (and therefore function).2906// The next clone number is the current size of versions array.2907// Confirm this matches the CloneNo provided by the caller, which is based on2908// the number of function clones we have.2909assert(CloneNo ==2910(Call.call().is<AllocInfo *>()2911? Call.call().dyn_cast<AllocInfo *>()->Versions.size()2912: Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));2913// Walk all the instructions in this function. Create a new version for2914// each (by adding an entry to the Versions/Clones summary array), and copy2915// over the version being called for the function clone being cloned here.2916// Additionally, add an entry to the CallMap for the new function clone,2917// mapping the original call (clone 0, what is in CallsWithMetadataInFunc)2918// to the new call clone.2919for (auto &Inst : CallsWithMetadataInFunc) {2920// This map always has the initial version in it.2921assert(Inst.cloneNo() == 0);2922if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {2923assert(AI->Versions.size() == CloneNo);2924// We assign the allocation type later (in updateAllocationCall), just add2925// an entry for it here.2926AI->Versions.push_back(0);2927} else {2928auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();2929assert(CI && CI->Clones.size() == CloneNo);2930// We assign the clone number later (in updateCall), just add an entry for2931// it here.2932CI->Clones.push_back(0);2933}2934CallMap[Inst] = {Inst.call(), CloneNo};2935}2936return {Func.func(), CloneNo};2937}29382939// This method assigns cloned callsites to functions, cloning the functions as2940// needed. The assignment is greedy and proceeds roughly as follows:2941//2942// For each function Func:2943// For each call with graph Node having clones:2944// Initialize ClonesWorklist to Node and its clones2945// Initialize NodeCloneCount to 02946// While ClonesWorklist is not empty:2947// Clone = pop front ClonesWorklist2948// NodeCloneCount++2949// If Func has been cloned less than NodeCloneCount times:2950// If NodeCloneCount is 1:2951// Assign Clone to original Func2952// Continue2953// Create a new function clone2954// If other callers not assigned to call a function clone yet:2955// Assign them to call new function clone2956// Continue2957// Assign any other caller calling the cloned version to new clone2958//2959// For each caller of Clone:2960// If caller is assigned to call a specific function clone:2961// If we cannot assign Clone to that function clone:2962// Create new callsite Clone NewClone2963// Add NewClone to ClonesWorklist2964// Continue2965// Assign Clone to existing caller's called function clone2966// Else:2967// If Clone not already assigned to a function clone:2968// Assign to first function clone without assignment2969// Assign caller to selected function clone2970template <typename DerivedCCG, typename FuncTy, typename CallTy>2971bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {2972bool Changed = false;29732974// Keep track of the assignment of nodes (callsites) to function clones they2975// call.2976DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;29772978// Update caller node to call function version CalleeFunc, by recording the2979// assignment in CallsiteToCalleeFuncCloneMap.2980auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,2981const FuncInfo &CalleeFunc) {2982assert(Caller->hasCall());2983CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;2984};29852986// Walk all functions for which we saw calls with memprof metadata, and handle2987// cloning for each of its calls.2988for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {2989FuncInfo OrigFunc(Func);2990// Map from each clone of OrigFunc to a map of remappings of each call of2991// interest (from original uncloned call to the corresponding cloned call in2992// that function clone).2993std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;2994for (auto &Call : CallsWithMetadata) {2995ContextNode *Node = getNodeForInst(Call);2996// Skip call if we do not have a node for it (all uses of its stack ids2997// were either on inlined chains or pruned from the MIBs), or if we did2998// not create any clones for it.2999if (!Node || Node->Clones.empty())3000continue;3001assert(Node->hasCall() &&3002"Not having a call should have prevented cloning");30033004// Track the assignment of function clones to clones of the current3005// callsite Node being handled.3006std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;30073008// Assign callsite version CallsiteClone to function version FuncClone,3009// and also assign (possibly cloned) Call to CallsiteClone.3010auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,3011CallInfo &Call,3012ContextNode *CallsiteClone,3013bool IsAlloc) {3014// Record the clone of callsite node assigned to this function clone.3015FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;30163017assert(FuncClonesToCallMap.count(FuncClone));3018std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];3019CallInfo CallClone(Call);3020if (CallMap.count(Call))3021CallClone = CallMap[Call];3022CallsiteClone->setCall(CallClone);3023};30243025// Keep track of the clones of callsite Node that need to be assigned to3026// function clones. This list may be expanded in the loop body below if we3027// find additional cloning is required.3028std::deque<ContextNode *> ClonesWorklist;3029// Ignore original Node if we moved all of its contexts to clones.3030if (!Node->emptyContextIds())3031ClonesWorklist.push_back(Node);3032ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),3033Node->Clones.end());30343035// Now walk through all of the clones of this callsite Node that we need,3036// and determine the assignment to a corresponding clone of the current3037// function (creating new function clones as needed).3038unsigned NodeCloneCount = 0;3039while (!ClonesWorklist.empty()) {3040ContextNode *Clone = ClonesWorklist.front();3041ClonesWorklist.pop_front();3042NodeCloneCount++;3043if (VerifyNodes)3044checkNode<DerivedCCG, FuncTy, CallTy>(Clone);30453046// Need to create a new function clone if we have more callsite clones3047// than existing function clones, which would have been assigned to an3048// earlier clone in the list (we assign callsite clones to function3049// clones greedily).3050if (FuncClonesToCallMap.size() < NodeCloneCount) {3051// If this is the first callsite copy, assign to original function.3052if (NodeCloneCount == 1) {3053// Since FuncClonesToCallMap is empty in this case, no clones have3054// been created for this function yet, and no callers should have3055// been assigned a function clone for this callee node yet.3056assert(llvm::none_of(3057Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {3058return CallsiteToCalleeFuncCloneMap.count(E->Caller);3059}));3060// Initialize with empty call map, assign Clone to original function3061// and its callers, and skip to the next clone.3062FuncClonesToCallMap[OrigFunc] = {};3063AssignCallsiteCloneToFuncClone(3064OrigFunc, Call, Clone,3065AllocationCallToContextNodeMap.count(Call));3066for (auto &CE : Clone->CallerEdges) {3067// Ignore any caller that does not have a recorded callsite Call.3068if (!CE->Caller->hasCall())3069continue;3070RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);3071}3072continue;3073}30743075// First locate which copy of OrigFunc to clone again. If a caller3076// of this callsite clone was already assigned to call a particular3077// function clone, we need to redirect all of those callers to the3078// new function clone, and update their other callees within this3079// function.3080FuncInfo PreviousAssignedFuncClone;3081auto EI = llvm::find_if(3082Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {3083return CallsiteToCalleeFuncCloneMap.count(E->Caller);3084});3085bool CallerAssignedToCloneOfFunc = false;3086if (EI != Clone->CallerEdges.end()) {3087const std::shared_ptr<ContextEdge> &Edge = *EI;3088PreviousAssignedFuncClone =3089CallsiteToCalleeFuncCloneMap[Edge->Caller];3090CallerAssignedToCloneOfFunc = true;3091}30923093// Clone function and save it along with the CallInfo map created3094// during cloning in the FuncClonesToCallMap.3095std::map<CallInfo, CallInfo> NewCallMap;3096unsigned CloneNo = FuncClonesToCallMap.size();3097assert(CloneNo > 0 && "Clone 0 is the original function, which "3098"should already exist in the map");3099FuncInfo NewFuncClone = cloneFunctionForCallsite(3100OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);3101FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));3102FunctionClonesAnalysis++;3103Changed = true;31043105// If no caller callsites were already assigned to a clone of this3106// function, we can simply assign this clone to the new func clone3107// and update all callers to it, then skip to the next clone.3108if (!CallerAssignedToCloneOfFunc) {3109AssignCallsiteCloneToFuncClone(3110NewFuncClone, Call, Clone,3111AllocationCallToContextNodeMap.count(Call));3112for (auto &CE : Clone->CallerEdges) {3113// Ignore any caller that does not have a recorded callsite Call.3114if (!CE->Caller->hasCall())3115continue;3116RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);3117}3118continue;3119}31203121// We may need to do additional node cloning in this case.3122// Reset the CallsiteToCalleeFuncCloneMap entry for any callers3123// that were previously assigned to call PreviousAssignedFuncClone,3124// to record that they now call NewFuncClone.3125for (auto CE : Clone->CallerEdges) {3126// Skip any that have been removed on an earlier iteration.3127if (!CE)3128continue;3129// Ignore any caller that does not have a recorded callsite Call.3130if (!CE->Caller->hasCall())3131continue;31323133if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||3134// We subsequently fall through to later handling that3135// will perform any additional cloning required for3136// callers that were calling other function clones.3137CallsiteToCalleeFuncCloneMap[CE->Caller] !=3138PreviousAssignedFuncClone)3139continue;31403141RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);31423143// If we are cloning a function that was already assigned to some3144// callers, then essentially we are creating new callsite clones3145// of the other callsites in that function that are reached by those3146// callers. Clone the other callees of the current callsite's caller3147// that were already assigned to PreviousAssignedFuncClone3148// accordingly. This is important since we subsequently update the3149// calls from the nodes in the graph and their assignments to callee3150// functions recorded in CallsiteToCalleeFuncCloneMap.3151for (auto CalleeEdge : CE->Caller->CalleeEdges) {3152// Skip any that have been removed on an earlier iteration when3153// cleaning up newly None type callee edges.3154if (!CalleeEdge)3155continue;3156ContextNode *Callee = CalleeEdge->Callee;3157// Skip the current callsite, we are looking for other3158// callsites Caller calls, as well as any that does not have a3159// recorded callsite Call.3160if (Callee == Clone || !Callee->hasCall())3161continue;3162ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);3163removeNoneTypeCalleeEdges(NewClone);3164// Moving the edge may have resulted in some none type3165// callee edges on the original Callee.3166removeNoneTypeCalleeEdges(Callee);3167assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);3168// If the Callee node was already assigned to call a specific3169// function version, make sure its new clone is assigned to call3170// that same function clone.3171if (CallsiteToCalleeFuncCloneMap.count(Callee))3172RecordCalleeFuncOfCallsite(3173NewClone, CallsiteToCalleeFuncCloneMap[Callee]);3174// Update NewClone with the new Call clone of this callsite's Call3175// created for the new function clone created earlier.3176// Recall that we have already ensured when building the graph3177// that each caller can only call callsites within the same3178// function, so we are guaranteed that Callee Call is in the3179// current OrigFunc.3180// CallMap is set up as indexed by original Call at clone 0.3181CallInfo OrigCall(Callee->getOrigNode()->Call);3182OrigCall.setCloneNo(0);3183std::map<CallInfo, CallInfo> &CallMap =3184FuncClonesToCallMap[NewFuncClone];3185assert(CallMap.count(OrigCall));3186CallInfo NewCall(CallMap[OrigCall]);3187assert(NewCall);3188NewClone->setCall(NewCall);3189}3190}3191// Fall through to handling below to perform the recording of the3192// function for this callsite clone. This enables handling of cases3193// where the callers were assigned to different clones of a function.3194}31953196// See if we can use existing function clone. Walk through3197// all caller edges to see if any have already been assigned to3198// a clone of this callsite's function. If we can use it, do so. If not,3199// because that function clone is already assigned to a different clone3200// of this callsite, then we need to clone again.3201// Basically, this checking is needed to handle the case where different3202// caller functions/callsites may need versions of this function3203// containing different mixes of callsite clones across the different3204// callsites within the function. If that happens, we need to create3205// additional function clones to handle the various combinations.3206//3207// Keep track of any new clones of this callsite created by the3208// following loop, as well as any existing clone that we decided to3209// assign this clone to.3210std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;3211FuncInfo FuncCloneAssignedToCurCallsiteClone;3212// We need to be able to remove Edge from CallerEdges, so need to adjust3213// iterator in the loop.3214for (auto EI = Clone->CallerEdges.begin();3215EI != Clone->CallerEdges.end();) {3216auto Edge = *EI;3217// Ignore any caller that does not have a recorded callsite Call.3218if (!Edge->Caller->hasCall()) {3219EI++;3220continue;3221}3222// If this caller already assigned to call a version of OrigFunc, need3223// to ensure we can assign this callsite clone to that function clone.3224if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {3225FuncInfo FuncCloneCalledByCaller =3226CallsiteToCalleeFuncCloneMap[Edge->Caller];3227// First we need to confirm that this function clone is available3228// for use by this callsite node clone.3229//3230// While FuncCloneToCurNodeCloneMap is built only for this Node and3231// its callsite clones, one of those callsite clones X could have3232// been assigned to the same function clone called by Edge's caller3233// - if Edge's caller calls another callsite within Node's original3234// function, and that callsite has another caller reaching clone X.3235// We need to clone Node again in this case.3236if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&3237FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=3238Clone) ||3239// Detect when we have multiple callers of this callsite that3240// have already been assigned to specific, and different, clones3241// of OrigFunc (due to other unrelated callsites in Func they3242// reach via call contexts). Is this Clone of callsite Node3243// assigned to a different clone of OrigFunc? If so, clone Node3244// again.3245(FuncCloneAssignedToCurCallsiteClone &&3246FuncCloneAssignedToCurCallsiteClone !=3247FuncCloneCalledByCaller)) {3248// We need to use a different newly created callsite clone, in3249// order to assign it to another new function clone on a3250// subsequent iteration over the Clones array (adjusted below).3251// Note we specifically do not reset the3252// CallsiteToCalleeFuncCloneMap entry for this caller, so that3253// when this new clone is processed later we know which version of3254// the function to copy (so that other callsite clones we have3255// assigned to that function clone are properly cloned over). See3256// comments in the function cloning handling earlier.32573258// Check if we already have cloned this callsite again while3259// walking through caller edges, for a caller calling the same3260// function clone. If so, we can move this edge to that new clone3261// rather than creating yet another new clone.3262if (FuncCloneToNewCallsiteCloneMap.count(3263FuncCloneCalledByCaller)) {3264ContextNode *NewClone =3265FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];3266moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);3267// Cleanup any none type edges cloned over.3268removeNoneTypeCalleeEdges(NewClone);3269} else {3270// Create a new callsite clone.3271ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);3272removeNoneTypeCalleeEdges(NewClone);3273FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =3274NewClone;3275// Add to list of clones and process later.3276ClonesWorklist.push_back(NewClone);3277assert(EI == Clone->CallerEdges.end() ||3278Clone->AllocTypes != (uint8_t)AllocationType::None);3279assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);3280}3281// Moving the caller edge may have resulted in some none type3282// callee edges.3283removeNoneTypeCalleeEdges(Clone);3284// We will handle the newly created callsite clone in a subsequent3285// iteration over this Node's Clones. Continue here since we3286// already adjusted iterator EI while moving the edge.3287continue;3288}32893290// Otherwise, we can use the function clone already assigned to this3291// caller.3292if (!FuncCloneAssignedToCurCallsiteClone) {3293FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;3294// Assign Clone to FuncCloneCalledByCaller3295AssignCallsiteCloneToFuncClone(3296FuncCloneCalledByCaller, Call, Clone,3297AllocationCallToContextNodeMap.count(Call));3298} else3299// Don't need to do anything - callsite is already calling this3300// function clone.3301assert(FuncCloneAssignedToCurCallsiteClone ==3302FuncCloneCalledByCaller);33033304} else {3305// We have not already assigned this caller to a version of3306// OrigFunc. Do the assignment now.33073308// First check if we have already assigned this callsite clone to a3309// clone of OrigFunc for another caller during this iteration over3310// its caller edges.3311if (!FuncCloneAssignedToCurCallsiteClone) {3312// Find first function in FuncClonesToCallMap without an assigned3313// clone of this callsite Node. We should always have one3314// available at this point due to the earlier cloning when the3315// FuncClonesToCallMap size was smaller than the clone number.3316for (auto &CF : FuncClonesToCallMap) {3317if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {3318FuncCloneAssignedToCurCallsiteClone = CF.first;3319break;3320}3321}3322assert(FuncCloneAssignedToCurCallsiteClone);3323// Assign Clone to FuncCloneAssignedToCurCallsiteClone3324AssignCallsiteCloneToFuncClone(3325FuncCloneAssignedToCurCallsiteClone, Call, Clone,3326AllocationCallToContextNodeMap.count(Call));3327} else3328assert(FuncCloneToCurNodeCloneMap3329[FuncCloneAssignedToCurCallsiteClone] == Clone);3330// Update callers to record function version called.3331RecordCalleeFuncOfCallsite(Edge->Caller,3332FuncCloneAssignedToCurCallsiteClone);3333}33343335EI++;3336}3337}3338if (VerifyCCG) {3339checkNode<DerivedCCG, FuncTy, CallTy>(Node);3340for (const auto &PE : Node->CalleeEdges)3341checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);3342for (const auto &CE : Node->CallerEdges)3343checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);3344for (auto *Clone : Node->Clones) {3345checkNode<DerivedCCG, FuncTy, CallTy>(Clone);3346for (const auto &PE : Clone->CalleeEdges)3347checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);3348for (const auto &CE : Clone->CallerEdges)3349checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);3350}3351}3352}3353}33543355auto UpdateCalls = [&](ContextNode *Node,3356DenseSet<const ContextNode *> &Visited,3357auto &&UpdateCalls) {3358auto Inserted = Visited.insert(Node);3359if (!Inserted.second)3360return;33613362for (auto *Clone : Node->Clones)3363UpdateCalls(Clone, Visited, UpdateCalls);33643365for (auto &Edge : Node->CallerEdges)3366UpdateCalls(Edge->Caller, Visited, UpdateCalls);33673368// Skip if either no call to update, or if we ended up with no context ids3369// (we moved all edges onto other clones).3370if (!Node->hasCall() || Node->emptyContextIds())3371return;33723373if (Node->IsAllocation) {3374updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));3375return;3376}33773378if (!CallsiteToCalleeFuncCloneMap.count(Node))3379return;33803381auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];3382updateCall(Node->Call, CalleeFunc);3383};33843385// Performs DFS traversal starting from allocation nodes to update calls to3386// reflect cloning decisions recorded earlier. For regular LTO this will3387// update the actual calls in the IR to call the appropriate function clone3388// (and add attributes to allocation calls), whereas for ThinLTO the decisions3389// are recorded in the summary entries.3390DenseSet<const ContextNode *> Visited;3391for (auto &Entry : AllocationCallToContextNodeMap)3392UpdateCalls(Entry.second, Visited, UpdateCalls);33933394return Changed;3395}33963397static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(3398Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,3399std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>3400&FuncToAliasMap) {3401// The first "clone" is the original copy, we should only call this if we3402// needed to create new clones.3403assert(NumClones > 1);3404SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;3405VMaps.reserve(NumClones - 1);3406FunctionsClonedThinBackend++;3407for (unsigned I = 1; I < NumClones; I++) {3408VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());3409auto *NewF = CloneFunction(&F, *VMaps.back());3410FunctionClonesThinBackend++;3411// Strip memprof and callsite metadata from clone as they are no longer3412// needed.3413for (auto &BB : *NewF) {3414for (auto &Inst : BB) {3415Inst.setMetadata(LLVMContext::MD_memprof, nullptr);3416Inst.setMetadata(LLVMContext::MD_callsite, nullptr);3417}3418}3419std::string Name = getMemProfFuncName(F.getName(), I);3420auto *PrevF = M.getFunction(Name);3421if (PrevF) {3422// We might have created this when adjusting callsite in another3423// function. It should be a declaration.3424assert(PrevF->isDeclaration());3425NewF->takeName(PrevF);3426PrevF->replaceAllUsesWith(NewF);3427PrevF->eraseFromParent();3428} else3429NewF->setName(Name);3430ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)3431<< "created clone " << ore::NV("NewFunction", NewF));34323433// Now handle aliases to this function, and clone those as well.3434if (!FuncToAliasMap.count(&F))3435continue;3436for (auto *A : FuncToAliasMap[&F]) {3437std::string Name = getMemProfFuncName(A->getName(), I);3438auto *PrevA = M.getNamedAlias(Name);3439auto *NewA = GlobalAlias::create(A->getValueType(),3440A->getType()->getPointerAddressSpace(),3441A->getLinkage(), Name, NewF);3442NewA->copyAttributesFrom(A);3443if (PrevA) {3444// We might have created this when adjusting callsite in another3445// function. It should be a declaration.3446assert(PrevA->isDeclaration());3447NewA->takeName(PrevA);3448PrevA->replaceAllUsesWith(NewA);3449PrevA->eraseFromParent();3450}3451}3452}3453return VMaps;3454}34553456// Locate the summary for F. This is complicated by the fact that it might3457// have been internalized or promoted.3458static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,3459const ModuleSummaryIndex *ImportSummary) {3460// FIXME: Ideally we would retain the original GUID in some fashion on the3461// function (e.g. as metadata), but for now do our best to locate the3462// summary without that information.3463ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());3464if (!TheFnVI)3465// See if theFn was internalized, by checking index directly with3466// original name (this avoids the name adjustment done by getGUID() for3467// internal symbols).3468TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));3469if (TheFnVI)3470return TheFnVI;3471// Now query with the original name before any promotion was performed.3472StringRef OrigName =3473ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName());3474std::string OrigId = GlobalValue::getGlobalIdentifier(3475OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());3476TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));3477if (TheFnVI)3478return TheFnVI;3479// Could be a promoted local imported from another module. We need to pass3480// down more info here to find the original module id. For now, try with3481// the OrigName which might have been stored in the OidGuidMap in the3482// index. This would not work if there were same-named locals in multiple3483// modules, however.3484auto OrigGUID =3485ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));3486if (OrigGUID)3487TheFnVI = ImportSummary->getValueInfo(OrigGUID);3488return TheFnVI;3489}34903491bool MemProfContextDisambiguation::applyImport(Module &M) {3492assert(ImportSummary);3493bool Changed = false;34943495auto IsMemProfClone = [](const Function &F) {3496return F.getName().contains(MemProfCloneSuffix);3497};34983499// We also need to clone any aliases that reference cloned functions, because3500// the modified callsites may invoke via the alias. Keep track of the aliases3501// for each function.3502std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>3503FuncToAliasMap;3504for (auto &A : M.aliases()) {3505auto *Aliasee = A.getAliaseeObject();3506if (auto *F = dyn_cast<Function>(Aliasee))3507FuncToAliasMap[F].insert(&A);3508}35093510for (auto &F : M) {3511if (F.isDeclaration() || IsMemProfClone(F))3512continue;35133514OptimizationRemarkEmitter ORE(&F);35153516SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;3517bool ClonesCreated = false;3518unsigned NumClonesCreated = 0;3519auto CloneFuncIfNeeded = [&](unsigned NumClones) {3520// We should at least have version 0 which is the original copy.3521assert(NumClones > 0);3522// If only one copy needed use original.3523if (NumClones == 1)3524return;3525// If we already performed cloning of this function, confirm that the3526// requested number of clones matches (the thin link should ensure the3527// number of clones for each constituent callsite is consistent within3528// each function), before returning.3529if (ClonesCreated) {3530assert(NumClonesCreated == NumClones);3531return;3532}3533VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);3534// The first "clone" is the original copy, which doesn't have a VMap.3535assert(VMaps.size() == NumClones - 1);3536Changed = true;3537ClonesCreated = true;3538NumClonesCreated = NumClones;3539};35403541auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,3542Function *CalledFunction) {3543// Perform cloning if not yet done.3544CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());35453546// Should have skipped indirect calls via mayHaveMemprofSummary.3547assert(CalledFunction);3548assert(!IsMemProfClone(*CalledFunction));35493550// Update the calls per the summary info.3551// Save orig name since it gets updated in the first iteration3552// below.3553auto CalleeOrigName = CalledFunction->getName();3554for (unsigned J = 0; J < StackNode.Clones.size(); J++) {3555// Do nothing if this version calls the original version of its3556// callee.3557if (!StackNode.Clones[J])3558continue;3559auto NewF = M.getOrInsertFunction(3560getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),3561CalledFunction->getFunctionType());3562CallBase *CBClone;3563// Copy 0 is the original function.3564if (!J)3565CBClone = CB;3566else3567CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);3568CBClone->setCalledFunction(NewF);3569ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)3570<< ore::NV("Call", CBClone) << " in clone "3571<< ore::NV("Caller", CBClone->getFunction())3572<< " assigned to call function clone "3573<< ore::NV("Callee", NewF.getCallee()));3574}3575};35763577// Locate the summary for F.3578ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);3579// If not found, this could be an imported local (see comment in3580// findValueInfoForFunc). Skip for now as it will be cloned in its original3581// module (where it would have been promoted to global scope so should3582// satisfy any reference in this module).3583if (!TheFnVI)3584continue;35853586auto *GVSummary =3587ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());3588if (!GVSummary) {3589// Must have been imported, use the summary which matches the definition。3590// (might be multiple if this was a linkonce_odr).3591auto SrcModuleMD = F.getMetadata("thinlto_src_module");3592assert(SrcModuleMD &&3593"enable-import-metadata is needed to emit thinlto_src_module");3594StringRef SrcModule =3595dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();3596for (auto &GVS : TheFnVI.getSummaryList()) {3597if (GVS->modulePath() == SrcModule) {3598GVSummary = GVS.get();3599break;3600}3601}3602assert(GVSummary && GVSummary->modulePath() == SrcModule);3603}36043605// If this was an imported alias skip it as we won't have the function3606// summary, and it should be cloned in the original module.3607if (isa<AliasSummary>(GVSummary))3608continue;36093610auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());36113612if (FS->allocs().empty() && FS->callsites().empty())3613continue;36143615auto SI = FS->callsites().begin();3616auto AI = FS->allocs().begin();36173618// To handle callsite infos synthesized for tail calls which have missing3619// frames in the profiled context, map callee VI to the synthesized callsite3620// info.3621DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;3622// Iterate the callsites for this function in reverse, since we place all3623// those synthesized for tail calls at the end.3624for (auto CallsiteIt = FS->callsites().rbegin();3625CallsiteIt != FS->callsites().rend(); CallsiteIt++) {3626auto &Callsite = *CallsiteIt;3627// Stop as soon as we see a non-synthesized callsite info (see comment3628// above loop). All the entries added for discovered tail calls have empty3629// stack ids.3630if (!Callsite.StackIdIndices.empty())3631break;3632MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});3633}36343635// Assume for now that the instructions are in the exact same order3636// as when the summary was created, but confirm this is correct by3637// matching the stack ids.3638for (auto &BB : F) {3639for (auto &I : BB) {3640auto *CB = dyn_cast<CallBase>(&I);3641// Same handling as when creating module summary.3642if (!mayHaveMemprofSummary(CB))3643continue;36443645auto *CalledValue = CB->getCalledOperand();3646auto *CalledFunction = CB->getCalledFunction();3647if (CalledValue && !CalledFunction) {3648CalledValue = CalledValue->stripPointerCasts();3649// Stripping pointer casts can reveal a called function.3650CalledFunction = dyn_cast<Function>(CalledValue);3651}3652// Check if this is an alias to a function. If so, get the3653// called aliasee for the checks below.3654if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {3655assert(!CalledFunction &&3656"Expected null called function in callsite for alias");3657CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());3658}36593660CallStack<MDNode, MDNode::op_iterator> CallsiteContext(3661I.getMetadata(LLVMContext::MD_callsite));3662auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);36633664// Include allocs that were already assigned a memprof function3665// attribute in the statistics.3666if (CB->getAttributes().hasFnAttr("memprof")) {3667assert(!MemProfMD);3668CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"3669? AllocTypeColdThinBackend++3670: AllocTypeNotColdThinBackend++;3671OrigAllocsThinBackend++;3672AllocVersionsThinBackend++;3673if (!MaxAllocVersionsThinBackend)3674MaxAllocVersionsThinBackend = 1;3675// Remove any remaining callsite metadata and we can skip the rest of3676// the handling for this instruction, since no cloning needed.3677I.setMetadata(LLVMContext::MD_callsite, nullptr);3678continue;3679}36803681if (MemProfMD) {3682// Consult the next alloc node.3683assert(AI != FS->allocs().end());3684auto &AllocNode = *(AI++);36853686// Sanity check that the MIB stack ids match between the summary and3687// instruction metadata.3688auto MIBIter = AllocNode.MIBs.begin();3689for (auto &MDOp : MemProfMD->operands()) {3690assert(MIBIter != AllocNode.MIBs.end());3691LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =3692MIBIter->StackIdIndices.begin();3693auto *MIBMD = cast<const MDNode>(MDOp);3694MDNode *StackMDNode = getMIBStackNode(MIBMD);3695assert(StackMDNode);3696CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);3697auto ContextIterBegin =3698StackContext.beginAfterSharedPrefix(CallsiteContext);3699// Skip the checking on the first iteration.3700uint64_t LastStackContextId =3701(ContextIterBegin != StackContext.end() &&3702*ContextIterBegin == 0)3703? 13704: 0;3705for (auto ContextIter = ContextIterBegin;3706ContextIter != StackContext.end(); ++ContextIter) {3707// If this is a direct recursion, simply skip the duplicate3708// entries, to be consistent with how the summary ids were3709// generated during ModuleSummaryAnalysis.3710if (LastStackContextId == *ContextIter)3711continue;3712LastStackContextId = *ContextIter;3713assert(StackIdIndexIter != MIBIter->StackIdIndices.end());3714assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==3715*ContextIter);3716StackIdIndexIter++;3717}3718MIBIter++;3719}37203721// Perform cloning if not yet done.3722CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());37233724OrigAllocsThinBackend++;3725AllocVersionsThinBackend += AllocNode.Versions.size();3726if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())3727MaxAllocVersionsThinBackend = AllocNode.Versions.size();37283729// If there is only one version that means we didn't end up3730// considering this function for cloning, and in that case the alloc3731// will still be none type or should have gotten the default NotCold.3732// Skip that after calling clone helper since that does some sanity3733// checks that confirm we haven't decided yet that we need cloning.3734if (AllocNode.Versions.size() == 1) {3735assert((AllocationType)AllocNode.Versions[0] ==3736AllocationType::NotCold ||3737(AllocationType)AllocNode.Versions[0] ==3738AllocationType::None);3739UnclonableAllocsThinBackend++;3740continue;3741}37423743// All versions should have a singular allocation type.3744assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {3745return Type == ((uint8_t)AllocationType::NotCold |3746(uint8_t)AllocationType::Cold);3747}));37483749// Update the allocation types per the summary info.3750for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {3751// Ignore any that didn't get an assigned allocation type.3752if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)3753continue;3754AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];3755AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++3756: AllocTypeNotColdThinBackend++;3757std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);3758auto A = llvm::Attribute::get(F.getContext(), "memprof",3759AllocTypeString);3760CallBase *CBClone;3761// Copy 0 is the original function.3762if (!J)3763CBClone = CB;3764else3765// Since VMaps are only created for new clones, we index with3766// clone J-1 (J==0 is the original clone and does not have a VMaps3767// entry).3768CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);3769CBClone->addFnAttr(A);3770ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)3771<< ore::NV("AllocationCall", CBClone) << " in clone "3772<< ore::NV("Caller", CBClone->getFunction())3773<< " marked with memprof allocation attribute "3774<< ore::NV("Attribute", AllocTypeString));3775}3776} else if (!CallsiteContext.empty()) {3777// Consult the next callsite node.3778assert(SI != FS->callsites().end());3779auto &StackNode = *(SI++);37803781#ifndef NDEBUG3782// Sanity check that the stack ids match between the summary and3783// instruction metadata.3784auto StackIdIndexIter = StackNode.StackIdIndices.begin();3785for (auto StackId : CallsiteContext) {3786assert(StackIdIndexIter != StackNode.StackIdIndices.end());3787assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==3788StackId);3789StackIdIndexIter++;3790}3791#endif37923793CloneCallsite(StackNode, CB, CalledFunction);3794} else if (CB->isTailCall()) {3795// Locate the synthesized callsite info for the callee VI, if any was3796// created, and use that for cloning.3797ValueInfo CalleeVI =3798findValueInfoForFunc(*CalledFunction, M, ImportSummary);3799if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {3800auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);3801assert(Callsite != MapTailCallCalleeVIToCallsite.end());3802CloneCallsite(Callsite->second, CB, CalledFunction);3803}3804}3805// Memprof and callsite metadata on memory allocations no longer needed.3806I.setMetadata(LLVMContext::MD_memprof, nullptr);3807I.setMetadata(LLVMContext::MD_callsite, nullptr);3808}3809}3810}38113812return Changed;3813}38143815template <typename DerivedCCG, typename FuncTy, typename CallTy>3816bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {3817if (DumpCCG) {3818dbgs() << "CCG before cloning:\n";3819dbgs() << *this;3820}3821if (ExportToDot)3822exportToDot("postbuild");38233824if (VerifyCCG) {3825check();3826}38273828identifyClones();38293830if (VerifyCCG) {3831check();3832}38333834if (DumpCCG) {3835dbgs() << "CCG after cloning:\n";3836dbgs() << *this;3837}3838if (ExportToDot)3839exportToDot("cloned");38403841bool Changed = assignFunctions();38423843if (DumpCCG) {3844dbgs() << "CCG after assigning function clones:\n";3845dbgs() << *this;3846}3847if (ExportToDot)3848exportToDot("clonefuncassign");38493850if (MemProfReportHintedSizes)3851printTotalSizes(errs());38523853return Changed;3854}38553856bool MemProfContextDisambiguation::processModule(3857Module &M,3858llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {38593860// If we have an import summary, then the cloning decisions were made during3861// the thin link on the index. Apply them and return.3862if (ImportSummary)3863return applyImport(M);38643865// TODO: If/when other types of memprof cloning are enabled beyond just for3866// hot and cold, we will need to change this to individually control the3867// AllocationType passed to addStackNodesForMIB during CCG construction.3868// Note that we specifically check this after applying imports above, so that3869// the option isn't needed to be passed to distributed ThinLTO backend3870// clang processes, which won't necessarily have visibility into the linker3871// dependences. Instead the information is communicated from the LTO link to3872// the backends via the combined summary index.3873if (!SupportsHotColdNew)3874return false;38753876ModuleCallsiteContextGraph CCG(M, OREGetter);3877return CCG.process();3878}38793880MemProfContextDisambiguation::MemProfContextDisambiguation(3881const ModuleSummaryIndex *Summary)3882: ImportSummary(Summary) {3883if (ImportSummary) {3884// The MemProfImportSummary should only be used for testing ThinLTO3885// distributed backend handling via opt, in which case we don't have a3886// summary from the pass pipeline.3887assert(MemProfImportSummary.empty());3888return;3889}3890if (MemProfImportSummary.empty())3891return;38923893auto ReadSummaryFile =3894errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary));3895if (!ReadSummaryFile) {3896logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),3897"Error loading file '" + MemProfImportSummary +3898"': ");3899return;3900}3901auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);3902if (!ImportSummaryForTestingOrErr) {3903logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),3904"Error parsing file '" + MemProfImportSummary +3905"': ");3906return;3907}3908ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);3909ImportSummary = ImportSummaryForTesting.get();3910}39113912PreservedAnalyses MemProfContextDisambiguation::run(Module &M,3913ModuleAnalysisManager &AM) {3914auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();3915auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {3916return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);3917};3918if (!processModule(M, OREGetter))3919return PreservedAnalyses::all();3920return PreservedAnalyses::none();3921}39223923void MemProfContextDisambiguation::run(3924ModuleSummaryIndex &Index,3925llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>3926isPrevailing) {3927// TODO: If/when other types of memprof cloning are enabled beyond just for3928// hot and cold, we will need to change this to individually control the3929// AllocationType passed to addStackNodesForMIB during CCG construction.3930// The index was set from the option, so these should be in sync.3931assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);3932if (!SupportsHotColdNew)3933return;39343935IndexCallsiteContextGraph CCG(Index, isPrevailing);3936CCG.process();3937}393839393940