Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
35269 views
1
//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file implements support for context disambiguation of allocation
10
// calls for profile guided heap optimization. Specifically, it uses Memprof
11
// profiles which indicate context specific allocation behavior (currently
12
// distinguishing cold vs hot memory allocations). Cloning is performed to
13
// expose the cold allocation call contexts, and the allocation calls are
14
// subsequently annotated with an attribute for later transformation.
15
//
16
// The transformations can be performed either directly on IR (regular LTO), or
17
// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18
// Both types of LTO operate on a the same base graph representation, which
19
// uses CRTP to support either IR or Index formats.
20
//
21
//===----------------------------------------------------------------------===//
22
23
#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24
#include "llvm/ADT/DenseMap.h"
25
#include "llvm/ADT/DenseSet.h"
26
#include "llvm/ADT/MapVector.h"
27
#include "llvm/ADT/SetOperations.h"
28
#include "llvm/ADT/SmallPtrSet.h"
29
#include "llvm/ADT/SmallSet.h"
30
#include "llvm/ADT/SmallVector.h"
31
#include "llvm/ADT/Statistic.h"
32
#include "llvm/Analysis/MemoryProfileInfo.h"
33
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
34
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35
#include "llvm/Bitcode/BitcodeReader.h"
36
#include "llvm/IR/Constants.h"
37
#include "llvm/IR/Instructions.h"
38
#include "llvm/IR/Module.h"
39
#include "llvm/IR/ModuleSummaryIndex.h"
40
#include "llvm/Pass.h"
41
#include "llvm/Support/CommandLine.h"
42
#include "llvm/Support/FileSystem.h"
43
#include "llvm/Support/GraphWriter.h"
44
#include "llvm/Support/raw_ostream.h"
45
#include "llvm/Transforms/IPO.h"
46
#include "llvm/Transforms/Utils/Cloning.h"
47
#include <deque>
48
#include <sstream>
49
#include <unordered_map>
50
#include <vector>
51
using namespace llvm;
52
using namespace llvm::memprof;
53
54
#define DEBUG_TYPE "memprof-context-disambiguation"
55
56
STATISTIC(FunctionClonesAnalysis,
57
"Number of function clones created during whole program analysis");
58
STATISTIC(FunctionClonesThinBackend,
59
"Number of function clones created during ThinLTO backend");
60
STATISTIC(FunctionsClonedThinBackend,
61
"Number of functions that had clones created during ThinLTO backend");
62
STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
63
"cloned) during whole program analysis");
64
STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
65
"during whole program analysis");
66
STATISTIC(AllocTypeNotColdThinBackend,
67
"Number of not cold static allocations (possibly cloned) during "
68
"ThinLTO backend");
69
STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
70
"(possibly cloned) during ThinLTO backend");
71
STATISTIC(OrigAllocsThinBackend,
72
"Number of original (not cloned) allocations with memprof profiles "
73
"during ThinLTO backend");
74
STATISTIC(
75
AllocVersionsThinBackend,
76
"Number of allocation versions (including clones) during ThinLTO backend");
77
STATISTIC(MaxAllocVersionsThinBackend,
78
"Maximum number of allocation versions created for an original "
79
"allocation during ThinLTO backend");
80
STATISTIC(UnclonableAllocsThinBackend,
81
"Number of unclonable ambigous allocations during ThinLTO backend");
82
STATISTIC(RemovedEdgesWithMismatchedCallees,
83
"Number of edges removed due to mismatched callees (profiled vs IR)");
84
STATISTIC(FoundProfiledCalleeCount,
85
"Number of profiled callees found via tail calls");
86
STATISTIC(FoundProfiledCalleeDepth,
87
"Aggregate depth of profiled callees found via tail calls");
88
STATISTIC(FoundProfiledCalleeMaxDepth,
89
"Maximum depth of profiled callees found via tail calls");
90
STATISTIC(FoundProfiledCalleeNonUniquelyCount,
91
"Number of profiled callees found via multiple tail call chains");
92
93
static cl::opt<std::string> DotFilePathPrefix(
94
"memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
95
cl::value_desc("filename"),
96
cl::desc("Specify the path prefix of the MemProf dot files."));
97
98
static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
99
cl::Hidden,
100
cl::desc("Export graph to dot files."));
101
102
static cl::opt<bool>
103
DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
104
cl::desc("Dump CallingContextGraph to stdout after each stage."));
105
106
static cl::opt<bool>
107
VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
108
cl::desc("Perform verification checks on CallingContextGraph."));
109
110
static cl::opt<bool>
111
VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
112
cl::desc("Perform frequent verification checks on nodes."));
113
114
static cl::opt<std::string> MemProfImportSummary(
115
"memprof-import-summary",
116
cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
117
cl::Hidden);
118
119
static cl::opt<unsigned>
120
TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
121
cl::Hidden,
122
cl::desc("Max depth to recursively search for missing "
123
"frames through tail calls."));
124
125
namespace llvm {
126
cl::opt<bool> EnableMemProfContextDisambiguation(
127
"enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
128
cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
129
130
// Indicate we are linking with an allocator that supports hot/cold operator
131
// new interfaces.
132
cl::opt<bool> SupportsHotColdNew(
133
"supports-hot-cold-new", cl::init(false), cl::Hidden,
134
cl::desc("Linking with hot/cold operator new interfaces"));
135
} // namespace llvm
136
137
extern cl::opt<bool> MemProfReportHintedSizes;
138
139
namespace {
140
/// CRTP base for graphs built from either IR or ThinLTO summary index.
141
///
142
/// The graph represents the call contexts in all memprof metadata on allocation
143
/// calls, with nodes for the allocations themselves, as well as for the calls
144
/// in each context. The graph is initially built from the allocation memprof
145
/// metadata (or summary) MIBs. It is then updated to match calls with callsite
146
/// metadata onto the nodes, updating it to reflect any inlining performed on
147
/// those calls.
148
///
149
/// Each MIB (representing an allocation's call context with allocation
150
/// behavior) is assigned a unique context id during the graph build. The edges
151
/// and nodes in the graph are decorated with the context ids they carry. This
152
/// is used to correctly update the graph when cloning is performed so that we
153
/// can uniquify the context for a single (possibly cloned) allocation.
154
template <typename DerivedCCG, typename FuncTy, typename CallTy>
155
class CallsiteContextGraph {
156
public:
157
CallsiteContextGraph() = default;
158
CallsiteContextGraph(const CallsiteContextGraph &) = default;
159
CallsiteContextGraph(CallsiteContextGraph &&) = default;
160
161
/// Main entry point to perform analysis and transformations on graph.
162
bool process();
163
164
/// Perform cloning on the graph necessary to uniquely identify the allocation
165
/// behavior of an allocation based on its context.
166
void identifyClones();
167
168
/// Assign callsite clones to functions, cloning functions as needed to
169
/// accommodate the combinations of their callsite clones reached by callers.
170
/// For regular LTO this clones functions and callsites in the IR, but for
171
/// ThinLTO the cloning decisions are noted in the summaries and later applied
172
/// in applyImport.
173
bool assignFunctions();
174
175
void dump() const;
176
void print(raw_ostream &OS) const;
177
void printTotalSizes(raw_ostream &OS) const;
178
179
friend raw_ostream &operator<<(raw_ostream &OS,
180
const CallsiteContextGraph &CCG) {
181
CCG.print(OS);
182
return OS;
183
}
184
185
friend struct GraphTraits<
186
const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
187
friend struct DOTGraphTraits<
188
const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
189
190
void exportToDot(std::string Label) const;
191
192
/// Represents a function clone via FuncTy pointer and clone number pair.
193
struct FuncInfo final
194
: public std::pair<FuncTy *, unsigned /*Clone number*/> {
195
using Base = std::pair<FuncTy *, unsigned>;
196
FuncInfo(const Base &B) : Base(B) {}
197
FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
198
explicit operator bool() const { return this->first != nullptr; }
199
FuncTy *func() const { return this->first; }
200
unsigned cloneNo() const { return this->second; }
201
};
202
203
/// Represents a callsite clone via CallTy and clone number pair.
204
struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
205
using Base = std::pair<CallTy, unsigned>;
206
CallInfo(const Base &B) : Base(B) {}
207
CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
208
: Base(Call, CloneNo) {}
209
explicit operator bool() const { return (bool)this->first; }
210
CallTy call() const { return this->first; }
211
unsigned cloneNo() const { return this->second; }
212
void setCloneNo(unsigned N) { this->second = N; }
213
void print(raw_ostream &OS) const {
214
if (!operator bool()) {
215
assert(!cloneNo());
216
OS << "null Call";
217
return;
218
}
219
call()->print(OS);
220
OS << "\t(clone " << cloneNo() << ")";
221
}
222
void dump() const {
223
print(dbgs());
224
dbgs() << "\n";
225
}
226
friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
227
Call.print(OS);
228
return OS;
229
}
230
};
231
232
struct ContextEdge;
233
234
/// Node in the Callsite Context Graph
235
struct ContextNode {
236
// Keep this for now since in the IR case where we have an Instruction* it
237
// is not as immediately discoverable. Used for printing richer information
238
// when dumping graph.
239
bool IsAllocation;
240
241
// Keeps track of when the Call was reset to null because there was
242
// recursion.
243
bool Recursive = false;
244
245
// The corresponding allocation or interior call.
246
CallInfo Call;
247
248
// For alloc nodes this is a unique id assigned when constructed, and for
249
// callsite stack nodes it is the original stack id when the node is
250
// constructed from the memprof MIB metadata on the alloc nodes. Note that
251
// this is only used when matching callsite metadata onto the stack nodes
252
// created when processing the allocation memprof MIBs, and for labeling
253
// nodes in the dot graph. Therefore we don't bother to assign a value for
254
// clones.
255
uint64_t OrigStackOrAllocId = 0;
256
257
// This will be formed by ORing together the AllocationType enum values
258
// for contexts including this node.
259
uint8_t AllocTypes = 0;
260
261
// Edges to all callees in the profiled call stacks.
262
// TODO: Should this be a map (from Callee node) for more efficient lookup?
263
std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
264
265
// Edges to all callers in the profiled call stacks.
266
// TODO: Should this be a map (from Caller node) for more efficient lookup?
267
std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
268
269
// Get the list of edges from which we can compute allocation information
270
// such as the context ids and allocation type of this node.
271
const std::vector<std::shared_ptr<ContextEdge>> *
272
getEdgesWithAllocInfo() const {
273
// If node has any callees, compute from those, otherwise compute from
274
// callers (i.e. if this is the leaf allocation node).
275
if (!CalleeEdges.empty())
276
return &CalleeEdges;
277
if (!CallerEdges.empty()) {
278
// A node with caller edges but no callee edges must be the allocation
279
// node.
280
assert(IsAllocation);
281
return &CallerEdges;
282
}
283
return nullptr;
284
}
285
286
// Compute the context ids for this node from the union of its edge context
287
// ids.
288
DenseSet<uint32_t> getContextIds() const {
289
DenseSet<uint32_t> ContextIds;
290
auto *Edges = getEdgesWithAllocInfo();
291
if (!Edges)
292
return {};
293
unsigned Count = 0;
294
for (auto &Edge : *Edges)
295
Count += Edge->getContextIds().size();
296
ContextIds.reserve(Count);
297
for (auto &Edge : *Edges)
298
ContextIds.insert(Edge->getContextIds().begin(),
299
Edge->getContextIds().end());
300
return ContextIds;
301
}
302
303
// Compute the allocation type for this node from the OR of its edge
304
// allocation types.
305
uint8_t computeAllocType() const {
306
auto *Edges = getEdgesWithAllocInfo();
307
if (!Edges)
308
return (uint8_t)AllocationType::None;
309
uint8_t BothTypes =
310
(uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
311
uint8_t AllocType = (uint8_t)AllocationType::None;
312
for (auto &Edge : *Edges) {
313
AllocType |= Edge->AllocTypes;
314
// Bail early if alloc type reached both, no further refinement.
315
if (AllocType == BothTypes)
316
return AllocType;
317
}
318
return AllocType;
319
}
320
321
// The context ids set for this node is empty if its edge context ids are
322
// also all empty.
323
bool emptyContextIds() const {
324
auto *Edges = getEdgesWithAllocInfo();
325
if (!Edges)
326
return true;
327
for (auto &Edge : *Edges) {
328
if (!Edge->getContextIds().empty())
329
return false;
330
}
331
return true;
332
}
333
334
// List of clones of this ContextNode, initially empty.
335
std::vector<ContextNode *> Clones;
336
337
// If a clone, points to the original uncloned node.
338
ContextNode *CloneOf = nullptr;
339
340
ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
341
342
ContextNode(bool IsAllocation, CallInfo C)
343
: IsAllocation(IsAllocation), Call(C) {}
344
345
void addClone(ContextNode *Clone) {
346
if (CloneOf) {
347
CloneOf->Clones.push_back(Clone);
348
Clone->CloneOf = CloneOf;
349
} else {
350
Clones.push_back(Clone);
351
assert(!Clone->CloneOf);
352
Clone->CloneOf = this;
353
}
354
}
355
356
ContextNode *getOrigNode() {
357
if (!CloneOf)
358
return this;
359
return CloneOf;
360
}
361
362
void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
363
unsigned int ContextId);
364
365
ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
366
ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
367
void eraseCalleeEdge(const ContextEdge *Edge);
368
void eraseCallerEdge(const ContextEdge *Edge);
369
370
void setCall(CallInfo C) { Call = C; }
371
372
bool hasCall() const { return (bool)Call.call(); }
373
374
void printCall(raw_ostream &OS) const { Call.print(OS); }
375
376
// True if this node was effectively removed from the graph, in which case
377
// it should have an allocation type of None and empty context ids.
378
bool isRemoved() const {
379
assert((AllocTypes == (uint8_t)AllocationType::None) ==
380
emptyContextIds());
381
return AllocTypes == (uint8_t)AllocationType::None;
382
}
383
384
void dump() const;
385
void print(raw_ostream &OS) const;
386
387
friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
388
Node.print(OS);
389
return OS;
390
}
391
};
392
393
/// Edge in the Callsite Context Graph from a ContextNode N to a caller or
394
/// callee.
395
struct ContextEdge {
396
ContextNode *Callee;
397
ContextNode *Caller;
398
399
// This will be formed by ORing together the AllocationType enum values
400
// for contexts including this edge.
401
uint8_t AllocTypes = 0;
402
403
// The set of IDs for contexts including this edge.
404
DenseSet<uint32_t> ContextIds;
405
406
ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
407
DenseSet<uint32_t> ContextIds)
408
: Callee(Callee), Caller(Caller), AllocTypes(AllocType),
409
ContextIds(std::move(ContextIds)) {}
410
411
DenseSet<uint32_t> &getContextIds() { return ContextIds; }
412
413
void dump() const;
414
void print(raw_ostream &OS) const;
415
416
friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
417
Edge.print(OS);
418
return OS;
419
}
420
};
421
422
/// Helpers to remove callee edges that have allocation type None (due to not
423
/// carrying any context ids) after transformations.
424
void removeNoneTypeCalleeEdges(ContextNode *Node);
425
void
426
recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
427
DenseSet<const ContextNode *> &Visited);
428
429
protected:
430
/// Get a list of nodes corresponding to the stack ids in the given callsite
431
/// context.
432
template <class NodeT, class IteratorT>
433
std::vector<uint64_t>
434
getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
435
436
/// Adds nodes for the given allocation and any stack ids on its memprof MIB
437
/// metadata (or summary).
438
ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
439
440
/// Adds nodes for the given MIB stack ids.
441
template <class NodeT, class IteratorT>
442
void addStackNodesForMIB(ContextNode *AllocNode,
443
CallStack<NodeT, IteratorT> &StackContext,
444
CallStack<NodeT, IteratorT> &CallsiteContext,
445
AllocationType AllocType, uint64_t TotalSize);
446
447
/// Matches all callsite metadata (or summary) to the nodes created for
448
/// allocation memprof MIB metadata, synthesizing new nodes to reflect any
449
/// inlining performed on those callsite instructions.
450
void updateStackNodes();
451
452
/// Update graph to conservatively handle any callsite stack nodes that target
453
/// multiple different callee target functions.
454
void handleCallsitesWithMultipleTargets();
455
456
/// Save lists of calls with MemProf metadata in each function, for faster
457
/// iteration.
458
MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
459
460
/// Map from callsite node to the enclosing caller function.
461
std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
462
463
private:
464
using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
465
466
using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
467
const FuncTy *, DenseSet<uint32_t>>;
468
469
/// Assigns the given Node to calls at or inlined into the location with
470
/// the Node's stack id, after post order traversing and processing its
471
/// caller nodes. Uses the call information recorded in the given
472
/// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
473
/// as needed. Called by updateStackNodes which sets up the given
474
/// StackIdToMatchingCalls map.
475
void assignStackNodesPostOrder(
476
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
477
DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
478
479
/// Duplicates the given set of context ids, updating the provided
480
/// map from each original id with the newly generated context ids,
481
/// and returning the new duplicated id set.
482
DenseSet<uint32_t> duplicateContextIds(
483
const DenseSet<uint32_t> &StackSequenceContextIds,
484
DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
485
486
/// Propagates all duplicated context ids across the graph.
487
void propagateDuplicateContextIds(
488
const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
489
490
/// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
491
/// else to its callers. Also updates OrigNode's edges to remove any context
492
/// ids moved to the newly created edge.
493
void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
494
bool TowardsCallee,
495
DenseSet<uint32_t> RemainingContextIds);
496
497
/// Get the stack id corresponding to the given Id or Index (for IR this will
498
/// return itself, for a summary index this will return the id recorded in the
499
/// index for that stack id index value).
500
uint64_t getStackId(uint64_t IdOrIndex) const {
501
return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
502
}
503
504
/// Returns true if the given call targets the callee of the given edge, or if
505
/// we were able to identify the call chain through intermediate tail calls.
506
/// In the latter case new context nodes are added to the graph for the
507
/// identified tail calls, and their synthesized nodes are added to
508
/// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
509
/// the updated edges and to prepare it for an increment in the caller.
510
bool
511
calleesMatch(CallTy Call, EdgeIter &EI,
512
MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
513
514
/// Returns true if the given call targets the given function, or if we were
515
/// able to identify the call chain through intermediate tail calls (in which
516
/// case FoundCalleeChain will be populated).
517
bool calleeMatchesFunc(
518
CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
519
std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
520
return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
521
Call, Func, CallerFunc, FoundCalleeChain);
522
}
523
524
/// Get a list of nodes corresponding to the stack ids in the given
525
/// callsite's context.
526
std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
527
return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
528
Call);
529
}
530
531
/// Get the last stack id in the context for callsite.
532
uint64_t getLastStackId(CallTy Call) {
533
return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
534
}
535
536
/// Update the allocation call to record type of allocated memory.
537
void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
538
AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
539
static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
540
}
541
542
/// Update non-allocation call to invoke (possibly cloned) function
543
/// CalleeFunc.
544
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
545
static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
546
}
547
548
/// Clone the given function for the given callsite, recording mapping of all
549
/// of the functions tracked calls to their new versions in the CallMap.
550
/// Assigns new clones to clone number CloneNo.
551
FuncInfo cloneFunctionForCallsite(
552
FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
553
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
554
return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
555
Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
556
}
557
558
/// Gets a label to use in the dot graph for the given call clone in the given
559
/// function.
560
std::string getLabel(const FuncTy *Func, const CallTy Call,
561
unsigned CloneNo) const {
562
return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
563
}
564
565
/// Helpers to find the node corresponding to the given call or stackid.
566
ContextNode *getNodeForInst(const CallInfo &C);
567
ContextNode *getNodeForAlloc(const CallInfo &C);
568
ContextNode *getNodeForStackId(uint64_t StackId);
569
570
/// Computes the alloc type corresponding to the given context ids, by
571
/// unioning their recorded alloc types.
572
uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
573
574
/// Returns the allocation type of the intersection of the contexts of two
575
/// nodes (based on their provided context id sets), optimized for the case
576
/// when Node1Ids is smaller than Node2Ids.
577
uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
578
const DenseSet<uint32_t> &Node2Ids);
579
580
/// Returns the allocation type of the intersection of the contexts of two
581
/// nodes (based on their provided context id sets).
582
uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
583
const DenseSet<uint32_t> &Node2Ids);
584
585
/// Create a clone of Edge's callee and move Edge to that new callee node,
586
/// performing the necessary context id and allocation type updates.
587
/// If callee's caller edge iterator is supplied, it is updated when removing
588
/// the edge from that list. If ContextIdsToMove is non-empty, only that
589
/// subset of Edge's ids are moved to an edge to the new callee.
590
ContextNode *
591
moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
592
EdgeIter *CallerEdgeI = nullptr,
593
DenseSet<uint32_t> ContextIdsToMove = {});
594
595
/// Change the callee of Edge to existing callee clone NewCallee, performing
596
/// the necessary context id and allocation type updates.
597
/// If callee's caller edge iterator is supplied, it is updated when removing
598
/// the edge from that list. If ContextIdsToMove is non-empty, only that
599
/// subset of Edge's ids are moved to an edge to the new callee.
600
void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
601
ContextNode *NewCallee,
602
EdgeIter *CallerEdgeI = nullptr,
603
bool NewClone = false,
604
DenseSet<uint32_t> ContextIdsToMove = {});
605
606
/// Recursively perform cloning on the graph for the given Node and its
607
/// callers, in order to uniquely identify the allocation behavior of an
608
/// allocation given its context. The context ids of the allocation being
609
/// processed are given in AllocContextIds.
610
void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
611
const DenseSet<uint32_t> &AllocContextIds);
612
613
/// Map from each context ID to the AllocationType assigned to that context.
614
DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
615
616
/// Map from each contextID to the profiled aggregate allocation size,
617
/// optionally populated when requested (via MemProfReportHintedSizes).
618
DenseMap<uint32_t, uint64_t> ContextIdToTotalSize;
619
620
/// Identifies the context node created for a stack id when adding the MIB
621
/// contexts to the graph. This is used to locate the context nodes when
622
/// trying to assign the corresponding callsites with those stack ids to these
623
/// nodes.
624
DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
625
626
/// Maps to track the calls to their corresponding nodes in the graph.
627
MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
628
MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
629
630
/// Owner of all ContextNode unique_ptrs.
631
std::vector<std::unique_ptr<ContextNode>> NodeOwner;
632
633
/// Perform sanity checks on graph when requested.
634
void check() const;
635
636
/// Keeps track of the last unique context id assigned.
637
unsigned int LastContextId = 0;
638
};
639
640
template <typename DerivedCCG, typename FuncTy, typename CallTy>
641
using ContextNode =
642
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
643
template <typename DerivedCCG, typename FuncTy, typename CallTy>
644
using ContextEdge =
645
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
646
template <typename DerivedCCG, typename FuncTy, typename CallTy>
647
using FuncInfo =
648
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
649
template <typename DerivedCCG, typename FuncTy, typename CallTy>
650
using CallInfo =
651
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
652
653
/// CRTP derived class for graphs built from IR (regular LTO).
654
class ModuleCallsiteContextGraph
655
: public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
656
Instruction *> {
657
public:
658
ModuleCallsiteContextGraph(
659
Module &M,
660
llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
661
662
private:
663
friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
664
Instruction *>;
665
666
uint64_t getStackId(uint64_t IdOrIndex) const;
667
bool calleeMatchesFunc(
668
Instruction *Call, const Function *Func, const Function *CallerFunc,
669
std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
670
bool findProfiledCalleeThroughTailCalls(
671
const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
672
std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
673
bool &FoundMultipleCalleeChains);
674
uint64_t getLastStackId(Instruction *Call);
675
std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
676
void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
677
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
678
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
679
Instruction *>::FuncInfo
680
cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
681
std::map<CallInfo, CallInfo> &CallMap,
682
std::vector<CallInfo> &CallsWithMetadataInFunc,
683
unsigned CloneNo);
684
std::string getLabel(const Function *Func, const Instruction *Call,
685
unsigned CloneNo) const;
686
687
const Module &Mod;
688
llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
689
};
690
691
/// Represents a call in the summary index graph, which can either be an
692
/// allocation or an interior callsite node in an allocation's context.
693
/// Holds a pointer to the corresponding data structure in the index.
694
struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
695
IndexCall() : PointerUnion() {}
696
IndexCall(std::nullptr_t) : IndexCall() {}
697
IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
698
IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
699
IndexCall(PointerUnion PT) : PointerUnion(PT) {}
700
701
IndexCall *operator->() { return this; }
702
703
PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
704
705
void print(raw_ostream &OS) const {
706
if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
707
OS << *AI;
708
} else {
709
auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
710
assert(CI);
711
OS << *CI;
712
}
713
}
714
};
715
716
/// CRTP derived class for graphs built from summary index (ThinLTO).
717
class IndexCallsiteContextGraph
718
: public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
719
IndexCall> {
720
public:
721
IndexCallsiteContextGraph(
722
ModuleSummaryIndex &Index,
723
llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
724
isPrevailing);
725
726
~IndexCallsiteContextGraph() {
727
// Now that we are done with the graph it is safe to add the new
728
// CallsiteInfo structs to the function summary vectors. The graph nodes
729
// point into locations within these vectors, so we don't want to add them
730
// any earlier.
731
for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
732
auto *FS = I.first;
733
for (auto &Callsite : I.second)
734
FS->addCallsite(*Callsite.second);
735
}
736
}
737
738
private:
739
friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
740
IndexCall>;
741
742
uint64_t getStackId(uint64_t IdOrIndex) const;
743
bool calleeMatchesFunc(
744
IndexCall &Call, const FunctionSummary *Func,
745
const FunctionSummary *CallerFunc,
746
std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
747
bool findProfiledCalleeThroughTailCalls(
748
ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
749
std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
750
bool &FoundMultipleCalleeChains);
751
uint64_t getLastStackId(IndexCall &Call);
752
std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
753
void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
754
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
755
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
756
IndexCall>::FuncInfo
757
cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
758
std::map<CallInfo, CallInfo> &CallMap,
759
std::vector<CallInfo> &CallsWithMetadataInFunc,
760
unsigned CloneNo);
761
std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
762
unsigned CloneNo) const;
763
764
// Saves mapping from function summaries containing memprof records back to
765
// its VI, for use in checking and debugging.
766
std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
767
768
const ModuleSummaryIndex &Index;
769
llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
770
isPrevailing;
771
772
// Saves/owns the callsite info structures synthesized for missing tail call
773
// frames that we discover while building the graph.
774
// It maps from the summary of the function making the tail call, to a map
775
// of callee ValueInfo to corresponding synthesized callsite info.
776
std::unordered_map<FunctionSummary *,
777
std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
778
FunctionCalleesToSynthesizedCallsiteInfos;
779
};
780
} // namespace
781
782
namespace llvm {
783
template <>
784
struct DenseMapInfo<typename CallsiteContextGraph<
785
ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
786
: public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
787
template <>
788
struct DenseMapInfo<typename CallsiteContextGraph<
789
IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
790
: public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
791
template <>
792
struct DenseMapInfo<IndexCall>
793
: public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
794
} // end namespace llvm
795
796
namespace {
797
798
struct FieldSeparator {
799
bool Skip = true;
800
const char *Sep;
801
802
FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
803
};
804
805
raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
806
if (FS.Skip) {
807
FS.Skip = false;
808
return OS;
809
}
810
return OS << FS.Sep;
811
}
812
813
// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
814
// type we should actually use on the corresponding allocation.
815
// If we can't clone a node that has NotCold+Cold alloc type, we will fall
816
// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
817
// from NotCold.
818
AllocationType allocTypeToUse(uint8_t AllocTypes) {
819
assert(AllocTypes != (uint8_t)AllocationType::None);
820
if (AllocTypes ==
821
((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
822
return AllocationType::NotCold;
823
else
824
return (AllocationType)AllocTypes;
825
}
826
827
// Helper to check if the alloc types for all edges recorded in the
828
// InAllocTypes vector match the alloc types for all edges in the Edges
829
// vector.
830
template <typename DerivedCCG, typename FuncTy, typename CallTy>
831
bool allocTypesMatch(
832
const std::vector<uint8_t> &InAllocTypes,
833
const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
834
&Edges) {
835
return std::equal(
836
InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
837
[](const uint8_t &l,
838
const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
839
// Can share if one of the edges is None type - don't
840
// care about the type along that edge as it doesn't
841
// exist for those context ids.
842
if (l == (uint8_t)AllocationType::None ||
843
r->AllocTypes == (uint8_t)AllocationType::None)
844
return true;
845
return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
846
});
847
}
848
849
} // end anonymous namespace
850
851
template <typename DerivedCCG, typename FuncTy, typename CallTy>
852
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
853
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
854
const CallInfo &C) {
855
ContextNode *Node = getNodeForAlloc(C);
856
if (Node)
857
return Node;
858
859
return NonAllocationCallToContextNodeMap.lookup(C);
860
}
861
862
template <typename DerivedCCG, typename FuncTy, typename CallTy>
863
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
864
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
865
const CallInfo &C) {
866
return AllocationCallToContextNodeMap.lookup(C);
867
}
868
869
template <typename DerivedCCG, typename FuncTy, typename CallTy>
870
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
871
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
872
uint64_t StackId) {
873
auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
874
if (StackEntryNode != StackEntryIdToContextNodeMap.end())
875
return StackEntryNode->second;
876
return nullptr;
877
}
878
879
template <typename DerivedCCG, typename FuncTy, typename CallTy>
880
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
881
addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
882
unsigned int ContextId) {
883
for (auto &Edge : CallerEdges) {
884
if (Edge->Caller == Caller) {
885
Edge->AllocTypes |= (uint8_t)AllocType;
886
Edge->getContextIds().insert(ContextId);
887
return;
888
}
889
}
890
std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
891
this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
892
CallerEdges.push_back(Edge);
893
Caller->CalleeEdges.push_back(Edge);
894
}
895
896
template <typename DerivedCCG, typename FuncTy, typename CallTy>
897
void CallsiteContextGraph<
898
DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
899
for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
900
auto Edge = *EI;
901
if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
902
assert(Edge->ContextIds.empty());
903
Edge->Callee->eraseCallerEdge(Edge.get());
904
EI = Node->CalleeEdges.erase(EI);
905
} else
906
++EI;
907
}
908
}
909
910
template <typename DerivedCCG, typename FuncTy, typename CallTy>
911
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
912
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
913
findEdgeFromCallee(const ContextNode *Callee) {
914
for (const auto &Edge : CalleeEdges)
915
if (Edge->Callee == Callee)
916
return Edge.get();
917
return nullptr;
918
}
919
920
template <typename DerivedCCG, typename FuncTy, typename CallTy>
921
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
922
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
923
findEdgeFromCaller(const ContextNode *Caller) {
924
for (const auto &Edge : CallerEdges)
925
if (Edge->Caller == Caller)
926
return Edge.get();
927
return nullptr;
928
}
929
930
template <typename DerivedCCG, typename FuncTy, typename CallTy>
931
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
932
eraseCalleeEdge(const ContextEdge *Edge) {
933
auto EI = llvm::find_if(
934
CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
935
return CalleeEdge.get() == Edge;
936
});
937
assert(EI != CalleeEdges.end());
938
CalleeEdges.erase(EI);
939
}
940
941
template <typename DerivedCCG, typename FuncTy, typename CallTy>
942
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
943
eraseCallerEdge(const ContextEdge *Edge) {
944
auto EI = llvm::find_if(
945
CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
946
return CallerEdge.get() == Edge;
947
});
948
assert(EI != CallerEdges.end());
949
CallerEdges.erase(EI);
950
}
951
952
template <typename DerivedCCG, typename FuncTy, typename CallTy>
953
uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
954
DenseSet<uint32_t> &ContextIds) {
955
uint8_t BothTypes =
956
(uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
957
uint8_t AllocType = (uint8_t)AllocationType::None;
958
for (auto Id : ContextIds) {
959
AllocType |= (uint8_t)ContextIdToAllocationType[Id];
960
// Bail early if alloc type reached both, no further refinement.
961
if (AllocType == BothTypes)
962
return AllocType;
963
}
964
return AllocType;
965
}
966
967
template <typename DerivedCCG, typename FuncTy, typename CallTy>
968
uint8_t
969
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
970
const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
971
uint8_t BothTypes =
972
(uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
973
uint8_t AllocType = (uint8_t)AllocationType::None;
974
for (auto Id : Node1Ids) {
975
if (!Node2Ids.count(Id))
976
continue;
977
AllocType |= (uint8_t)ContextIdToAllocationType[Id];
978
// Bail early if alloc type reached both, no further refinement.
979
if (AllocType == BothTypes)
980
return AllocType;
981
}
982
return AllocType;
983
}
984
985
template <typename DerivedCCG, typename FuncTy, typename CallTy>
986
uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
987
const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
988
if (Node1Ids.size() < Node2Ids.size())
989
return intersectAllocTypesImpl(Node1Ids, Node2Ids);
990
else
991
return intersectAllocTypesImpl(Node2Ids, Node1Ids);
992
}
993
994
template <typename DerivedCCG, typename FuncTy, typename CallTy>
995
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
996
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
997
CallInfo Call, const FuncTy *F) {
998
assert(!getNodeForAlloc(Call));
999
NodeOwner.push_back(
1000
std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
1001
ContextNode *AllocNode = NodeOwner.back().get();
1002
AllocationCallToContextNodeMap[Call] = AllocNode;
1003
NodeToCallingFunc[AllocNode] = F;
1004
// Use LastContextId as a uniq id for MIB allocation nodes.
1005
AllocNode->OrigStackOrAllocId = LastContextId;
1006
// Alloc type should be updated as we add in the MIBs. We should assert
1007
// afterwards that it is not still None.
1008
AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1009
1010
return AllocNode;
1011
}
1012
1013
static std::string getAllocTypeString(uint8_t AllocTypes) {
1014
if (!AllocTypes)
1015
return "None";
1016
std::string Str;
1017
if (AllocTypes & (uint8_t)AllocationType::NotCold)
1018
Str += "NotCold";
1019
if (AllocTypes & (uint8_t)AllocationType::Cold)
1020
Str += "Cold";
1021
return Str;
1022
}
1023
1024
template <typename DerivedCCG, typename FuncTy, typename CallTy>
1025
template <class NodeT, class IteratorT>
1026
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1027
ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1028
CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1029
uint64_t TotalSize) {
1030
assert(!MemProfReportHintedSizes || TotalSize > 0);
1031
// Treating the hot alloc type as NotCold before the disambiguation for "hot"
1032
// is done.
1033
if (AllocType == AllocationType::Hot)
1034
AllocType = AllocationType::NotCold;
1035
1036
ContextIdToAllocationType[++LastContextId] = AllocType;
1037
1038
if (MemProfReportHintedSizes) {
1039
assert(TotalSize);
1040
ContextIdToTotalSize[LastContextId] = TotalSize;
1041
}
1042
1043
// Update alloc type and context ids for this MIB.
1044
AllocNode->AllocTypes |= (uint8_t)AllocType;
1045
1046
// Now add or update nodes for each stack id in alloc's context.
1047
// Later when processing the stack ids on non-alloc callsites we will adjust
1048
// for any inlining in the context.
1049
ContextNode *PrevNode = AllocNode;
1050
// Look for recursion (direct recursion should have been collapsed by
1051
// module summary analysis, here we should just be detecting mutual
1052
// recursion). Mark these nodes so we don't try to clone.
1053
SmallSet<uint64_t, 8> StackIdSet;
1054
// Skip any on the allocation call (inlining).
1055
for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1056
ContextIter != StackContext.end(); ++ContextIter) {
1057
auto StackId = getStackId(*ContextIter);
1058
ContextNode *StackNode = getNodeForStackId(StackId);
1059
if (!StackNode) {
1060
NodeOwner.push_back(
1061
std::make_unique<ContextNode>(/*IsAllocation=*/false));
1062
StackNode = NodeOwner.back().get();
1063
StackEntryIdToContextNodeMap[StackId] = StackNode;
1064
StackNode->OrigStackOrAllocId = StackId;
1065
}
1066
auto Ins = StackIdSet.insert(StackId);
1067
if (!Ins.second)
1068
StackNode->Recursive = true;
1069
StackNode->AllocTypes |= (uint8_t)AllocType;
1070
PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1071
PrevNode = StackNode;
1072
}
1073
}
1074
1075
template <typename DerivedCCG, typename FuncTy, typename CallTy>
1076
DenseSet<uint32_t>
1077
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1078
const DenseSet<uint32_t> &StackSequenceContextIds,
1079
DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1080
DenseSet<uint32_t> NewContextIds;
1081
for (auto OldId : StackSequenceContextIds) {
1082
NewContextIds.insert(++LastContextId);
1083
OldToNewContextIds[OldId].insert(LastContextId);
1084
assert(ContextIdToAllocationType.count(OldId));
1085
// The new context has the same allocation type as original.
1086
ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1087
// For now set this to 0 so we don't duplicate sizes. Not clear how to divvy
1088
// up the size. Assume that if we are able to duplicate context ids that we
1089
// will be able to disambiguate all copies.
1090
ContextIdToTotalSize[LastContextId] = 0;
1091
}
1092
return NewContextIds;
1093
}
1094
1095
template <typename DerivedCCG, typename FuncTy, typename CallTy>
1096
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1097
propagateDuplicateContextIds(
1098
const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1099
// Build a set of duplicated context ids corresponding to the input id set.
1100
auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1101
DenseSet<uint32_t> NewIds;
1102
for (auto Id : ContextIds)
1103
if (auto NewId = OldToNewContextIds.find(Id);
1104
NewId != OldToNewContextIds.end())
1105
NewIds.insert(NewId->second.begin(), NewId->second.end());
1106
return NewIds;
1107
};
1108
1109
// Recursively update context ids sets along caller edges.
1110
auto UpdateCallers = [&](ContextNode *Node,
1111
DenseSet<const ContextEdge *> &Visited,
1112
auto &&UpdateCallers) -> void {
1113
for (const auto &Edge : Node->CallerEdges) {
1114
auto Inserted = Visited.insert(Edge.get());
1115
if (!Inserted.second)
1116
continue;
1117
ContextNode *NextNode = Edge->Caller;
1118
DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1119
// Only need to recursively iterate to NextNode via this caller edge if
1120
// it resulted in any added ids to NextNode.
1121
if (!NewIdsToAdd.empty()) {
1122
Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1123
UpdateCallers(NextNode, Visited, UpdateCallers);
1124
}
1125
}
1126
};
1127
1128
DenseSet<const ContextEdge *> Visited;
1129
for (auto &Entry : AllocationCallToContextNodeMap) {
1130
auto *Node = Entry.second;
1131
UpdateCallers(Node, Visited, UpdateCallers);
1132
}
1133
}
1134
1135
template <typename DerivedCCG, typename FuncTy, typename CallTy>
1136
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1137
ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1138
// This must be passed by value to make a copy since it will be adjusted
1139
// as ids are moved.
1140
DenseSet<uint32_t> RemainingContextIds) {
1141
auto &OrigEdges =
1142
TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1143
// Increment iterator in loop so that we can remove edges as needed.
1144
for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1145
auto Edge = *EI;
1146
// Remove any matching context ids from Edge, return set that were found and
1147
// removed, these are the new edge's context ids. Also update the remaining
1148
// (not found ids).
1149
DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1150
set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1151
NotFoundContextIds);
1152
RemainingContextIds.swap(NotFoundContextIds);
1153
// If no matching context ids for this edge, skip it.
1154
if (NewEdgeContextIds.empty()) {
1155
++EI;
1156
continue;
1157
}
1158
if (TowardsCallee) {
1159
uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1160
auto NewEdge = std::make_shared<ContextEdge>(
1161
Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1162
NewNode->CalleeEdges.push_back(NewEdge);
1163
NewEdge->Callee->CallerEdges.push_back(NewEdge);
1164
} else {
1165
uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1166
auto NewEdge = std::make_shared<ContextEdge>(
1167
NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1168
NewNode->CallerEdges.push_back(NewEdge);
1169
NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1170
}
1171
// Remove old edge if context ids empty.
1172
if (Edge->getContextIds().empty()) {
1173
if (TowardsCallee) {
1174
Edge->Callee->eraseCallerEdge(Edge.get());
1175
EI = OrigNode->CalleeEdges.erase(EI);
1176
} else {
1177
Edge->Caller->eraseCalleeEdge(Edge.get());
1178
EI = OrigNode->CallerEdges.erase(EI);
1179
}
1180
continue;
1181
}
1182
++EI;
1183
}
1184
}
1185
1186
template <typename DerivedCCG, typename FuncTy, typename CallTy>
1187
static void checkEdge(
1188
const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1189
// Confirm that alloc type is not None and that we have at least one context
1190
// id.
1191
assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1192
assert(!Edge->ContextIds.empty());
1193
}
1194
1195
template <typename DerivedCCG, typename FuncTy, typename CallTy>
1196
static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1197
bool CheckEdges = true) {
1198
if (Node->isRemoved())
1199
return;
1200
#ifndef NDEBUG
1201
// Compute node's context ids once for use in asserts.
1202
auto NodeContextIds = Node->getContextIds();
1203
#endif
1204
// Node's context ids should be the union of both its callee and caller edge
1205
// context ids.
1206
if (Node->CallerEdges.size()) {
1207
DenseSet<uint32_t> CallerEdgeContextIds(
1208
Node->CallerEdges.front()->ContextIds);
1209
for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1210
if (CheckEdges)
1211
checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1212
set_union(CallerEdgeContextIds, Edge->ContextIds);
1213
}
1214
// Node can have more context ids than callers if some contexts terminate at
1215
// node and some are longer.
1216
assert(NodeContextIds == CallerEdgeContextIds ||
1217
set_is_subset(CallerEdgeContextIds, NodeContextIds));
1218
}
1219
if (Node->CalleeEdges.size()) {
1220
DenseSet<uint32_t> CalleeEdgeContextIds(
1221
Node->CalleeEdges.front()->ContextIds);
1222
for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1223
if (CheckEdges)
1224
checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1225
set_union(CalleeEdgeContextIds, Edge->getContextIds());
1226
}
1227
assert(NodeContextIds == CalleeEdgeContextIds);
1228
}
1229
}
1230
1231
template <typename DerivedCCG, typename FuncTy, typename CallTy>
1232
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1233
assignStackNodesPostOrder(ContextNode *Node,
1234
DenseSet<const ContextNode *> &Visited,
1235
DenseMap<uint64_t, std::vector<CallContextInfo>>
1236
&StackIdToMatchingCalls) {
1237
auto Inserted = Visited.insert(Node);
1238
if (!Inserted.second)
1239
return;
1240
// Post order traversal. Iterate over a copy since we may add nodes and
1241
// therefore new callers during the recursive call, invalidating any
1242
// iterator over the original edge vector. We don't need to process these
1243
// new nodes as they were already processed on creation.
1244
auto CallerEdges = Node->CallerEdges;
1245
for (auto &Edge : CallerEdges) {
1246
// Skip any that have been removed during the recursion.
1247
if (!Edge)
1248
continue;
1249
assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
1250
}
1251
1252
// If this node's stack id is in the map, update the graph to contain new
1253
// nodes representing any inlining at interior callsites. Note we move the
1254
// associated context ids over to the new nodes.
1255
1256
// Ignore this node if it is for an allocation or we didn't record any
1257
// stack id lists ending at it.
1258
if (Node->IsAllocation ||
1259
!StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1260
return;
1261
1262
auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1263
// Handle the simple case first. A single call with a single stack id.
1264
// In this case there is no need to create any new context nodes, simply
1265
// assign the context node for stack id to this Call.
1266
if (Calls.size() == 1) {
1267
auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1268
if (Ids.size() == 1) {
1269
assert(SavedContextIds.empty());
1270
// It should be this Node
1271
assert(Node == getNodeForStackId(Ids[0]));
1272
if (Node->Recursive)
1273
return;
1274
Node->setCall(Call);
1275
NonAllocationCallToContextNodeMap[Call] = Node;
1276
NodeToCallingFunc[Node] = Func;
1277
return;
1278
}
1279
}
1280
1281
// Find the node for the last stack id, which should be the same
1282
// across all calls recorded for this id, and is this node's id.
1283
uint64_t LastId = Node->OrigStackOrAllocId;
1284
ContextNode *LastNode = getNodeForStackId(LastId);
1285
// We should only have kept stack ids that had nodes.
1286
assert(LastNode);
1287
1288
for (unsigned I = 0; I < Calls.size(); I++) {
1289
auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1290
// Skip any for which we didn't assign any ids, these don't get a node in
1291
// the graph.
1292
if (SavedContextIds.empty())
1293
continue;
1294
1295
assert(LastId == Ids.back());
1296
1297
ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1298
assert(FirstNode);
1299
1300
// Recompute the context ids for this stack id sequence (the
1301
// intersection of the context ids of the corresponding nodes).
1302
// Start with the ids we saved in the map for this call, which could be
1303
// duplicated context ids. We have to recompute as we might have overlap
1304
// overlap between the saved context ids for different last nodes, and
1305
// removed them already during the post order traversal.
1306
set_intersect(SavedContextIds, FirstNode->getContextIds());
1307
ContextNode *PrevNode = nullptr;
1308
for (auto Id : Ids) {
1309
ContextNode *CurNode = getNodeForStackId(Id);
1310
// We should only have kept stack ids that had nodes and weren't
1311
// recursive.
1312
assert(CurNode);
1313
assert(!CurNode->Recursive);
1314
if (!PrevNode) {
1315
PrevNode = CurNode;
1316
continue;
1317
}
1318
auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1319
if (!Edge) {
1320
SavedContextIds.clear();
1321
break;
1322
}
1323
PrevNode = CurNode;
1324
set_intersect(SavedContextIds, Edge->getContextIds());
1325
1326
// If we now have no context ids for clone, skip this call.
1327
if (SavedContextIds.empty())
1328
break;
1329
}
1330
if (SavedContextIds.empty())
1331
continue;
1332
1333
// Create new context node.
1334
NodeOwner.push_back(
1335
std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
1336
ContextNode *NewNode = NodeOwner.back().get();
1337
NodeToCallingFunc[NewNode] = Func;
1338
NonAllocationCallToContextNodeMap[Call] = NewNode;
1339
NewNode->AllocTypes = computeAllocType(SavedContextIds);
1340
1341
// Connect to callees of innermost stack frame in inlined call chain.
1342
// This updates context ids for FirstNode's callee's to reflect those
1343
// moved to NewNode.
1344
connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1345
1346
// Connect to callers of outermost stack frame in inlined call chain.
1347
// This updates context ids for FirstNode's caller's to reflect those
1348
// moved to NewNode.
1349
connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1350
1351
// Now we need to remove context ids from edges/nodes between First and
1352
// Last Node.
1353
PrevNode = nullptr;
1354
for (auto Id : Ids) {
1355
ContextNode *CurNode = getNodeForStackId(Id);
1356
// We should only have kept stack ids that had nodes.
1357
assert(CurNode);
1358
1359
// Remove the context ids moved to NewNode from CurNode, and the
1360
// edge from the prior node.
1361
if (PrevNode) {
1362
auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1363
assert(PrevEdge);
1364
set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1365
if (PrevEdge->getContextIds().empty()) {
1366
PrevNode->eraseCallerEdge(PrevEdge);
1367
CurNode->eraseCalleeEdge(PrevEdge);
1368
}
1369
}
1370
// Since we update the edges from leaf to tail, only look at the callee
1371
// edges. This isn't an alloc node, so if there are no callee edges, the
1372
// alloc type is None.
1373
CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1374
? (uint8_t)AllocationType::None
1375
: CurNode->computeAllocType();
1376
PrevNode = CurNode;
1377
}
1378
if (VerifyNodes) {
1379
checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1380
for (auto Id : Ids) {
1381
ContextNode *CurNode = getNodeForStackId(Id);
1382
// We should only have kept stack ids that had nodes.
1383
assert(CurNode);
1384
checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1385
}
1386
}
1387
}
1388
}
1389
1390
template <typename DerivedCCG, typename FuncTy, typename CallTy>
1391
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1392
// Map of stack id to all calls with that as the last (outermost caller)
1393
// callsite id that has a context node (some might not due to pruning
1394
// performed during matching of the allocation profile contexts).
1395
// The CallContextInfo contains the Call and a list of its stack ids with
1396
// ContextNodes, the function containing Call, and the set of context ids
1397
// the analysis will eventually identify for use in any new node created
1398
// for that callsite.
1399
DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1400
for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1401
for (auto &Call : CallsWithMetadata) {
1402
// Ignore allocations, already handled.
1403
if (AllocationCallToContextNodeMap.count(Call))
1404
continue;
1405
auto StackIdsWithContextNodes =
1406
getStackIdsWithContextNodesForCall(Call.call());
1407
// If there were no nodes created for MIBs on allocs (maybe this was in
1408
// the unambiguous part of the MIB stack that was pruned), ignore.
1409
if (StackIdsWithContextNodes.empty())
1410
continue;
1411
// Otherwise, record this Call along with the list of ids for the last
1412
// (outermost caller) stack id with a node.
1413
StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1414
{Call.call(), StackIdsWithContextNodes, Func, {}});
1415
}
1416
}
1417
1418
// First make a pass through all stack ids that correspond to a call,
1419
// as identified in the above loop. Compute the context ids corresponding to
1420
// each of these calls when they correspond to multiple stack ids due to
1421
// due to inlining. Perform any duplication of context ids required when
1422
// there is more than one call with the same stack ids. Their (possibly newly
1423
// duplicated) context ids are saved in the StackIdToMatchingCalls map.
1424
DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1425
for (auto &It : StackIdToMatchingCalls) {
1426
auto &Calls = It.getSecond();
1427
// Skip single calls with a single stack id. These don't need a new node.
1428
if (Calls.size() == 1) {
1429
auto &Ids = std::get<1>(Calls[0]);
1430
if (Ids.size() == 1)
1431
continue;
1432
}
1433
// In order to do the best and maximal matching of inlined calls to context
1434
// node sequences we will sort the vectors of stack ids in descending order
1435
// of length, and within each length, lexicographically by stack id. The
1436
// latter is so that we can specially handle calls that have identical stack
1437
// id sequences (either due to cloning or artificially because of the MIB
1438
// context pruning).
1439
std::stable_sort(Calls.begin(), Calls.end(),
1440
[](const CallContextInfo &A, const CallContextInfo &B) {
1441
auto &IdsA = std::get<1>(A);
1442
auto &IdsB = std::get<1>(B);
1443
return IdsA.size() > IdsB.size() ||
1444
(IdsA.size() == IdsB.size() && IdsA < IdsB);
1445
});
1446
1447
// Find the node for the last stack id, which should be the same
1448
// across all calls recorded for this id, and is the id for this
1449
// entry in the StackIdToMatchingCalls map.
1450
uint64_t LastId = It.getFirst();
1451
ContextNode *LastNode = getNodeForStackId(LastId);
1452
// We should only have kept stack ids that had nodes.
1453
assert(LastNode);
1454
1455
if (LastNode->Recursive)
1456
continue;
1457
1458
// Initialize the context ids with the last node's. We will subsequently
1459
// refine the context ids by computing the intersection along all edges.
1460
DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1461
assert(!LastNodeContextIds.empty());
1462
1463
for (unsigned I = 0; I < Calls.size(); I++) {
1464
auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1465
assert(SavedContextIds.empty());
1466
assert(LastId == Ids.back());
1467
1468
// First compute the context ids for this stack id sequence (the
1469
// intersection of the context ids of the corresponding nodes).
1470
// Start with the remaining saved ids for the last node.
1471
assert(!LastNodeContextIds.empty());
1472
DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1473
1474
ContextNode *PrevNode = LastNode;
1475
ContextNode *CurNode = LastNode;
1476
bool Skip = false;
1477
1478
// Iterate backwards through the stack Ids, starting after the last Id
1479
// in the list, which was handled once outside for all Calls.
1480
for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1481
auto Id = *IdIter;
1482
CurNode = getNodeForStackId(Id);
1483
// We should only have kept stack ids that had nodes.
1484
assert(CurNode);
1485
1486
if (CurNode->Recursive) {
1487
Skip = true;
1488
break;
1489
}
1490
1491
auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1492
// If there is no edge then the nodes belong to different MIB contexts,
1493
// and we should skip this inlined context sequence. For example, this
1494
// particular inlined context may include stack ids A->B, and we may
1495
// indeed have nodes for both A and B, but it is possible that they were
1496
// never profiled in sequence in a single MIB for any allocation (i.e.
1497
// we might have profiled an allocation that involves the callsite A,
1498
// but through a different one of its callee callsites, and we might
1499
// have profiled an allocation that involves callsite B, but reached
1500
// from a different caller callsite).
1501
if (!Edge) {
1502
Skip = true;
1503
break;
1504
}
1505
PrevNode = CurNode;
1506
1507
// Update the context ids, which is the intersection of the ids along
1508
// all edges in the sequence.
1509
set_intersect(StackSequenceContextIds, Edge->getContextIds());
1510
1511
// If we now have no context ids for clone, skip this call.
1512
if (StackSequenceContextIds.empty()) {
1513
Skip = true;
1514
break;
1515
}
1516
}
1517
if (Skip)
1518
continue;
1519
1520
// If some of this call's stack ids did not have corresponding nodes (due
1521
// to pruning), don't include any context ids for contexts that extend
1522
// beyond these nodes. Otherwise we would be matching part of unrelated /
1523
// not fully matching stack contexts. To do this, subtract any context ids
1524
// found in caller nodes of the last node found above.
1525
if (Ids.back() != getLastStackId(Call)) {
1526
for (const auto &PE : LastNode->CallerEdges) {
1527
set_subtract(StackSequenceContextIds, PE->getContextIds());
1528
if (StackSequenceContextIds.empty())
1529
break;
1530
}
1531
// If we now have no context ids for clone, skip this call.
1532
if (StackSequenceContextIds.empty())
1533
continue;
1534
}
1535
1536
// Check if the next set of stack ids is the same (since the Calls vector
1537
// of tuples is sorted by the stack ids we can just look at the next one).
1538
bool DuplicateContextIds = false;
1539
if (I + 1 < Calls.size()) {
1540
auto NextIds = std::get<1>(Calls[I + 1]);
1541
DuplicateContextIds = Ids == NextIds;
1542
}
1543
1544
// If we don't have duplicate context ids, then we can assign all the
1545
// context ids computed for the original node sequence to this call.
1546
// If there are duplicate calls with the same stack ids then we synthesize
1547
// new context ids that are duplicates of the originals. These are
1548
// assigned to SavedContextIds, which is a reference into the map entry
1549
// for this call, allowing us to access these ids later on.
1550
OldToNewContextIds.reserve(OldToNewContextIds.size() +
1551
StackSequenceContextIds.size());
1552
SavedContextIds =
1553
DuplicateContextIds
1554
? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1555
: StackSequenceContextIds;
1556
assert(!SavedContextIds.empty());
1557
1558
if (!DuplicateContextIds) {
1559
// Update saved last node's context ids to remove those that are
1560
// assigned to other calls, so that it is ready for the next call at
1561
// this stack id.
1562
set_subtract(LastNodeContextIds, StackSequenceContextIds);
1563
if (LastNodeContextIds.empty())
1564
break;
1565
}
1566
}
1567
}
1568
1569
// Propagate the duplicate context ids over the graph.
1570
propagateDuplicateContextIds(OldToNewContextIds);
1571
1572
if (VerifyCCG)
1573
check();
1574
1575
// Now perform a post-order traversal over the graph, starting with the
1576
// allocation nodes, essentially processing nodes from callers to callees.
1577
// For any that contains an id in the map, update the graph to contain new
1578
// nodes representing any inlining at interior callsites. Note we move the
1579
// associated context ids over to the new nodes.
1580
DenseSet<const ContextNode *> Visited;
1581
for (auto &Entry : AllocationCallToContextNodeMap)
1582
assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
1583
if (VerifyCCG)
1584
check();
1585
}
1586
1587
uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1588
CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1589
Call->getMetadata(LLVMContext::MD_callsite));
1590
return CallsiteContext.back();
1591
}
1592
1593
uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1594
assert(isa<CallsiteInfo *>(Call.getBase()));
1595
CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1596
CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1597
// Need to convert index into stack id.
1598
return Index.getStackIdAtIndex(CallsiteContext.back());
1599
}
1600
1601
static const std::string MemProfCloneSuffix = ".memprof.";
1602
1603
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1604
// We use CloneNo == 0 to refer to the original version, which doesn't get
1605
// renamed with a suffix.
1606
if (!CloneNo)
1607
return Base.str();
1608
return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1609
}
1610
1611
std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1612
const Instruction *Call,
1613
unsigned CloneNo) const {
1614
return (Twine(Call->getFunction()->getName()) + " -> " +
1615
cast<CallBase>(Call)->getCalledFunction()->getName())
1616
.str();
1617
}
1618
1619
std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1620
const IndexCall &Call,
1621
unsigned CloneNo) const {
1622
auto VI = FSToVIMap.find(Func);
1623
assert(VI != FSToVIMap.end());
1624
if (isa<AllocInfo *>(Call.getBase()))
1625
return (VI->second.name() + " -> alloc").str();
1626
else {
1627
auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1628
return (VI->second.name() + " -> " +
1629
getMemProfFuncName(Callsite->Callee.name(),
1630
Callsite->Clones[CloneNo]))
1631
.str();
1632
}
1633
}
1634
1635
std::vector<uint64_t>
1636
ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1637
Instruction *Call) {
1638
CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1639
Call->getMetadata(LLVMContext::MD_callsite));
1640
return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1641
CallsiteContext);
1642
}
1643
1644
std::vector<uint64_t>
1645
IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1646
assert(isa<CallsiteInfo *>(Call.getBase()));
1647
CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1648
CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1649
return getStackIdsWithContextNodes<CallsiteInfo,
1650
SmallVector<unsigned>::const_iterator>(
1651
CallsiteContext);
1652
}
1653
1654
template <typename DerivedCCG, typename FuncTy, typename CallTy>
1655
template <class NodeT, class IteratorT>
1656
std::vector<uint64_t>
1657
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1658
CallStack<NodeT, IteratorT> &CallsiteContext) {
1659
std::vector<uint64_t> StackIds;
1660
for (auto IdOrIndex : CallsiteContext) {
1661
auto StackId = getStackId(IdOrIndex);
1662
ContextNode *Node = getNodeForStackId(StackId);
1663
if (!Node)
1664
break;
1665
StackIds.push_back(StackId);
1666
}
1667
return StackIds;
1668
}
1669
1670
ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1671
Module &M,
1672
llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
1673
: Mod(M), OREGetter(OREGetter) {
1674
for (auto &F : M) {
1675
std::vector<CallInfo> CallsWithMetadata;
1676
for (auto &BB : F) {
1677
for (auto &I : BB) {
1678
if (!isa<CallBase>(I))
1679
continue;
1680
if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1681
CallsWithMetadata.push_back(&I);
1682
auto *AllocNode = addAllocNode(&I, &F);
1683
auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1684
assert(CallsiteMD);
1685
CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1686
// Add all of the MIBs and their stack nodes.
1687
for (auto &MDOp : MemProfMD->operands()) {
1688
auto *MIBMD = cast<const MDNode>(MDOp);
1689
MDNode *StackNode = getMIBStackNode(MIBMD);
1690
assert(StackNode);
1691
CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
1692
addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1693
AllocNode, StackContext, CallsiteContext,
1694
getMIBAllocType(MIBMD), getMIBTotalSize(MIBMD));
1695
}
1696
assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1697
// Memprof and callsite metadata on memory allocations no longer
1698
// needed.
1699
I.setMetadata(LLVMContext::MD_memprof, nullptr);
1700
I.setMetadata(LLVMContext::MD_callsite, nullptr);
1701
}
1702
// For callsite metadata, add to list for this function for later use.
1703
else if (I.getMetadata(LLVMContext::MD_callsite))
1704
CallsWithMetadata.push_back(&I);
1705
}
1706
}
1707
if (!CallsWithMetadata.empty())
1708
FuncToCallsWithMetadata[&F] = CallsWithMetadata;
1709
}
1710
1711
if (DumpCCG) {
1712
dbgs() << "CCG before updating call stack chains:\n";
1713
dbgs() << *this;
1714
}
1715
1716
if (ExportToDot)
1717
exportToDot("prestackupdate");
1718
1719
updateStackNodes();
1720
1721
handleCallsitesWithMultipleTargets();
1722
1723
// Strip off remaining callsite metadata, no longer needed.
1724
for (auto &FuncEntry : FuncToCallsWithMetadata)
1725
for (auto &Call : FuncEntry.second)
1726
Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
1727
}
1728
1729
IndexCallsiteContextGraph::IndexCallsiteContextGraph(
1730
ModuleSummaryIndex &Index,
1731
llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1732
isPrevailing)
1733
: Index(Index), isPrevailing(isPrevailing) {
1734
for (auto &I : Index) {
1735
auto VI = Index.getValueInfo(I);
1736
for (auto &S : VI.getSummaryList()) {
1737
// We should only add the prevailing nodes. Otherwise we may try to clone
1738
// in a weak copy that won't be linked (and may be different than the
1739
// prevailing version).
1740
// We only keep the memprof summary on the prevailing copy now when
1741
// building the combined index, as a space optimization, however don't
1742
// rely on this optimization. The linker doesn't resolve local linkage
1743
// values so don't check whether those are prevailing.
1744
if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1745
!isPrevailing(VI.getGUID(), S.get()))
1746
continue;
1747
auto *FS = dyn_cast<FunctionSummary>(S.get());
1748
if (!FS)
1749
continue;
1750
std::vector<CallInfo> CallsWithMetadata;
1751
if (!FS->allocs().empty()) {
1752
for (auto &AN : FS->mutableAllocs()) {
1753
// This can happen because of recursion elimination handling that
1754
// currently exists in ModuleSummaryAnalysis. Skip these for now.
1755
// We still added them to the summary because we need to be able to
1756
// correlate properly in applyImport in the backends.
1757
if (AN.MIBs.empty())
1758
continue;
1759
CallsWithMetadata.push_back({&AN});
1760
auto *AllocNode = addAllocNode({&AN}, FS);
1761
// Pass an empty CallStack to the CallsiteContext (second)
1762
// parameter, since for ThinLTO we already collapsed out the inlined
1763
// stack ids on the allocation call during ModuleSummaryAnalysis.
1764
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1765
EmptyContext;
1766
unsigned I = 0;
1767
assert(!MemProfReportHintedSizes ||
1768
AN.TotalSizes.size() == AN.MIBs.size());
1769
// Now add all of the MIBs and their stack nodes.
1770
for (auto &MIB : AN.MIBs) {
1771
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1772
StackContext(&MIB);
1773
uint64_t TotalSize = 0;
1774
if (MemProfReportHintedSizes)
1775
TotalSize = AN.TotalSizes[I];
1776
addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1777
AllocNode, StackContext, EmptyContext, MIB.AllocType,
1778
TotalSize);
1779
I++;
1780
}
1781
assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1782
// Initialize version 0 on the summary alloc node to the current alloc
1783
// type, unless it has both types in which case make it default, so
1784
// that in the case where we aren't able to clone the original version
1785
// always ends up with the default allocation behavior.
1786
AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
1787
}
1788
}
1789
// For callsite metadata, add to list for this function for later use.
1790
if (!FS->callsites().empty())
1791
for (auto &SN : FS->mutableCallsites())
1792
CallsWithMetadata.push_back({&SN});
1793
1794
if (!CallsWithMetadata.empty())
1795
FuncToCallsWithMetadata[FS] = CallsWithMetadata;
1796
1797
if (!FS->allocs().empty() || !FS->callsites().empty())
1798
FSToVIMap[FS] = VI;
1799
}
1800
}
1801
1802
if (DumpCCG) {
1803
dbgs() << "CCG before updating call stack chains:\n";
1804
dbgs() << *this;
1805
}
1806
1807
if (ExportToDot)
1808
exportToDot("prestackupdate");
1809
1810
updateStackNodes();
1811
1812
handleCallsitesWithMultipleTargets();
1813
}
1814
1815
template <typename DerivedCCG, typename FuncTy, typename CallTy>
1816
void CallsiteContextGraph<DerivedCCG, FuncTy,
1817
CallTy>::handleCallsitesWithMultipleTargets() {
1818
// Look for and workaround callsites that call multiple functions.
1819
// This can happen for indirect calls, which needs better handling, and in
1820
// more rare cases (e.g. macro expansion).
1821
// TODO: To fix this for indirect calls we will want to perform speculative
1822
// devirtualization using either the normal PGO info with ICP, or using the
1823
// information in the profiled MemProf contexts. We can do this prior to
1824
// this transformation for regular LTO, and for ThinLTO we can simulate that
1825
// effect in the summary and perform the actual speculative devirtualization
1826
// while cloning in the ThinLTO backend.
1827
1828
// Keep track of the new nodes synthesized for discovered tail calls missing
1829
// from the profiled contexts.
1830
MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
1831
1832
for (auto &Entry : NonAllocationCallToContextNodeMap) {
1833
auto *Node = Entry.second;
1834
assert(Node->Clones.empty());
1835
// Check all node callees and see if in the same function.
1836
auto Call = Node->Call.call();
1837
for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
1838
++EI) {
1839
auto Edge = *EI;
1840
if (!Edge->Callee->hasCall())
1841
continue;
1842
assert(NodeToCallingFunc.count(Edge->Callee));
1843
// Check if the called function matches that of the callee node.
1844
if (calleesMatch(Call, EI, TailCallToContextNodeMap))
1845
continue;
1846
RemovedEdgesWithMismatchedCallees++;
1847
// Work around by setting Node to have a null call, so it gets
1848
// skipped during cloning. Otherwise assignFunctions will assert
1849
// because its data structures are not designed to handle this case.
1850
Node->setCall(CallInfo());
1851
break;
1852
}
1853
}
1854
1855
// Remove all mismatched nodes identified in the above loop from the node map
1856
// (checking whether they have a null call which is set above). For a
1857
// MapVector like NonAllocationCallToContextNodeMap it is much more efficient
1858
// to do the removal via remove_if than by individually erasing entries above.
1859
NonAllocationCallToContextNodeMap.remove_if(
1860
[](const auto &it) { return !it.second->hasCall(); });
1861
1862
// Add the new nodes after the above loop so that the iteration is not
1863
// invalidated.
1864
for (auto &[Call, Node] : TailCallToContextNodeMap)
1865
NonAllocationCallToContextNodeMap[Call] = Node;
1866
}
1867
1868
uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1869
// In the Module (IR) case this is already the Id.
1870
return IdOrIndex;
1871
}
1872
1873
uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1874
// In the Index case this is an index into the stack id list in the summary
1875
// index, convert it to an Id.
1876
return Index.getStackIdAtIndex(IdOrIndex);
1877
}
1878
1879
template <typename DerivedCCG, typename FuncTy, typename CallTy>
1880
bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
1881
CallTy Call, EdgeIter &EI,
1882
MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
1883
auto Edge = *EI;
1884
const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
1885
const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
1886
// Will be populated in order of callee to caller if we find a chain of tail
1887
// calls between the profiled caller and callee.
1888
std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
1889
if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
1890
FoundCalleeChain))
1891
return false;
1892
1893
// The usual case where the profiled callee matches that of the IR/summary.
1894
if (FoundCalleeChain.empty())
1895
return true;
1896
1897
auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
1898
auto *CurEdge = Callee->findEdgeFromCaller(Caller);
1899
// If there is already an edge between these nodes, simply update it and
1900
// return.
1901
if (CurEdge) {
1902
CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
1903
Edge->ContextIds.end());
1904
CurEdge->AllocTypes |= Edge->AllocTypes;
1905
return;
1906
}
1907
// Otherwise, create a new edge and insert it into the caller and callee
1908
// lists.
1909
auto NewEdge = std::make_shared<ContextEdge>(
1910
Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
1911
Callee->CallerEdges.push_back(NewEdge);
1912
if (Caller == Edge->Caller) {
1913
// If we are inserting the new edge into the current edge's caller, insert
1914
// the new edge before the current iterator position, and then increment
1915
// back to the current edge.
1916
EI = Caller->CalleeEdges.insert(EI, NewEdge);
1917
++EI;
1918
assert(*EI == Edge &&
1919
"Iterator position not restored after insert and increment");
1920
} else
1921
Caller->CalleeEdges.push_back(NewEdge);
1922
};
1923
1924
// Create new nodes for each found callee and connect in between the profiled
1925
// caller and callee.
1926
auto *CurCalleeNode = Edge->Callee;
1927
for (auto &[NewCall, Func] : FoundCalleeChain) {
1928
ContextNode *NewNode = nullptr;
1929
// First check if we have already synthesized a node for this tail call.
1930
if (TailCallToContextNodeMap.count(NewCall)) {
1931
NewNode = TailCallToContextNodeMap[NewCall];
1932
NewNode->AllocTypes |= Edge->AllocTypes;
1933
} else {
1934
FuncToCallsWithMetadata[Func].push_back({NewCall});
1935
// Create Node and record node info.
1936
NodeOwner.push_back(
1937
std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall));
1938
NewNode = NodeOwner.back().get();
1939
NodeToCallingFunc[NewNode] = Func;
1940
TailCallToContextNodeMap[NewCall] = NewNode;
1941
NewNode->AllocTypes = Edge->AllocTypes;
1942
}
1943
1944
// Hook up node to its callee node
1945
AddEdge(NewNode, CurCalleeNode);
1946
1947
CurCalleeNode = NewNode;
1948
}
1949
1950
// Hook up edge's original caller to new callee node.
1951
AddEdge(Edge->Caller, CurCalleeNode);
1952
1953
// Remove old edge
1954
Edge->Callee->eraseCallerEdge(Edge.get());
1955
EI = Edge->Caller->CalleeEdges.erase(EI);
1956
1957
// To simplify the increment of EI in the caller, subtract one from EI.
1958
// In the final AddEdge call we would have either added a new callee edge,
1959
// to Edge->Caller, or found an existing one. Either way we are guaranteed
1960
// that there is at least one callee edge.
1961
assert(!Edge->Caller->CalleeEdges.empty());
1962
--EI;
1963
1964
return true;
1965
}
1966
1967
bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1968
const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
1969
std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
1970
bool &FoundMultipleCalleeChains) {
1971
// Stop recursive search if we have already explored the maximum specified
1972
// depth.
1973
if (Depth > TailCallSearchDepth)
1974
return false;
1975
1976
auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
1977
FoundCalleeChain.push_back({Callsite, F});
1978
};
1979
1980
auto *CalleeFunc = dyn_cast<Function>(CurCallee);
1981
if (!CalleeFunc) {
1982
auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
1983
assert(Alias);
1984
CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
1985
assert(CalleeFunc);
1986
}
1987
1988
// Look for tail calls in this function, and check if they either call the
1989
// profiled callee directly, or indirectly (via a recursive search).
1990
// Only succeed if there is a single unique tail call chain found between the
1991
// profiled caller and callee, otherwise we could perform incorrect cloning.
1992
bool FoundSingleCalleeChain = false;
1993
for (auto &BB : *CalleeFunc) {
1994
for (auto &I : BB) {
1995
auto *CB = dyn_cast<CallBase>(&I);
1996
if (!CB || !CB->isTailCall())
1997
continue;
1998
auto *CalledValue = CB->getCalledOperand();
1999
auto *CalledFunction = CB->getCalledFunction();
2000
if (CalledValue && !CalledFunction) {
2001
CalledValue = CalledValue->stripPointerCasts();
2002
// Stripping pointer casts can reveal a called function.
2003
CalledFunction = dyn_cast<Function>(CalledValue);
2004
}
2005
// Check if this is an alias to a function. If so, get the
2006
// called aliasee for the checks below.
2007
if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2008
assert(!CalledFunction &&
2009
"Expected null called function in callsite for alias");
2010
CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2011
}
2012
if (!CalledFunction)
2013
continue;
2014
if (CalledFunction == ProfiledCallee) {
2015
if (FoundSingleCalleeChain) {
2016
FoundMultipleCalleeChains = true;
2017
return false;
2018
}
2019
FoundSingleCalleeChain = true;
2020
FoundProfiledCalleeCount++;
2021
FoundProfiledCalleeDepth += Depth;
2022
if (Depth > FoundProfiledCalleeMaxDepth)
2023
FoundProfiledCalleeMaxDepth = Depth;
2024
SaveCallsiteInfo(&I, CalleeFunc);
2025
} else if (findProfiledCalleeThroughTailCalls(
2026
ProfiledCallee, CalledFunction, Depth + 1,
2027
FoundCalleeChain, FoundMultipleCalleeChains)) {
2028
// findProfiledCalleeThroughTailCalls should not have returned
2029
// true if FoundMultipleCalleeChains.
2030
assert(!FoundMultipleCalleeChains);
2031
if (FoundSingleCalleeChain) {
2032
FoundMultipleCalleeChains = true;
2033
return false;
2034
}
2035
FoundSingleCalleeChain = true;
2036
SaveCallsiteInfo(&I, CalleeFunc);
2037
} else if (FoundMultipleCalleeChains)
2038
return false;
2039
}
2040
}
2041
2042
return FoundSingleCalleeChain;
2043
}
2044
2045
bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2046
Instruction *Call, const Function *Func, const Function *CallerFunc,
2047
std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2048
auto *CB = dyn_cast<CallBase>(Call);
2049
if (!CB->getCalledOperand())
2050
return false;
2051
auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2052
auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2053
if (CalleeFunc == Func)
2054
return true;
2055
auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2056
if (Alias && Alias->getAliasee() == Func)
2057
return true;
2058
2059
// Recursively search for the profiled callee through tail calls starting with
2060
// the actual Callee. The discovered tail call chain is saved in
2061
// FoundCalleeChain, and we will fixup the graph to include these callsites
2062
// after returning.
2063
// FIXME: We will currently redo the same recursive walk if we find the same
2064
// mismatched callee from another callsite. We can improve this with more
2065
// bookkeeping of the created chain of new nodes for each mismatch.
2066
unsigned Depth = 1;
2067
bool FoundMultipleCalleeChains = false;
2068
if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2069
FoundCalleeChain,
2070
FoundMultipleCalleeChains)) {
2071
LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2072
<< Func->getName() << " from " << CallerFunc->getName()
2073
<< " that actually called " << CalleeVal->getName()
2074
<< (FoundMultipleCalleeChains
2075
? " (found multiple possible chains)"
2076
: "")
2077
<< "\n");
2078
if (FoundMultipleCalleeChains)
2079
FoundProfiledCalleeNonUniquelyCount++;
2080
return false;
2081
}
2082
2083
return true;
2084
}
2085
2086
bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2087
ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2088
std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2089
bool &FoundMultipleCalleeChains) {
2090
// Stop recursive search if we have already explored the maximum specified
2091
// depth.
2092
if (Depth > TailCallSearchDepth)
2093
return false;
2094
2095
auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2096
// Make a CallsiteInfo for each discovered callee, if one hasn't already
2097
// been synthesized.
2098
if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2099
!FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2100
// StackIds is empty (we don't have debug info available in the index for
2101
// these callsites)
2102
FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2103
std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2104
CallsiteInfo *NewCallsiteInfo =
2105
FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2106
FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2107
};
2108
2109
// Look for tail calls in this function, and check if they either call the
2110
// profiled callee directly, or indirectly (via a recursive search).
2111
// Only succeed if there is a single unique tail call chain found between the
2112
// profiled caller and callee, otherwise we could perform incorrect cloning.
2113
bool FoundSingleCalleeChain = false;
2114
for (auto &S : CurCallee.getSummaryList()) {
2115
if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2116
!isPrevailing(CurCallee.getGUID(), S.get()))
2117
continue;
2118
auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2119
if (!FS)
2120
continue;
2121
auto FSVI = CurCallee;
2122
auto *AS = dyn_cast<AliasSummary>(S.get());
2123
if (AS)
2124
FSVI = AS->getAliaseeVI();
2125
for (auto &CallEdge : FS->calls()) {
2126
if (!CallEdge.second.hasTailCall())
2127
continue;
2128
if (CallEdge.first == ProfiledCallee) {
2129
if (FoundSingleCalleeChain) {
2130
FoundMultipleCalleeChains = true;
2131
return false;
2132
}
2133
FoundSingleCalleeChain = true;
2134
FoundProfiledCalleeCount++;
2135
FoundProfiledCalleeDepth += Depth;
2136
if (Depth > FoundProfiledCalleeMaxDepth)
2137
FoundProfiledCalleeMaxDepth = Depth;
2138
CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2139
// Add FS to FSToVIMap in case it isn't already there.
2140
assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2141
FSToVIMap[FS] = FSVI;
2142
} else if (findProfiledCalleeThroughTailCalls(
2143
ProfiledCallee, CallEdge.first, Depth + 1,
2144
FoundCalleeChain, FoundMultipleCalleeChains)) {
2145
// findProfiledCalleeThroughTailCalls should not have returned
2146
// true if FoundMultipleCalleeChains.
2147
assert(!FoundMultipleCalleeChains);
2148
if (FoundSingleCalleeChain) {
2149
FoundMultipleCalleeChains = true;
2150
return false;
2151
}
2152
FoundSingleCalleeChain = true;
2153
CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2154
// Add FS to FSToVIMap in case it isn't already there.
2155
assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2156
FSToVIMap[FS] = FSVI;
2157
} else if (FoundMultipleCalleeChains)
2158
return false;
2159
}
2160
}
2161
2162
return FoundSingleCalleeChain;
2163
}
2164
2165
bool IndexCallsiteContextGraph::calleeMatchesFunc(
2166
IndexCall &Call, const FunctionSummary *Func,
2167
const FunctionSummary *CallerFunc,
2168
std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2169
ValueInfo Callee =
2170
dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2171
// If there is no summary list then this is a call to an externally defined
2172
// symbol.
2173
AliasSummary *Alias =
2174
Callee.getSummaryList().empty()
2175
? nullptr
2176
: dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2177
assert(FSToVIMap.count(Func));
2178
auto FuncVI = FSToVIMap[Func];
2179
if (Callee == FuncVI ||
2180
// If callee is an alias, check the aliasee, since only function
2181
// summary base objects will contain the stack node summaries and thus
2182
// get a context node.
2183
(Alias && Alias->getAliaseeVI() == FuncVI))
2184
return true;
2185
2186
// Recursively search for the profiled callee through tail calls starting with
2187
// the actual Callee. The discovered tail call chain is saved in
2188
// FoundCalleeChain, and we will fixup the graph to include these callsites
2189
// after returning.
2190
// FIXME: We will currently redo the same recursive walk if we find the same
2191
// mismatched callee from another callsite. We can improve this with more
2192
// bookkeeping of the created chain of new nodes for each mismatch.
2193
unsigned Depth = 1;
2194
bool FoundMultipleCalleeChains = false;
2195
if (!findProfiledCalleeThroughTailCalls(
2196
FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2197
LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2198
<< " from " << FSToVIMap[CallerFunc]
2199
<< " that actually called " << Callee
2200
<< (FoundMultipleCalleeChains
2201
? " (found multiple possible chains)"
2202
: "")
2203
<< "\n");
2204
if (FoundMultipleCalleeChains)
2205
FoundProfiledCalleeNonUniquelyCount++;
2206
return false;
2207
}
2208
2209
return true;
2210
}
2211
2212
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2213
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2214
const {
2215
print(dbgs());
2216
dbgs() << "\n";
2217
}
2218
2219
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2220
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2221
raw_ostream &OS) const {
2222
OS << "Node " << this << "\n";
2223
OS << "\t";
2224
printCall(OS);
2225
if (Recursive)
2226
OS << " (recursive)";
2227
OS << "\n";
2228
OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2229
OS << "\tContextIds:";
2230
// Make a copy of the computed context ids that we can sort for stability.
2231
auto ContextIds = getContextIds();
2232
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2233
std::sort(SortedIds.begin(), SortedIds.end());
2234
for (auto Id : SortedIds)
2235
OS << " " << Id;
2236
OS << "\n";
2237
OS << "\tCalleeEdges:\n";
2238
for (auto &Edge : CalleeEdges)
2239
OS << "\t\t" << *Edge << "\n";
2240
OS << "\tCallerEdges:\n";
2241
for (auto &Edge : CallerEdges)
2242
OS << "\t\t" << *Edge << "\n";
2243
if (!Clones.empty()) {
2244
OS << "\tClones: ";
2245
FieldSeparator FS;
2246
for (auto *Clone : Clones)
2247
OS << FS << Clone;
2248
OS << "\n";
2249
} else if (CloneOf) {
2250
OS << "\tClone of " << CloneOf << "\n";
2251
}
2252
}
2253
2254
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2255
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2256
const {
2257
print(dbgs());
2258
dbgs() << "\n";
2259
}
2260
2261
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2262
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2263
raw_ostream &OS) const {
2264
OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2265
<< " AllocTypes: " << getAllocTypeString(AllocTypes);
2266
OS << " ContextIds:";
2267
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2268
std::sort(SortedIds.begin(), SortedIds.end());
2269
for (auto Id : SortedIds)
2270
OS << " " << Id;
2271
}
2272
2273
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2274
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2275
print(dbgs());
2276
}
2277
2278
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2279
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2280
raw_ostream &OS) const {
2281
OS << "Callsite Context Graph:\n";
2282
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2283
for (const auto Node : nodes<GraphType>(this)) {
2284
if (Node->isRemoved())
2285
continue;
2286
Node->print(OS);
2287
OS << "\n";
2288
}
2289
}
2290
2291
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2292
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
2293
raw_ostream &OS) const {
2294
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2295
for (const auto Node : nodes<GraphType>(this)) {
2296
if (Node->isRemoved())
2297
continue;
2298
if (!Node->IsAllocation)
2299
continue;
2300
DenseSet<uint32_t> ContextIds = Node->getContextIds();
2301
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2302
std::sort(SortedIds.begin(), SortedIds.end());
2303
for (auto Id : SortedIds) {
2304
auto SizeI = ContextIdToTotalSize.find(Id);
2305
assert(SizeI != ContextIdToTotalSize.end());
2306
auto TypeI = ContextIdToAllocationType.find(Id);
2307
assert(TypeI != ContextIdToAllocationType.end());
2308
OS << getAllocTypeString((uint8_t)TypeI->second) << " context " << Id
2309
<< " with total size " << SizeI->second << " is "
2310
<< getAllocTypeString(Node->AllocTypes) << " after cloning\n";
2311
}
2312
}
2313
}
2314
2315
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2316
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2317
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2318
for (const auto Node : nodes<GraphType>(this)) {
2319
checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2320
for (auto &Edge : Node->CallerEdges)
2321
checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2322
}
2323
}
2324
2325
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2326
struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2327
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2328
using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2329
2330
using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2331
static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2332
2333
using nodes_iterator =
2334
mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
2335
decltype(&getNode)>;
2336
2337
static nodes_iterator nodes_begin(GraphType G) {
2338
return nodes_iterator(G->NodeOwner.begin(), &getNode);
2339
}
2340
2341
static nodes_iterator nodes_end(GraphType G) {
2342
return nodes_iterator(G->NodeOwner.end(), &getNode);
2343
}
2344
2345
static NodeRef getEntryNode(GraphType G) {
2346
return G->NodeOwner.begin()->get();
2347
}
2348
2349
using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2350
static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2351
GetCallee(const EdgePtrTy &P) {
2352
return P->Callee;
2353
}
2354
2355
using ChildIteratorType =
2356
mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2357
DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2358
decltype(&GetCallee)>;
2359
2360
static ChildIteratorType child_begin(NodeRef N) {
2361
return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2362
}
2363
2364
static ChildIteratorType child_end(NodeRef N) {
2365
return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2366
}
2367
};
2368
2369
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2370
struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2371
: public DefaultDOTGraphTraits {
2372
DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2373
2374
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2375
using GTraits = GraphTraits<GraphType>;
2376
using NodeRef = typename GTraits::NodeRef;
2377
using ChildIteratorType = typename GTraits::ChildIteratorType;
2378
2379
static std::string getNodeLabel(NodeRef Node, GraphType G) {
2380
std::string LabelString =
2381
(Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2382
Twine(Node->OrigStackOrAllocId))
2383
.str();
2384
LabelString += "\n";
2385
if (Node->hasCall()) {
2386
auto Func = G->NodeToCallingFunc.find(Node);
2387
assert(Func != G->NodeToCallingFunc.end());
2388
LabelString +=
2389
G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2390
} else {
2391
LabelString += "null call";
2392
if (Node->Recursive)
2393
LabelString += " (recursive)";
2394
else
2395
LabelString += " (external)";
2396
}
2397
return LabelString;
2398
}
2399
2400
static std::string getNodeAttributes(NodeRef Node, GraphType) {
2401
std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2402
getContextIds(Node->getContextIds()) + "\"")
2403
.str();
2404
AttributeString +=
2405
(Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2406
AttributeString += ",style=\"filled\"";
2407
if (Node->CloneOf) {
2408
AttributeString += ",color=\"blue\"";
2409
AttributeString += ",style=\"filled,bold,dashed\"";
2410
} else
2411
AttributeString += ",style=\"filled\"";
2412
return AttributeString;
2413
}
2414
2415
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2416
GraphType) {
2417
auto &Edge = *(ChildIter.getCurrent());
2418
return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2419
Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2420
.str();
2421
}
2422
2423
// Since the NodeOwners list includes nodes that are no longer connected to
2424
// the graph, skip them here.
2425
static bool isNodeHidden(NodeRef Node, GraphType) {
2426
return Node->isRemoved();
2427
}
2428
2429
private:
2430
static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
2431
std::string IdString = "ContextIds:";
2432
if (ContextIds.size() < 100) {
2433
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2434
std::sort(SortedIds.begin(), SortedIds.end());
2435
for (auto Id : SortedIds)
2436
IdString += (" " + Twine(Id)).str();
2437
} else {
2438
IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
2439
}
2440
return IdString;
2441
}
2442
2443
static std::string getColor(uint8_t AllocTypes) {
2444
if (AllocTypes == (uint8_t)AllocationType::NotCold)
2445
// Color "brown1" actually looks like a lighter red.
2446
return "brown1";
2447
if (AllocTypes == (uint8_t)AllocationType::Cold)
2448
return "cyan";
2449
if (AllocTypes ==
2450
((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
2451
// Lighter purple.
2452
return "mediumorchid1";
2453
return "gray";
2454
}
2455
2456
static std::string getNodeId(NodeRef Node) {
2457
std::stringstream SStream;
2458
SStream << std::hex << "N0x" << (unsigned long long)Node;
2459
std::string Result = SStream.str();
2460
return Result;
2461
}
2462
};
2463
2464
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2465
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
2466
std::string Label) const {
2467
WriteGraph(this, "", false, Label,
2468
DotFilePathPrefix + "ccg." + Label + ".dot");
2469
}
2470
2471
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2472
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
2473
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
2474
const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI,
2475
DenseSet<uint32_t> ContextIdsToMove) {
2476
ContextNode *Node = Edge->Callee;
2477
NodeOwner.push_back(
2478
std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
2479
ContextNode *Clone = NodeOwner.back().get();
2480
Node->addClone(Clone);
2481
assert(NodeToCallingFunc.count(Node));
2482
NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
2483
moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
2484
ContextIdsToMove);
2485
return Clone;
2486
}
2487
2488
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2489
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2490
moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
2491
ContextNode *NewCallee, EdgeIter *CallerEdgeI,
2492
bool NewClone,
2493
DenseSet<uint32_t> ContextIdsToMove) {
2494
// NewCallee and Edge's current callee must be clones of the same original
2495
// node (Edge's current callee may be the original node too).
2496
assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
2497
2498
ContextNode *OldCallee = Edge->Callee;
2499
2500
// We might already have an edge to the new callee from earlier cloning for a
2501
// different allocation. If one exists we will reuse it.
2502
auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
2503
2504
// Callers will pass an empty ContextIdsToMove set when they want to move the
2505
// edge. Copy in Edge's ids for simplicity.
2506
if (ContextIdsToMove.empty())
2507
ContextIdsToMove = Edge->getContextIds();
2508
2509
// If we are moving all of Edge's ids, then just move the whole Edge.
2510
// Otherwise only move the specified subset, to a new edge if needed.
2511
if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
2512
// Moving the whole Edge.
2513
if (CallerEdgeI)
2514
*CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
2515
else
2516
OldCallee->eraseCallerEdge(Edge.get());
2517
if (ExistingEdgeToNewCallee) {
2518
// Since we already have an edge to NewCallee, simply move the ids
2519
// onto it, and remove the existing Edge.
2520
ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2521
ContextIdsToMove.end());
2522
ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
2523
assert(Edge->ContextIds == ContextIdsToMove);
2524
Edge->ContextIds.clear();
2525
Edge->AllocTypes = (uint8_t)AllocationType::None;
2526
Edge->Caller->eraseCalleeEdge(Edge.get());
2527
} else {
2528
// Otherwise just reconnect Edge to NewCallee.
2529
Edge->Callee = NewCallee;
2530
NewCallee->CallerEdges.push_back(Edge);
2531
// Don't need to update Edge's context ids since we are simply
2532
// reconnecting it.
2533
}
2534
// In either case, need to update the alloc types on New Callee.
2535
NewCallee->AllocTypes |= Edge->AllocTypes;
2536
} else {
2537
// Only moving a subset of Edge's ids.
2538
if (CallerEdgeI)
2539
++CallerEdgeI;
2540
// Compute the alloc type of the subset of ids being moved.
2541
auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
2542
if (ExistingEdgeToNewCallee) {
2543
// Since we already have an edge to NewCallee, simply move the ids
2544
// onto it.
2545
ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2546
ContextIdsToMove.end());
2547
ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
2548
} else {
2549
// Otherwise, create a new edge to NewCallee for the ids being moved.
2550
auto NewEdge = std::make_shared<ContextEdge>(
2551
NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
2552
Edge->Caller->CalleeEdges.push_back(NewEdge);
2553
NewCallee->CallerEdges.push_back(NewEdge);
2554
}
2555
// In either case, need to update the alloc types on NewCallee, and remove
2556
// those ids and update the alloc type on the original Edge.
2557
NewCallee->AllocTypes |= CallerEdgeAllocType;
2558
set_subtract(Edge->ContextIds, ContextIdsToMove);
2559
Edge->AllocTypes = computeAllocType(Edge->ContextIds);
2560
}
2561
// Now walk the old callee node's callee edges and move Edge's context ids
2562
// over to the corresponding edge into the clone (which is created here if
2563
// this is a newly created clone).
2564
for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2565
// The context ids moving to the new callee are the subset of this edge's
2566
// context ids and the context ids on the caller edge being moved.
2567
DenseSet<uint32_t> EdgeContextIdsToMove =
2568
set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
2569
set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2570
OldCalleeEdge->AllocTypes =
2571
computeAllocType(OldCalleeEdge->getContextIds());
2572
if (!NewClone) {
2573
// Update context ids / alloc type on corresponding edge to NewCallee.
2574
// There is a chance this may not exist if we are reusing an existing
2575
// clone, specifically during function assignment, where we would have
2576
// removed none type edges after creating the clone. If we can't find
2577
// a corresponding edge there, fall through to the cloning below.
2578
if (auto *NewCalleeEdge =
2579
NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2580
NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2581
EdgeContextIdsToMove.end());
2582
NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
2583
continue;
2584
}
2585
}
2586
auto NewEdge = std::make_shared<ContextEdge>(
2587
OldCalleeEdge->Callee, NewCallee,
2588
computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
2589
NewCallee->CalleeEdges.push_back(NewEdge);
2590
NewEdge->Callee->CallerEdges.push_back(NewEdge);
2591
}
2592
// Recompute the node alloc type now that its callee edges have been
2593
// updated (since we will compute from those edges).
2594
OldCallee->AllocTypes = OldCallee->computeAllocType();
2595
// OldCallee alloc type should be None iff its context id set is now empty.
2596
assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2597
OldCallee->emptyContextIds());
2598
if (VerifyCCG) {
2599
checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
2600
checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
2601
for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2602
checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2603
/*CheckEdges=*/false);
2604
for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2605
checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2606
/*CheckEdges=*/false);
2607
}
2608
}
2609
2610
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2611
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2612
recursivelyRemoveNoneTypeCalleeEdges(
2613
ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
2614
auto Inserted = Visited.insert(Node);
2615
if (!Inserted.second)
2616
return;
2617
2618
removeNoneTypeCalleeEdges(Node);
2619
2620
for (auto *Clone : Node->Clones)
2621
recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
2622
2623
// The recursive call may remove some of this Node's caller edges.
2624
// Iterate over a copy and skip any that were removed.
2625
auto CallerEdges = Node->CallerEdges;
2626
for (auto &Edge : CallerEdges) {
2627
// Skip any that have been removed by an earlier recursive call.
2628
if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2629
assert(!is_contained(Node->CallerEdges, Edge));
2630
continue;
2631
}
2632
recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
2633
}
2634
}
2635
2636
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2637
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
2638
DenseSet<const ContextNode *> Visited;
2639
for (auto &Entry : AllocationCallToContextNodeMap) {
2640
Visited.clear();
2641
identifyClones(Entry.second, Visited, Entry.second->getContextIds());
2642
}
2643
Visited.clear();
2644
for (auto &Entry : AllocationCallToContextNodeMap)
2645
recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
2646
if (VerifyCCG)
2647
check();
2648
}
2649
2650
// helper function to check an AllocType is cold or notcold or both.
2651
bool checkColdOrNotCold(uint8_t AllocType) {
2652
return (AllocType == (uint8_t)AllocationType::Cold) ||
2653
(AllocType == (uint8_t)AllocationType::NotCold) ||
2654
(AllocType ==
2655
((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
2656
}
2657
2658
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2659
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
2660
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
2661
const DenseSet<uint32_t> &AllocContextIds) {
2662
if (VerifyNodes)
2663
checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2664
assert(!Node->CloneOf);
2665
2666
// If Node as a null call, then either it wasn't found in the module (regular
2667
// LTO) or summary index (ThinLTO), or there were other conditions blocking
2668
// cloning (e.g. recursion, calls multiple targets, etc).
2669
// Do this here so that we don't try to recursively clone callers below, which
2670
// isn't useful at least for this node.
2671
if (!Node->hasCall())
2672
return;
2673
2674
#ifndef NDEBUG
2675
auto Insert =
2676
#endif
2677
Visited.insert(Node);
2678
// We should not have visited this node yet.
2679
assert(Insert.second);
2680
// The recursive call to identifyClones may delete the current edge from the
2681
// CallerEdges vector. Make a copy and iterate on that, simpler than passing
2682
// in an iterator and having recursive call erase from it. Other edges may
2683
// also get removed during the recursion, which will have null Callee and
2684
// Caller pointers (and are deleted later), so we skip those below.
2685
{
2686
auto CallerEdges = Node->CallerEdges;
2687
for (auto &Edge : CallerEdges) {
2688
// Skip any that have been removed by an earlier recursive call.
2689
if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2690
assert(!llvm::count(Node->CallerEdges, Edge));
2691
continue;
2692
}
2693
// Ignore any caller we previously visited via another edge.
2694
if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2695
identifyClones(Edge->Caller, Visited, AllocContextIds);
2696
}
2697
}
2698
}
2699
2700
// Check if we reached an unambiguous call or have have only a single caller.
2701
if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2702
return;
2703
2704
// We need to clone.
2705
2706
// Try to keep the original version as alloc type NotCold. This will make
2707
// cases with indirect calls or any other situation with an unknown call to
2708
// the original function get the default behavior. We do this by sorting the
2709
// CallerEdges of the Node we will clone by alloc type.
2710
//
2711
// Give NotCold edge the lowest sort priority so those edges are at the end of
2712
// the caller edges vector, and stay on the original version (since the below
2713
// code clones greedily until it finds all remaining edges have the same type
2714
// and leaves the remaining ones on the original Node).
2715
//
2716
// We shouldn't actually have any None type edges, so the sorting priority for
2717
// that is arbitrary, and we assert in that case below.
2718
const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
2719
/*Cold*/ 1,
2720
/*NotColdCold*/ 2};
2721
std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2722
[&](const std::shared_ptr<ContextEdge> &A,
2723
const std::shared_ptr<ContextEdge> &B) {
2724
// Nodes with non-empty context ids should be sorted before
2725
// those with empty context ids.
2726
if (A->ContextIds.empty())
2727
// Either B ContextIds are non-empty (in which case we
2728
// should return false because B < A), or B ContextIds
2729
// are empty, in which case they are equal, and we should
2730
// maintain the original relative ordering.
2731
return false;
2732
if (B->ContextIds.empty())
2733
return true;
2734
2735
if (A->AllocTypes == B->AllocTypes)
2736
// Use the first context id for each edge as a
2737
// tie-breaker.
2738
return *A->ContextIds.begin() < *B->ContextIds.begin();
2739
return AllocTypeCloningPriority[A->AllocTypes] <
2740
AllocTypeCloningPriority[B->AllocTypes];
2741
});
2742
2743
assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2744
2745
// Iterate until we find no more opportunities for disambiguating the alloc
2746
// types via cloning. In most cases this loop will terminate once the Node
2747
// has a single allocation type, in which case no more cloning is needed.
2748
// We need to be able to remove Edge from CallerEdges, so need to adjust
2749
// iterator inside the loop.
2750
for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2751
auto CallerEdge = *EI;
2752
2753
// See if cloning the prior caller edge left this node with a single alloc
2754
// type or a single caller. In that case no more cloning of Node is needed.
2755
if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2756
break;
2757
2758
// Only need to process the ids along this edge pertaining to the given
2759
// allocation.
2760
auto CallerEdgeContextsForAlloc =
2761
set_intersection(CallerEdge->getContextIds(), AllocContextIds);
2762
if (CallerEdgeContextsForAlloc.empty()) {
2763
++EI;
2764
continue;
2765
}
2766
auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
2767
2768
// Compute the node callee edge alloc types corresponding to the context ids
2769
// for this caller edge.
2770
std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2771
CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
2772
for (auto &CalleeEdge : Node->CalleeEdges)
2773
CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2774
CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
2775
2776
// Don't clone if doing so will not disambiguate any alloc types amongst
2777
// caller edges (including the callee edges that would be cloned).
2778
// Otherwise we will simply move all edges to the clone.
2779
//
2780
// First check if by cloning we will disambiguate the caller allocation
2781
// type from node's allocation type. Query allocTypeToUse so that we don't
2782
// bother cloning to distinguish NotCold+Cold from NotCold. Note that
2783
// neither of these should be None type.
2784
//
2785
// Then check if by cloning node at least one of the callee edges will be
2786
// disambiguated by splitting out different context ids.
2787
assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2788
assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2789
if (allocTypeToUse(CallerAllocTypeForAlloc) ==
2790
allocTypeToUse(Node->AllocTypes) &&
2791
allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2792
CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2793
++EI;
2794
continue;
2795
}
2796
2797
// First see if we can use an existing clone. Check each clone and its
2798
// callee edges for matching alloc types.
2799
ContextNode *Clone = nullptr;
2800
for (auto *CurClone : Node->Clones) {
2801
if (allocTypeToUse(CurClone->AllocTypes) !=
2802
allocTypeToUse(CallerAllocTypeForAlloc))
2803
continue;
2804
2805
if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2806
CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2807
continue;
2808
Clone = CurClone;
2809
break;
2810
}
2811
2812
// The edge iterator is adjusted when we move the CallerEdge to the clone.
2813
if (Clone)
2814
moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI, /*NewClone=*/false,
2815
CallerEdgeContextsForAlloc);
2816
else
2817
Clone =
2818
moveEdgeToNewCalleeClone(CallerEdge, &EI, CallerEdgeContextsForAlloc);
2819
2820
assert(EI == Node->CallerEdges.end() ||
2821
Node->AllocTypes != (uint8_t)AllocationType::None);
2822
// Sanity check that no alloc types on clone or its edges are None.
2823
assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2824
}
2825
2826
// We should still have some context ids on the original Node.
2827
assert(!Node->emptyContextIds());
2828
2829
// Sanity check that no alloc types on node or edges are None.
2830
assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2831
2832
if (VerifyNodes)
2833
checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2834
}
2835
2836
void ModuleCallsiteContextGraph::updateAllocationCall(
2837
CallInfo &Call, AllocationType AllocType) {
2838
std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
2839
auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
2840
"memprof", AllocTypeString);
2841
cast<CallBase>(Call.call())->addFnAttr(A);
2842
OREGetter(Call.call()->getFunction())
2843
.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
2844
<< ore::NV("AllocationCall", Call.call()) << " in clone "
2845
<< ore::NV("Caller", Call.call()->getFunction())
2846
<< " marked with memprof allocation attribute "
2847
<< ore::NV("Attribute", AllocTypeString));
2848
}
2849
2850
void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2851
AllocationType AllocType) {
2852
auto *AI = Call.call().dyn_cast<AllocInfo *>();
2853
assert(AI);
2854
assert(AI->Versions.size() > Call.cloneNo());
2855
AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
2856
}
2857
2858
void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2859
FuncInfo CalleeFunc) {
2860
if (CalleeFunc.cloneNo() > 0)
2861
cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
2862
OREGetter(CallerCall.call()->getFunction())
2863
.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
2864
<< ore::NV("Call", CallerCall.call()) << " in clone "
2865
<< ore::NV("Caller", CallerCall.call()->getFunction())
2866
<< " assigned to call function clone "
2867
<< ore::NV("Callee", CalleeFunc.func()));
2868
}
2869
2870
void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2871
FuncInfo CalleeFunc) {
2872
auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
2873
assert(CI &&
2874
"Caller cannot be an allocation which should not have profiled calls");
2875
assert(CI->Clones.size() > CallerCall.cloneNo());
2876
CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2877
}
2878
2879
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
2880
Instruction *>::FuncInfo
2881
ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2882
FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2883
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2884
// Use existing LLVM facilities for cloning and obtaining Call in clone
2885
ValueToValueMapTy VMap;
2886
auto *NewFunc = CloneFunction(Func.func(), VMap);
2887
std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
2888
assert(!Func.func()->getParent()->getFunction(Name));
2889
NewFunc->setName(Name);
2890
for (auto &Inst : CallsWithMetadataInFunc) {
2891
// This map always has the initial version in it.
2892
assert(Inst.cloneNo() == 0);
2893
CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
2894
}
2895
OREGetter(Func.func())
2896
.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
2897
<< "created clone " << ore::NV("NewFunction", NewFunc));
2898
return {NewFunc, CloneNo};
2899
}
2900
2901
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
2902
IndexCall>::FuncInfo
2903
IndexCallsiteContextGraph::cloneFunctionForCallsite(
2904
FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2905
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2906
// Check how many clones we have of Call (and therefore function).
2907
// The next clone number is the current size of versions array.
2908
// Confirm this matches the CloneNo provided by the caller, which is based on
2909
// the number of function clones we have.
2910
assert(CloneNo ==
2911
(Call.call().is<AllocInfo *>()
2912
? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2913
: Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2914
// Walk all the instructions in this function. Create a new version for
2915
// each (by adding an entry to the Versions/Clones summary array), and copy
2916
// over the version being called for the function clone being cloned here.
2917
// Additionally, add an entry to the CallMap for the new function clone,
2918
// mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2919
// to the new call clone.
2920
for (auto &Inst : CallsWithMetadataInFunc) {
2921
// This map always has the initial version in it.
2922
assert(Inst.cloneNo() == 0);
2923
if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
2924
assert(AI->Versions.size() == CloneNo);
2925
// We assign the allocation type later (in updateAllocationCall), just add
2926
// an entry for it here.
2927
AI->Versions.push_back(0);
2928
} else {
2929
auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
2930
assert(CI && CI->Clones.size() == CloneNo);
2931
// We assign the clone number later (in updateCall), just add an entry for
2932
// it here.
2933
CI->Clones.push_back(0);
2934
}
2935
CallMap[Inst] = {Inst.call(), CloneNo};
2936
}
2937
return {Func.func(), CloneNo};
2938
}
2939
2940
// This method assigns cloned callsites to functions, cloning the functions as
2941
// needed. The assignment is greedy and proceeds roughly as follows:
2942
//
2943
// For each function Func:
2944
// For each call with graph Node having clones:
2945
// Initialize ClonesWorklist to Node and its clones
2946
// Initialize NodeCloneCount to 0
2947
// While ClonesWorklist is not empty:
2948
// Clone = pop front ClonesWorklist
2949
// NodeCloneCount++
2950
// If Func has been cloned less than NodeCloneCount times:
2951
// If NodeCloneCount is 1:
2952
// Assign Clone to original Func
2953
// Continue
2954
// Create a new function clone
2955
// If other callers not assigned to call a function clone yet:
2956
// Assign them to call new function clone
2957
// Continue
2958
// Assign any other caller calling the cloned version to new clone
2959
//
2960
// For each caller of Clone:
2961
// If caller is assigned to call a specific function clone:
2962
// If we cannot assign Clone to that function clone:
2963
// Create new callsite Clone NewClone
2964
// Add NewClone to ClonesWorklist
2965
// Continue
2966
// Assign Clone to existing caller's called function clone
2967
// Else:
2968
// If Clone not already assigned to a function clone:
2969
// Assign to first function clone without assignment
2970
// Assign caller to selected function clone
2971
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2972
bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
2973
bool Changed = false;
2974
2975
// Keep track of the assignment of nodes (callsites) to function clones they
2976
// call.
2977
DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
2978
2979
// Update caller node to call function version CalleeFunc, by recording the
2980
// assignment in CallsiteToCalleeFuncCloneMap.
2981
auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
2982
const FuncInfo &CalleeFunc) {
2983
assert(Caller->hasCall());
2984
CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
2985
};
2986
2987
// Walk all functions for which we saw calls with memprof metadata, and handle
2988
// cloning for each of its calls.
2989
for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2990
FuncInfo OrigFunc(Func);
2991
// Map from each clone of OrigFunc to a map of remappings of each call of
2992
// interest (from original uncloned call to the corresponding cloned call in
2993
// that function clone).
2994
std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
2995
for (auto &Call : CallsWithMetadata) {
2996
ContextNode *Node = getNodeForInst(Call);
2997
// Skip call if we do not have a node for it (all uses of its stack ids
2998
// were either on inlined chains or pruned from the MIBs), or if we did
2999
// not create any clones for it.
3000
if (!Node || Node->Clones.empty())
3001
continue;
3002
assert(Node->hasCall() &&
3003
"Not having a call should have prevented cloning");
3004
3005
// Track the assignment of function clones to clones of the current
3006
// callsite Node being handled.
3007
std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
3008
3009
// Assign callsite version CallsiteClone to function version FuncClone,
3010
// and also assign (possibly cloned) Call to CallsiteClone.
3011
auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
3012
CallInfo &Call,
3013
ContextNode *CallsiteClone,
3014
bool IsAlloc) {
3015
// Record the clone of callsite node assigned to this function clone.
3016
FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
3017
3018
assert(FuncClonesToCallMap.count(FuncClone));
3019
std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
3020
CallInfo CallClone(Call);
3021
if (CallMap.count(Call))
3022
CallClone = CallMap[Call];
3023
CallsiteClone->setCall(CallClone);
3024
};
3025
3026
// Keep track of the clones of callsite Node that need to be assigned to
3027
// function clones. This list may be expanded in the loop body below if we
3028
// find additional cloning is required.
3029
std::deque<ContextNode *> ClonesWorklist;
3030
// Ignore original Node if we moved all of its contexts to clones.
3031
if (!Node->emptyContextIds())
3032
ClonesWorklist.push_back(Node);
3033
ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
3034
Node->Clones.end());
3035
3036
// Now walk through all of the clones of this callsite Node that we need,
3037
// and determine the assignment to a corresponding clone of the current
3038
// function (creating new function clones as needed).
3039
unsigned NodeCloneCount = 0;
3040
while (!ClonesWorklist.empty()) {
3041
ContextNode *Clone = ClonesWorklist.front();
3042
ClonesWorklist.pop_front();
3043
NodeCloneCount++;
3044
if (VerifyNodes)
3045
checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3046
3047
// Need to create a new function clone if we have more callsite clones
3048
// than existing function clones, which would have been assigned to an
3049
// earlier clone in the list (we assign callsite clones to function
3050
// clones greedily).
3051
if (FuncClonesToCallMap.size() < NodeCloneCount) {
3052
// If this is the first callsite copy, assign to original function.
3053
if (NodeCloneCount == 1) {
3054
// Since FuncClonesToCallMap is empty in this case, no clones have
3055
// been created for this function yet, and no callers should have
3056
// been assigned a function clone for this callee node yet.
3057
assert(llvm::none_of(
3058
Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3059
return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3060
}));
3061
// Initialize with empty call map, assign Clone to original function
3062
// and its callers, and skip to the next clone.
3063
FuncClonesToCallMap[OrigFunc] = {};
3064
AssignCallsiteCloneToFuncClone(
3065
OrigFunc, Call, Clone,
3066
AllocationCallToContextNodeMap.count(Call));
3067
for (auto &CE : Clone->CallerEdges) {
3068
// Ignore any caller that does not have a recorded callsite Call.
3069
if (!CE->Caller->hasCall())
3070
continue;
3071
RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
3072
}
3073
continue;
3074
}
3075
3076
// First locate which copy of OrigFunc to clone again. If a caller
3077
// of this callsite clone was already assigned to call a particular
3078
// function clone, we need to redirect all of those callers to the
3079
// new function clone, and update their other callees within this
3080
// function.
3081
FuncInfo PreviousAssignedFuncClone;
3082
auto EI = llvm::find_if(
3083
Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3084
return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3085
});
3086
bool CallerAssignedToCloneOfFunc = false;
3087
if (EI != Clone->CallerEdges.end()) {
3088
const std::shared_ptr<ContextEdge> &Edge = *EI;
3089
PreviousAssignedFuncClone =
3090
CallsiteToCalleeFuncCloneMap[Edge->Caller];
3091
CallerAssignedToCloneOfFunc = true;
3092
}
3093
3094
// Clone function and save it along with the CallInfo map created
3095
// during cloning in the FuncClonesToCallMap.
3096
std::map<CallInfo, CallInfo> NewCallMap;
3097
unsigned CloneNo = FuncClonesToCallMap.size();
3098
assert(CloneNo > 0 && "Clone 0 is the original function, which "
3099
"should already exist in the map");
3100
FuncInfo NewFuncClone = cloneFunctionForCallsite(
3101
OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
3102
FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
3103
FunctionClonesAnalysis++;
3104
Changed = true;
3105
3106
// If no caller callsites were already assigned to a clone of this
3107
// function, we can simply assign this clone to the new func clone
3108
// and update all callers to it, then skip to the next clone.
3109
if (!CallerAssignedToCloneOfFunc) {
3110
AssignCallsiteCloneToFuncClone(
3111
NewFuncClone, Call, Clone,
3112
AllocationCallToContextNodeMap.count(Call));
3113
for (auto &CE : Clone->CallerEdges) {
3114
// Ignore any caller that does not have a recorded callsite Call.
3115
if (!CE->Caller->hasCall())
3116
continue;
3117
RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3118
}
3119
continue;
3120
}
3121
3122
// We may need to do additional node cloning in this case.
3123
// Reset the CallsiteToCalleeFuncCloneMap entry for any callers
3124
// that were previously assigned to call PreviousAssignedFuncClone,
3125
// to record that they now call NewFuncClone.
3126
for (auto CE : Clone->CallerEdges) {
3127
// Skip any that have been removed on an earlier iteration.
3128
if (!CE)
3129
continue;
3130
// Ignore any caller that does not have a recorded callsite Call.
3131
if (!CE->Caller->hasCall())
3132
continue;
3133
3134
if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
3135
// We subsequently fall through to later handling that
3136
// will perform any additional cloning required for
3137
// callers that were calling other function clones.
3138
CallsiteToCalleeFuncCloneMap[CE->Caller] !=
3139
PreviousAssignedFuncClone)
3140
continue;
3141
3142
RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3143
3144
// If we are cloning a function that was already assigned to some
3145
// callers, then essentially we are creating new callsite clones
3146
// of the other callsites in that function that are reached by those
3147
// callers. Clone the other callees of the current callsite's caller
3148
// that were already assigned to PreviousAssignedFuncClone
3149
// accordingly. This is important since we subsequently update the
3150
// calls from the nodes in the graph and their assignments to callee
3151
// functions recorded in CallsiteToCalleeFuncCloneMap.
3152
for (auto CalleeEdge : CE->Caller->CalleeEdges) {
3153
// Skip any that have been removed on an earlier iteration when
3154
// cleaning up newly None type callee edges.
3155
if (!CalleeEdge)
3156
continue;
3157
ContextNode *Callee = CalleeEdge->Callee;
3158
// Skip the current callsite, we are looking for other
3159
// callsites Caller calls, as well as any that does not have a
3160
// recorded callsite Call.
3161
if (Callee == Clone || !Callee->hasCall())
3162
continue;
3163
ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
3164
removeNoneTypeCalleeEdges(NewClone);
3165
// Moving the edge may have resulted in some none type
3166
// callee edges on the original Callee.
3167
removeNoneTypeCalleeEdges(Callee);
3168
assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3169
// If the Callee node was already assigned to call a specific
3170
// function version, make sure its new clone is assigned to call
3171
// that same function clone.
3172
if (CallsiteToCalleeFuncCloneMap.count(Callee))
3173
RecordCalleeFuncOfCallsite(
3174
NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
3175
// Update NewClone with the new Call clone of this callsite's Call
3176
// created for the new function clone created earlier.
3177
// Recall that we have already ensured when building the graph
3178
// that each caller can only call callsites within the same
3179
// function, so we are guaranteed that Callee Call is in the
3180
// current OrigFunc.
3181
// CallMap is set up as indexed by original Call at clone 0.
3182
CallInfo OrigCall(Callee->getOrigNode()->Call);
3183
OrigCall.setCloneNo(0);
3184
std::map<CallInfo, CallInfo> &CallMap =
3185
FuncClonesToCallMap[NewFuncClone];
3186
assert(CallMap.count(OrigCall));
3187
CallInfo NewCall(CallMap[OrigCall]);
3188
assert(NewCall);
3189
NewClone->setCall(NewCall);
3190
}
3191
}
3192
// Fall through to handling below to perform the recording of the
3193
// function for this callsite clone. This enables handling of cases
3194
// where the callers were assigned to different clones of a function.
3195
}
3196
3197
// See if we can use existing function clone. Walk through
3198
// all caller edges to see if any have already been assigned to
3199
// a clone of this callsite's function. If we can use it, do so. If not,
3200
// because that function clone is already assigned to a different clone
3201
// of this callsite, then we need to clone again.
3202
// Basically, this checking is needed to handle the case where different
3203
// caller functions/callsites may need versions of this function
3204
// containing different mixes of callsite clones across the different
3205
// callsites within the function. If that happens, we need to create
3206
// additional function clones to handle the various combinations.
3207
//
3208
// Keep track of any new clones of this callsite created by the
3209
// following loop, as well as any existing clone that we decided to
3210
// assign this clone to.
3211
std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3212
FuncInfo FuncCloneAssignedToCurCallsiteClone;
3213
// We need to be able to remove Edge from CallerEdges, so need to adjust
3214
// iterator in the loop.
3215
for (auto EI = Clone->CallerEdges.begin();
3216
EI != Clone->CallerEdges.end();) {
3217
auto Edge = *EI;
3218
// Ignore any caller that does not have a recorded callsite Call.
3219
if (!Edge->Caller->hasCall()) {
3220
EI++;
3221
continue;
3222
}
3223
// If this caller already assigned to call a version of OrigFunc, need
3224
// to ensure we can assign this callsite clone to that function clone.
3225
if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3226
FuncInfo FuncCloneCalledByCaller =
3227
CallsiteToCalleeFuncCloneMap[Edge->Caller];
3228
// First we need to confirm that this function clone is available
3229
// for use by this callsite node clone.
3230
//
3231
// While FuncCloneToCurNodeCloneMap is built only for this Node and
3232
// its callsite clones, one of those callsite clones X could have
3233
// been assigned to the same function clone called by Edge's caller
3234
// - if Edge's caller calls another callsite within Node's original
3235
// function, and that callsite has another caller reaching clone X.
3236
// We need to clone Node again in this case.
3237
if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3238
FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3239
Clone) ||
3240
// Detect when we have multiple callers of this callsite that
3241
// have already been assigned to specific, and different, clones
3242
// of OrigFunc (due to other unrelated callsites in Func they
3243
// reach via call contexts). Is this Clone of callsite Node
3244
// assigned to a different clone of OrigFunc? If so, clone Node
3245
// again.
3246
(FuncCloneAssignedToCurCallsiteClone &&
3247
FuncCloneAssignedToCurCallsiteClone !=
3248
FuncCloneCalledByCaller)) {
3249
// We need to use a different newly created callsite clone, in
3250
// order to assign it to another new function clone on a
3251
// subsequent iteration over the Clones array (adjusted below).
3252
// Note we specifically do not reset the
3253
// CallsiteToCalleeFuncCloneMap entry for this caller, so that
3254
// when this new clone is processed later we know which version of
3255
// the function to copy (so that other callsite clones we have
3256
// assigned to that function clone are properly cloned over). See
3257
// comments in the function cloning handling earlier.
3258
3259
// Check if we already have cloned this callsite again while
3260
// walking through caller edges, for a caller calling the same
3261
// function clone. If so, we can move this edge to that new clone
3262
// rather than creating yet another new clone.
3263
if (FuncCloneToNewCallsiteCloneMap.count(
3264
FuncCloneCalledByCaller)) {
3265
ContextNode *NewClone =
3266
FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3267
moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
3268
// Cleanup any none type edges cloned over.
3269
removeNoneTypeCalleeEdges(NewClone);
3270
} else {
3271
// Create a new callsite clone.
3272
ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
3273
removeNoneTypeCalleeEdges(NewClone);
3274
FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3275
NewClone;
3276
// Add to list of clones and process later.
3277
ClonesWorklist.push_back(NewClone);
3278
assert(EI == Clone->CallerEdges.end() ||
3279
Clone->AllocTypes != (uint8_t)AllocationType::None);
3280
assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3281
}
3282
// Moving the caller edge may have resulted in some none type
3283
// callee edges.
3284
removeNoneTypeCalleeEdges(Clone);
3285
// We will handle the newly created callsite clone in a subsequent
3286
// iteration over this Node's Clones. Continue here since we
3287
// already adjusted iterator EI while moving the edge.
3288
continue;
3289
}
3290
3291
// Otherwise, we can use the function clone already assigned to this
3292
// caller.
3293
if (!FuncCloneAssignedToCurCallsiteClone) {
3294
FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
3295
// Assign Clone to FuncCloneCalledByCaller
3296
AssignCallsiteCloneToFuncClone(
3297
FuncCloneCalledByCaller, Call, Clone,
3298
AllocationCallToContextNodeMap.count(Call));
3299
} else
3300
// Don't need to do anything - callsite is already calling this
3301
// function clone.
3302
assert(FuncCloneAssignedToCurCallsiteClone ==
3303
FuncCloneCalledByCaller);
3304
3305
} else {
3306
// We have not already assigned this caller to a version of
3307
// OrigFunc. Do the assignment now.
3308
3309
// First check if we have already assigned this callsite clone to a
3310
// clone of OrigFunc for another caller during this iteration over
3311
// its caller edges.
3312
if (!FuncCloneAssignedToCurCallsiteClone) {
3313
// Find first function in FuncClonesToCallMap without an assigned
3314
// clone of this callsite Node. We should always have one
3315
// available at this point due to the earlier cloning when the
3316
// FuncClonesToCallMap size was smaller than the clone number.
3317
for (auto &CF : FuncClonesToCallMap) {
3318
if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
3319
FuncCloneAssignedToCurCallsiteClone = CF.first;
3320
break;
3321
}
3322
}
3323
assert(FuncCloneAssignedToCurCallsiteClone);
3324
// Assign Clone to FuncCloneAssignedToCurCallsiteClone
3325
AssignCallsiteCloneToFuncClone(
3326
FuncCloneAssignedToCurCallsiteClone, Call, Clone,
3327
AllocationCallToContextNodeMap.count(Call));
3328
} else
3329
assert(FuncCloneToCurNodeCloneMap
3330
[FuncCloneAssignedToCurCallsiteClone] == Clone);
3331
// Update callers to record function version called.
3332
RecordCalleeFuncOfCallsite(Edge->Caller,
3333
FuncCloneAssignedToCurCallsiteClone);
3334
}
3335
3336
EI++;
3337
}
3338
}
3339
if (VerifyCCG) {
3340
checkNode<DerivedCCG, FuncTy, CallTy>(Node);
3341
for (const auto &PE : Node->CalleeEdges)
3342
checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3343
for (const auto &CE : Node->CallerEdges)
3344
checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3345
for (auto *Clone : Node->Clones) {
3346
checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3347
for (const auto &PE : Clone->CalleeEdges)
3348
checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3349
for (const auto &CE : Clone->CallerEdges)
3350
checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3351
}
3352
}
3353
}
3354
}
3355
3356
auto UpdateCalls = [&](ContextNode *Node,
3357
DenseSet<const ContextNode *> &Visited,
3358
auto &&UpdateCalls) {
3359
auto Inserted = Visited.insert(Node);
3360
if (!Inserted.second)
3361
return;
3362
3363
for (auto *Clone : Node->Clones)
3364
UpdateCalls(Clone, Visited, UpdateCalls);
3365
3366
for (auto &Edge : Node->CallerEdges)
3367
UpdateCalls(Edge->Caller, Visited, UpdateCalls);
3368
3369
// Skip if either no call to update, or if we ended up with no context ids
3370
// (we moved all edges onto other clones).
3371
if (!Node->hasCall() || Node->emptyContextIds())
3372
return;
3373
3374
if (Node->IsAllocation) {
3375
updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
3376
return;
3377
}
3378
3379
if (!CallsiteToCalleeFuncCloneMap.count(Node))
3380
return;
3381
3382
auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
3383
updateCall(Node->Call, CalleeFunc);
3384
};
3385
3386
// Performs DFS traversal starting from allocation nodes to update calls to
3387
// reflect cloning decisions recorded earlier. For regular LTO this will
3388
// update the actual calls in the IR to call the appropriate function clone
3389
// (and add attributes to allocation calls), whereas for ThinLTO the decisions
3390
// are recorded in the summary entries.
3391
DenseSet<const ContextNode *> Visited;
3392
for (auto &Entry : AllocationCallToContextNodeMap)
3393
UpdateCalls(Entry.second, Visited, UpdateCalls);
3394
3395
return Changed;
3396
}
3397
3398
static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
3399
Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
3400
std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3401
&FuncToAliasMap) {
3402
// The first "clone" is the original copy, we should only call this if we
3403
// needed to create new clones.
3404
assert(NumClones > 1);
3405
SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
3406
VMaps.reserve(NumClones - 1);
3407
FunctionsClonedThinBackend++;
3408
for (unsigned I = 1; I < NumClones; I++) {
3409
VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
3410
auto *NewF = CloneFunction(&F, *VMaps.back());
3411
FunctionClonesThinBackend++;
3412
// Strip memprof and callsite metadata from clone as they are no longer
3413
// needed.
3414
for (auto &BB : *NewF) {
3415
for (auto &Inst : BB) {
3416
Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
3417
Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
3418
}
3419
}
3420
std::string Name = getMemProfFuncName(F.getName(), I);
3421
auto *PrevF = M.getFunction(Name);
3422
if (PrevF) {
3423
// We might have created this when adjusting callsite in another
3424
// function. It should be a declaration.
3425
assert(PrevF->isDeclaration());
3426
NewF->takeName(PrevF);
3427
PrevF->replaceAllUsesWith(NewF);
3428
PrevF->eraseFromParent();
3429
} else
3430
NewF->setName(Name);
3431
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
3432
<< "created clone " << ore::NV("NewFunction", NewF));
3433
3434
// Now handle aliases to this function, and clone those as well.
3435
if (!FuncToAliasMap.count(&F))
3436
continue;
3437
for (auto *A : FuncToAliasMap[&F]) {
3438
std::string Name = getMemProfFuncName(A->getName(), I);
3439
auto *PrevA = M.getNamedAlias(Name);
3440
auto *NewA = GlobalAlias::create(A->getValueType(),
3441
A->getType()->getPointerAddressSpace(),
3442
A->getLinkage(), Name, NewF);
3443
NewA->copyAttributesFrom(A);
3444
if (PrevA) {
3445
// We might have created this when adjusting callsite in another
3446
// function. It should be a declaration.
3447
assert(PrevA->isDeclaration());
3448
NewA->takeName(PrevA);
3449
PrevA->replaceAllUsesWith(NewA);
3450
PrevA->eraseFromParent();
3451
}
3452
}
3453
}
3454
return VMaps;
3455
}
3456
3457
// Locate the summary for F. This is complicated by the fact that it might
3458
// have been internalized or promoted.
3459
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
3460
const ModuleSummaryIndex *ImportSummary) {
3461
// FIXME: Ideally we would retain the original GUID in some fashion on the
3462
// function (e.g. as metadata), but for now do our best to locate the
3463
// summary without that information.
3464
ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
3465
if (!TheFnVI)
3466
// See if theFn was internalized, by checking index directly with
3467
// original name (this avoids the name adjustment done by getGUID() for
3468
// internal symbols).
3469
TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
3470
if (TheFnVI)
3471
return TheFnVI;
3472
// Now query with the original name before any promotion was performed.
3473
StringRef OrigName =
3474
ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName());
3475
std::string OrigId = GlobalValue::getGlobalIdentifier(
3476
OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
3477
TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
3478
if (TheFnVI)
3479
return TheFnVI;
3480
// Could be a promoted local imported from another module. We need to pass
3481
// down more info here to find the original module id. For now, try with
3482
// the OrigName which might have been stored in the OidGuidMap in the
3483
// index. This would not work if there were same-named locals in multiple
3484
// modules, however.
3485
auto OrigGUID =
3486
ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
3487
if (OrigGUID)
3488
TheFnVI = ImportSummary->getValueInfo(OrigGUID);
3489
return TheFnVI;
3490
}
3491
3492
bool MemProfContextDisambiguation::applyImport(Module &M) {
3493
assert(ImportSummary);
3494
bool Changed = false;
3495
3496
auto IsMemProfClone = [](const Function &F) {
3497
return F.getName().contains(MemProfCloneSuffix);
3498
};
3499
3500
// We also need to clone any aliases that reference cloned functions, because
3501
// the modified callsites may invoke via the alias. Keep track of the aliases
3502
// for each function.
3503
std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3504
FuncToAliasMap;
3505
for (auto &A : M.aliases()) {
3506
auto *Aliasee = A.getAliaseeObject();
3507
if (auto *F = dyn_cast<Function>(Aliasee))
3508
FuncToAliasMap[F].insert(&A);
3509
}
3510
3511
for (auto &F : M) {
3512
if (F.isDeclaration() || IsMemProfClone(F))
3513
continue;
3514
3515
OptimizationRemarkEmitter ORE(&F);
3516
3517
SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
3518
bool ClonesCreated = false;
3519
unsigned NumClonesCreated = 0;
3520
auto CloneFuncIfNeeded = [&](unsigned NumClones) {
3521
// We should at least have version 0 which is the original copy.
3522
assert(NumClones > 0);
3523
// If only one copy needed use original.
3524
if (NumClones == 1)
3525
return;
3526
// If we already performed cloning of this function, confirm that the
3527
// requested number of clones matches (the thin link should ensure the
3528
// number of clones for each constituent callsite is consistent within
3529
// each function), before returning.
3530
if (ClonesCreated) {
3531
assert(NumClonesCreated == NumClones);
3532
return;
3533
}
3534
VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
3535
// The first "clone" is the original copy, which doesn't have a VMap.
3536
assert(VMaps.size() == NumClones - 1);
3537
Changed = true;
3538
ClonesCreated = true;
3539
NumClonesCreated = NumClones;
3540
};
3541
3542
auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
3543
Function *CalledFunction) {
3544
// Perform cloning if not yet done.
3545
CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
3546
3547
// Should have skipped indirect calls via mayHaveMemprofSummary.
3548
assert(CalledFunction);
3549
assert(!IsMemProfClone(*CalledFunction));
3550
3551
// Update the calls per the summary info.
3552
// Save orig name since it gets updated in the first iteration
3553
// below.
3554
auto CalleeOrigName = CalledFunction->getName();
3555
for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
3556
// Do nothing if this version calls the original version of its
3557
// callee.
3558
if (!StackNode.Clones[J])
3559
continue;
3560
auto NewF = M.getOrInsertFunction(
3561
getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
3562
CalledFunction->getFunctionType());
3563
CallBase *CBClone;
3564
// Copy 0 is the original function.
3565
if (!J)
3566
CBClone = CB;
3567
else
3568
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3569
CBClone->setCalledFunction(NewF);
3570
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
3571
<< ore::NV("Call", CBClone) << " in clone "
3572
<< ore::NV("Caller", CBClone->getFunction())
3573
<< " assigned to call function clone "
3574
<< ore::NV("Callee", NewF.getCallee()));
3575
}
3576
};
3577
3578
// Locate the summary for F.
3579
ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
3580
// If not found, this could be an imported local (see comment in
3581
// findValueInfoForFunc). Skip for now as it will be cloned in its original
3582
// module (where it would have been promoted to global scope so should
3583
// satisfy any reference in this module).
3584
if (!TheFnVI)
3585
continue;
3586
3587
auto *GVSummary =
3588
ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
3589
if (!GVSummary) {
3590
// Must have been imported, use the summary which matches the definition。
3591
// (might be multiple if this was a linkonce_odr).
3592
auto SrcModuleMD = F.getMetadata("thinlto_src_module");
3593
assert(SrcModuleMD &&
3594
"enable-import-metadata is needed to emit thinlto_src_module");
3595
StringRef SrcModule =
3596
dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
3597
for (auto &GVS : TheFnVI.getSummaryList()) {
3598
if (GVS->modulePath() == SrcModule) {
3599
GVSummary = GVS.get();
3600
break;
3601
}
3602
}
3603
assert(GVSummary && GVSummary->modulePath() == SrcModule);
3604
}
3605
3606
// If this was an imported alias skip it as we won't have the function
3607
// summary, and it should be cloned in the original module.
3608
if (isa<AliasSummary>(GVSummary))
3609
continue;
3610
3611
auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
3612
3613
if (FS->allocs().empty() && FS->callsites().empty())
3614
continue;
3615
3616
auto SI = FS->callsites().begin();
3617
auto AI = FS->allocs().begin();
3618
3619
// To handle callsite infos synthesized for tail calls which have missing
3620
// frames in the profiled context, map callee VI to the synthesized callsite
3621
// info.
3622
DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
3623
// Iterate the callsites for this function in reverse, since we place all
3624
// those synthesized for tail calls at the end.
3625
for (auto CallsiteIt = FS->callsites().rbegin();
3626
CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
3627
auto &Callsite = *CallsiteIt;
3628
// Stop as soon as we see a non-synthesized callsite info (see comment
3629
// above loop). All the entries added for discovered tail calls have empty
3630
// stack ids.
3631
if (!Callsite.StackIdIndices.empty())
3632
break;
3633
MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
3634
}
3635
3636
// Assume for now that the instructions are in the exact same order
3637
// as when the summary was created, but confirm this is correct by
3638
// matching the stack ids.
3639
for (auto &BB : F) {
3640
for (auto &I : BB) {
3641
auto *CB = dyn_cast<CallBase>(&I);
3642
// Same handling as when creating module summary.
3643
if (!mayHaveMemprofSummary(CB))
3644
continue;
3645
3646
auto *CalledValue = CB->getCalledOperand();
3647
auto *CalledFunction = CB->getCalledFunction();
3648
if (CalledValue && !CalledFunction) {
3649
CalledValue = CalledValue->stripPointerCasts();
3650
// Stripping pointer casts can reveal a called function.
3651
CalledFunction = dyn_cast<Function>(CalledValue);
3652
}
3653
// Check if this is an alias to a function. If so, get the
3654
// called aliasee for the checks below.
3655
if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
3656
assert(!CalledFunction &&
3657
"Expected null called function in callsite for alias");
3658
CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
3659
}
3660
3661
CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
3662
I.getMetadata(LLVMContext::MD_callsite));
3663
auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
3664
3665
// Include allocs that were already assigned a memprof function
3666
// attribute in the statistics.
3667
if (CB->getAttributes().hasFnAttr("memprof")) {
3668
assert(!MemProfMD);
3669
CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3670
? AllocTypeColdThinBackend++
3671
: AllocTypeNotColdThinBackend++;
3672
OrigAllocsThinBackend++;
3673
AllocVersionsThinBackend++;
3674
if (!MaxAllocVersionsThinBackend)
3675
MaxAllocVersionsThinBackend = 1;
3676
// Remove any remaining callsite metadata and we can skip the rest of
3677
// the handling for this instruction, since no cloning needed.
3678
I.setMetadata(LLVMContext::MD_callsite, nullptr);
3679
continue;
3680
}
3681
3682
if (MemProfMD) {
3683
// Consult the next alloc node.
3684
assert(AI != FS->allocs().end());
3685
auto &AllocNode = *(AI++);
3686
3687
// Sanity check that the MIB stack ids match between the summary and
3688
// instruction metadata.
3689
auto MIBIter = AllocNode.MIBs.begin();
3690
for (auto &MDOp : MemProfMD->operands()) {
3691
assert(MIBIter != AllocNode.MIBs.end());
3692
LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3693
MIBIter->StackIdIndices.begin();
3694
auto *MIBMD = cast<const MDNode>(MDOp);
3695
MDNode *StackMDNode = getMIBStackNode(MIBMD);
3696
assert(StackMDNode);
3697
CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3698
auto ContextIterBegin =
3699
StackContext.beginAfterSharedPrefix(CallsiteContext);
3700
// Skip the checking on the first iteration.
3701
uint64_t LastStackContextId =
3702
(ContextIterBegin != StackContext.end() &&
3703
*ContextIterBegin == 0)
3704
? 1
3705
: 0;
3706
for (auto ContextIter = ContextIterBegin;
3707
ContextIter != StackContext.end(); ++ContextIter) {
3708
// If this is a direct recursion, simply skip the duplicate
3709
// entries, to be consistent with how the summary ids were
3710
// generated during ModuleSummaryAnalysis.
3711
if (LastStackContextId == *ContextIter)
3712
continue;
3713
LastStackContextId = *ContextIter;
3714
assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
3715
assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3716
*ContextIter);
3717
StackIdIndexIter++;
3718
}
3719
MIBIter++;
3720
}
3721
3722
// Perform cloning if not yet done.
3723
CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
3724
3725
OrigAllocsThinBackend++;
3726
AllocVersionsThinBackend += AllocNode.Versions.size();
3727
if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3728
MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3729
3730
// If there is only one version that means we didn't end up
3731
// considering this function for cloning, and in that case the alloc
3732
// will still be none type or should have gotten the default NotCold.
3733
// Skip that after calling clone helper since that does some sanity
3734
// checks that confirm we haven't decided yet that we need cloning.
3735
if (AllocNode.Versions.size() == 1) {
3736
assert((AllocationType)AllocNode.Versions[0] ==
3737
AllocationType::NotCold ||
3738
(AllocationType)AllocNode.Versions[0] ==
3739
AllocationType::None);
3740
UnclonableAllocsThinBackend++;
3741
continue;
3742
}
3743
3744
// All versions should have a singular allocation type.
3745
assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3746
return Type == ((uint8_t)AllocationType::NotCold |
3747
(uint8_t)AllocationType::Cold);
3748
}));
3749
3750
// Update the allocation types per the summary info.
3751
for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
3752
// Ignore any that didn't get an assigned allocation type.
3753
if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
3754
continue;
3755
AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
3756
AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
3757
: AllocTypeNotColdThinBackend++;
3758
std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
3759
auto A = llvm::Attribute::get(F.getContext(), "memprof",
3760
AllocTypeString);
3761
CallBase *CBClone;
3762
// Copy 0 is the original function.
3763
if (!J)
3764
CBClone = CB;
3765
else
3766
// Since VMaps are only created for new clones, we index with
3767
// clone J-1 (J==0 is the original clone and does not have a VMaps
3768
// entry).
3769
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3770
CBClone->addFnAttr(A);
3771
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
3772
<< ore::NV("AllocationCall", CBClone) << " in clone "
3773
<< ore::NV("Caller", CBClone->getFunction())
3774
<< " marked with memprof allocation attribute "
3775
<< ore::NV("Attribute", AllocTypeString));
3776
}
3777
} else if (!CallsiteContext.empty()) {
3778
// Consult the next callsite node.
3779
assert(SI != FS->callsites().end());
3780
auto &StackNode = *(SI++);
3781
3782
#ifndef NDEBUG
3783
// Sanity check that the stack ids match between the summary and
3784
// instruction metadata.
3785
auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3786
for (auto StackId : CallsiteContext) {
3787
assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3788
assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3789
StackId);
3790
StackIdIndexIter++;
3791
}
3792
#endif
3793
3794
CloneCallsite(StackNode, CB, CalledFunction);
3795
} else if (CB->isTailCall()) {
3796
// Locate the synthesized callsite info for the callee VI, if any was
3797
// created, and use that for cloning.
3798
ValueInfo CalleeVI =
3799
findValueInfoForFunc(*CalledFunction, M, ImportSummary);
3800
if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
3801
auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
3802
assert(Callsite != MapTailCallCalleeVIToCallsite.end());
3803
CloneCallsite(Callsite->second, CB, CalledFunction);
3804
}
3805
}
3806
// Memprof and callsite metadata on memory allocations no longer needed.
3807
I.setMetadata(LLVMContext::MD_memprof, nullptr);
3808
I.setMetadata(LLVMContext::MD_callsite, nullptr);
3809
}
3810
}
3811
}
3812
3813
return Changed;
3814
}
3815
3816
template <typename DerivedCCG, typename FuncTy, typename CallTy>
3817
bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
3818
if (DumpCCG) {
3819
dbgs() << "CCG before cloning:\n";
3820
dbgs() << *this;
3821
}
3822
if (ExportToDot)
3823
exportToDot("postbuild");
3824
3825
if (VerifyCCG) {
3826
check();
3827
}
3828
3829
identifyClones();
3830
3831
if (VerifyCCG) {
3832
check();
3833
}
3834
3835
if (DumpCCG) {
3836
dbgs() << "CCG after cloning:\n";
3837
dbgs() << *this;
3838
}
3839
if (ExportToDot)
3840
exportToDot("cloned");
3841
3842
bool Changed = assignFunctions();
3843
3844
if (DumpCCG) {
3845
dbgs() << "CCG after assigning function clones:\n";
3846
dbgs() << *this;
3847
}
3848
if (ExportToDot)
3849
exportToDot("clonefuncassign");
3850
3851
if (MemProfReportHintedSizes)
3852
printTotalSizes(errs());
3853
3854
return Changed;
3855
}
3856
3857
bool MemProfContextDisambiguation::processModule(
3858
Module &M,
3859
llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
3860
3861
// If we have an import summary, then the cloning decisions were made during
3862
// the thin link on the index. Apply them and return.
3863
if (ImportSummary)
3864
return applyImport(M);
3865
3866
// TODO: If/when other types of memprof cloning are enabled beyond just for
3867
// hot and cold, we will need to change this to individually control the
3868
// AllocationType passed to addStackNodesForMIB during CCG construction.
3869
// Note that we specifically check this after applying imports above, so that
3870
// the option isn't needed to be passed to distributed ThinLTO backend
3871
// clang processes, which won't necessarily have visibility into the linker
3872
// dependences. Instead the information is communicated from the LTO link to
3873
// the backends via the combined summary index.
3874
if (!SupportsHotColdNew)
3875
return false;
3876
3877
ModuleCallsiteContextGraph CCG(M, OREGetter);
3878
return CCG.process();
3879
}
3880
3881
MemProfContextDisambiguation::MemProfContextDisambiguation(
3882
const ModuleSummaryIndex *Summary)
3883
: ImportSummary(Summary) {
3884
if (ImportSummary) {
3885
// The MemProfImportSummary should only be used for testing ThinLTO
3886
// distributed backend handling via opt, in which case we don't have a
3887
// summary from the pass pipeline.
3888
assert(MemProfImportSummary.empty());
3889
return;
3890
}
3891
if (MemProfImportSummary.empty())
3892
return;
3893
3894
auto ReadSummaryFile =
3895
errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary));
3896
if (!ReadSummaryFile) {
3897
logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
3898
"Error loading file '" + MemProfImportSummary +
3899
"': ");
3900
return;
3901
}
3902
auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
3903
if (!ImportSummaryForTestingOrErr) {
3904
logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
3905
"Error parsing file '" + MemProfImportSummary +
3906
"': ");
3907
return;
3908
}
3909
ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
3910
ImportSummary = ImportSummaryForTesting.get();
3911
}
3912
3913
PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
3914
ModuleAnalysisManager &AM) {
3915
auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
3916
auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
3917
return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
3918
};
3919
if (!processModule(M, OREGetter))
3920
return PreservedAnalyses::all();
3921
return PreservedAnalyses::none();
3922
}
3923
3924
void MemProfContextDisambiguation::run(
3925
ModuleSummaryIndex &Index,
3926
llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
3927
isPrevailing) {
3928
// TODO: If/when other types of memprof cloning are enabled beyond just for
3929
// hot and cold, we will need to change this to individually control the
3930
// AllocationType passed to addStackNodesForMIB during CCG construction.
3931
// The index was set from the option, so these should be in sync.
3932
assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
3933
if (!SupportsHotColdNew)
3934
return;
3935
3936
IndexCallsiteContextGraph CCG(Index, isPrevailing);
3937
CCG.process();
3938
}
3939
3940