Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp
213766 views
1
//===- RootAutodetector.cpp - detect contextual profiling roots -----------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
9
#include "RootAutoDetector.h"
10
11
#include "CtxInstrProfiling.h"
12
#include "sanitizer_common/sanitizer_common.h"
13
#include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap)
14
#include <assert.h>
15
#include <dlfcn.h>
16
#include <pthread.h>
17
18
using namespace __ctx_profile;
19
template <typename T> using Set = DenseMap<T, bool>;
20
21
namespace __sanitizer {
22
void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,
23
bool request_fast, u32 max_depth) {
24
// We can't implement the fast variant. The fast variant ends up invoking an
25
// external allocator, because of pthread_attr_getstack. If this happens
26
// during an allocation of the program being instrumented, a non-reentrant
27
// lock may be taken (this was observed). The allocator called by
28
// pthread_attr_getstack will also try to take that lock.
29
UnwindSlow(pc, max_depth);
30
}
31
} // namespace __sanitizer
32
33
RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) {
34
GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex);
35
Parent.AllSamples.PushBack(this);
36
}
37
38
void RootAutoDetector::start() {
39
atomic_store_relaxed(&Self, reinterpret_cast<uintptr_t>(this));
40
pthread_create(
41
&WorkerThread, nullptr,
42
+[](void *Ctx) -> void * {
43
RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx);
44
SleepForSeconds(RAD->WaitSeconds);
45
// To avoid holding the AllSamplesMutex, make a snapshot of all the
46
// thread samples collected so far
47
Vector<PerThreadSamples *> SamplesSnapshot;
48
{
49
GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex);
50
SamplesSnapshot.Resize(RAD->AllSamples.Size());
51
for (uptr I = 0; I < RAD->AllSamples.Size(); ++I)
52
SamplesSnapshot[I] = RAD->AllSamples[I];
53
}
54
DenseMap<uptr, uint64_t> AllRoots;
55
for (uptr I = 0; I < SamplesSnapshot.Size(); ++I) {
56
GenericScopedLock<SpinMutex>(&SamplesSnapshot[I]->M);
57
SamplesSnapshot[I]->TrieRoot.determineRoots().forEach([&](auto &KVP) {
58
auto [FAddr, Count] = KVP;
59
AllRoots[FAddr] += Count;
60
return true;
61
});
62
}
63
// FIXME: as a next step, establish a minimum relative nr of samples
64
// per root that would qualify it as a root.
65
for (auto *FD = reinterpret_cast<FunctionData *>(
66
atomic_load_relaxed(&RAD->FunctionDataListHead));
67
FD; FD = FD->Next) {
68
if (AllRoots.contains(reinterpret_cast<uptr>(FD->EntryAddress))) {
69
if (canBeRoot(FD->CtxRoot)) {
70
FD->getOrAllocateContextRoot();
71
} else {
72
// FIXME: address this by informing the root detection algorithm
73
// to skip over such functions and pick the next down in the
74
// stack. At that point, this becomes an assert.
75
Printf("[ctxprof] Root auto-detector selected a musttail "
76
"function for root (%p). Ignoring\n",
77
FD->EntryAddress);
78
}
79
}
80
}
81
atomic_store_relaxed(&RAD->Self, 0);
82
return nullptr;
83
},
84
this);
85
}
86
87
void RootAutoDetector::join() { pthread_join(WorkerThread, nullptr); }
88
89
void RootAutoDetector::sample() {
90
// tracking reentry in case we want to re-explore fast stack unwind - which
91
// does potentially re-enter the runtime because it calls the instrumented
92
// allocator because of pthread_attr_getstack. See the notes also on
93
// UnwindImpl above.
94
static thread_local bool Entered = false;
95
static thread_local uint64_t Entries = 0;
96
if (Entered || (++Entries % SampleRate))
97
return;
98
Entered = true;
99
collectStack();
100
Entered = false;
101
}
102
103
void RootAutoDetector::collectStack() {
104
GET_CALLER_PC_BP;
105
BufferedStackTrace CurrentStack;
106
CurrentStack.Unwind(pc, bp, /*context=*/nullptr, /*request_fast=*/false);
107
// 2 stack frames would be very unlikely to mean anything, since at least the
108
// compiler-rt frame - which can't be inlined - should be observable, which
109
// counts as 1; we can be even more aggressive with this number.
110
if (CurrentStack.size <= 2)
111
return;
112
static thread_local PerThreadSamples *ThisThreadSamples =
113
new (__sanitizer::InternalAlloc(sizeof(PerThreadSamples)))
114
PerThreadSamples(*this);
115
116
if (!ThisThreadSamples->M.TryLock())
117
return;
118
119
ThisThreadSamples->TrieRoot.insertStack(CurrentStack);
120
ThisThreadSamples->M.Unlock();
121
}
122
123
uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {
124
// this requires --linkopt=-Wl,--export-dynamic
125
Dl_info Info;
126
if (dladdr(reinterpret_cast<const void *>(CallsiteAddress), &Info) != 0)
127
return reinterpret_cast<uptr>(Info.dli_saddr);
128
return 0;
129
}
130
131
void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) {
132
++TheTrie.Count;
133
auto *Current = &TheTrie;
134
// the stack is backwards - the first callsite is at the top.
135
for (int I = ST.size - 1; I >= 0; --I) {
136
uptr ChildAddr = ST.trace[I];
137
auto [Iter, _] = Current->Children.insert({ChildAddr, Trie(ChildAddr)});
138
++Iter->second.Count;
139
Current = &Iter->second;
140
}
141
}
142
143
DenseMap<uptr, uint64_t> PerThreadCallsiteTrie::determineRoots() const {
144
// Assuming a message pump design, roots are those functions called by the
145
// message pump. The message pump is an infinite loop (for all practical
146
// considerations) fetching data from a queue. The root functions return -
147
// otherwise the message pump doesn't work. This function detects roots as the
148
// first place in the trie (starting from the root) where a function calls 2
149
// or more functions.
150
//
151
// We start with a callsite trie - the nodes are callsites. Different child
152
// nodes may actually correspond to the same function.
153
//
154
// For example: using function(callsite)
155
// f1(csf1_1) -> f2(csf2_1) -> f3
156
// -> f2(csf2_2) -> f4
157
//
158
// would be represented in our trie as:
159
// csf1_1 -> csf2_1 -> f3
160
// -> csf2_2 -> f4
161
//
162
// While we can assert the control flow returns to f2, we don't know if it
163
// ever returns to f1. f2 could be the message pump.
164
//
165
// We need to convert our callsite tree into a function tree. We can also,
166
// more economically, just see how many distinct functions there are at a
167
// certain depth. When that count is greater than 1, we got to potential roots
168
// and everything above should be considered as non-roots.
169
DenseMap<uptr, uint64_t> Result;
170
Set<const Trie *> Worklist;
171
Worklist.insert({&TheTrie, {}});
172
173
while (!Worklist.empty()) {
174
Set<const Trie *> NextWorklist;
175
DenseMap<uptr, uint64_t> Candidates;
176
Worklist.forEach([&](const auto &KVP) {
177
auto [Node, _] = KVP;
178
auto SA = getFctStartAddr(Node->CallsiteAddress);
179
Candidates[SA] += Node->Count;
180
Node->Children.forEach([&](auto &ChildKVP) {
181
NextWorklist.insert({&ChildKVP.second, true});
182
return true;
183
});
184
return true;
185
});
186
if (Candidates.size() > 1) {
187
Result.swap(Candidates);
188
break;
189
}
190
Worklist.swap(NextWorklist);
191
}
192
return Result;
193
}
194
195