Path: blob/main/contrib/llvm-project/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp
213766 views
//===- RootAutodetector.cpp - detect contextual profiling roots -----------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//78#include "RootAutoDetector.h"910#include "CtxInstrProfiling.h"11#include "sanitizer_common/sanitizer_common.h"12#include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap)13#include <assert.h>14#include <dlfcn.h>15#include <pthread.h>1617using namespace __ctx_profile;18template <typename T> using Set = DenseMap<T, bool>;1920namespace __sanitizer {21void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,22bool request_fast, u32 max_depth) {23// We can't implement the fast variant. The fast variant ends up invoking an24// external allocator, because of pthread_attr_getstack. If this happens25// during an allocation of the program being instrumented, a non-reentrant26// lock may be taken (this was observed). The allocator called by27// pthread_attr_getstack will also try to take that lock.28UnwindSlow(pc, max_depth);29}30} // namespace __sanitizer3132RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) {33GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex);34Parent.AllSamples.PushBack(this);35}3637void RootAutoDetector::start() {38atomic_store_relaxed(&Self, reinterpret_cast<uintptr_t>(this));39pthread_create(40&WorkerThread, nullptr,41+[](void *Ctx) -> void * {42RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx);43SleepForSeconds(RAD->WaitSeconds);44// To avoid holding the AllSamplesMutex, make a snapshot of all the45// thread samples collected so far46Vector<PerThreadSamples *> SamplesSnapshot;47{48GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex);49SamplesSnapshot.Resize(RAD->AllSamples.Size());50for (uptr I = 0; I < RAD->AllSamples.Size(); ++I)51SamplesSnapshot[I] = RAD->AllSamples[I];52}53DenseMap<uptr, uint64_t> AllRoots;54for (uptr I = 0; I < SamplesSnapshot.Size(); ++I) {55GenericScopedLock<SpinMutex>(&SamplesSnapshot[I]->M);56SamplesSnapshot[I]->TrieRoot.determineRoots().forEach([&](auto &KVP) {57auto [FAddr, Count] = KVP;58AllRoots[FAddr] += Count;59return true;60});61}62// FIXME: as a next step, establish a minimum relative nr of samples63// per root that would qualify it as a root.64for (auto *FD = reinterpret_cast<FunctionData *>(65atomic_load_relaxed(&RAD->FunctionDataListHead));66FD; FD = FD->Next) {67if (AllRoots.contains(reinterpret_cast<uptr>(FD->EntryAddress))) {68if (canBeRoot(FD->CtxRoot)) {69FD->getOrAllocateContextRoot();70} else {71// FIXME: address this by informing the root detection algorithm72// to skip over such functions and pick the next down in the73// stack. At that point, this becomes an assert.74Printf("[ctxprof] Root auto-detector selected a musttail "75"function for root (%p). Ignoring\n",76FD->EntryAddress);77}78}79}80atomic_store_relaxed(&RAD->Self, 0);81return nullptr;82},83this);84}8586void RootAutoDetector::join() { pthread_join(WorkerThread, nullptr); }8788void RootAutoDetector::sample() {89// tracking reentry in case we want to re-explore fast stack unwind - which90// does potentially re-enter the runtime because it calls the instrumented91// allocator because of pthread_attr_getstack. See the notes also on92// UnwindImpl above.93static thread_local bool Entered = false;94static thread_local uint64_t Entries = 0;95if (Entered || (++Entries % SampleRate))96return;97Entered = true;98collectStack();99Entered = false;100}101102void RootAutoDetector::collectStack() {103GET_CALLER_PC_BP;104BufferedStackTrace CurrentStack;105CurrentStack.Unwind(pc, bp, /*context=*/nullptr, /*request_fast=*/false);106// 2 stack frames would be very unlikely to mean anything, since at least the107// compiler-rt frame - which can't be inlined - should be observable, which108// counts as 1; we can be even more aggressive with this number.109if (CurrentStack.size <= 2)110return;111static thread_local PerThreadSamples *ThisThreadSamples =112new (__sanitizer::InternalAlloc(sizeof(PerThreadSamples)))113PerThreadSamples(*this);114115if (!ThisThreadSamples->M.TryLock())116return;117118ThisThreadSamples->TrieRoot.insertStack(CurrentStack);119ThisThreadSamples->M.Unlock();120}121122uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {123// this requires --linkopt=-Wl,--export-dynamic124Dl_info Info;125if (dladdr(reinterpret_cast<const void *>(CallsiteAddress), &Info) != 0)126return reinterpret_cast<uptr>(Info.dli_saddr);127return 0;128}129130void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) {131++TheTrie.Count;132auto *Current = &TheTrie;133// the stack is backwards - the first callsite is at the top.134for (int I = ST.size - 1; I >= 0; --I) {135uptr ChildAddr = ST.trace[I];136auto [Iter, _] = Current->Children.insert({ChildAddr, Trie(ChildAddr)});137++Iter->second.Count;138Current = &Iter->second;139}140}141142DenseMap<uptr, uint64_t> PerThreadCallsiteTrie::determineRoots() const {143// Assuming a message pump design, roots are those functions called by the144// message pump. The message pump is an infinite loop (for all practical145// considerations) fetching data from a queue. The root functions return -146// otherwise the message pump doesn't work. This function detects roots as the147// first place in the trie (starting from the root) where a function calls 2148// or more functions.149//150// We start with a callsite trie - the nodes are callsites. Different child151// nodes may actually correspond to the same function.152//153// For example: using function(callsite)154// f1(csf1_1) -> f2(csf2_1) -> f3155// -> f2(csf2_2) -> f4156//157// would be represented in our trie as:158// csf1_1 -> csf2_1 -> f3159// -> csf2_2 -> f4160//161// While we can assert the control flow returns to f2, we don't know if it162// ever returns to f1. f2 could be the message pump.163//164// We need to convert our callsite tree into a function tree. We can also,165// more economically, just see how many distinct functions there are at a166// certain depth. When that count is greater than 1, we got to potential roots167// and everything above should be considered as non-roots.168DenseMap<uptr, uint64_t> Result;169Set<const Trie *> Worklist;170Worklist.insert({&TheTrie, {}});171172while (!Worklist.empty()) {173Set<const Trie *> NextWorklist;174DenseMap<uptr, uint64_t> Candidates;175Worklist.forEach([&](const auto &KVP) {176auto [Node, _] = KVP;177auto SA = getFctStartAddr(Node->CallsiteAddress);178Candidates[SA] += Node->Count;179Node->Children.forEach([&](auto &ChildKVP) {180NextWorklist.insert({&ChildKVP.second, true});181return true;182});183return true;184});185if (Candidates.size() > 1) {186Result.swap(Candidates);187break;188}189Worklist.swap(NextWorklist);190}191return Result;192}193194195