Path: blob/main/contrib/llvm-project/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
35233 views
//===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//78#include "CtxInstrProfiling.h"9#include "sanitizer_common/sanitizer_allocator_internal.h"10#include "sanitizer_common/sanitizer_common.h"11#include "sanitizer_common/sanitizer_dense_map.h"12#include "sanitizer_common/sanitizer_libc.h"13#include "sanitizer_common/sanitizer_mutex.h"14#include "sanitizer_common/sanitizer_placement_new.h"15#include "sanitizer_common/sanitizer_thread_safety.h"16#include "sanitizer_common/sanitizer_vector.h"1718#include <assert.h>1920using namespace __ctx_profile;2122namespace {23// Keep track of all the context roots we actually saw, so we can then traverse24// them when the user asks for the profile in __llvm_ctx_profile_fetch25__sanitizer::SpinMutex AllContextsMutex;26SANITIZER_GUARDED_BY(AllContextsMutex)27__sanitizer::Vector<ContextRoot *> AllContextRoots;2829// utility to taint a pointer by setting the LSB. There is an assumption30// throughout that the addresses of contexts are even (really, they should be31// align(8), but "even"-ness is the minimum assumption)32// "scratch contexts" are buffers that we return in certain cases - they are33// large enough to allow for memory safe counter access, but they don't link34// subcontexts below them (the runtime recognizes them and enforces that)35ContextNode *markAsScratch(const ContextNode *Ctx) {36return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1);37}3839// Used when getting the data from TLS. We don't *really* need to reset, but40// it's a simpler system if we do.41template <typename T> inline T consume(T &V) {42auto R = V;43V = {0};44return R;45}4647// We allocate at least kBuffSize Arena pages. The scratch buffer is also that48// large.49constexpr size_t kPower = 20;50constexpr size_t kBuffSize = 1 << kPower;5152// Highly unlikely we need more than kBuffSize for a context.53size_t getArenaAllocSize(size_t Needed) {54if (Needed >= kBuffSize)55return 2 * Needed;56return kBuffSize;57}5859// verify the structural integrity of the context60bool validate(const ContextRoot *Root) {61// all contexts should be laid out in some arena page. Go over each arena62// allocated for this Root, and jump over contained contexts based on63// self-reported sizes.64__sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs;65for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {66const auto *Pos = Mem->start();67while (Pos < Mem->pos()) {68const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);69if (!ContextStartAddrs.insert({reinterpret_cast<uint64_t>(Ctx), true})70.second)71return false;72Pos += Ctx->size();73}74}7576// Now traverse the contexts again the same way, but validate all nonull77// subcontext addresses appear in the set computed above.78for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {79const auto *Pos = Mem->start();80while (Pos < Mem->pos()) {81const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);82for (uint32_t I = 0; I < Ctx->callsites_size(); ++I)83for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next())84if (!ContextStartAddrs.find(reinterpret_cast<uint64_t>(Sub)))85return false;8687Pos += Ctx->size();88}89}90return true;91}9293inline ContextNode *allocContextNode(char *Place, GUID Guid,94uint32_t NrCounters, uint32_t NrCallsites,95ContextNode *Next = nullptr) {96assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0);97return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next);98}99100void resetContextNode(ContextNode &Node) {101// FIXME(mtrofin): this is std::memset, which we can probably use if we102// drop/reduce the dependency on sanitizer_common.103for (uint32_t I = 0; I < Node.counters_size(); ++I)104Node.counters()[I] = 0;105for (uint32_t I = 0; I < Node.callsites_size(); ++I)106for (auto *Next = Node.subContexts()[I]; Next; Next = Next->next())107resetContextNode(*Next);108}109110void onContextEnter(ContextNode &Node) { ++Node.counters()[0]; }111112} // namespace113114// the scratch buffer - what we give when we can't produce a real context (the115// scratch isn't "real" in that it's expected to be clobbered carelessly - we116// don't read it). The other important thing is that the callees from a scratch117// context also get a scratch context.118// Eventually this can be replaced with per-function buffers, a'la the typical119// (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but120// the part about determining the nature of the subcontexts does.121__thread char __Buffer[kBuffSize] = {0};122123#define TheScratchContext \124markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))125126// init the TLSes127__thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr,128nullptr};129__thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0};130131__thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =132nullptr;133134Arena::Arena(uint32_t Size) : Size(Size) {135__sanitizer::internal_memset(start(), 0, Size);136}137138// FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce139// the dependency on the latter.140Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {141assert(!Prev || Prev->Next == nullptr);142Arena *NewArena = new (__sanitizer::InternalAlloc(143Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment))144Arena(Size);145if (Prev)146Prev->Next = NewArena;147return NewArena;148}149150void Arena::freeArenaList(Arena *&A) {151assert(A);152for (auto *I = A; I != nullptr;) {153auto *Current = I;154I = I->Next;155__sanitizer::InternalFree(Current);156}157A = nullptr;158}159160// If this is the first time we hit a callsite with this (Guid) particular161// callee, we need to allocate.162ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,163uint32_t NrCounters, uint32_t NrCallsites) {164auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);165auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;166char *AllocPlace = Mem->tryBumpAllocate(AllocSize);167if (!AllocPlace) {168// if we failed to allocate on the current arena, allocate a new arena,169// and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we170// find it from now on for other cases when we need to getCallsiteSlow.171// Note that allocateNewArena will link the allocated memory in the list of172// Arenas.173__llvm_ctx_profile_current_context_root->CurrentMem = Mem =174Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);175AllocPlace = Mem->tryBumpAllocate(AllocSize);176}177auto *Ret = allocContextNode(AllocPlace, Guid, NrCounters, NrCallsites,178*InsertionPoint);179*InsertionPoint = Ret;180return Ret;181}182183ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,184uint32_t NrCounters,185uint32_t NrCallsites) {186// fast "out" if we're not even doing contextual collection.187if (!__llvm_ctx_profile_current_context_root)188return TheScratchContext;189190// also fast "out" if the caller is scratch. We can see if it's scratch by191// looking at the interior pointer into the subcontexts vector that the caller192// provided, which, if the context is scratch, so is that interior pointer193// (because all the address calculations are using even values. Or more194// precisely, aligned - 8 values)195auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);196if (!CallsiteContext || isScratch(CallsiteContext))197return TheScratchContext;198199// if the callee isn't the expected one, return scratch.200// Signal handler(s) could have been invoked at any point in the execution.201// Should that have happened, and had it (the handler) be built with202// instrumentation, its __llvm_ctx_profile_get_context would have failed here.203// Its sub call graph would have then populated204// __llvm_ctx_profile_{expected_callee | callsite} at index 1.205// The normal call graph may be impacted in that, if the signal handler206// happened somewhere before we read the TLS here, we'd see the TLS reset and207// we'd also fail here. That would just mean we would loose counter values for208// the normal subgraph, this time around. That should be very unlikely, but if209// it happens too frequently, we should be able to detect discrepancies in210// entry counts (caller-callee). At the moment, the design goes on the211// assumption that is so unfrequent, though, that it's not worth doing more212// for that case.213auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);214if (ExpectedCallee != Callee)215return TheScratchContext;216217auto *Callsite = *CallsiteContext;218// in the case of indirect calls, we will have all seen targets forming a219// linked list here. Find the one corresponding to this callee.220while (Callsite && Callsite->guid() != Guid) {221Callsite = Callsite->next();222}223auto *Ret = Callsite ? Callsite224: getCallsiteSlow(Guid, CallsiteContext, NrCounters,225NrCallsites);226if (Ret->callsites_size() != NrCallsites ||227Ret->counters_size() != NrCounters)228__sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: "229"Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",230reinterpret_cast<void *>(Ret), Guid, NrCallsites,231NrCounters, Ret->guid(), Ret->callsites_size(),232Ret->counters_size());233onContextEnter(*Ret);234return Ret;235}236237// This should be called once for a Root. Allocate the first arena, set up the238// first context.239void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters,240uint32_t NrCallsites) {241__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(242&AllContextsMutex);243// Re-check - we got here without having had taken a lock.244if (Root->FirstMemBlock)245return;246const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites);247auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed));248Root->FirstMemBlock = M;249Root->CurrentMem = M;250Root->FirstNode = allocContextNode(M->tryBumpAllocate(Needed), Guid,251NrCounters, NrCallsites);252AllContextRoots.PushBack(Root);253}254255ContextNode *__llvm_ctx_profile_start_context(256ContextRoot *Root, GUID Guid, uint32_t Counters,257uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {258if (!Root->FirstMemBlock) {259setupContext(Root, Guid, Counters, Callsites);260}261if (Root->Taken.TryLock()) {262__llvm_ctx_profile_current_context_root = Root;263onContextEnter(*Root->FirstNode);264return Root->FirstNode;265}266// If this thread couldn't take the lock, return scratch context.267__llvm_ctx_profile_current_context_root = nullptr;268return TheScratchContext;269}270271void __llvm_ctx_profile_release_context(ContextRoot *Root)272SANITIZER_NO_THREAD_SAFETY_ANALYSIS {273if (__llvm_ctx_profile_current_context_root) {274__llvm_ctx_profile_current_context_root = nullptr;275Root->Taken.Unlock();276}277}278279void __llvm_ctx_profile_start_collection() {280size_t NrMemUnits = 0;281__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(282&AllContextsMutex);283for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) {284auto *Root = AllContextRoots[I];285__sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(286&Root->Taken);287for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())288++NrMemUnits;289290resetContextNode(*Root->FirstNode);291}292__sanitizer::Printf("[ctxprof] Initial NrMemUnits: %zu \n", NrMemUnits);293}294295bool __llvm_ctx_profile_fetch(void *Data,296bool (*Writer)(void *W, const ContextNode &)) {297assert(Writer);298__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(299&AllContextsMutex);300301for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) {302auto *Root = AllContextRoots[I];303__sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock(304&Root->Taken);305if (!validate(Root)) {306__sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");307return false;308}309if (!Writer(Data, *Root->FirstNode))310return false;311}312return true;313}314315void __llvm_ctx_profile_free() {316__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(317&AllContextsMutex);318for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)319for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {320auto *C = A;321A = A->next();322__sanitizer::InternalFree(C);323}324AllContextRoots.Reset();325}326327328