Path: blob/main/contrib/llvm-project/compiler-rt/lib/xray/xray_function_call_trie.h
35265 views
//===-- xray_function_call_trie.h ------------------------------*- C++ -*-===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file is a part of XRay, a dynamic runtime instrumentation system.9//10// This file defines the interface for a function call trie.11//12//===----------------------------------------------------------------------===//13#ifndef XRAY_FUNCTION_CALL_TRIE_H14#define XRAY_FUNCTION_CALL_TRIE_H1516#include "xray_buffer_queue.h"17#include "xray_defs.h"18#include "xray_profiling_flags.h"19#include "xray_segmented_array.h"20#include <limits>21#include <memory> // For placement new.22#include <utility>2324namespace __xray {2526/// A FunctionCallTrie represents the stack traces of XRay instrumented27/// functions that we've encountered, where a node corresponds to a function and28/// the path from the root to the node its stack trace. Each node in the trie29/// will contain some useful values, including:30///31/// * The cumulative amount of time spent in this particular node/stack.32/// * The number of times this stack has appeared.33/// * A histogram of latencies for that particular node.34///35/// Each node in the trie will also contain a list of callees, represented using36/// a Array<NodeIdPair> -- each NodeIdPair instance will contain the function37/// ID of the callee, and a pointer to the node.38///39/// If we visualise this data structure, we'll find the following potential40/// representation:41///42/// [function id node] -> [callees] [cumulative time]43/// [call counter] [latency histogram]44///45/// As an example, when we have a function in this pseudocode:46///47/// func f(N) {48/// g()49/// h()50/// for i := 1..N { j() }51/// }52///53/// We may end up with a trie of the following form:54///55/// f -> [ g, h, j ] [...] [1] [...]56/// g -> [ ... ] [...] [1] [...]57/// h -> [ ... ] [...] [1] [...]58/// j -> [ ... ] [...] [N] [...]59///60/// If for instance the function g() called j() like so:61///62/// func g() {63/// for i := 1..10 { j() }64/// }65///66/// We'll find the following updated trie:67///68/// f -> [ g, h, j ] [...] [1] [...]69/// g -> [ j' ] [...] [1] [...]70/// h -> [ ... ] [...] [1] [...]71/// j -> [ ... ] [...] [N] [...]72/// j' -> [ ... ] [...] [10] [...]73///74/// Note that we'll have a new node representing the path `f -> g -> j'` with75/// isolated data. This isolation gives us a means of representing the stack76/// traces as a path, as opposed to a key in a table. The alternative77/// implementation here would be to use a separate table for the path, and use78/// hashes of the path as an identifier to accumulate the information. We've79/// moved away from this approach as it takes a lot of time to compute the hash80/// every time we need to update a function's call information as we're handling81/// the entry and exit events.82///83/// This approach allows us to maintain a shadow stack, which represents the84/// currently executing path, and on function exits quickly compute the amount85/// of time elapsed from the entry, then update the counters for the node86/// already represented in the trie. This necessitates an efficient87/// representation of the various data structures (the list of callees must be88/// cache-aware and efficient to look up, and the histogram must be compact and89/// quick to update) to enable us to keep the overheads of this implementation90/// to the minimum.91class FunctionCallTrie {92public:93struct Node;9495// We use a NodeIdPair type instead of a std::pair<...> to not rely on the96// standard library types in this header.97struct NodeIdPair {98Node *NodePtr;99int32_t FId;100};101102using NodeIdPairArray = Array<NodeIdPair>;103using NodeIdPairAllocatorType = NodeIdPairArray::AllocatorType;104105// A Node in the FunctionCallTrie gives us a list of callees, the cumulative106// number of times this node actually appeared, the cumulative amount of time107// for this particular node including its children call times, and just the108// local time spent on this node. Each Node will have the ID of the XRay109// instrumented function that it is associated to.110struct Node {111Node *Parent;112NodeIdPairArray Callees;113uint64_t CallCount;114uint64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.115int32_t FId;116117// TODO: Include the compact histogram.118};119120private:121struct ShadowStackEntry {122uint64_t EntryTSC;123Node *NodePtr;124uint16_t EntryCPU;125};126127using NodeArray = Array<Node>;128using RootArray = Array<Node *>;129using ShadowStackArray = Array<ShadowStackEntry>;130131public:132// We collate the allocators we need into a single struct, as a convenience to133// allow us to initialize these as a group.134struct Allocators {135using NodeAllocatorType = NodeArray::AllocatorType;136using RootAllocatorType = RootArray::AllocatorType;137using ShadowStackAllocatorType = ShadowStackArray::AllocatorType;138139// Use hosted aligned storage members to allow for trivial move and init.140// This also allows us to sidestep the potential-failing allocation issue.141alignas(NodeAllocatorType) std::byte142NodeAllocatorStorage[sizeof(NodeAllocatorType)];143alignas(RootAllocatorType) std::byte144RootAllocatorStorage[sizeof(RootAllocatorType)];145alignas(ShadowStackAllocatorType) std::byte146ShadowStackAllocatorStorage[sizeof(ShadowStackAllocatorType)];147alignas(NodeIdPairAllocatorType) std::byte148NodeIdPairAllocatorStorage[sizeof(NodeIdPairAllocatorType)];149150NodeAllocatorType *NodeAllocator = nullptr;151RootAllocatorType *RootAllocator = nullptr;152ShadowStackAllocatorType *ShadowStackAllocator = nullptr;153NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;154155Allocators() = default;156Allocators(const Allocators &) = delete;157Allocators &operator=(const Allocators &) = delete;158159struct Buffers {160BufferQueue::Buffer NodeBuffer;161BufferQueue::Buffer RootsBuffer;162BufferQueue::Buffer ShadowStackBuffer;163BufferQueue::Buffer NodeIdPairBuffer;164};165166explicit Allocators(Buffers &B) XRAY_NEVER_INSTRUMENT {167new (&NodeAllocatorStorage)168NodeAllocatorType(B.NodeBuffer.Data, B.NodeBuffer.Size);169NodeAllocator =170reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);171172new (&RootAllocatorStorage)173RootAllocatorType(B.RootsBuffer.Data, B.RootsBuffer.Size);174RootAllocator =175reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);176177new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType(178B.ShadowStackBuffer.Data, B.ShadowStackBuffer.Size);179ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(180&ShadowStackAllocatorStorage);181182new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType(183B.NodeIdPairBuffer.Data, B.NodeIdPairBuffer.Size);184NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(185&NodeIdPairAllocatorStorage);186}187188explicit Allocators(uptr Max) XRAY_NEVER_INSTRUMENT {189new (&NodeAllocatorStorage) NodeAllocatorType(Max);190NodeAllocator =191reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);192193new (&RootAllocatorStorage) RootAllocatorType(Max);194RootAllocator =195reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);196197new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType(Max);198ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(199&ShadowStackAllocatorStorage);200201new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType(Max);202NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(203&NodeIdPairAllocatorStorage);204}205206Allocators(Allocators &&O) XRAY_NEVER_INSTRUMENT {207// Here we rely on the safety of memcpy'ing contents of the storage208// members, and then pointing the source pointers to nullptr.209internal_memcpy(&NodeAllocatorStorage, &O.NodeAllocatorStorage,210sizeof(NodeAllocatorType));211internal_memcpy(&RootAllocatorStorage, &O.RootAllocatorStorage,212sizeof(RootAllocatorType));213internal_memcpy(&ShadowStackAllocatorStorage,214&O.ShadowStackAllocatorStorage,215sizeof(ShadowStackAllocatorType));216internal_memcpy(&NodeIdPairAllocatorStorage,217&O.NodeIdPairAllocatorStorage,218sizeof(NodeIdPairAllocatorType));219220NodeAllocator =221reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);222RootAllocator =223reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);224ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(225&ShadowStackAllocatorStorage);226NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(227&NodeIdPairAllocatorStorage);228229O.NodeAllocator = nullptr;230O.RootAllocator = nullptr;231O.ShadowStackAllocator = nullptr;232O.NodeIdPairAllocator = nullptr;233}234235Allocators &operator=(Allocators &&O) XRAY_NEVER_INSTRUMENT {236// When moving into an existing instance, we ensure that we clean up the237// current allocators.238if (NodeAllocator)239NodeAllocator->~NodeAllocatorType();240if (O.NodeAllocator) {241new (&NodeAllocatorStorage)242NodeAllocatorType(std::move(*O.NodeAllocator));243NodeAllocator =244reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);245O.NodeAllocator = nullptr;246} else {247NodeAllocator = nullptr;248}249250if (RootAllocator)251RootAllocator->~RootAllocatorType();252if (O.RootAllocator) {253new (&RootAllocatorStorage)254RootAllocatorType(std::move(*O.RootAllocator));255RootAllocator =256reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);257O.RootAllocator = nullptr;258} else {259RootAllocator = nullptr;260}261262if (ShadowStackAllocator)263ShadowStackAllocator->~ShadowStackAllocatorType();264if (O.ShadowStackAllocator) {265new (&ShadowStackAllocatorStorage)266ShadowStackAllocatorType(std::move(*O.ShadowStackAllocator));267ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(268&ShadowStackAllocatorStorage);269O.ShadowStackAllocator = nullptr;270} else {271ShadowStackAllocator = nullptr;272}273274if (NodeIdPairAllocator)275NodeIdPairAllocator->~NodeIdPairAllocatorType();276if (O.NodeIdPairAllocator) {277new (&NodeIdPairAllocatorStorage)278NodeIdPairAllocatorType(std::move(*O.NodeIdPairAllocator));279NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(280&NodeIdPairAllocatorStorage);281O.NodeIdPairAllocator = nullptr;282} else {283NodeIdPairAllocator = nullptr;284}285286return *this;287}288289~Allocators() XRAY_NEVER_INSTRUMENT {290if (NodeAllocator != nullptr)291NodeAllocator->~NodeAllocatorType();292if (RootAllocator != nullptr)293RootAllocator->~RootAllocatorType();294if (ShadowStackAllocator != nullptr)295ShadowStackAllocator->~ShadowStackAllocatorType();296if (NodeIdPairAllocator != nullptr)297NodeIdPairAllocator->~NodeIdPairAllocatorType();298}299};300301static Allocators InitAllocators() XRAY_NEVER_INSTRUMENT {302return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max);303}304305static Allocators InitAllocatorsCustom(uptr Max) XRAY_NEVER_INSTRUMENT {306Allocators A(Max);307return A;308}309310static Allocators311InitAllocatorsFromBuffers(Allocators::Buffers &Bufs) XRAY_NEVER_INSTRUMENT {312Allocators A(Bufs);313return A;314}315316private:317NodeArray Nodes;318RootArray Roots;319ShadowStackArray ShadowStack;320NodeIdPairAllocatorType *NodeIdPairAllocator;321uint32_t OverflowedFunctions;322323public:324explicit FunctionCallTrie(const Allocators &A) XRAY_NEVER_INSTRUMENT325: Nodes(*A.NodeAllocator),326Roots(*A.RootAllocator),327ShadowStack(*A.ShadowStackAllocator),328NodeIdPairAllocator(A.NodeIdPairAllocator),329OverflowedFunctions(0) {}330331FunctionCallTrie() = delete;332FunctionCallTrie(const FunctionCallTrie &) = delete;333FunctionCallTrie &operator=(const FunctionCallTrie &) = delete;334335FunctionCallTrie(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT336: Nodes(std::move(O.Nodes)),337Roots(std::move(O.Roots)),338ShadowStack(std::move(O.ShadowStack)),339NodeIdPairAllocator(O.NodeIdPairAllocator),340OverflowedFunctions(O.OverflowedFunctions) {}341342FunctionCallTrie &operator=(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT {343Nodes = std::move(O.Nodes);344Roots = std::move(O.Roots);345ShadowStack = std::move(O.ShadowStack);346NodeIdPairAllocator = O.NodeIdPairAllocator;347OverflowedFunctions = O.OverflowedFunctions;348return *this;349}350351~FunctionCallTrie() XRAY_NEVER_INSTRUMENT {}352353void enterFunction(const int32_t FId, uint64_t TSC,354uint16_t CPU) XRAY_NEVER_INSTRUMENT {355DCHECK_NE(FId, 0);356357// If we're already overflowed the function call stack, do not bother358// attempting to record any more function entries.359if (UNLIKELY(OverflowedFunctions)) {360++OverflowedFunctions;361return;362}363364// If this is the first function we've encountered, we want to set up the365// node(s) and treat it as a root.366if (UNLIKELY(ShadowStack.empty())) {367auto *NewRoot = Nodes.AppendEmplace(368nullptr, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId);369if (UNLIKELY(NewRoot == nullptr))370return;371if (Roots.AppendEmplace(NewRoot) == nullptr) {372Nodes.trim(1);373return;374}375if (ShadowStack.AppendEmplace(TSC, NewRoot, CPU) == nullptr) {376Nodes.trim(1);377Roots.trim(1);378++OverflowedFunctions;379return;380}381return;382}383384// From this point on, we require that the stack is not empty.385DCHECK(!ShadowStack.empty());386auto TopNode = ShadowStack.back().NodePtr;387DCHECK_NE(TopNode, nullptr);388389// If we've seen this callee before, then we access that node and place that390// on the top of the stack.391auto* Callee = TopNode->Callees.find_element(392[FId](const NodeIdPair &NR) { return NR.FId == FId; });393if (Callee != nullptr) {394CHECK_NE(Callee->NodePtr, nullptr);395if (ShadowStack.AppendEmplace(TSC, Callee->NodePtr, CPU) == nullptr)396++OverflowedFunctions;397return;398}399400// This means we've never seen this stack before, create a new node here.401auto* NewNode = Nodes.AppendEmplace(402TopNode, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId);403if (UNLIKELY(NewNode == nullptr))404return;405DCHECK_NE(NewNode, nullptr);406TopNode->Callees.AppendEmplace(NewNode, FId);407if (ShadowStack.AppendEmplace(TSC, NewNode, CPU) == nullptr)408++OverflowedFunctions;409return;410}411412void exitFunction(int32_t FId, uint64_t TSC,413uint16_t CPU) XRAY_NEVER_INSTRUMENT {414// If we're exiting functions that have "overflowed" or don't fit into the415// stack due to allocator constraints, we then decrement that count first.416if (OverflowedFunctions) {417--OverflowedFunctions;418return;419}420421// When we exit a function, we look up the ShadowStack to see whether we've422// entered this function before. We do as little processing here as we can,423// since most of the hard work would have already been done at function424// entry.425uint64_t CumulativeTreeTime = 0;426427while (!ShadowStack.empty()) {428const auto &Top = ShadowStack.back();429auto TopNode = Top.NodePtr;430DCHECK_NE(TopNode, nullptr);431432// We may encounter overflow on the TSC we're provided, which may end up433// being less than the TSC when we first entered the function.434//435// To get the accurate measurement of cycles, we need to check whether436// we've overflowed (TSC < Top.EntryTSC) and then account the difference437// between the entry TSC and the max for the TSC counter (max of uint64_t)438// then add the value of TSC. We can prove that the maximum delta we will439// get is at most the 64-bit unsigned value, since the difference between440// a TSC of 0 and a Top.EntryTSC of 1 is (numeric_limits<uint64_t>::max()441// - 1) + 1.442//443// NOTE: This assumes that TSCs are synchronised across CPUs.444// TODO: Count the number of times we've seen CPU migrations.445uint64_t LocalTime =446Top.EntryTSC > TSC447? (std::numeric_limits<uint64_t>::max() - Top.EntryTSC) + TSC448: TSC - Top.EntryTSC;449TopNode->CallCount++;450TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;451CumulativeTreeTime += LocalTime;452ShadowStack.trim(1);453454// TODO: Update the histogram for the node.455if (TopNode->FId == FId)456break;457}458}459460const RootArray &getRoots() const XRAY_NEVER_INSTRUMENT { return Roots; }461462// The deepCopyInto operation will update the provided FunctionCallTrie by463// re-creating the contents of this particular FunctionCallTrie in the other464// FunctionCallTrie. It will do this using a Depth First Traversal from the465// roots, and while doing so recreating the traversal in the provided466// FunctionCallTrie.467//468// This operation will *not* destroy the state in `O`, and thus may cause some469// duplicate entries in `O` if it is not empty.470//471// This function is *not* thread-safe, and may require external472// synchronisation of both "this" and |O|.473//474// This function must *not* be called with a non-empty FunctionCallTrie |O|.475void deepCopyInto(FunctionCallTrie &O) const XRAY_NEVER_INSTRUMENT {476DCHECK(O.getRoots().empty());477478// We then push the root into a stack, to use as the parent marker for new479// nodes we push in as we're traversing depth-first down the call tree.480struct NodeAndParent {481FunctionCallTrie::Node *Node;482FunctionCallTrie::Node *NewNode;483};484using Stack = Array<NodeAndParent>;485486typename Stack::AllocatorType StackAllocator(487profilingFlags()->stack_allocator_max);488Stack DFSStack(StackAllocator);489490for (const auto Root : getRoots()) {491// Add a node in O for this root.492auto NewRoot = O.Nodes.AppendEmplace(493nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), Root->CallCount,494Root->CumulativeLocalTime, Root->FId);495496// Because we cannot allocate more memory we should bail out right away.497if (UNLIKELY(NewRoot == nullptr))498return;499500if (UNLIKELY(O.Roots.Append(NewRoot) == nullptr))501return;502503// TODO: Figure out what to do if we fail to allocate any more stack504// space. Maybe warn or report once?505if (DFSStack.AppendEmplace(Root, NewRoot) == nullptr)506return;507while (!DFSStack.empty()) {508NodeAndParent NP = DFSStack.back();509DCHECK_NE(NP.Node, nullptr);510DCHECK_NE(NP.NewNode, nullptr);511DFSStack.trim(1);512for (const auto Callee : NP.Node->Callees) {513auto NewNode = O.Nodes.AppendEmplace(514NP.NewNode, NodeIdPairArray(*O.NodeIdPairAllocator),515Callee.NodePtr->CallCount, Callee.NodePtr->CumulativeLocalTime,516Callee.FId);517if (UNLIKELY(NewNode == nullptr))518return;519if (UNLIKELY(NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId) ==520nullptr))521return;522if (UNLIKELY(DFSStack.AppendEmplace(Callee.NodePtr, NewNode) ==523nullptr))524return;525}526}527}528}529530// The mergeInto operation will update the provided FunctionCallTrie by531// traversing the current trie's roots and updating (i.e. merging) the data in532// the nodes with the data in the target's nodes. If the node doesn't exist in533// the provided trie, we add a new one in the right position, and inherit the534// data from the original (current) trie, along with all its callees.535//536// This function is *not* thread-safe, and may require external537// synchronisation of both "this" and |O|.538void mergeInto(FunctionCallTrie &O) const XRAY_NEVER_INSTRUMENT {539struct NodeAndTarget {540FunctionCallTrie::Node *OrigNode;541FunctionCallTrie::Node *TargetNode;542};543using Stack = Array<NodeAndTarget>;544typename Stack::AllocatorType StackAllocator(545profilingFlags()->stack_allocator_max);546Stack DFSStack(StackAllocator);547548for (const auto Root : getRoots()) {549Node *TargetRoot = nullptr;550auto R = O.Roots.find_element(551[&](const Node *Node) { return Node->FId == Root->FId; });552if (R == nullptr) {553TargetRoot = O.Nodes.AppendEmplace(554nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u,555Root->FId);556if (UNLIKELY(TargetRoot == nullptr))557return;558559O.Roots.Append(TargetRoot);560} else {561TargetRoot = *R;562}563564DFSStack.AppendEmplace(Root, TargetRoot);565while (!DFSStack.empty()) {566NodeAndTarget NT = DFSStack.back();567DCHECK_NE(NT.OrigNode, nullptr);568DCHECK_NE(NT.TargetNode, nullptr);569DFSStack.trim(1);570// TODO: Update the histogram as well when we have it ready.571NT.TargetNode->CallCount += NT.OrigNode->CallCount;572NT.TargetNode->CumulativeLocalTime += NT.OrigNode->CumulativeLocalTime;573for (const auto Callee : NT.OrigNode->Callees) {574auto TargetCallee = NT.TargetNode->Callees.find_element(575[&](const FunctionCallTrie::NodeIdPair &C) {576return C.FId == Callee.FId;577});578if (TargetCallee == nullptr) {579auto NewTargetNode = O.Nodes.AppendEmplace(580NT.TargetNode, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u,581Callee.FId);582583if (UNLIKELY(NewTargetNode == nullptr))584return;585586TargetCallee =587NT.TargetNode->Callees.AppendEmplace(NewTargetNode, Callee.FId);588}589DFSStack.AppendEmplace(Callee.NodePtr, TargetCallee->NodePtr);590}591}592}593}594};595596} // namespace __xray597598#endif // XRAY_FUNCTION_CALL_TRIE_H599600601