Path: blob/main/contrib/llvm-project/compiler-rt/lib/xray/xray_profile_collector.cpp
35265 views
//===-- xray_profile_collector.cpp -----------------------------*- C++ -*-===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file is a part of XRay, a dynamic runtime instrumentation system.9//10// This implements the interface for the profileCollectorService.11//12//===----------------------------------------------------------------------===//13#include "xray_profile_collector.h"14#include "sanitizer_common/sanitizer_common.h"15#include "xray_allocator.h"16#include "xray_defs.h"17#include "xray_profiling_flags.h"18#include "xray_segmented_array.h"19#include <memory>20#include <pthread.h>21#include <utility>2223namespace __xray {24namespace profileCollectorService {2526namespace {2728SpinMutex GlobalMutex;29struct ThreadTrie {30tid_t TId;31alignas(FunctionCallTrie) std::byte TrieStorage[sizeof(FunctionCallTrie)];32};3334struct ProfileBuffer {35void *Data;36size_t Size;37};3839// Current version of the profile format.40constexpr u64 XRayProfilingVersion = 0x20180424;4142// Identifier for XRay profiling files 'xrayprof' in hex.43constexpr u64 XRayMagicBytes = 0x7872617970726f66;4445struct XRayProfilingFileHeader {46const u64 MagicBytes = XRayMagicBytes;47const u64 Version = XRayProfilingVersion;48u64 Timestamp = 0; // System time in nanoseconds.49u64 PID = 0; // Process ID.50};5152struct BlockHeader {53u32 BlockSize;54u32 BlockNum;55u64 ThreadId;56};5758struct ThreadData {59BufferQueue *BQ;60FunctionCallTrie::Allocators::Buffers Buffers;61FunctionCallTrie::Allocators Allocators;62FunctionCallTrie FCT;63tid_t TId;64};6566using ThreadDataArray = Array<ThreadData>;67using ThreadDataAllocator = ThreadDataArray::AllocatorType;6869// We use a separate buffer queue for the backing store for the allocator used70// by the ThreadData array. This lets us host the buffers, allocators, and tries71// associated with a thread by moving the data into the array instead of72// attempting to copy the data to a separately backed set of tries.73alignas(BufferQueue) static std::byte BufferQueueStorage[sizeof(BufferQueue)];74static BufferQueue *BQ = nullptr;75static BufferQueue::Buffer Buffer;76alignas(ThreadDataAllocator) static std::byte77ThreadDataAllocatorStorage[sizeof(ThreadDataAllocator)];78alignas(ThreadDataArray) static std::byte79ThreadDataArrayStorage[sizeof(ThreadDataArray)];8081static ThreadDataAllocator *TDAllocator = nullptr;82static ThreadDataArray *TDArray = nullptr;8384using ProfileBufferArray = Array<ProfileBuffer>;85using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;8687// These need to be global aligned storage to avoid dynamic initialization. We88// need these to be aligned to allow us to placement new objects into the89// storage, and have pointers to those objects be appropriately aligned.90alignas(ProfileBufferArray) static std::byte91ProfileBuffersStorage[sizeof(ProfileBufferArray)];92alignas(ProfileBufferArrayAllocator) static std::byte93ProfileBufferArrayAllocatorStorage[sizeof(ProfileBufferArrayAllocator)];9495static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;96static ProfileBufferArray *ProfileBuffers = nullptr;9798// Use a global flag to determine whether the collector implementation has been99// initialized.100static atomic_uint8_t CollectorInitialized{0};101102} // namespace103104void post(BufferQueue *Q, FunctionCallTrie &&T,105FunctionCallTrie::Allocators &&A,106FunctionCallTrie::Allocators::Buffers &&B,107tid_t TId) XRAY_NEVER_INSTRUMENT {108DCHECK_NE(Q, nullptr);109110// Bail out early if the collector has not been initialized.111if (!atomic_load(&CollectorInitialized, memory_order_acquire)) {112T.~FunctionCallTrie();113A.~Allocators();114Q->releaseBuffer(B.NodeBuffer);115Q->releaseBuffer(B.RootsBuffer);116Q->releaseBuffer(B.ShadowStackBuffer);117Q->releaseBuffer(B.NodeIdPairBuffer);118B.~Buffers();119return;120}121122{123SpinMutexLock Lock(&GlobalMutex);124DCHECK_NE(TDAllocator, nullptr);125DCHECK_NE(TDArray, nullptr);126127if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T),128TId) == nullptr) {129// If we fail to add the data to the array, we should destroy the objects130// handed us.131T.~FunctionCallTrie();132A.~Allocators();133Q->releaseBuffer(B.NodeBuffer);134Q->releaseBuffer(B.RootsBuffer);135Q->releaseBuffer(B.ShadowStackBuffer);136Q->releaseBuffer(B.NodeIdPairBuffer);137B.~Buffers();138}139}140}141142// A PathArray represents the function id's representing a stack trace. In this143// context a path is almost always represented from the leaf function in a call144// stack to a root of the call trie.145using PathArray = Array<int32_t>;146147struct ProfileRecord {148using PathAllocator = typename PathArray::AllocatorType;149150// The Path in this record is the function id's from the leaf to the root of151// the function call stack as represented from a FunctionCallTrie.152PathArray Path;153const FunctionCallTrie::Node *Node;154};155156namespace {157158using ProfileRecordArray = Array<ProfileRecord>;159160// Walk a depth-first traversal of each root of the FunctionCallTrie to generate161// the path(s) and the data associated with the path.162static void163populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,164const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT {165using StackArray = Array<const FunctionCallTrie::Node *>;166using StackAllocator = typename StackArray::AllocatorType;167StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);168StackArray DFSStack(StackAlloc);169for (const auto *R : Trie.getRoots()) {170DFSStack.Append(R);171while (!DFSStack.empty()) {172auto *Node = DFSStack.back();173DFSStack.trim(1);174if (Node == nullptr)175continue;176auto Record = PRs.AppendEmplace(PathArray{PA}, Node);177if (Record == nullptr)178return;179DCHECK_NE(Record, nullptr);180181// Traverse the Node's parents and as we're doing so, get the FIds in182// the order they appear.183for (auto N = Node; N != nullptr; N = N->Parent)184Record->Path.Append(N->FId);185DCHECK(!Record->Path.empty());186187for (const auto C : Node->Callees)188DFSStack.Append(C.NodePtr);189}190}191}192193static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,194const ProfileRecordArray &ProfileRecords)195XRAY_NEVER_INSTRUMENT {196auto NextPtr = static_cast<uint8_t *>(197internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +198sizeof(Header);199for (const auto &Record : ProfileRecords) {200// List of IDs follow:201for (const auto FId : Record.Path)202NextPtr =203static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +204sizeof(FId);205206// Add the sentinel here.207constexpr int32_t SentinelFId = 0;208NextPtr = static_cast<uint8_t *>(209internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +210sizeof(SentinelFId);211212// Add the node data here.213NextPtr =214static_cast<uint8_t *>(internal_memcpy(215NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) +216sizeof(Record.Node->CallCount);217NextPtr = static_cast<uint8_t *>(218internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,219sizeof(Record.Node->CumulativeLocalTime))) +220sizeof(Record.Node->CumulativeLocalTime);221}222223DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size);224}225226} // namespace227228void serialize() XRAY_NEVER_INSTRUMENT {229if (!atomic_load(&CollectorInitialized, memory_order_acquire))230return;231232SpinMutexLock Lock(&GlobalMutex);233234// Clear out the global ProfileBuffers, if it's not empty.235for (auto &B : *ProfileBuffers)236deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size);237ProfileBuffers->trim(ProfileBuffers->size());238239DCHECK_NE(TDArray, nullptr);240if (TDArray->empty())241return;242243// Then repopulate the global ProfileBuffers.244u32 I = 0;245auto MaxSize = profilingFlags()->global_allocator_max;246auto ProfileArena = allocateBuffer(MaxSize);247if (ProfileArena == nullptr)248return;249250auto ProfileArenaCleanup = at_scope_exit(251[&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); });252253auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max);254if (PathArena == nullptr)255return;256257auto PathArenaCleanup = at_scope_exit(258[&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); });259260for (const auto &ThreadTrie : *TDArray) {261using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;262ProfileRecordAllocator PRAlloc(ProfileArena,263profilingFlags()->global_allocator_max);264ProfileRecord::PathAllocator PathAlloc(265PathArena, profilingFlags()->global_allocator_max);266ProfileRecordArray ProfileRecords(PRAlloc);267268// First, we want to compute the amount of space we're going to need. We'll269// use a local allocator and an __xray::Array<...> to store the intermediary270// data, then compute the size as we're going along. Then we'll allocate the271// contiguous space to contain the thread buffer data.272if (ThreadTrie.FCT.getRoots().empty())273continue;274275populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT);276DCHECK(!ThreadTrie.FCT.getRoots().empty());277DCHECK(!ProfileRecords.empty());278279// Go through each record, to compute the sizes.280//281// header size = block size (4 bytes)282// + block number (4 bytes)283// + thread id (8 bytes)284// record size = path ids (4 bytes * number of ids + sentinel 4 bytes)285// + call count (8 bytes)286// + local time (8 bytes)287// + end of record (8 bytes)288u32 CumulativeSizes = 0;289for (const auto &Record : ProfileRecords)290CumulativeSizes += 20 + (4 * Record.Path.size());291292BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};293auto B = ProfileBuffers->Append({});294B->Size = sizeof(Header) + CumulativeSizes;295B->Data = allocateBuffer(B->Size);296DCHECK_NE(B->Data, nullptr);297serializeRecords(B, Header, ProfileRecords);298}299}300301void reset() XRAY_NEVER_INSTRUMENT {302atomic_store(&CollectorInitialized, 0, memory_order_release);303SpinMutexLock Lock(&GlobalMutex);304305if (ProfileBuffers != nullptr) {306// Clear out the profile buffers that have been serialized.307for (auto &B : *ProfileBuffers)308deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size);309ProfileBuffers->trim(ProfileBuffers->size());310ProfileBuffers = nullptr;311}312313if (TDArray != nullptr) {314// Release the resources as required.315for (auto &TD : *TDArray) {316TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer);317TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer);318TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer);319TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer);320}321// We don't bother destroying the array here because we've already322// potentially freed the backing store for the array. Instead we're going to323// reset the pointer to nullptr, and re-use the storage later instead324// (placement-new'ing into the storage as-is).325TDArray = nullptr;326}327328if (TDAllocator != nullptr) {329TDAllocator->~Allocator();330TDAllocator = nullptr;331}332333if (Buffer.Data != nullptr) {334BQ->releaseBuffer(Buffer);335}336337if (BQ == nullptr) {338bool Success = false;339new (&BufferQueueStorage)340BufferQueue(profilingFlags()->global_allocator_max, 1, Success);341if (!Success)342return;343BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);344} else {345BQ->finalize();346347if (BQ->init(profilingFlags()->global_allocator_max, 1) !=348BufferQueue::ErrorCode::Ok)349return;350}351352if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok)353return;354355new (&ProfileBufferArrayAllocatorStorage)356ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);357ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(358&ProfileBufferArrayAllocatorStorage);359360new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator);361ProfileBuffers =362reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);363364new (&ThreadDataAllocatorStorage)365ThreadDataAllocator(Buffer.Data, Buffer.Size);366TDAllocator =367reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage);368new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator);369TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage);370371atomic_store(&CollectorInitialized, 1, memory_order_release);372}373374XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {375SpinMutexLock Lock(&GlobalMutex);376377if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)378return {nullptr, 0};379380static pthread_once_t Once = PTHREAD_ONCE_INIT;381alignas(XRayProfilingFileHeader) static std::byte382FileHeaderStorage[sizeof(XRayProfilingFileHeader)];383pthread_once(384&Once, +[]() XRAY_NEVER_INSTRUMENT {385new (&FileHeaderStorage) XRayProfilingFileHeader{};386});387388if (UNLIKELY(B.Data == nullptr)) {389// The first buffer should always contain the file header information.390auto &FileHeader =391*reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage);392FileHeader.Timestamp = NanoTime();393FileHeader.PID = internal_getpid();394return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)};395}396397if (UNLIKELY(B.Data == &FileHeaderStorage))398return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size};399400BlockHeader Header;401internal_memcpy(&Header, B.Data, sizeof(BlockHeader));402auto NextBlock = Header.BlockNum + 1;403if (NextBlock < ProfileBuffers->size())404return {(*ProfileBuffers)[NextBlock].Data,405(*ProfileBuffers)[NextBlock].Size};406return {nullptr, 0};407}408409} // namespace profileCollectorService410} // namespace __xray411412413