CoCalc -- CtxInstrProfiling.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
³⁵²³³ views
1
//===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8

9
#include "CtxInstrProfiling.h"
10
#include "sanitizer_common/sanitizer_allocator_internal.h"
11
#include "sanitizer_common/sanitizer_common.h"
12
#include "sanitizer_common/sanitizer_dense_map.h"
13
#include "sanitizer_common/sanitizer_libc.h"
14
#include "sanitizer_common/sanitizer_mutex.h"
15
#include "sanitizer_common/sanitizer_placement_new.h"
16
#include "sanitizer_common/sanitizer_thread_safety.h"
17
#include "sanitizer_common/sanitizer_vector.h"
18

19
#include <assert.h>
20

21
using namespace __ctx_profile;
22

23
namespace {
24
// Keep track of all the context roots we actually saw, so we can then traverse
25
// them when the user asks for the profile in __llvm_ctx_profile_fetch
26
__sanitizer::SpinMutex AllContextsMutex;
27
SANITIZER_GUARDED_BY(AllContextsMutex)
28
__sanitizer::Vector<ContextRoot *> AllContextRoots;
29

30
// utility to taint a pointer by setting the LSB. There is an assumption
31
// throughout that the addresses of contexts are even (really, they should be
32
// align(8), but "even"-ness is the minimum assumption)
33
// "scratch contexts" are buffers that we return in certain cases - they are
34
// large enough to allow for memory safe counter access, but they don't link
35
// subcontexts below them (the runtime recognizes them and enforces that)
36
ContextNode *markAsScratch(const ContextNode *Ctx) {
37
  return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1);
38
}
39

40
// Used when getting the data from TLS. We don't *really* need to reset, but
41
// it's a simpler system if we do.
42
template <typename T> inline T consume(T &V) {
43
  auto R = V;
44
  V = {0};
45
  return R;
46
}
47

48
// We allocate at least kBuffSize Arena pages. The scratch buffer is also that
49
// large.
50
constexpr size_t kPower = 20;
51
constexpr size_t kBuffSize = 1 << kPower;
52

53
// Highly unlikely we need more than kBuffSize for a context.
54
size_t getArenaAllocSize(size_t Needed) {
55
  if (Needed >= kBuffSize)
56
    return 2 * Needed;
57
  return kBuffSize;
58
}
59

60
// verify the structural integrity of the context
61
bool validate(const ContextRoot *Root) {
62
  // all contexts should be laid out in some arena page. Go over each arena
63
  // allocated for this Root, and jump over contained contexts based on
64
  // self-reported sizes.
65
  __sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs;
66
  for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
67
    const auto *Pos = Mem->start();
68
    while (Pos < Mem->pos()) {
69
      const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
70
      if (!ContextStartAddrs.insert({reinterpret_cast<uint64_t>(Ctx), true})
71
               .second)
72
        return false;
73
      Pos += Ctx->size();
74
    }
75
  }
76

77
  // Now traverse the contexts again the same way, but validate all nonull
78
  // subcontext addresses appear in the set computed above.
79
  for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
80
    const auto *Pos = Mem->start();
81
    while (Pos < Mem->pos()) {
82
      const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
83
      for (uint32_t I = 0; I < Ctx->callsites_size(); ++I)
84
        for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next())
85
          if (!ContextStartAddrs.find(reinterpret_cast<uint64_t>(Sub)))
86
            return false;
87

88
      Pos += Ctx->size();
89
    }
90
  }
91
  return true;
92
}
93

94
inline ContextNode *allocContextNode(char *Place, GUID Guid,
95
                                     uint32_t NrCounters, uint32_t NrCallsites,
96
                                     ContextNode *Next = nullptr) {
97
  assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0);
98
  return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next);
99
}
100

101
void resetContextNode(ContextNode &Node) {
102
  // FIXME(mtrofin): this is std::memset, which we can probably use if we
103
  // drop/reduce the dependency on sanitizer_common.
104
  for (uint32_t I = 0; I < Node.counters_size(); ++I)
105
    Node.counters()[I] = 0;
106
  for (uint32_t I = 0; I < Node.callsites_size(); ++I)
107
    for (auto *Next = Node.subContexts()[I]; Next; Next = Next->next())
108
      resetContextNode(*Next);
109
}
110

111
void onContextEnter(ContextNode &Node) { ++Node.counters()[0]; }
112

113
} // namespace
114

115
// the scratch buffer - what we give when we can't produce a real context (the
116
// scratch isn't "real" in that it's expected to be clobbered carelessly - we
117
// don't read it). The other important thing is that the callees from a scratch
118
// context also get a scratch context.
119
// Eventually this can be replaced with per-function buffers, a'la the typical
120
// (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but
121
// the part about determining the nature of the subcontexts does.
122
__thread char __Buffer[kBuffSize] = {0};
123

124
#define TheScratchContext                                                      \
125
  markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))
126

127
// init the TLSes
128
__thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr,
129
                                                                 nullptr};
130
__thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0};
131

132
__thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =
133
    nullptr;
134

135
Arena::Arena(uint32_t Size) : Size(Size) {
136
  __sanitizer::internal_memset(start(), 0, Size);
137
}
138

139
// FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
140
// the dependency on the latter.
141
Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
142
  assert(!Prev || Prev->Next == nullptr);
143
  Arena *NewArena = new (__sanitizer::InternalAlloc(
144
      Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment))
145
      Arena(Size);
146
  if (Prev)
147
    Prev->Next = NewArena;
148
  return NewArena;
149
}
150

151
void Arena::freeArenaList(Arena *&A) {
152
  assert(A);
153
  for (auto *I = A; I != nullptr;) {
154
    auto *Current = I;
155
    I = I->Next;
156
    __sanitizer::InternalFree(Current);
157
  }
158
  A = nullptr;
159
}
160

161
// If this is the first time we hit a callsite with this (Guid) particular
162
// callee, we need to allocate.
163
ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
164
                             uint32_t NrCounters, uint32_t NrCallsites) {
165
  auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);
166
  auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
167
  char *AllocPlace = Mem->tryBumpAllocate(AllocSize);
168
  if (!AllocPlace) {
169
    // if we failed to allocate on the current arena, allocate a new arena,
170
    // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we
171
    // find it from now on for other cases when we need to getCallsiteSlow.
172
    // Note that allocateNewArena will link the allocated memory in the list of
173
    // Arenas.
174
    __llvm_ctx_profile_current_context_root->CurrentMem = Mem =
175
        Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);
176
    AllocPlace = Mem->tryBumpAllocate(AllocSize);
177
  }
178
  auto *Ret = allocContextNode(AllocPlace, Guid, NrCounters, NrCallsites,
179
                               *InsertionPoint);
180
  *InsertionPoint = Ret;
181
  return Ret;
182
}
183

184
ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
185
                                            uint32_t NrCounters,
186
                                            uint32_t NrCallsites) {
187
  // fast "out" if we're not even doing contextual collection.
188
  if (!__llvm_ctx_profile_current_context_root)
189
    return TheScratchContext;
190

191
  // also fast "out" if the caller is scratch. We can see if it's scratch by
192
  // looking at the interior pointer into the subcontexts vector that the caller
193
  // provided, which, if the context is scratch, so is that interior pointer
194
  // (because all the address calculations are using even values. Or more
195
  // precisely, aligned - 8 values)
196
  auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
197
  if (!CallsiteContext || isScratch(CallsiteContext))
198
    return TheScratchContext;
199

200
  // if the callee isn't the expected one, return scratch.
201
  // Signal handler(s) could have been invoked at any point in the execution.
202
  // Should that have happened, and had it (the handler) be built with
203
  // instrumentation, its __llvm_ctx_profile_get_context would have failed here.
204
  // Its sub call graph would have then populated
205
  // __llvm_ctx_profile_{expected_callee | callsite} at index 1.
206
  // The normal call graph may be impacted in that, if the signal handler
207
  // happened somewhere before we read the TLS here, we'd see the TLS reset and
208
  // we'd also fail here. That would just mean we would loose counter values for
209
  // the normal subgraph, this time around. That should be very unlikely, but if
210
  // it happens too frequently, we should be able to detect discrepancies in
211
  // entry counts (caller-callee). At the moment, the design goes on the
212
  // assumption that is so unfrequent, though, that it's not worth doing more
213
  // for that case.
214
  auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
215
  if (ExpectedCallee != Callee)
216
    return TheScratchContext;
217

218
  auto *Callsite = *CallsiteContext;
219
  // in the case of indirect calls, we will have all seen targets forming a
220
  // linked list here. Find the one corresponding to this callee.
221
  while (Callsite && Callsite->guid() != Guid) {
222
    Callsite = Callsite->next();
223
  }
224
  auto *Ret = Callsite ? Callsite
225
                       : getCallsiteSlow(Guid, CallsiteContext, NrCounters,
226
                                         NrCallsites);
227
  if (Ret->callsites_size() != NrCallsites ||
228
      Ret->counters_size() != NrCounters)
229
    __sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: "
230
                        "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
231
                        reinterpret_cast<void *>(Ret), Guid, NrCallsites,
232
                        NrCounters, Ret->guid(), Ret->callsites_size(),
233
                        Ret->counters_size());
234
  onContextEnter(*Ret);
235
  return Ret;
236
}
237

238
// This should be called once for a Root. Allocate the first arena, set up the
239
// first context.
240
void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters,
241
                  uint32_t NrCallsites) {
242
  __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
243
      &AllContextsMutex);
244
  // Re-check - we got here without having had taken a lock.
245
  if (Root->FirstMemBlock)
246
    return;
247
  const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites);
248
  auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed));
249
  Root->FirstMemBlock = M;
250
  Root->CurrentMem = M;
251
  Root->FirstNode = allocContextNode(M->tryBumpAllocate(Needed), Guid,
252
                                     NrCounters, NrCallsites);
253
  AllContextRoots.PushBack(Root);
254
}
255

256
ContextNode *__llvm_ctx_profile_start_context(
257
    ContextRoot *Root, GUID Guid, uint32_t Counters,
258
    uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
259
  if (!Root->FirstMemBlock) {
260
    setupContext(Root, Guid, Counters, Callsites);
261
  }
262
  if (Root->Taken.TryLock()) {
263
    __llvm_ctx_profile_current_context_root = Root;
264
    onContextEnter(*Root->FirstNode);
265
    return Root->FirstNode;
266
  }
267
  // If this thread couldn't take the lock, return scratch context.
268
  __llvm_ctx_profile_current_context_root = nullptr;
269
  return TheScratchContext;
270
}
271

272
void __llvm_ctx_profile_release_context(ContextRoot *Root)
273
    SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
274
  if (__llvm_ctx_profile_current_context_root) {
275
    __llvm_ctx_profile_current_context_root = nullptr;
276
    Root->Taken.Unlock();
277
  }
278
}
279

280
void __llvm_ctx_profile_start_collection() {
281
  size_t NrMemUnits = 0;
282
  __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
283
      &AllContextsMutex);
284
  for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) {
285
    auto *Root = AllContextRoots[I];
286
    __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(
287
        &Root->Taken);
288
    for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())
289
      ++NrMemUnits;
290

291
    resetContextNode(*Root->FirstNode);
292
  }
293
  __sanitizer::Printf("[ctxprof] Initial NrMemUnits: %zu \n", NrMemUnits);
294
}
295

296
bool __llvm_ctx_profile_fetch(void *Data,
297
                              bool (*Writer)(void *W, const ContextNode &)) {
298
  assert(Writer);
299
  __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
300
      &AllContextsMutex);
301

302
  for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) {
303
    auto *Root = AllContextRoots[I];
304
    __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock(
305
        &Root->Taken);
306
    if (!validate(Root)) {
307
      __sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");
308
      return false;
309
    }
310
    if (!Writer(Data, *Root->FirstNode))
311
      return false;
312
  }
313
  return true;
314
}
315

316
void __llvm_ctx_profile_free() {
317
  __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
318
      &AllContextsMutex);
319
  for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
320
    for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
321
      auto *C = A;
322
      A = A->next();
323
      __sanitizer::InternalFree(C);
324
    }
325
  AllContextRoots.Reset();
326
}
327

328
Product

Resources

Company