CoCalc -- BottleneckAnalysis.h

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
³⁵²⁹⁰ views
1
//===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
/// \file
9
///
10
/// This file implements the bottleneck analysis view.
11
///
12
/// This view internally observes backend pressure increase events in order to
13
/// identify problematic data dependencies and processor resource interferences.
14
///
15
/// Example of bottleneck analysis report for a dot-product on X86 btver2:
16
///
17
/// Cycles with backend pressure increase [ 40.76% ]
18
/// Throughput Bottlenecks: 
19
///   Resource Pressure       [ 39.34% ]
20
///   - JFPA  [ 39.34% ]
21
///   - JFPU0  [ 39.34% ]
22
///   Data Dependencies:      [ 1.42% ]
23
///   - Register Dependencies [ 1.42% ]
24
///   - Memory Dependencies   [ 0.00% ]
25
///
26
/// According to the example, backend pressure increased during the 40.76% of
27
/// the simulated cycles.  In particular, the major cause of backend pressure
28
/// increases was the contention on floating point adder JFPA accessible from
29
/// pipeline resource JFPU0.
30
///
31
/// At the end of each cycle, if pressure on the simulated out-of-order buffers
32
/// has increased, a backend pressure event is reported.
33
/// In particular, this occurs when there is a delta between the number of uOps
34
/// dispatched and the number of uOps issued to the underlying pipelines.
35
///
36
/// The bottleneck analysis view is also responsible for identifying and
37
/// printing the most "critical" sequence of dependent instructions according to
38
/// the simulated run.
39
///
40
/// Below is the critical sequence computed for the dot-product example on
41
/// btver2:
42
///
43
///              Instruction                     Dependency Information
44
/// +----< 2.    vhaddps %xmm3, %xmm3, %xmm4
45
/// |
46
/// |    < loop carried > 
47
/// |
48
/// |      0.    vmulps	 %xmm0, %xmm0, %xmm2
49
/// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
50
/// +----> 2.    vhaddps %xmm3, %xmm3, %xmm4     ## REGISTER dependency:  %xmm3
51
/// |
52
/// |    < loop carried > 
53
/// |
54
/// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
55
///
56
///
57
/// The algorithm that computes the critical sequence is very similar to a
58
/// critical path analysis.
59
/// 
60
/// A dependency graph is used internally to track dependencies between nodes.
61
/// Nodes of the graph represent instructions from the input assembly sequence,
62
/// and edges of the graph represent data dependencies or processor resource
63
/// interferences.
64
///
65
/// Edges are dynamically 'discovered' by observing instruction state
66
/// transitions and backend pressure increase events. Edges are internally
67
/// ranked based on their "criticality". A dependency is considered to be
68
/// critical if it takes a long time to execute, and if it contributes to
69
/// backend pressure increases. Criticality is internally measured in terms of
70
/// cycles; it is computed for every edge in the graph as a function of the edge
71
/// latency and the number of backend pressure increase cycles contributed by
72
/// that edge.
73
///
74
/// At the end of simulation, costs are propagated to nodes through the edges of
75
/// the graph, and the most expensive path connecting the root-set (a
76
/// set of nodes with no predecessors) to a leaf node is reported as critical
77
/// sequence.
78
//
79
//===----------------------------------------------------------------------===//
80

81
#ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
82
#define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
83

84
#include "Views/InstructionView.h"
85
#include "llvm/ADT/DenseMap.h"
86
#include "llvm/ADT/SmallVector.h"
87
#include "llvm/MC/MCInstPrinter.h"
88
#include "llvm/MC/MCSchedule.h"
89
#include "llvm/MC/MCSubtargetInfo.h"
90
#include "llvm/Support/FormattedStream.h"
91
#include "llvm/Support/raw_ostream.h"
92

93
namespace llvm {
94
namespace mca {
95

96
class PressureTracker {
97
  const MCSchedModel &SM;
98

99
  // Resource pressure distribution. There is an element for every processor
100
  // resource declared by the scheduling model. Quantities are number of cycles.
101
  SmallVector<unsigned, 4> ResourcePressureDistribution;
102

103
  // Each processor resource is associated with a so-called processor resource
104
  // mask. This vector allows to correlate processor resource IDs with processor
105
  // resource masks. There is exactly one element per each processor resource
106
  // declared by the scheduling model.
107
  SmallVector<uint64_t, 4> ProcResID2Mask;
108

109
  // Maps processor resource state indices (returned by calls to
110
  // `getResourceStateIndex(Mask)` to processor resource identifiers.
111
  SmallVector<unsigned, 4> ResIdx2ProcResID;
112

113
  // Maps Processor Resource identifiers to ResourceUsers indices.
114
  SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex;
115

116
  // Identifies the last user of a processor resource unit.
117
  // This vector is updated on every instruction issued event.
118
  // There is one entry for every processor resource unit declared by the
119
  // processor model. An all_ones value is treated like an invalid instruction
120
  // identifier.
121
  using User = std::pair<unsigned, unsigned>;
122
  SmallVector<User, 4> ResourceUsers;
123

124
  struct InstructionPressureInfo {
125
    unsigned RegisterPressureCycles;
126
    unsigned MemoryPressureCycles;
127
    unsigned ResourcePressureCycles;
128
  };
129
  DenseMap<unsigned, InstructionPressureInfo> IPI;
130

131
  void updateResourcePressureDistribution(uint64_t CumulativeMask);
132

133
  User getResourceUser(unsigned ProcResID, unsigned UnitID) const {
134
    unsigned Index = ProcResID2ResourceUsersIndex[ProcResID];
135
    return ResourceUsers[Index + UnitID];
136
  }
137

138
public:
139
  PressureTracker(const MCSchedModel &Model);
140

141
  ArrayRef<unsigned> getResourcePressureDistribution() const {
142
    return ResourcePressureDistribution;
143
  }
144

145
  void getResourceUsers(uint64_t ResourceMask,
146
                        SmallVectorImpl<User> &Users) const;
147

148
  unsigned getRegisterPressureCycles(unsigned IID) const {
149
    assert(IPI.contains(IID) && "Instruction is not tracked!");
150
    const InstructionPressureInfo &Info = IPI.find(IID)->second;
151
    return Info.RegisterPressureCycles;
152
  }
153

154
  unsigned getMemoryPressureCycles(unsigned IID) const {
155
    assert(IPI.contains(IID) && "Instruction is not tracked!");
156
    const InstructionPressureInfo &Info = IPI.find(IID)->second;
157
    return Info.MemoryPressureCycles;
158
  }
159

160
  unsigned getResourcePressureCycles(unsigned IID) const {
161
    assert(IPI.contains(IID) && "Instruction is not tracked!");
162
    const InstructionPressureInfo &Info = IPI.find(IID)->second;
163
    return Info.ResourcePressureCycles;
164
  }
165

166
  const char *resolveResourceName(uint64_t ResourceMask) const {
167
    unsigned Index = getResourceStateIndex(ResourceMask);
168
    unsigned ProcResID = ResIdx2ProcResID[Index];
169
    const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
170
    return PRDesc.Name;
171
  }
172

173
  void onInstructionDispatched(unsigned IID);
174
  void onInstructionExecuted(unsigned IID);
175

176
  void handlePressureEvent(const HWPressureEvent &Event);
177
  void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);
178
};
179

180
// A dependency edge.
181
struct DependencyEdge {
182
  enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };
183

184
  // Dependency edge descriptor.
185
  //
186
  // It specifies the dependency type, as well as the edge cost in cycles.
187
  struct Dependency {
188
    DependencyType Type;
189
    uint64_t ResourceOrRegID;
190
    uint64_t Cost;
191
  };
192
  Dependency Dep;
193

194
  unsigned FromIID;
195
  unsigned ToIID;
196

197
  // Used by the bottleneck analysis to compute the interference
198
  // probability for processor resources.
199
  unsigned Frequency;
200
};
201

202
// A dependency graph used by the bottleneck analysis to describe data
203
// dependencies and processor resource interferences between instructions.
204
//
205
// There is a node (an instance of struct DGNode) for every instruction in the
206
// input assembly sequence. Edges of the graph represent dependencies between
207
// instructions.
208
//
209
// Each edge of the graph is associated with a cost value which is used
210
// internally to rank dependency based on their impact on the runtime
211
// performance (see field DependencyEdge::Dependency::Cost). In general, the
212
// higher the cost of an edge, the higher the impact on performance.
213
//
214
// The cost of a dependency is a function of both the latency and the number of
215
// cycles where the dependency has been seen as critical (i.e. contributing to
216
// back-pressure increases).
217
//
218
// Loop carried dependencies are carefully expanded by the bottleneck analysis
219
// to guarantee that the graph stays acyclic. To this end, extra nodes are
220
// pre-allocated at construction time to describe instructions from "past and
221
// future" iterations. The graph is kept acyclic mainly because it simplifies
222
// the complexity of the algorithm that computes the critical sequence.
223
class DependencyGraph {
224
  struct DGNode {
225
    unsigned NumPredecessors;
226
    unsigned NumVisitedPredecessors;
227
    uint64_t Cost;
228
    unsigned Depth;
229

230
    DependencyEdge CriticalPredecessor;
231
    SmallVector<DependencyEdge, 8> OutgoingEdges;
232
  };
233
  SmallVector<DGNode, 16> Nodes;
234

235
  DependencyGraph(const DependencyGraph &) = delete;
236
  DependencyGraph &operator=(const DependencyGraph &) = delete;
237

238
  void addDependency(unsigned From, unsigned To,
239
                     DependencyEdge::Dependency &&DE);
240

241
  void pruneEdges(unsigned Iterations);
242
  void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
243
  void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet,
244
                             unsigned Iterations);
245

246
#ifndef NDEBUG
247
  void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
248
                          MCInstPrinter &MCIP) const;
249
#endif
250

251
public:
252
  DependencyGraph(unsigned Size) : Nodes(Size) {}
253

254
  void addRegisterDep(unsigned From, unsigned To, unsigned RegID,
255
                      unsigned Cost) {
256
    addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost});
257
  }
258

259
  void addMemoryDep(unsigned From, unsigned To, unsigned Cost) {
260
    addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost});
261
  }
262

263
  void addResourceDep(unsigned From, unsigned To, uint64_t Mask,
264
                      unsigned Cost) {
265
    addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost});
266
  }
267

268
  // Called by the bottleneck analysis at the end of simulation to propagate
269
  // costs through the edges of the graph, and compute a critical path.
270
  void finalizeGraph(unsigned Iterations) {
271
    SmallVector<unsigned, 16> RootSet;
272
    pruneEdges(Iterations);
273
    initializeRootSet(RootSet);
274
    propagateThroughEdges(RootSet, Iterations);
275
  }
276

277
  // Returns a sequence of edges representing the critical sequence based on the
278
  // simulated run. It assumes that the graph has already been finalized (i.e.
279
  // method `finalizeGraph()` has already been called on this graph).
280
  void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;
281

282
#ifndef NDEBUG
283
  void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;
284
#endif
285
};
286

287
/// A view that collects and prints a few performance numbers.
288
class BottleneckAnalysis : public InstructionView {
289
  PressureTracker Tracker;
290
  DependencyGraph DG;
291

292
  unsigned Iterations;
293
  unsigned TotalCycles;
294

295
  bool PressureIncreasedBecauseOfResources;
296
  bool PressureIncreasedBecauseOfRegisterDependencies;
297
  bool PressureIncreasedBecauseOfMemoryDependencies;
298
  // True if throughput was affected by dispatch stalls.
299
  bool SeenStallCycles;
300

301
  struct BackPressureInfo {
302
    // Cycles where backpressure increased.
303
    unsigned PressureIncreaseCycles;
304
    // Cycles where backpressure increased because of pipeline pressure.
305
    unsigned ResourcePressureCycles;
306
    // Cycles where backpressure increased because of data dependencies.
307
    unsigned DataDependencyCycles;
308
    // Cycles where backpressure increased because of register dependencies.
309
    unsigned RegisterDependencyCycles;
310
    // Cycles where backpressure increased because of memory dependencies.
311
    unsigned MemoryDependencyCycles;
312
  };
313
  BackPressureInfo BPI;
314

315
  // Used to populate the dependency graph DG.
316
  void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy);
317
  void addMemoryDep(unsigned From, unsigned To, unsigned Cy);
318
  void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy);
319

320
  void printInstruction(formatted_raw_ostream &FOS, const MCInst &MCI,
321
                        bool UseDifferentColor = false) const;
322

323
  // Prints a bottleneck message to OS.
324
  void printBottleneckHints(raw_ostream &OS) const;
325
  void printCriticalSequence(raw_ostream &OS) const;
326

327
public:
328
  BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
329
                     ArrayRef<MCInst> Sequence, unsigned Iterations);
330

331
  void onCycleEnd() override;
332
  void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; }
333
  void onEvent(const HWPressureEvent &Event) override;
334
  void onEvent(const HWInstructionEvent &Event) override;
335

336
  void printView(raw_ostream &OS) const override;
337
  StringRef getNameAsString() const override { return "BottleneckAnalysis"; }
338
  bool isSerializable() const override { return false; }
339

340
#ifndef NDEBUG
341
  void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); }
342
#endif
343
};
344

345
} // namespace mca
346
} // namespace llvm
347

348
#endif
349

350
Product

Resources

Company