Path: blob/main/contrib/llvm-project/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
35290 views
//===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7/// \file8///9/// This file implements the bottleneck analysis view.10///11/// This view internally observes backend pressure increase events in order to12/// identify problematic data dependencies and processor resource interferences.13///14/// Example of bottleneck analysis report for a dot-product on X86 btver2:15///16/// Cycles with backend pressure increase [ 40.76% ]17/// Throughput Bottlenecks:18/// Resource Pressure [ 39.34% ]19/// - JFPA [ 39.34% ]20/// - JFPU0 [ 39.34% ]21/// Data Dependencies: [ 1.42% ]22/// - Register Dependencies [ 1.42% ]23/// - Memory Dependencies [ 0.00% ]24///25/// According to the example, backend pressure increased during the 40.76% of26/// the simulated cycles. In particular, the major cause of backend pressure27/// increases was the contention on floating point adder JFPA accessible from28/// pipeline resource JFPU0.29///30/// At the end of each cycle, if pressure on the simulated out-of-order buffers31/// has increased, a backend pressure event is reported.32/// In particular, this occurs when there is a delta between the number of uOps33/// dispatched and the number of uOps issued to the underlying pipelines.34///35/// The bottleneck analysis view is also responsible for identifying and36/// printing the most "critical" sequence of dependent instructions according to37/// the simulated run.38///39/// Below is the critical sequence computed for the dot-product example on40/// btver2:41///42/// Instruction Dependency Information43/// +----< 2. vhaddps %xmm3, %xmm3, %xmm444/// |45/// | < loop carried >46/// |47/// | 0. vmulps %xmm0, %xmm0, %xmm248/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]49/// +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm350/// |51/// | < loop carried >52/// |53/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]54///55///56/// The algorithm that computes the critical sequence is very similar to a57/// critical path analysis.58///59/// A dependency graph is used internally to track dependencies between nodes.60/// Nodes of the graph represent instructions from the input assembly sequence,61/// and edges of the graph represent data dependencies or processor resource62/// interferences.63///64/// Edges are dynamically 'discovered' by observing instruction state65/// transitions and backend pressure increase events. Edges are internally66/// ranked based on their "criticality". A dependency is considered to be67/// critical if it takes a long time to execute, and if it contributes to68/// backend pressure increases. Criticality is internally measured in terms of69/// cycles; it is computed for every edge in the graph as a function of the edge70/// latency and the number of backend pressure increase cycles contributed by71/// that edge.72///73/// At the end of simulation, costs are propagated to nodes through the edges of74/// the graph, and the most expensive path connecting the root-set (a75/// set of nodes with no predecessors) to a leaf node is reported as critical76/// sequence.77//78//===----------------------------------------------------------------------===//7980#ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H81#define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H8283#include "Views/InstructionView.h"84#include "llvm/ADT/DenseMap.h"85#include "llvm/ADT/SmallVector.h"86#include "llvm/MC/MCInstPrinter.h"87#include "llvm/MC/MCSchedule.h"88#include "llvm/MC/MCSubtargetInfo.h"89#include "llvm/Support/FormattedStream.h"90#include "llvm/Support/raw_ostream.h"9192namespace llvm {93namespace mca {9495class PressureTracker {96const MCSchedModel &SM;9798// Resource pressure distribution. There is an element for every processor99// resource declared by the scheduling model. Quantities are number of cycles.100SmallVector<unsigned, 4> ResourcePressureDistribution;101102// Each processor resource is associated with a so-called processor resource103// mask. This vector allows to correlate processor resource IDs with processor104// resource masks. There is exactly one element per each processor resource105// declared by the scheduling model.106SmallVector<uint64_t, 4> ProcResID2Mask;107108// Maps processor resource state indices (returned by calls to109// `getResourceStateIndex(Mask)` to processor resource identifiers.110SmallVector<unsigned, 4> ResIdx2ProcResID;111112// Maps Processor Resource identifiers to ResourceUsers indices.113SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex;114115// Identifies the last user of a processor resource unit.116// This vector is updated on every instruction issued event.117// There is one entry for every processor resource unit declared by the118// processor model. An all_ones value is treated like an invalid instruction119// identifier.120using User = std::pair<unsigned, unsigned>;121SmallVector<User, 4> ResourceUsers;122123struct InstructionPressureInfo {124unsigned RegisterPressureCycles;125unsigned MemoryPressureCycles;126unsigned ResourcePressureCycles;127};128DenseMap<unsigned, InstructionPressureInfo> IPI;129130void updateResourcePressureDistribution(uint64_t CumulativeMask);131132User getResourceUser(unsigned ProcResID, unsigned UnitID) const {133unsigned Index = ProcResID2ResourceUsersIndex[ProcResID];134return ResourceUsers[Index + UnitID];135}136137public:138PressureTracker(const MCSchedModel &Model);139140ArrayRef<unsigned> getResourcePressureDistribution() const {141return ResourcePressureDistribution;142}143144void getResourceUsers(uint64_t ResourceMask,145SmallVectorImpl<User> &Users) const;146147unsigned getRegisterPressureCycles(unsigned IID) const {148assert(IPI.contains(IID) && "Instruction is not tracked!");149const InstructionPressureInfo &Info = IPI.find(IID)->second;150return Info.RegisterPressureCycles;151}152153unsigned getMemoryPressureCycles(unsigned IID) const {154assert(IPI.contains(IID) && "Instruction is not tracked!");155const InstructionPressureInfo &Info = IPI.find(IID)->second;156return Info.MemoryPressureCycles;157}158159unsigned getResourcePressureCycles(unsigned IID) const {160assert(IPI.contains(IID) && "Instruction is not tracked!");161const InstructionPressureInfo &Info = IPI.find(IID)->second;162return Info.ResourcePressureCycles;163}164165const char *resolveResourceName(uint64_t ResourceMask) const {166unsigned Index = getResourceStateIndex(ResourceMask);167unsigned ProcResID = ResIdx2ProcResID[Index];168const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);169return PRDesc.Name;170}171172void onInstructionDispatched(unsigned IID);173void onInstructionExecuted(unsigned IID);174175void handlePressureEvent(const HWPressureEvent &Event);176void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);177};178179// A dependency edge.180struct DependencyEdge {181enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };182183// Dependency edge descriptor.184//185// It specifies the dependency type, as well as the edge cost in cycles.186struct Dependency {187DependencyType Type;188uint64_t ResourceOrRegID;189uint64_t Cost;190};191Dependency Dep;192193unsigned FromIID;194unsigned ToIID;195196// Used by the bottleneck analysis to compute the interference197// probability for processor resources.198unsigned Frequency;199};200201// A dependency graph used by the bottleneck analysis to describe data202// dependencies and processor resource interferences between instructions.203//204// There is a node (an instance of struct DGNode) for every instruction in the205// input assembly sequence. Edges of the graph represent dependencies between206// instructions.207//208// Each edge of the graph is associated with a cost value which is used209// internally to rank dependency based on their impact on the runtime210// performance (see field DependencyEdge::Dependency::Cost). In general, the211// higher the cost of an edge, the higher the impact on performance.212//213// The cost of a dependency is a function of both the latency and the number of214// cycles where the dependency has been seen as critical (i.e. contributing to215// back-pressure increases).216//217// Loop carried dependencies are carefully expanded by the bottleneck analysis218// to guarantee that the graph stays acyclic. To this end, extra nodes are219// pre-allocated at construction time to describe instructions from "past and220// future" iterations. The graph is kept acyclic mainly because it simplifies221// the complexity of the algorithm that computes the critical sequence.222class DependencyGraph {223struct DGNode {224unsigned NumPredecessors;225unsigned NumVisitedPredecessors;226uint64_t Cost;227unsigned Depth;228229DependencyEdge CriticalPredecessor;230SmallVector<DependencyEdge, 8> OutgoingEdges;231};232SmallVector<DGNode, 16> Nodes;233234DependencyGraph(const DependencyGraph &) = delete;235DependencyGraph &operator=(const DependencyGraph &) = delete;236237void addDependency(unsigned From, unsigned To,238DependencyEdge::Dependency &&DE);239240void pruneEdges(unsigned Iterations);241void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;242void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet,243unsigned Iterations);244245#ifndef NDEBUG246void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,247MCInstPrinter &MCIP) const;248#endif249250public:251DependencyGraph(unsigned Size) : Nodes(Size) {}252253void addRegisterDep(unsigned From, unsigned To, unsigned RegID,254unsigned Cost) {255addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost});256}257258void addMemoryDep(unsigned From, unsigned To, unsigned Cost) {259addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost});260}261262void addResourceDep(unsigned From, unsigned To, uint64_t Mask,263unsigned Cost) {264addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost});265}266267// Called by the bottleneck analysis at the end of simulation to propagate268// costs through the edges of the graph, and compute a critical path.269void finalizeGraph(unsigned Iterations) {270SmallVector<unsigned, 16> RootSet;271pruneEdges(Iterations);272initializeRootSet(RootSet);273propagateThroughEdges(RootSet, Iterations);274}275276// Returns a sequence of edges representing the critical sequence based on the277// simulated run. It assumes that the graph has already been finalized (i.e.278// method `finalizeGraph()` has already been called on this graph).279void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;280281#ifndef NDEBUG282void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;283#endif284};285286/// A view that collects and prints a few performance numbers.287class BottleneckAnalysis : public InstructionView {288PressureTracker Tracker;289DependencyGraph DG;290291unsigned Iterations;292unsigned TotalCycles;293294bool PressureIncreasedBecauseOfResources;295bool PressureIncreasedBecauseOfRegisterDependencies;296bool PressureIncreasedBecauseOfMemoryDependencies;297// True if throughput was affected by dispatch stalls.298bool SeenStallCycles;299300struct BackPressureInfo {301// Cycles where backpressure increased.302unsigned PressureIncreaseCycles;303// Cycles where backpressure increased because of pipeline pressure.304unsigned ResourcePressureCycles;305// Cycles where backpressure increased because of data dependencies.306unsigned DataDependencyCycles;307// Cycles where backpressure increased because of register dependencies.308unsigned RegisterDependencyCycles;309// Cycles where backpressure increased because of memory dependencies.310unsigned MemoryDependencyCycles;311};312BackPressureInfo BPI;313314// Used to populate the dependency graph DG.315void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy);316void addMemoryDep(unsigned From, unsigned To, unsigned Cy);317void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy);318319void printInstruction(formatted_raw_ostream &FOS, const MCInst &MCI,320bool UseDifferentColor = false) const;321322// Prints a bottleneck message to OS.323void printBottleneckHints(raw_ostream &OS) const;324void printCriticalSequence(raw_ostream &OS) const;325326public:327BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,328ArrayRef<MCInst> Sequence, unsigned Iterations);329330void onCycleEnd() override;331void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; }332void onEvent(const HWPressureEvent &Event) override;333void onEvent(const HWInstructionEvent &Event) override;334335void printView(raw_ostream &OS) const override;336StringRef getNameAsString() const override { return "BottleneckAnalysis"; }337bool isSerializable() const override { return false; }338339#ifndef NDEBUG340void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); }341#endif342};343344} // namespace mca345} // namespace llvm346347#endif348349350