Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
35269 views
//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// \file This file defines a set of schedule DAG mutations that can be used to9// override default scheduler behavior to enforce specific scheduling patterns.10// They should be used in cases where runtime performance considerations such as11// inter-wavefront interactions, mean that compile-time heuristics cannot12// predict the optimal instruction ordering, or in kernels where optimum13// instruction scheduling is important enough to warrant manual intervention.14//15//===----------------------------------------------------------------------===//1617#include "AMDGPUIGroupLP.h"18#include "AMDGPUTargetMachine.h"19#include "MCTargetDesc/AMDGPUMCTargetDesc.h"20#include "SIInstrInfo.h"21#include "SIMachineFunctionInfo.h"22#include "llvm/ADT/BitmaskEnum.h"23#include "llvm/ADT/DenseMap.h"24#include "llvm/CodeGen/MachineScheduler.h"25#include "llvm/CodeGen/TargetOpcodes.h"2627using namespace llvm;2829#define DEBUG_TYPE "igrouplp"3031namespace {3233static cl::opt<bool> EnableExactSolver(34"amdgpu-igrouplp-exact-solver", cl::Hidden,35cl::desc("Whether to use the exponential time solver to fit "36"the instructions to the pipeline as closely as "37"possible."),38cl::init(false));3940static cl::opt<unsigned> CutoffForExact(41"amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,42cl::desc("The maximum number of scheduling group conflicts "43"which we attempt to solve with the exponential time "44"exact solver. Problem sizes greater than this will"45"be solved by the less accurate greedy algorithm. Selecting "46"solver by size is superseded by manually selecting "47"the solver (e.g. by amdgpu-igrouplp-exact-solver"));4849static cl::opt<uint64_t> MaxBranchesExplored(50"amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,51cl::desc("The amount of branches that we are willing to explore with"52"the exact algorithm before giving up."));5354static cl::opt<bool> UseCostHeur(55"amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,56cl::desc("Whether to use the cost heuristic to make choices as we "57"traverse the search space using the exact solver. Defaulted "58"to on, and if turned off, we will use the node order -- "59"attempting to put the later nodes in the later sched groups. "60"Experimentally, results are mixed, so this should be set on a "61"case-by-case basis."));6263// Components of the mask that determines which instruction types may be may be64// classified into a SchedGroup.65enum class SchedGroupMask {66NONE = 0u,67ALU = 1u << 0,68VALU = 1u << 1,69SALU = 1u << 2,70MFMA = 1u << 3,71VMEM = 1u << 4,72VMEM_READ = 1u << 5,73VMEM_WRITE = 1u << 6,74DS = 1u << 7,75DS_READ = 1u << 8,76DS_WRITE = 1u << 9,77TRANS = 1u << 10,78ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |79DS_READ | DS_WRITE | TRANS,80LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)81};8283class SchedGroup;8485// InstructionRule class is used to enact a filter which determines whether or86// not an SU maps to a given SchedGroup. It contains complementary data87// structures (e.g Cache) to help those filters.88class InstructionRule {89protected:90const SIInstrInfo *TII;91unsigned SGID;92// A cache made available to the Filter to store SUnits for subsequent93// invocations of the Filter94std::optional<SmallVector<SUnit *, 4>> Cache;9596public:97virtual bool98apply(const SUnit *, const ArrayRef<SUnit *>,99SmallVectorImpl<SchedGroup> &) {100return true;101};102103InstructionRule(const SIInstrInfo *TII, unsigned SGID,104bool NeedsCache = false)105: TII(TII), SGID(SGID) {106if (NeedsCache) {107Cache = SmallVector<SUnit *, 4>();108}109}110111virtual ~InstructionRule() = default;112};113114using SUnitsToCandidateSGsMap = DenseMap<SUnit *, SmallVector<int, 4>>;115116// Classify instructions into groups to enable fine tuned control over the117// scheduler. These groups may be more specific than current SchedModel118// instruction classes.119class SchedGroup {120private:121// Mask that defines which instruction types can be classified into this122// SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER123// and SCHED_GROUP_BARRIER.124SchedGroupMask SGMask;125126// Maximum number of SUnits that can be added to this group.127std::optional<unsigned> MaxSize;128129// SchedGroups will only synchronize with other SchedGroups that have the same130// SyncID.131int SyncID = 0;132133// SGID is used to map instructions to candidate SchedGroups134unsigned SGID;135136// The different rules each instruction in this SchedGroup must conform to137SmallVector<std::shared_ptr<InstructionRule>, 4> Rules;138139// Count of the number of created SchedGroups, used to initialize SGID.140static unsigned NumSchedGroups;141142// Try to add and edge from SU A to SU B.143bool tryAddEdge(SUnit *A, SUnit *B);144145// Use SGMask to determine whether we can classify MI as a member of this146// SchedGroup object.147bool canAddMI(const MachineInstr &MI) const;148149public:150// Collection of SUnits that are classified as members of this group.151SmallVector<SUnit *, 32> Collection;152153ScheduleDAGInstrs *DAG;154const SIInstrInfo *TII;155156// Returns true if SU can be added to this SchedGroup.157bool canAddSU(SUnit &SU) const;158159// Add DAG dependencies from all SUnits in this SchedGroup and this SU. If160// MakePred is true, SU will be a predecessor of the SUnits in this161// SchedGroup, otherwise SU will be a successor.162void link(SUnit &SU, bool MakePred = false);163164// Add DAG dependencies and track which edges are added, and the count of165// missed edges166int link(SUnit &SU, bool MakePred,167std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);168169// Add DAG dependencies from all SUnits in this SchedGroup and this SU.170// Use the predicate to determine whether SU should be a predecessor (P =171// true) or a successor (P = false) of this SchedGroup.172void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);173174// Add DAG dependencies such that SUnits in this group shall be ordered175// before SUnits in OtherGroup.176void link(SchedGroup &OtherGroup);177178// Returns true if no more instructions may be added to this group.179bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }180181// Append a constraint that SUs must meet in order to fit into this182// SchedGroup. Since many rules involve the relationship between a SchedGroup183// and the SUnits in other SchedGroups, rules are checked at Pipeline Solve184// time (rather than SchedGroup init time.)185void addRule(std::shared_ptr<InstructionRule> NewRule) {186Rules.push_back(NewRule);187}188189// Returns true if the SU matches all rules190bool allowedByRules(const SUnit *SU,191SmallVectorImpl<SchedGroup> &SyncPipe) const {192for (auto &Rule : Rules) {193if (!Rule.get()->apply(SU, Collection, SyncPipe))194return false;195}196return true;197}198199// Add SU to the SchedGroup.200void add(SUnit &SU) {201LLVM_DEBUG(dbgs() << "For SchedGroup with mask "202<< format_hex((int)SGMask, 10, true) << " adding "203<< *SU.getInstr());204Collection.push_back(&SU);205}206207// Remove last element in the SchedGroup208void pop() { Collection.pop_back(); }209210// Identify and add all relevant SUs from the DAG to this SchedGroup.211void initSchedGroup();212213// Add instructions to the SchedGroup bottom up starting from RIter.214// PipelineInstrs is a set of instructions that should not be added to the215// SchedGroup even when the other conditions for adding it are satisfied.216// RIter will be added to the SchedGroup as well, and dependencies will be217// added so that RIter will always be scheduled at the end of the group.218void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,219SUnitsToCandidateSGsMap &SyncedInstrs);220221void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);222223int getSyncID() { return SyncID; }224225int getSGID() { return SGID; }226227SchedGroupMask getMask() { return SGMask; }228229SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,230ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)231: SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {232SGID = NumSchedGroups++;233}234235SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,236ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)237: SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {238SGID = NumSchedGroups++;239}240};241242// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.243static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {244assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||245SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||246SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);247248while (!SU.Preds.empty())249for (auto &P : SU.Preds)250SU.removePred(P);251252while (!SU.Succs.empty())253for (auto &S : SU.Succs)254for (auto &SP : S.getSUnit()->Preds)255if (SP.getSUnit() == &SU)256S.getSUnit()->removePred(SP);257}258259using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;260using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>;261262// The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline263// in non-trivial cases. For example, if the requested pipeline is264// {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction265// in the DAG, then we will have an instruction that can not be trivially266// assigned to a SchedGroup. The PipelineSolver class implements two algorithms267// to find a good solution to the pipeline -- a greedy algorithm and an exact268// algorithm. The exact algorithm has an exponential time complexity and should269// only be used for small sized problems or medium sized problems where an exact270// solution is highly desired.271class PipelineSolver {272ScheduleDAGMI *DAG;273274// Instructions that can be assigned to multiple SchedGroups275DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;276SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;277DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;278// The current working pipeline279SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline;280// The pipeline that has the best solution found so far281SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline;282283// Whether or not we actually have any SyncedInstrs to try to solve.284bool NeedsSolver = false;285286// Compute an estimate of the size of search tree -- the true size is287// the product of each conflictedInst.Matches.size() across all SyncPipelines288unsigned computeProblemSize();289290// The cost penalty of not assigning a SU to a SchedGroup291int MissPenalty = 0;292293// Costs in terms of the number of edges we are unable to add294int BestCost = -1;295int CurrCost = 0;296297// Index pointing to the conflicting instruction that is currently being298// fitted299int CurrConflInstNo = 0;300// Index to the pipeline that is currently being fitted301int CurrSyncGroupIdx = 0;302// The first non trivial pipeline303int BeginSyncGroupIdx = 0;304305// How many branches we have explored306uint64_t BranchesExplored = 0;307308// The direction in which we process the candidate SchedGroups per SU309bool IsBottomUp = true;310311// Update indices to fit next conflicting instruction312void advancePosition();313// Recede indices to attempt to find better fit for previous conflicting314// instruction315void retreatPosition();316317// The exponential time algorithm which finds the provably best fit318bool solveExact();319// The polynomial time algorithm which attempts to find a good fit320bool solveGreedy();321// Find the best SchedGroup for the current SU using the heuristic given all322// current information. One step in the greedy algorithm. Templated against323// the SchedGroup iterator (either reverse or forward).324template <typename T>325void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I,326T E);327// Whether or not the current solution is optimal328bool checkOptimal();329// Populate the ready list, prioiritizing fewest missed edges first330// Templated against the SchedGroup iterator (either reverse or forward).331template <typename T>332void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,333T E);334// Add edges corresponding to the SchedGroups as assigned by solver335void makePipeline();336// Link the SchedGroups in the best found pipeline.337// Tmplated against the SchedGroup iterator (either reverse or forward).338template <typename T> void linkSchedGroups(T I, T E);339// Add the edges from the SU to the other SchedGroups in pipeline, and340// return the number of edges missed.341int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,342std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);343/// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It344/// returns the cost (in terms of missed pipeline edges), and tracks the edges345/// added in \p AddedEdges346template <typename T>347int linkSUnit(SUnit *SU, int SGID,348std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);349/// Remove the edges passed via \p AddedEdges350void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);351// Convert the passed in maps to arrays for bidirectional iterators352void convertSyncMapsToArrays();353354void reset();355356public:357// Invoke the solver to map instructions to instruction groups. Heuristic &&358// command-line-option determines to use exact or greedy algorithm.359void solve();360361PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,362DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,363ScheduleDAGMI *DAG, bool IsBottomUp = true)364: DAG(DAG), SyncedInstrs(SyncedInstrs),365SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {366367for (auto &PipelineInstrs : SyncedInstrs) {368if (PipelineInstrs.second.size() > 0) {369NeedsSolver = true;370break;371}372}373374if (!NeedsSolver)375return;376377convertSyncMapsToArrays();378379CurrPipeline = BestPipeline;380381while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&382PipelineInstrs[BeginSyncGroupIdx].size() == 0)383++BeginSyncGroupIdx;384385if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())386return;387}388};389390void PipelineSolver::reset() {391392for (auto &SyncPipeline : CurrPipeline) {393for (auto &SG : SyncPipeline) {394SmallVector<SUnit *, 32> TempCollection = SG.Collection;395SG.Collection.clear();396auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {397return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;398});399if (SchedBarr != TempCollection.end())400SG.Collection.push_back(*SchedBarr);401}402}403404CurrSyncGroupIdx = BeginSyncGroupIdx;405CurrConflInstNo = 0;406CurrCost = 0;407}408409void PipelineSolver::convertSyncMapsToArrays() {410for (auto &SyncPipe : SyncedSchedGroups) {411BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);412}413414int PipelineIDx = SyncedInstrs.size() - 1;415PipelineInstrs.resize(SyncedInstrs.size());416for (auto &SyncInstrMap : SyncedInstrs) {417for (auto &SUsToCandSGs : SyncInstrMap.second) {418if (PipelineInstrs[PipelineIDx].size() == 0) {419PipelineInstrs[PipelineIDx].push_back(420std::pair(SUsToCandSGs.first, SUsToCandSGs.second));421continue;422}423auto SortPosition = PipelineInstrs[PipelineIDx].begin();424// Insert them in sorted order -- this allows for good parsing order in425// the greedy algorithm426while (SortPosition != PipelineInstrs[PipelineIDx].end() &&427SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)428++SortPosition;429PipelineInstrs[PipelineIDx].insert(430SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));431}432--PipelineIDx;433}434}435436template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {437for (; I != E; ++I) {438auto &GroupA = *I;439for (auto J = std::next(I); J != E; ++J) {440auto &GroupB = *J;441GroupA.link(GroupB);442}443}444}445446void PipelineSolver::makePipeline() {447// Preserve the order of barrier for subsequent SchedGroupBarrier mutations448for (auto &SyncPipeline : BestPipeline) {449LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");450for (auto &SG : SyncPipeline) {451LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()452<< " has: \n");453SUnit *SGBarr = nullptr;454for (auto &SU : SG.Collection) {455if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)456SGBarr = SU;457LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");458}459// Command line requested IGroupLP doesn't have SGBarr460if (!SGBarr)461continue;462resetEdges(*SGBarr, DAG);463SG.link(*SGBarr, false);464}465}466467for (auto &SyncPipeline : BestPipeline) {468IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())469: linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());470}471}472473template <typename T>474int PipelineSolver::linkSUnit(475SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,476T I, T E) {477bool MakePred = false;478int AddedCost = 0;479for (; I < E; ++I) {480if (I->getSGID() == SGID) {481MakePred = true;482continue;483}484auto Group = *I;485AddedCost += Group.link(*SU, MakePred, AddedEdges);486assert(AddedCost >= 0);487}488return AddedCost;489}490491int PipelineSolver::addEdges(492SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,493std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {494495// For IsBottomUp, the first SchedGroup in SyncPipeline contains the496// instructions that are the ultimate successors in the resultant mutation.497// Therefore, in such a configuration, the SchedGroups occurring before the498// candidate SGID are successors of the candidate SchedGroup, thus the current499// SU should be linked as a predecessor to SUs in those SchedGroups. The500// opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple501// SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using502// IsBottomUp (in reverse).503return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(),504SyncPipeline.rend())505: linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(),506SyncPipeline.end());507}508509void PipelineSolver::removeEdges(510const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {511// Only remove the edges that we have added when testing512// the fit.513for (auto &PredSuccPair : EdgesToRemove) {514SUnit *Pred = PredSuccPair.first;515SUnit *Succ = PredSuccPair.second;516517auto Match = llvm::find_if(518Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });519if (Match != Succ->Preds.end()) {520assert(Match->isArtificial());521Succ->removePred(*Match);522}523}524}525526void PipelineSolver::advancePosition() {527++CurrConflInstNo;528529if (static_cast<size_t>(CurrConflInstNo) >=530PipelineInstrs[CurrSyncGroupIdx].size()) {531CurrConflInstNo = 0;532++CurrSyncGroupIdx;533// Advance to next non-trivial pipeline534while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&535PipelineInstrs[CurrSyncGroupIdx].size() == 0)536++CurrSyncGroupIdx;537}538}539540void PipelineSolver::retreatPosition() {541assert(CurrConflInstNo >= 0);542assert(CurrSyncGroupIdx >= 0);543544if (CurrConflInstNo > 0) {545--CurrConflInstNo;546return;547}548549if (CurrConflInstNo == 0) {550// If we return to the starting position, we have explored551// the entire tree552if (CurrSyncGroupIdx == BeginSyncGroupIdx)553return;554555--CurrSyncGroupIdx;556// Go to previous non-trivial pipeline557while (PipelineInstrs[CurrSyncGroupIdx].size() == 0)558--CurrSyncGroupIdx;559560CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;561}562}563564bool PipelineSolver::checkOptimal() {565if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {566if (BestCost == -1 || CurrCost < BestCost) {567BestPipeline = CurrPipeline;568BestCost = CurrCost;569LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");570}571assert(BestCost >= 0);572}573574bool DoneExploring = false;575if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)576DoneExploring = true;577578return (DoneExploring || BestCost == 0);579}580581template <typename T>582void PipelineSolver::populateReadyList(583SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {584SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];585auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];586assert(CurrSU.second.size() >= 1);587588for (; I != E; ++I) {589std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;590int CandSGID = *I;591SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {592return SG.getSGID() == CandSGID;593});594assert(Match);595596if (UseCostHeur) {597if (Match->isFull()) {598ReadyList.push_back(std::pair(*I, MissPenalty));599continue;600}601602int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);603ReadyList.push_back(std::pair(*I, TempCost));604removeEdges(AddedEdges);605} else606ReadyList.push_back(std::pair(*I, -1));607}608609if (UseCostHeur) {610std::sort(ReadyList.begin(), ReadyList.end(),611[](std::pair<int, int> A, std::pair<int, int> B) {612return A.second < B.second;613});614}615616assert(ReadyList.size() == CurrSU.second.size());617}618619bool PipelineSolver::solveExact() {620if (checkOptimal())621return true;622623if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())624return false;625626assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());627assert(static_cast<size_t>(CurrConflInstNo) <628PipelineInstrs[CurrSyncGroupIdx].size());629SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];630LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum631<< ") in Pipeline # " << CurrSyncGroupIdx << "\n");632633// SchedGroup -> Cost pairs634SmallVector<std::pair<int, int>, 4> ReadyList;635// Prioritize the candidate sched groups in terms of lowest cost first636IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(),637CurrSU.second.rend())638: populateReadyList(ReadyList, CurrSU.second.begin(),639CurrSU.second.end());640641auto I = ReadyList.begin();642auto E = ReadyList.end();643for (; I != E; ++I) {644// If we are trying SGs in least cost order, and the current SG is cost645// infeasible, then all subsequent SGs will also be cost infeasible, so we646// can prune.647if (BestCost != -1 && (CurrCost + I->second > BestCost))648return false;649650int CandSGID = I->first;651int AddedCost = 0;652std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;653auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];654SchedGroup *Match;655for (auto &SG : SyncPipeline) {656if (SG.getSGID() == CandSGID)657Match = &SG;658}659660if (Match->isFull())661continue;662663if (!Match->allowedByRules(CurrSU.first, SyncPipeline))664continue;665666LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "667<< (int)Match->getMask() << "and ID " << CandSGID668<< "\n");669Match->add(*CurrSU.first);670AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);671LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");672CurrCost += AddedCost;673advancePosition();674++BranchesExplored;675bool FinishedExploring = false;676// If the Cost after adding edges is greater than a known solution,677// backtrack678if (CurrCost < BestCost || BestCost == -1) {679if (solveExact()) {680FinishedExploring = BestCost != 0;681if (!FinishedExploring)682return true;683}684}685686retreatPosition();687CurrCost -= AddedCost;688removeEdges(AddedEdges);689Match->pop();690CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;691if (FinishedExploring)692return true;693}694695// Try the pipeline where the current instruction is omitted696// Potentially if we omit a problematic instruction from the pipeline,697// all the other instructions can nicely fit.698CurrCost += MissPenalty;699advancePosition();700701LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");702703bool FinishedExploring = false;704if (CurrCost < BestCost || BestCost == -1) {705if (solveExact()) {706bool FinishedExploring = BestCost != 0;707if (!FinishedExploring)708return true;709}710}711712retreatPosition();713CurrCost -= MissPenalty;714return FinishedExploring;715}716717template <typename T>718void PipelineSolver::greedyFind(719std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) {720SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];721int BestNodeCost = -1;722int TempCost;723SchedGroup *BestGroup = nullptr;724int BestGroupID = -1;725auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];726LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum727<< ") in Pipeline # " << CurrSyncGroupIdx << "\n");728729// Since we have added the potential SchedGroups from bottom up, but730// traversed the DAG from top down, parse over the groups from last to731// first. If we fail to do this for the greedy algorithm, the solution will732// likely not be good in more complex cases.733for (; I != E; ++I) {734std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;735int CandSGID = *I;736SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {737return SG.getSGID() == CandSGID;738});739assert(Match);740741LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "742<< (int)Match->getMask() << "\n");743744if (Match->isFull()) {745LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");746continue;747}748if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {749LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n");750continue;751}752TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);753LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");754if (TempCost < BestNodeCost || BestNodeCost == -1) {755BestGroup = Match;756BestNodeCost = TempCost;757BestGroupID = CandSGID;758}759removeEdges(AddedEdges);760if (BestNodeCost == 0)761break;762}763764if (BestGroupID != -1) {765BestGroup->add(*CurrSU.first);766addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);767LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"768<< (int)BestGroup->getMask() << "\n");769BestCost += TempCost;770} else771BestCost += MissPenalty;772773CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;774}775776bool PipelineSolver::solveGreedy() {777BestCost = 0;778std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;779780while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {781SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];782IsBottomUp783? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())784: greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());785advancePosition();786}787BestPipeline = CurrPipeline;788removeEdges(AddedEdges);789return false;790}791792unsigned PipelineSolver::computeProblemSize() {793unsigned ProblemSize = 0;794for (auto &PipeConflicts : PipelineInstrs) {795ProblemSize += PipeConflicts.size();796}797798return ProblemSize;799}800801void PipelineSolver::solve() {802if (!NeedsSolver)803return;804805unsigned ProblemSize = computeProblemSize();806assert(ProblemSize > 0);807808bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;809MissPenalty = (ProblemSize / 2) + 1;810811LLVM_DEBUG(DAG->dump());812if (EnableExactSolver || BelowCutoff) {813LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");814solveGreedy();815reset();816LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");817if (BestCost > 0) {818LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");819solveExact();820LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");821}822} else { // Use the Greedy Algorithm by default823LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");824solveGreedy();825}826827makePipeline();828LLVM_DEBUG(dbgs() << "After applying mutation\n");829LLVM_DEBUG(DAG->dump());830}831832enum IGLPStrategyID : int {833MFMASmallGemmOptID = 0,834MFMASmallGemmSingleWaveOptID = 1,835MFMAExpInterleave = 2836};837838// Implement a IGLP scheduling strategy.839class IGLPStrategy {840protected:841ScheduleDAGInstrs *DAG;842843const SIInstrInfo *TII;844845public:846/// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.847virtual bool applyIGLPStrategy(848DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,849DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,850AMDGPU::SchedulingPhase Phase) = 0;851852// Returns true if this strategy should be applied to a ScheduleDAG.853virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,854AMDGPU::SchedulingPhase Phase) = 0;855856bool IsBottomUp = true;857858IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)859: DAG(DAG), TII(TII) {}860861virtual ~IGLPStrategy() = default;862};863864class MFMASmallGemmOpt final : public IGLPStrategy {865private:866public:867bool applyIGLPStrategy(868DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,869DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,870AMDGPU::SchedulingPhase Phase) override;871872bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,873AMDGPU::SchedulingPhase Phase) override {874return true;875}876877MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)878: IGLPStrategy(DAG, TII) {879IsBottomUp = true;880}881};882883bool MFMASmallGemmOpt::applyIGLPStrategy(884DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,885DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,886AMDGPU::SchedulingPhase Phase) {887// Count the number of MFMA instructions.888unsigned MFMACount = 0;889for (const MachineInstr &I : *DAG)890if (TII->isMFMAorWMMA(I))891++MFMACount;892893const unsigned PipelineSyncID = 0;894SchedGroup *SG = nullptr;895for (unsigned I = 0; I < MFMACount * 3; ++I) {896SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(897SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);898SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);899900SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(901SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);902SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);903}904905return true;906}907908class MFMAExpInterleaveOpt final : public IGLPStrategy {909private:910// The count of TRANS SUs involved in the interleaved pipeline911static unsigned TransPipeCount;912// The count of MFMA SUs involved in the interleaved pipeline913static unsigned MFMAPipeCount;914// The count of Add SUs involved in the interleaved pipeline915static unsigned AddPipeCount;916// The number of transitive MFMA successors for each TRANS SU917static unsigned MFMAEnablement;918// The number of transitive TRANS predecessors for each MFMA SU919static unsigned ExpRequirement;920// The count of independent "chains" of MFMA instructions in the pipeline921static unsigned MFMAChains;922// The length of each independent "chain" of MFMA instructions923static unsigned MFMAChainLength;924// Whether or not the pipeline has V_CVT instructions925static bool HasCvt;926// Whether or not there are instructions between the TRANS instruction and927// V_CVT928static bool HasChainBetweenCvt;929// The first occuring DS_READ which feeds an MFMA chain930static std::optional<unsigned> FirstPipeDSR;931// The MFMAPipe SUs with no MFMA predecessors932SmallVector<SUnit *, 4> MFMAChainSeeds;933// Compute the heuristics for the pipeline, returning whether or not the DAG934// is well formatted for the mutation935bool analyzeDAG(const SIInstrInfo *TII);936937/// Whether or not the instruction is a transitive predecessor of an MFMA938/// instruction939class IsPipeExp final : public InstructionRule {940public:941bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,942SmallVectorImpl<SchedGroup> &SyncPipe) override {943944auto DAG = SyncPipe[0].DAG;945946if (Cache->empty()) {947auto I = DAG->SUnits.rbegin();948auto E = DAG->SUnits.rend();949for (; I != E; I++) {950if (TII->isMFMAorWMMA(*I->getInstr()))951Cache->push_back(&*I);952}953if (Cache->empty())954return false;955}956957auto Reaches = (std::any_of(958Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) {959return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));960}));961962return Reaches;963}964IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)965: InstructionRule(TII, SGID, NeedsCache) {}966};967968/// Whether or not the instruction is a transitive predecessor of the969/// \p Number th MFMA of the MFMAs occuring after a TRANS instruction970class EnablesNthMFMA final : public InstructionRule {971private:972unsigned Number = 1;973974public:975bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,976SmallVectorImpl<SchedGroup> &SyncPipe) override {977bool FoundTrans = false;978unsigned Counter = 1;979auto DAG = SyncPipe[0].DAG;980981if (Cache->empty()) {982SmallVector<SUnit *, 8> Worklist;983984auto I = DAG->SUnits.begin();985auto E = DAG->SUnits.end();986for (; I != E; I++) {987if (FoundTrans && TII->isMFMAorWMMA(*I->getInstr())) {988if (Counter == Number) {989Cache->push_back(&*I);990break;991}992++Counter;993}994if (!FoundTrans && TII->isTRANS(I->getInstr()->getOpcode()))995FoundTrans = true;996}997if (Cache->empty())998return false;999}10001001return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));1002}10031004EnablesNthMFMA(unsigned Number, const SIInstrInfo *TII, unsigned SGID,1005bool NeedsCache = false)1006: InstructionRule(TII, SGID, NeedsCache), Number(Number) {}1007};10081009/// Whether or not the instruction enables the exact MFMA that is the \p1010/// Number th MFMA in the chain starting with \p ChainSeed1011class EnablesNthMFMAInChain final : public InstructionRule {1012private:1013unsigned Number = 1;1014SUnit *ChainSeed;10151016public:1017bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,1018SmallVectorImpl<SchedGroup> &SyncPipe) override {1019auto DAG = SyncPipe[0].DAG;10201021if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))1022return false;10231024if (Cache->empty()) {1025auto TempSU = ChainSeed;1026auto Depth = Number;1027while (Depth > 0) {1028--Depth;1029bool Found = false;1030for (auto &Succ : TempSU->Succs) {1031if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {1032TempSU = Succ.getSUnit();1033Found = true;1034break;1035}1036}1037if (!Found)1038return false;1039}10401041Cache->push_back(TempSU);1042}1043// If we failed to find the instruction to be placed into the cache, we1044// would have already exited.1045assert(!Cache->empty());10461047return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));1048}10491050EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed,1051const SIInstrInfo *TII, unsigned SGID,1052bool NeedsCache = false)1053: InstructionRule(TII, SGID, NeedsCache), Number(Number),1054ChainSeed(ChainSeed) {}1055};10561057/// Whether or not the instruction has less than \p Size immediate successors.1058/// If \p HasIntermediary is true, this tests also whether all successors of1059/// the SUnit have less than \p Size successors.1060class LessThanNSuccs final : public InstructionRule {1061private:1062unsigned Size = 1;1063bool HasIntermediary = false;10641065public:1066bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,1067SmallVectorImpl<SchedGroup> &SyncPipe) override {1068if (!SyncPipe.size())1069return false;10701071auto SuccSize = std::count_if(1072SU->Succs.begin(), SU->Succs.end(),1073[](const SDep &Succ) { return Succ.getKind() == SDep::Data; });1074if (SuccSize >= Size)1075return false;10761077if (HasIntermediary) {1078for (auto Succ : SU->Succs) {1079auto SuccSize = std::count_if(1080Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(),1081[](const SDep &SuccSucc) {1082return SuccSucc.getKind() == SDep::Data;1083});1084if (SuccSize >= Size)1085return false;1086}1087}10881089return true;1090}1091LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID,1092bool HasIntermediary = false, bool NeedsCache = false)1093: InstructionRule(TII, SGID, NeedsCache), Size(Size),1094HasIntermediary(HasIntermediary) {}1095};10961097/// Whether or not the instruction has greater than or equal to \p Size1098/// immediate successors. If \p HasIntermediary is true, this tests also1099/// whether all successors of the SUnit have greater than or equal to \p Size1100/// successors.1101class GreaterThanOrEqualToNSuccs final : public InstructionRule {1102private:1103unsigned Size = 1;1104bool HasIntermediary = false;11051106public:1107bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,1108SmallVectorImpl<SchedGroup> &SyncPipe) override {1109if (!SyncPipe.size())1110return false;11111112auto SuccSize = std::count_if(1113SU->Succs.begin(), SU->Succs.end(),1114[](const SDep &Succ) { return Succ.getKind() == SDep::Data; });1115if (SuccSize >= Size)1116return true;11171118if (HasIntermediary) {1119for (auto Succ : SU->Succs) {1120auto SuccSize = std::count_if(1121Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(),1122[](const SDep &SuccSucc) {1123return SuccSucc.getKind() == SDep::Data;1124});1125if (SuccSize >= Size)1126return true;1127}1128}11291130return false;1131}1132GreaterThanOrEqualToNSuccs(unsigned Size, const SIInstrInfo *TII,1133unsigned SGID, bool HasIntermediary = false,1134bool NeedsCache = false)1135: InstructionRule(TII, SGID, NeedsCache), Size(Size),1136HasIntermediary(HasIntermediary) {}1137};11381139// Whether or not the instruction is a relevant V_CVT instruction.1140class IsCvt final : public InstructionRule {1141public:1142bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,1143SmallVectorImpl<SchedGroup> &SyncPipe) override {1144auto Opc = SU->getInstr()->getOpcode();1145return Opc == AMDGPU::V_CVT_F16_F32_e32 ||1146Opc == AMDGPU::V_CVT_I32_F32_e32;1147}1148IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)1149: InstructionRule(TII, SGID, NeedsCache) {}1150};11511152// Whether or not the instruction is FMA_F32.1153class IsFMA final : public InstructionRule {1154public:1155bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,1156SmallVectorImpl<SchedGroup> &SyncPipe) override {1157return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 ||1158SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32;1159}1160IsFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)1161: InstructionRule(TII, SGID, NeedsCache) {}1162};11631164// Whether or not the instruction is a V_ADD_F32 instruction.1165class IsPipeAdd final : public InstructionRule {1166public:1167bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,1168SmallVectorImpl<SchedGroup> &SyncPipe) override {1169return SU->getInstr()->getOpcode() == AMDGPU::V_ADD_F32_e32;1170}1171IsPipeAdd(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)1172: InstructionRule(TII, SGID, NeedsCache) {}1173};11741175/// Whether or not the instruction is an immediate RAW successor1176/// of the SchedGroup \p Distance steps before.1177class IsSuccOfPrevNthGroup final : public InstructionRule {1178private:1179unsigned Distance = 1;11801181public:1182bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,1183SmallVectorImpl<SchedGroup> &SyncPipe) override {1184SchedGroup *OtherGroup = nullptr;1185if (!SyncPipe.size())1186return false;11871188for (auto &PipeSG : SyncPipe) {1189if ((unsigned)PipeSG.getSGID() == SGID - Distance)1190OtherGroup = &PipeSG;1191}11921193if (!OtherGroup)1194return false;1195if (!OtherGroup->Collection.size())1196return true;11971198for (auto &OtherEle : OtherGroup->Collection) {1199for (auto &Succ : OtherEle->Succs) {1200if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)1201return true;1202}1203}12041205return false;1206}1207IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,1208unsigned SGID, bool NeedsCache = false)1209: InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}1210};12111212/// Whether or not the instruction is a transitive successor of any1213/// instruction the the SchedGroup \p Distance steps before.1214class IsReachableFromPrevNthGroup final : public InstructionRule {1215private:1216unsigned Distance = 1;12171218public:1219bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,1220SmallVectorImpl<SchedGroup> &SyncPipe) override {1221SchedGroup *OtherGroup = nullptr;1222if (!SyncPipe.size())1223return false;12241225for (auto &PipeSG : SyncPipe) {1226if ((unsigned)PipeSG.getSGID() == SGID - Distance)1227OtherGroup = &PipeSG;1228}12291230if (!OtherGroup)1231return false;1232if (!OtherGroup->Collection.size())1233return true;12341235auto DAG = SyncPipe[0].DAG;12361237for (auto &OtherEle : OtherGroup->Collection)1238if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle))1239return true;12401241return false;1242}1243IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,1244unsigned SGID, bool NeedsCache = false)1245: InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}1246};12471248/// Whether or not the instruction occurs after the SU with NodeNUm \p Number1249class OccursAtOrAfterNode final : public InstructionRule {1250private:1251unsigned Number = 1;12521253public:1254bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,1255SmallVectorImpl<SchedGroup> &SyncPipe) override {12561257return SU->NodeNum >= Number;1258}1259OccursAtOrAfterNode(unsigned Number, const SIInstrInfo *TII, unsigned SGID,1260bool NeedsCache = false)1261: InstructionRule(TII, SGID, NeedsCache), Number(Number) {}1262};12631264/// Whether or not the SU is exactly the \p Number th MFMA in the chain1265/// starting with \p ChainSeed1266class IsExactMFMA final : public InstructionRule {1267private:1268unsigned Number = 1;1269SUnit *ChainSeed;12701271public:1272bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,1273SmallVectorImpl<SchedGroup> &SyncPipe) override {1274if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))1275return false;12761277if (Cache->empty()) {1278auto TempSU = ChainSeed;1279auto Depth = Number;1280while (Depth > 0) {1281--Depth;1282bool Found = false;1283for (auto &Succ : TempSU->Succs) {1284if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {1285TempSU = Succ.getSUnit();1286Found = true;1287break;1288}1289}1290if (!Found) {1291return false;1292}1293}1294Cache->push_back(TempSU);1295}1296// If we failed to find the instruction to be placed into the cache, we1297// would have already exited.1298assert(!Cache->empty());12991300return (*Cache)[0] == SU;1301}13021303IsExactMFMA(unsigned Number, SUnit *ChainSeed, const SIInstrInfo *TII,1304unsigned SGID, bool NeedsCache = false)1305: InstructionRule(TII, SGID, NeedsCache), Number(Number),1306ChainSeed(ChainSeed) {}1307};13081309// Whether the instruction occurs after the first TRANS instruction. This1310// implies the instruction can not be a predecessor of the first TRANS1311// insruction1312class OccursAfterExp final : public InstructionRule {1313public:1314bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,1315SmallVectorImpl<SchedGroup> &SyncPipe) override {13161317SmallVector<SUnit *, 12> Worklist;1318auto DAG = SyncPipe[0].DAG;1319if (Cache->empty()) {1320for (auto &SU : DAG->SUnits)1321if (TII->isTRANS(SU.getInstr()->getOpcode())) {1322Cache->push_back(&SU);1323break;1324}1325if (Cache->empty())1326return false;1327}13281329return SU->NodeNum > (*Cache)[0]->NodeNum;1330}13311332OccursAfterExp(const SIInstrInfo *TII, unsigned SGID,1333bool NeedsCache = false)1334: InstructionRule(TII, SGID, NeedsCache) {}1335};13361337public:1338bool applyIGLPStrategy(1339DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,1340DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,1341AMDGPU::SchedulingPhase Phase) override;13421343bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,1344AMDGPU::SchedulingPhase Phase) override;13451346MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)1347: IGLPStrategy(DAG, TII) {1348IsBottomUp = false;1349}1350};13511352unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;1353unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;1354unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;1355unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;1356unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;1357unsigned MFMAExpInterleaveOpt::MFMAChains = 0;1358unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;1359bool MFMAExpInterleaveOpt::HasCvt = false;1360bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;1361std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;13621363bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {1364SmallVector<SUnit *, 10> ExpPipeCands;1365SmallVector<SUnit *, 10> MFMAPipeCands;1366SmallVector<SUnit *, 10> MFMAPipeSUs;1367SmallVector<SUnit *, 10> PackSUs;1368SmallVector<SUnit *, 10> CvtSUs;13691370auto isBitPack = [](unsigned Opc) {1371return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;1372};13731374auto isCvt = [](unsigned Opc) {1375return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32;1376};13771378auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; };13791380AddPipeCount = 0;1381for (SUnit &SU : DAG->SUnits) {1382auto Opc = SU.getInstr()->getOpcode();1383if (TII->isTRANS(Opc)) {1384// Avoid counting a potential bonus V_EXP which all the MFMA depend on1385if (SU.Succs.size() >= 7)1386continue;1387for (auto &Succ : SU.Succs) {1388if (Succ.getSUnit()->Succs.size() >= 7)1389continue;1390}1391ExpPipeCands.push_back(&SU);1392}13931394if (TII->isMFMAorWMMA(*SU.getInstr()))1395MFMAPipeCands.push_back(&SU);13961397if (isBitPack(Opc))1398PackSUs.push_back(&SU);13991400if (isCvt(Opc))1401CvtSUs.push_back(&SU);14021403if (isAdd(Opc))1404++AddPipeCount;1405}14061407if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size()))1408return false;14091410TransPipeCount = 0;14111412std::optional<SUnit *> TempMFMA;1413std::optional<SUnit *> TempExp;1414// Count the number of EXPs that reach an MFMA1415for (auto &PredSU : ExpPipeCands) {1416for (auto &SuccSU : MFMAPipeCands) {1417if (DAG->IsReachable(SuccSU, PredSU)) {1418if (!TempExp) {1419TempExp = PredSU;1420TempMFMA = SuccSU;1421}1422MFMAPipeSUs.push_back(SuccSU);1423++TransPipeCount;1424break;1425}1426}1427}14281429if (!(TempExp && TempMFMA))1430return false;14311432HasChainBetweenCvt =1433std::find_if((*TempExp)->Succs.begin(), (*TempExp)->Succs.end(),1434[&isCvt](SDep &Succ) {1435return isCvt(Succ.getSUnit()->getInstr()->getOpcode());1436}) == (*TempExp)->Succs.end();14371438// Count the number of MFMAs that are reached by an EXP1439for (auto &SuccSU : MFMAPipeCands) {1440if (MFMAPipeSUs.size() &&1441std::find_if(MFMAPipeSUs.begin(), MFMAPipeSUs.end(),1442[&SuccSU](SUnit *PotentialMatch) {1443return PotentialMatch->NodeNum == SuccSU->NodeNum;1444}) != MFMAPipeSUs.end())1445continue;14461447for (auto &PredSU : ExpPipeCands) {1448if (DAG->IsReachable(SuccSU, PredSU)) {1449MFMAPipeSUs.push_back(SuccSU);1450break;1451}1452}1453}14541455MFMAPipeCount = MFMAPipeSUs.size();14561457assert(TempExp && TempMFMA);1458assert(MFMAPipeCount > 0);14591460std::optional<SUnit *> TempCvt;1461for (auto &SuccSU : CvtSUs) {1462if (DAG->IsReachable(SuccSU, *TempExp)) {1463TempCvt = SuccSU;1464break;1465}1466}14671468HasCvt = false;1469if (TempCvt.has_value()) {1470for (auto &SuccSU : MFMAPipeSUs) {1471if (DAG->IsReachable(SuccSU, *TempCvt)) {1472HasCvt = true;1473break;1474}1475}1476}14771478MFMAChains = 0;1479for (auto &MFMAPipeSU : MFMAPipeSUs) {1480if (is_contained(MFMAChainSeeds, MFMAPipeSU))1481continue;1482if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(),1483[&TII](SDep &Succ) {1484return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());1485})) {1486MFMAChainSeeds.push_back(MFMAPipeSU);1487++MFMAChains;1488}1489}14901491if (!MFMAChains)1492return false;14931494for (auto Pred : MFMAChainSeeds[0]->Preds) {1495if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&1496Pred.getSUnit()->getInstr()->mayLoad())1497FirstPipeDSR = Pred.getSUnit()->NodeNum;1498}14991500MFMAChainLength = MFMAPipeCount / MFMAChains;15011502// The number of bit pack operations that depend on a single V_EXP1503unsigned PackSuccCount = std::count_if(1504PackSUs.begin(), PackSUs.end(), [this, &TempExp](SUnit *VPack) {1505return DAG->IsReachable(VPack, *TempExp);1506});15071508// The number of bit pack operations an MFMA depends on1509unsigned PackPredCount =1510std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),1511[&isBitPack](SDep &Pred) {1512auto Opc = Pred.getSUnit()->getInstr()->getOpcode();1513return isBitPack(Opc);1514});15151516auto PackPred =1517std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),1518[&isBitPack](SDep &Pred) {1519auto Opc = Pred.getSUnit()->getInstr()->getOpcode();1520return isBitPack(Opc);1521});15221523if (PackPred == (*TempMFMA)->Preds.end())1524return false;15251526MFMAEnablement = 0;1527ExpRequirement = 0;1528// How many MFMAs depend on a single bit pack operation1529MFMAEnablement =1530std::count_if(PackPred->getSUnit()->Succs.begin(),1531PackPred->getSUnit()->Succs.end(), [&TII](SDep &Succ) {1532return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());1533});15341535// The number of MFMAs that depend on a single V_EXP1536MFMAEnablement *= PackSuccCount;15371538// The number of V_EXPs required to resolve all dependencies for an MFMA1539ExpRequirement =1540std::count_if(ExpPipeCands.begin(), ExpPipeCands.end(),1541[this, &PackPred](SUnit *ExpBase) {1542return DAG->IsReachable(PackPred->getSUnit(), ExpBase);1543});15441545ExpRequirement *= PackPredCount;1546return true;1547}15481549bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,1550AMDGPU::SchedulingPhase Phase) {1551const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();1552const SIInstrInfo *TII = ST.getInstrInfo();15531554if (Phase != AMDGPU::SchedulingPhase::PostRA)1555MFMAChainSeeds.clear();1556if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))1557return false;15581559return true;1560}15611562bool MFMAExpInterleaveOpt::applyIGLPStrategy(1563DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,1564DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,1565AMDGPU::SchedulingPhase Phase) {15661567bool IsSmallKernelType =1568MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;1569bool IsLargeKernelType =1570MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;15711572if (!(IsSmallKernelType || IsLargeKernelType))1573return false;15741575const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();1576const SIInstrInfo *TII = ST.getInstrInfo();15771578unsigned PipelineSyncID = 0;1579SchedGroup *SG = nullptr;15801581unsigned MFMAChain = 0;1582unsigned PositionInChain = 0;1583unsigned CurrMFMAForTransPosition = 0;15841585auto incrementTransPosition = [&MFMAChain, &PositionInChain,1586&CurrMFMAForTransPosition]() {1587CurrMFMAForTransPosition += MFMAEnablement;1588PositionInChain = (CurrMFMAForTransPosition / MFMAChains);1589MFMAChain = CurrMFMAForTransPosition % MFMAChains;1590};15911592auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {1593auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;1594return (TempMFMAForTrans / MFMAChains);1595};15961597auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {1598auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;1599return TempMFMAForTrans % MFMAChains;1600};16011602unsigned CurrMFMAPosition = 0;1603unsigned MFMAChainForMFMA = 0;1604unsigned PositionInChainForMFMA = 0;16051606auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,1607&PositionInChainForMFMA]() {1608++CurrMFMAPosition;1609MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;1610PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;1611};16121613bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA;1614assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);16151616bool UsesFMA = IsSmallKernelType || !IsPostRA;1617bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;1618bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);1619bool UsesVALU = IsSmallKernelType;16201621// PHASE 1: "Prefetch"1622if (UsesFMA) {1623// First Round FMA1624SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(1625SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);1626if (!IsPostRA && MFMAChains) {1627SG->addRule(std::make_shared<EnablesNthMFMAInChain>(1628PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),1629true));1630} else1631SG->addRule(1632std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));1633SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));1634SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);16351636// Second Round FMA1637SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(1638SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);1639if (!IsPostRA && MFMAChains) {1640SG->addRule(std::make_shared<EnablesNthMFMAInChain>(1641getNextTransPositionInChain(),1642MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));1643} else1644SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,1645SG->getSGID(), true));1646SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));1647SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);1648}16491650if (UsesDSRead) {1651SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(1652SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);1653SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,1654SG->getSGID()));1655SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);1656}16571658// First Round EXP1659SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(1660SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG, TII);1661if (!IsPostRA && MFMAChains)1662SG->addRule(std::make_shared<EnablesNthMFMAInChain>(1663PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), true));1664else1665SG->addRule(std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));1666SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));1667SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),1668HasChainBetweenCvt));1669SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);16701671incrementTransPosition();16721673// First Round CVT, Third Round FMA, Second Round EXP; interleaved1674for (unsigned I = 0; I < ExpRequirement; I++) {1675// First Round CVT1676if (UsesCvt) {1677SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(1678SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);1679SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));1680if (HasChainBetweenCvt)1681SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(16821 + (2 + UsesFMA) * I, TII, SG->getSGID()));1683else1684SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(16851 + (2 + UsesFMA) * I, TII, SG->getSGID()));1686SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);1687}16881689// Third Round FMA1690if (UsesFMA) {1691SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(1692SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);1693if (!IsPostRA && MFMAChains) {1694SG->addRule(std::make_shared<EnablesNthMFMAInChain>(1695getNextTransPositionInChain(),1696MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));1697} else1698SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,1699TII, SG->getSGID(), true));1700SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));1701SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);1702}17031704// Second Round EXP1705SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(1706SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);1707if (!IsPostRA && MFMAChains)1708SG->addRule(std::make_shared<EnablesNthMFMAInChain>(1709PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),1710true));1711else1712SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,1713SG->getSGID(), true));1714SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));1715SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),1716HasChainBetweenCvt));1717SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);1718}17191720// The "extra" EXP which enables all MFMA1721// TODO: UsesExtraExp1722SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(1723SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);1724SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));1725SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(17268, TII, SG->getSGID(), HasChainBetweenCvt));1727SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);17281729// PHASE 2: Main Interleave Loop17301731// The number of MFMAs per iteration1732unsigned MFMARatio =1733MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;1734// The number of Exps per iteration1735unsigned ExpRatio =1736MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;1737// The reamaining Exps1738unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)1739? TransPipeCount - (2 * ExpRequirement)1740: 0;1741unsigned ExpLoopCount = RemainingExp / ExpRatio;1742// In loop MFMAs1743unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)1744? MFMAPipeCount - (MFMAEnablement * 2)1745: 0;1746unsigned MFMALoopCount = MFMAInLoop / MFMARatio;1747unsigned VALUOps =1748AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;1749unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);17501751for (unsigned I = 0; I < LoopSize; I++) {1752if (!(I * ExpRatio % ExpRequirement))1753incrementTransPosition();17541755// Round N MFMA1756SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(1757SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG, TII);1758if (!IsPostRA && MFMAChains)1759SG->addRule(std::make_shared<IsExactMFMA>(1760PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA], TII,1761SG->getSGID(), true));1762else1763SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));1764SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);1765incrementMFMAPosition();17661767if (UsesVALU) {1768SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(1769SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG, TII);1770SG->addRule(std::make_shared<IsPipeAdd>(TII, SG->getSGID()));1771SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);1772}17731774if (UsesDSRead && !(I % 4)) {1775SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(1776SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);1777SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,1778SG->getSGID()));1779SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);1780}17811782// CVT, EXP, FMA Interleaving1783for (unsigned J = 0; J < ExpRatio; J++) {1784auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (I + 1);1785auto MaxMFMAOffset =1786(1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;17871788// Round N + 1 CVT1789if (UsesCvt) {1790SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(1791SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);1792SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));1793auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;1794auto DSROffset = I / 4 + 1;1795auto MaxDSROffset = MaxMFMAOffset / 4;1796// TODO: UsesExtraExp1797auto ExpOffset = I * ExpRatio + J >= ExpRequirement ? 0 : 1;1798auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +1799std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +1800ExpOffset;1801if (HasChainBetweenCvt)1802SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(1803CurrentOffset, TII, SG->getSGID()));1804else1805SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset, TII,1806SG->getSGID()));1807SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);1808}18091810// Round N + 3 FMA1811if (UsesFMA) {1812SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(1813SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);1814if (!IsPostRA && MFMAChains)1815SG->addRule(std::make_shared<EnablesNthMFMAInChain>(1816getNextTransPositionInChain(),1817MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(),1818true));1819else1820SG->addRule(std::make_shared<EnablesNthMFMA>(1821(((I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,1822TII, SG->getSGID(), true));1823SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));1824SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);1825}18261827// Round N + 2 Exp1828SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(1829SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);1830if (!IsPostRA && MFMAChains)1831SG->addRule(std::make_shared<EnablesNthMFMAInChain>(1832PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),1833true));1834else1835SG->addRule(std::make_shared<EnablesNthMFMA>(1836(((I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,1837TII, SG->getSGID(), true));1838SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));1839SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),1840HasChainBetweenCvt));1841SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);1842}1843}18441845// PHASE 3: Remaining MFMAs1846SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(1847SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, TII);1848SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));1849SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);1850return true;1851}18521853class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {1854private:1855// Whether the DS_READ is a predecessor of first four MFMA in region1856class EnablesInitialMFMA final : public InstructionRule {1857public:1858bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,1859SmallVectorImpl<SchedGroup> &SyncPipe) override {1860if (!SyncPipe.size())1861return false;1862int MFMAsFound = 0;1863if (!Cache->size()) {1864for (auto &Elt : SyncPipe[0].DAG->SUnits) {1865if (TII->isMFMAorWMMA(*Elt.getInstr())) {1866++MFMAsFound;1867if (MFMAsFound > 4)1868break;1869Cache->push_back(&Elt);1870}1871}1872}18731874assert(Cache->size());1875auto DAG = SyncPipe[0].DAG;1876for (auto &Elt : *Cache) {1877if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))1878return true;1879}1880return false;1881}18821883EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID,1884bool NeedsCache = false)1885: InstructionRule(TII, SGID, NeedsCache) {}1886};18871888// Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE1889class IsPermForDSW final : public InstructionRule {1890public:1891bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,1892SmallVectorImpl<SchedGroup> &SyncPipe) override {1893auto MI = SU->getInstr();1894if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)1895return false;18961897bool FitsInGroup = false;1898// Does the VALU have a DS_WRITE successor1899if (!Collection.size()) {1900for (auto &Succ : SU->Succs) {1901SUnit *SuccUnit = Succ.getSUnit();1902if (TII->isDS(*SuccUnit->getInstr()) &&1903SuccUnit->getInstr()->mayStore()) {1904Cache->push_back(SuccUnit);1905FitsInGroup = true;1906}1907}1908return FitsInGroup;1909}19101911assert(Cache->size());19121913// Does the VALU have a DS_WRITE successor that is the same as other1914// VALU already in the group. The V_PERMs will all share 1 DS_W succ1915return llvm::any_of(*Cache, [&SU](SUnit *Elt) {1916return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) {1917return ThisSucc.getSUnit() == Elt;1918});1919});1920}19211922IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)1923: InstructionRule(TII, SGID, NeedsCache) {}1924};19251926// Whether the SU is a successor of any element in previous SchedGroup1927class IsSuccOfPrevGroup final : public InstructionRule {1928public:1929bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,1930SmallVectorImpl<SchedGroup> &SyncPipe) override {1931SchedGroup *OtherGroup = nullptr;1932for (auto &PipeSG : SyncPipe) {1933if ((unsigned)PipeSG.getSGID() == SGID - 1) {1934OtherGroup = &PipeSG;1935}1936}19371938if (!OtherGroup)1939return false;1940if (!OtherGroup->Collection.size())1941return true;19421943// Does the previous VALU have this DS_Write as a successor1944return (std::any_of(OtherGroup->Collection.begin(),1945OtherGroup->Collection.end(), [&SU](SUnit *Elt) {1946return std::any_of(Elt->Succs.begin(),1947Elt->Succs.end(),1948[&SU](SDep &Succ) {1949return Succ.getSUnit() == SU;1950});1951}));1952}1953IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,1954bool NeedsCache = false)1955: InstructionRule(TII, SGID, NeedsCache) {}1956};19571958// Whether the combined load width of group is 128 bits1959class VMEMSize final : public InstructionRule {1960public:1961bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,1962SmallVectorImpl<SchedGroup> &SyncPipe) override {1963auto MI = SU->getInstr();1964if (MI->getOpcode() == TargetOpcode::BUNDLE)1965return false;1966if (!Collection.size())1967return true;19681969int NumBits = 0;19701971auto TRI = TII->getRegisterInfo();1972auto &MRI = MI->getParent()->getParent()->getRegInfo();1973for (auto &Elt : Collection) {1974auto Op = Elt->getInstr()->getOperand(0);1975auto Size =1976TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op));1977NumBits += Size;1978}19791980if (NumBits < 128) {1981assert(TII->isVMEM(*MI) && MI->mayLoad());1982if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(1983MRI, MI->getOperand(0))) <=1984128)1985return true;1986}19871988return false;1989}19901991VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)1992: InstructionRule(TII, SGID, NeedsCache) {}1993};19941995/// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup1996/// that is \p Distance steps away1997class SharesPredWithPrevNthGroup final : public InstructionRule {1998private:1999unsigned Distance = 1;20002001public:2002bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,2003SmallVectorImpl<SchedGroup> &SyncPipe) override {2004SchedGroup *OtherGroup = nullptr;2005if (!SyncPipe.size())2006return false;20072008if (!Cache->size()) {20092010for (auto &PipeSG : SyncPipe) {2011if ((unsigned)PipeSG.getSGID() == SGID - Distance) {2012OtherGroup = &PipeSG;2013}2014}20152016if (!OtherGroup)2017return false;2018if (!OtherGroup->Collection.size())2019return true;20202021for (auto &OtherEle : OtherGroup->Collection) {2022for (auto &Pred : OtherEle->Preds) {2023if (Pred.getSUnit()->getInstr()->getOpcode() ==2024AMDGPU::V_PERM_B32_e64)2025Cache->push_back(Pred.getSUnit());2026}2027}20282029// If the other group has no PERM preds, then this group won't share any2030if (!Cache->size())2031return false;2032}20332034auto DAG = SyncPipe[0].DAG;2035// Does the previous DS_WRITE share a V_PERM predecessor with this2036// VMEM_READ2037return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) {2038return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);2039});2040}2041SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,2042unsigned SGID, bool NeedsCache = false)2043: InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}2044};20452046public:2047bool applyIGLPStrategy(2048DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,2049DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,2050AMDGPU::SchedulingPhase Phase) override;20512052bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,2053AMDGPU::SchedulingPhase Phase) override {2054return true;2055}20562057MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)2058: IGLPStrategy(DAG, TII) {2059IsBottomUp = false;2060}2061};20622063static unsigned DSWCount = 0;2064static unsigned DSWWithPermCount = 0;2065static unsigned DSWWithSharedVMEMCount = 0;20662067bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(2068DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,2069DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,2070AMDGPU::SchedulingPhase Phase) {2071unsigned MFMACount = 0;2072unsigned DSRCount = 0;20732074bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;20752076assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&2077DSWWithSharedVMEMCount == 0)) &&2078"DSWCounters should be zero in pre-RA scheduling!");2079SmallVector<SUnit *, 6> DSWithPerms;2080for (auto &SU : DAG->SUnits) {2081auto I = SU.getInstr();2082if (TII->isMFMAorWMMA(*I))2083++MFMACount;2084else if (TII->isDS(*I)) {2085if (I->mayLoad())2086++DSRCount;2087else if (I->mayStore() && IsInitial) {2088++DSWCount;2089for (auto Pred : SU.Preds) {2090if (Pred.getSUnit()->getInstr()->getOpcode() ==2091AMDGPU::V_PERM_B32_e64) {2092DSWithPerms.push_back(&SU);2093break;2094}2095}2096}2097}2098}20992100if (IsInitial) {2101DSWWithPermCount = DSWithPerms.size();2102auto I = DSWithPerms.begin();2103auto E = DSWithPerms.end();21042105// Get the count of DS_WRITES with V_PERM predecessors which2106// have loop carried dependencies (WAR) on the same VMEM_READs.2107// We consider partial overlap as a miss -- in other words,2108// for a given DS_W, we only consider another DS_W as matching2109// if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred2110// for every V_PERM pred of this DS_W.2111DenseMap<MachineInstr *, SUnit *> VMEMLookup;2112SmallVector<SUnit *, 6> Counted;2113for (; I != E; I++) {2114SUnit *Cand = nullptr;2115bool MissedAny = false;2116for (auto &Pred : (*I)->Preds) {2117if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)2118continue;21192120if (Cand && llvm::is_contained(Counted, Cand))2121break;21222123for (auto &Succ : Pred.getSUnit()->Succs) {2124auto MI = Succ.getSUnit()->getInstr();2125if (!TII->isVMEM(*MI) || !MI->mayLoad())2126continue;21272128if (MissedAny || !VMEMLookup.size()) {2129MissedAny = true;2130VMEMLookup[MI] = *I;2131continue;2132}21332134if (!VMEMLookup.contains(MI)) {2135MissedAny = true;2136VMEMLookup[MI] = *I;2137continue;2138}21392140Cand = VMEMLookup[MI];2141if (llvm::is_contained(Counted, Cand)) {2142MissedAny = true;2143break;2144}2145}2146}2147if (!MissedAny && Cand) {2148DSWWithSharedVMEMCount += 2;2149Counted.push_back(Cand);2150Counted.push_back(*I);2151}2152}2153}21542155assert(DSWWithSharedVMEMCount <= DSWWithPermCount);2156SchedGroup *SG;2157unsigned PipelineSyncID = 0;2158// For kernels with V_PERM, there are enough VALU to mix in between MFMAs2159if (DSWWithPermCount) {2160for (unsigned I = 0; I < MFMACount; I++) {2161SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2162SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);2163SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);21642165SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2166SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);2167SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);2168}2169}21702171PipelineSyncID = 1;2172// Phase 1: Break up DS_READ and MFMA clusters.2173// First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ2174// prefetch21752176// Make ready initial MFMA2177SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2178SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII);2179SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true));2180SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);21812182SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2183SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);2184SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);21852186// Interleave MFMA with DS_READ prefetch2187for (unsigned I = 0; I < DSRCount - 4; ++I) {2188SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2189SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);2190SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);21912192SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2193SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);2194SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);2195}21962197// Phase 2a: Loop carried dependency with V_PERM2198// Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they2199// depend on. Interleave MFMA to keep XDL unit busy throughout.2200for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {2201SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2202SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);2203SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));2204SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);22052206SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2207SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);2208SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));2209SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);22102211SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2212SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);2213SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(22141, TII, SG->getSGID(), true));2215SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));2216SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);22172218SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2219SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);2220SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);22212222SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2223SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);2224SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(22253, TII, SG->getSGID(), true));2226SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));2227SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);22282229SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2230SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);2231SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);2232}22332234// Phase 2b: Loop carried dependency without V_PERM2235// Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.2236// Interleave MFMA to keep XDL unit busy throughout.2237for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {2238SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2239SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);2240SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);22412242SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2243SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);2244SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));2245SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);22462247SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2248SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);2249SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);2250}22512252// Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are2253// ultimately used by two DS_WRITE2254// Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they2255// depend on. Interleave MFMA to keep XDL unit busy throughout.22562257for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) {2258SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2259SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);2260SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));2261SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);22622263SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2264SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);2265SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));2266SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);22672268SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2269SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);2270SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);22712272SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2273SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);2274SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));2275SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);22762277SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2278SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);2279SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));2280SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);22812282SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2283SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);2284SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);22852286SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2287SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);2288SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(22892, TII, SG->getSGID(), true));2290SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));2291SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);22922293SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2294SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);2295SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);22962297SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2298SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);2299SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(23004, TII, SG->getSGID(), true));2301SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));2302SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);23032304SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(2305SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);2306SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);2307}23082309return true;2310}23112312static std::unique_ptr<IGLPStrategy>2313createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,2314const SIInstrInfo *TII) {2315switch (ID) {2316case MFMASmallGemmOptID:2317return std::make_unique<MFMASmallGemmOpt>(DAG, TII);2318case MFMASmallGemmSingleWaveOptID:2319return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);2320case MFMAExpInterleave:2321return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);2322}23232324llvm_unreachable("Unknown IGLPStrategyID");2325}23262327class IGroupLPDAGMutation : public ScheduleDAGMutation {2328private:2329const SIInstrInfo *TII;23302331ScheduleDAGMI *DAG;23322333// Organize lists of SchedGroups by their SyncID. SchedGroups /2334// SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added2335// between then.2336DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;23372338// Used to track instructions that can be mapped to multiple sched groups2339DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;23402341// Add DAG edges that enforce SCHED_BARRIER ordering.2342void addSchedBarrierEdges(SUnit &SU);23432344// Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should2345// not be reordered accross the SCHED_BARRIER. This is used for the base2346// SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that2347// SCHED_BARRIER will always block all instructions that can be classified2348// into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size2349// and may only synchronize with some SchedGroups. Returns the inverse of2350// Mask. SCHED_BARRIER's mask describes which instruction types should be2351// allowed to be scheduled across it. Invert the mask to get the2352// SchedGroupMask of instructions that should be barred.2353SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;23542355// Create SchedGroups for a SCHED_GROUP_BARRIER.2356void initSchedGroupBarrierPipelineStage(2357std::vector<SUnit>::reverse_iterator RIter);23582359bool initIGLPOpt(SUnit &SU);23602361public:2362void apply(ScheduleDAGInstrs *DAGInstrs) override;23632364// The order in which the PipelineSolver should process the candidate2365// SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last2366// created SchedGroup first, and will consider that as the ultimate2367// predecessor group when linking. TOP_DOWN instead links and processes the2368// first created SchedGroup first.2369bool IsBottomUp = true;23702371// The scheduling phase this application of IGLP corresponds with.2372AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;23732374IGroupLPDAGMutation() = default;2375IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}2376};23772378unsigned SchedGroup::NumSchedGroups = 0;23792380bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {2381if (A != B && DAG->canAddEdge(B, A)) {2382DAG->addEdge(B, SDep(A, SDep::Artificial));2383return true;2384}2385return false;2386}23872388bool SchedGroup::canAddMI(const MachineInstr &MI) const {2389bool Result = false;2390if (MI.isMetaInstruction())2391Result = false;23922393else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&2394(TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||2395TII->isTRANS(MI)))2396Result = true;23972398else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&2399TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))2400Result = true;24012402else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&2403TII->isSALU(MI))2404Result = true;24052406else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&2407TII->isMFMAorWMMA(MI))2408Result = true;24092410else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&2411(TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))2412Result = true;24132414else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&2415MI.mayLoad() &&2416(TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))2417Result = true;24182419else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&2420MI.mayStore() &&2421(TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))2422Result = true;24232424else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&2425TII->isDS(MI))2426Result = true;24272428else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&2429MI.mayLoad() && TII->isDS(MI))2430Result = true;24312432else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&2433MI.mayStore() && TII->isDS(MI))2434Result = true;24352436else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&2437TII->isTRANS(MI))2438Result = true;24392440LLVM_DEBUG(2441dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)2442<< (Result ? " could classify " : " unable to classify ") << MI);24432444return Result;2445}24462447int SchedGroup::link(SUnit &SU, bool MakePred,2448std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {2449int MissedEdges = 0;2450for (auto *A : Collection) {2451SUnit *B = &SU;2452if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)2453continue;2454if (MakePred)2455std::swap(A, B);24562457if (DAG->IsReachable(B, A))2458continue;24592460// tryAddEdge returns false if there is a dependency that makes adding2461// the A->B edge impossible, otherwise it returns true;2462bool Added = tryAddEdge(A, B);2463if (Added)2464AddedEdges.emplace_back(A, B);2465else2466++MissedEdges;2467}24682469return MissedEdges;2470}24712472void SchedGroup::link(SUnit &SU, bool MakePred) {2473for (auto *A : Collection) {2474SUnit *B = &SU;2475if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)2476continue;2477if (MakePred)2478std::swap(A, B);24792480tryAddEdge(A, B);2481}2482}24832484void SchedGroup::link(SUnit &SU,2485function_ref<bool(const SUnit *A, const SUnit *B)> P) {2486for (auto *A : Collection) {2487SUnit *B = &SU;2488if (P(A, B))2489std::swap(A, B);24902491tryAddEdge(A, B);2492}2493}24942495void SchedGroup::link(SchedGroup &OtherGroup) {2496for (auto *B : OtherGroup.Collection)2497link(*B);2498}24992500bool SchedGroup::canAddSU(SUnit &SU) const {2501MachineInstr &MI = *SU.getInstr();2502if (MI.getOpcode() != TargetOpcode::BUNDLE)2503return canAddMI(MI);25042505// Special case for bundled MIs.2506const MachineBasicBlock *MBB = MI.getParent();2507MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;2508while (E != MBB->end() && E->isBundledWithPred())2509++E;25102511// Return true if all of the bundled MIs can be added to this group.2512return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });2513}25142515void SchedGroup::initSchedGroup() {2516for (auto &SU : DAG->SUnits) {2517if (isFull())2518break;25192520if (canAddSU(SU))2521add(SU);2522}2523}25242525void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,2526SUnitsToCandidateSGsMap &SyncedInstrs) {2527SUnit &InitSU = *RIter;2528for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {2529auto &SU = *RIter;2530if (isFull())2531break;25322533if (canAddSU(SU))2534SyncedInstrs[&SU].push_back(SGID);2535}25362537add(InitSU);2538assert(MaxSize);2539(*MaxSize)++;2540}25412542void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {2543auto I = DAG->SUnits.rbegin();2544auto E = DAG->SUnits.rend();2545for (; I != E; ++I) {2546auto &SU = *I;2547if (isFull())2548break;2549if (canAddSU(SU))2550SyncedInstrs[&SU].push_back(SGID);2551}2552}25532554void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {2555const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();2556if (!TSchedModel || DAGInstrs->SUnits.empty())2557return;25582559LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");2560const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();2561TII = ST.getInstrInfo();2562DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);2563SyncedSchedGroups.clear();2564SyncedInstrs.clear();2565bool FoundSB = false;2566bool FoundIGLP = false;2567bool ShouldApplyIGLP = false;2568for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {2569unsigned Opc = R->getInstr()->getOpcode();2570// SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.2571if (Opc == AMDGPU::SCHED_BARRIER) {2572addSchedBarrierEdges(*R);2573FoundSB = true;2574} else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {2575initSchedGroupBarrierPipelineStage(R);2576FoundSB = true;2577} else if (Opc == AMDGPU::IGLP_OPT) {2578resetEdges(*R, DAG);2579if (!FoundSB && !FoundIGLP) {2580FoundIGLP = true;2581ShouldApplyIGLP = initIGLPOpt(*R);2582}2583}2584}25852586if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {2587PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);2588// PipelineSolver performs the mutation by adding the edges it2589// determined as the best2590PS.solve();2591return;2592}2593}25942595void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {2596MachineInstr &MI = *SchedBarrier.getInstr();2597assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);2598// Remove all existing edges from the SCHED_BARRIER that were added due to the2599// instruction having side effects.2600resetEdges(SchedBarrier, DAG);2601LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: "2602<< MI.getOperand(0).getImm() << "\n");2603auto InvertedMask =2604invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());2605SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);2606SG.initSchedGroup();26072608// Preserve original instruction ordering relative to the SCHED_BARRIER.2609SG.link(2610SchedBarrier,2611(function_ref<bool(const SUnit *A, const SUnit *B)>)[](2612const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });2613}26142615SchedGroupMask2616IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {2617// Invert mask and erase bits for types of instructions that are implied to be2618// allowed past the SCHED_BARRIER.2619SchedGroupMask InvertedMask = ~Mask;26202621// ALU implies VALU, SALU, MFMA, TRANS.2622if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)2623InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &2624~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;2625// VALU, SALU, MFMA, TRANS implies ALU.2626else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||2627(InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||2628(InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||2629(InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)2630InvertedMask &= ~SchedGroupMask::ALU;26312632// VMEM implies VMEM_READ, VMEM_WRITE.2633if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)2634InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;2635// VMEM_READ, VMEM_WRITE implies VMEM.2636else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||2637(InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)2638InvertedMask &= ~SchedGroupMask::VMEM;26392640// DS implies DS_READ, DS_WRITE.2641if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)2642InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;2643// DS_READ, DS_WRITE implies DS.2644else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||2645(InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)2646InvertedMask &= ~SchedGroupMask::DS;26472648LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask2649<< "\n");26502651return InvertedMask;2652}26532654void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(2655std::vector<SUnit>::reverse_iterator RIter) {2656// Remove all existing edges from the SCHED_GROUP_BARRIER that were added due2657// to the instruction having side effects.2658resetEdges(*RIter, DAG);2659MachineInstr &SGB = *RIter->getInstr();2660assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);2661int32_t SGMask = SGB.getOperand(0).getImm();2662int32_t Size = SGB.getOperand(1).getImm();2663int32_t SyncID = SGB.getOperand(2).getImm();26642665auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,2666Size, SyncID, DAG, TII);26672668SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);2669}26702671bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {2672IGLPStrategyID StrategyID =2673(IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();2674auto S = createIGLPStrategy(StrategyID, DAG, TII);2675if (!S->shouldApplyStrategy(DAG, Phase))2676return false;26772678IsBottomUp = S->IsBottomUp;2679return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase);2680}26812682} // namespace26832684namespace llvm {26852686/// \p Phase specifes whether or not this is a reentry into the2687/// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the2688/// same scheduling region (e.g. pre and post-RA scheduling / multiple2689/// scheduling "phases"), we can reenter this mutation framework more than once2690/// for a given region.2691std::unique_ptr<ScheduleDAGMutation>2692createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {2693return std::make_unique<IGroupLPDAGMutation>(Phase);2694}26952696} // end namespace llvm269726982699