CoCalc -- AMDGPUIGroupLP.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
³⁵²⁶⁹ views
1
//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP  ------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// \file This file defines a set of schedule DAG mutations that can be used to
10
// override default scheduler behavior to enforce specific scheduling patterns.
11
// They should be used in cases where runtime performance considerations such as
12
// inter-wavefront interactions, mean that compile-time heuristics cannot
13
// predict the optimal instruction ordering, or in kernels where optimum
14
// instruction scheduling is important enough to warrant manual intervention.
15
//
16
//===----------------------------------------------------------------------===//
17

18
#include "AMDGPUIGroupLP.h"
19
#include "AMDGPUTargetMachine.h"
20
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21
#include "SIInstrInfo.h"
22
#include "SIMachineFunctionInfo.h"
23
#include "llvm/ADT/BitmaskEnum.h"
24
#include "llvm/ADT/DenseMap.h"
25
#include "llvm/CodeGen/MachineScheduler.h"
26
#include "llvm/CodeGen/TargetOpcodes.h"
27

28
using namespace llvm;
29

30
#define DEBUG_TYPE "igrouplp"
31

32
namespace {
33

34
static cl::opt<bool> EnableExactSolver(
35
    "amdgpu-igrouplp-exact-solver", cl::Hidden,
36
    cl::desc("Whether to use the exponential time solver to fit "
37
             "the instructions to the pipeline as closely as "
38
             "possible."),
39
    cl::init(false));
40

41
static cl::opt<unsigned> CutoffForExact(
42
    "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
43
    cl::desc("The maximum number of scheduling group conflicts "
44
             "which we attempt to solve with the exponential time "
45
             "exact solver. Problem sizes greater than this will"
46
             "be solved by the less accurate greedy algorithm. Selecting "
47
             "solver by size is superseded by manually selecting "
48
             "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
49

50
static cl::opt<uint64_t> MaxBranchesExplored(
51
    "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
52
    cl::desc("The amount of branches that we are willing to explore with"
53
             "the exact algorithm before giving up."));
54

55
static cl::opt<bool> UseCostHeur(
56
    "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
57
    cl::desc("Whether to use the cost heuristic to make choices as we "
58
             "traverse the search space using the exact solver. Defaulted "
59
             "to on, and if turned off, we will use the node order -- "
60
             "attempting to put the later nodes in the later sched groups. "
61
             "Experimentally, results are mixed, so this should be set on a "
62
             "case-by-case basis."));
63

64
// Components of the mask that determines which instruction types may be may be
65
// classified into a SchedGroup.
66
enum class SchedGroupMask {
67
  NONE = 0u,
68
  ALU = 1u << 0,
69
  VALU = 1u << 1,
70
  SALU = 1u << 2,
71
  MFMA = 1u << 3,
72
  VMEM = 1u << 4,
73
  VMEM_READ = 1u << 5,
74
  VMEM_WRITE = 1u << 6,
75
  DS = 1u << 7,
76
  DS_READ = 1u << 8,
77
  DS_WRITE = 1u << 9,
78
  TRANS = 1u << 10,
79
  ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
80
        DS_READ | DS_WRITE | TRANS,
81
  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
82
};
83

84
class SchedGroup;
85

86
// InstructionRule class is used to enact a filter which determines whether or
87
// not an SU maps to a given SchedGroup. It contains complementary data
88
// structures (e.g Cache) to help those filters.
89
class InstructionRule {
90
protected:
91
  const SIInstrInfo *TII;
92
  unsigned SGID;
93
  // A cache made available to the Filter to store SUnits for subsequent
94
  // invocations of the Filter
95
  std::optional<SmallVector<SUnit *, 4>> Cache;
96

97
public:
98
  virtual bool
99
  apply(const SUnit *, const ArrayRef<SUnit *>,
100
        SmallVectorImpl<SchedGroup> &) {
101
    return true;
102
  };
103

104
  InstructionRule(const SIInstrInfo *TII, unsigned SGID,
105
                  bool NeedsCache = false)
106
      : TII(TII), SGID(SGID) {
107
    if (NeedsCache) {
108
      Cache = SmallVector<SUnit *, 4>();
109
    }
110
  }
111

112
  virtual ~InstructionRule() = default;
113
};
114

115
using SUnitsToCandidateSGsMap = DenseMap<SUnit *, SmallVector<int, 4>>;
116

117
// Classify instructions into groups to enable fine tuned control over the
118
// scheduler. These groups may be more specific than current SchedModel
119
// instruction classes.
120
class SchedGroup {
121
private:
122
  // Mask that defines which instruction types can be classified into this
123
  // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
124
  // and SCHED_GROUP_BARRIER.
125
  SchedGroupMask SGMask;
126

127
  // Maximum number of SUnits that can be added to this group.
128
  std::optional<unsigned> MaxSize;
129

130
  // SchedGroups will only synchronize with other SchedGroups that have the same
131
  // SyncID.
132
  int SyncID = 0;
133

134
  // SGID is used to map instructions to candidate SchedGroups
135
  unsigned SGID;
136

137
  // The different rules each instruction in this SchedGroup must conform to
138
  SmallVector<std::shared_ptr<InstructionRule>, 4> Rules;
139

140
  // Count of the number of created SchedGroups, used to initialize SGID.
141
  static unsigned NumSchedGroups;
142

143
  // Try to add and edge from SU A to SU B.
144
  bool tryAddEdge(SUnit *A, SUnit *B);
145

146
  // Use SGMask to determine whether we can classify MI as a member of this
147
  // SchedGroup object.
148
  bool canAddMI(const MachineInstr &MI) const;
149

150
public:
151
  // Collection of SUnits that are classified as members of this group.
152
  SmallVector<SUnit *, 32> Collection;
153

154
  ScheduleDAGInstrs *DAG;
155
  const SIInstrInfo *TII;
156

157
  // Returns true if SU can be added to this SchedGroup.
158
  bool canAddSU(SUnit &SU) const;
159

160
  // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
161
  // MakePred is true, SU will be a predecessor of the SUnits in this
162
  // SchedGroup, otherwise SU will be a successor.
163
  void link(SUnit &SU, bool MakePred = false);
164

165
  // Add DAG dependencies and track which edges are added, and the count of
166
  // missed edges
167
  int link(SUnit &SU, bool MakePred,
168
           std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
169

170
  // Add DAG dependencies from all SUnits in this SchedGroup and this SU.
171
  // Use the predicate to determine whether SU should be a predecessor (P =
172
  // true) or a successor (P = false) of this SchedGroup.
173
  void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);
174

175
  // Add DAG dependencies such that SUnits in this group shall be ordered
176
  // before SUnits in OtherGroup.
177
  void link(SchedGroup &OtherGroup);
178

179
  // Returns true if no more instructions may be added to this group.
180
  bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
181

182
  // Append a constraint that SUs must meet in order to fit into this
183
  // SchedGroup. Since many rules involve the relationship between a SchedGroup
184
  // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve
185
  // time (rather than SchedGroup init time.)
186
  void addRule(std::shared_ptr<InstructionRule> NewRule) {
187
    Rules.push_back(NewRule);
188
  }
189

190
  // Returns true if the SU matches all rules
191
  bool allowedByRules(const SUnit *SU,
192
                      SmallVectorImpl<SchedGroup> &SyncPipe) const {
193
    for (auto &Rule : Rules) {
194
      if (!Rule.get()->apply(SU, Collection, SyncPipe))
195
        return false;
196
    }
197
    return true;
198
  }
199

200
  // Add SU to the SchedGroup.
201
  void add(SUnit &SU) {
202
    LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
203
                      << format_hex((int)SGMask, 10, true) << " adding "
204
                      << *SU.getInstr());
205
    Collection.push_back(&SU);
206
  }
207

208
  // Remove last element in the SchedGroup
209
  void pop() { Collection.pop_back(); }
210

211
  // Identify and add all relevant SUs from the DAG to this SchedGroup.
212
  void initSchedGroup();
213

214
  // Add instructions to the SchedGroup bottom up starting from RIter.
215
  // PipelineInstrs is a set of instructions that should not be added to the
216
  // SchedGroup even when the other conditions for adding it are satisfied.
217
  // RIter will be added to the SchedGroup as well, and dependencies will be
218
  // added so that RIter will always be scheduled at the end of the group.
219
  void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
220
                      SUnitsToCandidateSGsMap &SyncedInstrs);
221

222
  void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
223

224
  int getSyncID() { return SyncID; }
225

226
  int getSGID() { return SGID; }
227

228
  SchedGroupMask getMask() { return SGMask; }
229

230
  SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
231
             ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
232
      : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
233
    SGID = NumSchedGroups++;
234
  }
235

236
  SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
237
             ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
238
      : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
239
    SGID = NumSchedGroups++;
240
  }
241
};
242

243
// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
244
static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
245
  assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||
246
         SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
247
         SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);
248

249
  while (!SU.Preds.empty())
250
    for (auto &P : SU.Preds)
251
      SU.removePred(P);
252

253
  while (!SU.Succs.empty())
254
    for (auto &S : SU.Succs)
255
      for (auto &SP : S.getSUnit()->Preds)
256
        if (SP.getSUnit() == &SU)
257
          S.getSUnit()->removePred(SP);
258
}
259

260
using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
261
using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>;
262

263
// The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
264
// in non-trivial cases. For example, if the requested pipeline is
265
// {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
266
// in the DAG, then we will have an instruction that can not be trivially
267
// assigned to a SchedGroup. The PipelineSolver class implements two algorithms
268
// to find a good solution to the pipeline -- a greedy algorithm and an exact
269
// algorithm. The exact algorithm has an exponential time complexity and should
270
// only be used for small sized problems or medium sized problems where an exact
271
// solution is highly desired.
272
class PipelineSolver {
273
  ScheduleDAGMI *DAG;
274

275
  // Instructions that can be assigned to multiple SchedGroups
276
  DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
277
  SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;
278
  DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
279
  // The current working pipeline
280
  SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline;
281
  // The pipeline that has the best solution found so far
282
  SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline;
283

284
  // Whether or not we actually have any SyncedInstrs to try to solve.
285
  bool NeedsSolver = false;
286

287
  // Compute an estimate of the size of search tree -- the true size is
288
  // the product of each conflictedInst.Matches.size() across all SyncPipelines
289
  unsigned computeProblemSize();
290

291
  // The cost penalty of not assigning a SU to a SchedGroup
292
  int MissPenalty = 0;
293

294
  // Costs in terms of the number of edges we are unable to add
295
  int BestCost = -1;
296
  int CurrCost = 0;
297

298
  // Index pointing to the conflicting instruction that is currently being
299
  // fitted
300
  int CurrConflInstNo = 0;
301
  // Index to the pipeline that is currently being fitted
302
  int CurrSyncGroupIdx = 0;
303
  // The first non trivial pipeline
304
  int BeginSyncGroupIdx = 0;
305

306
  // How many branches we have explored
307
  uint64_t BranchesExplored = 0;
308

309
  // The direction in which we process the candidate SchedGroups per SU
310
  bool IsBottomUp = true;
311

312
  // Update indices to fit next conflicting instruction
313
  void advancePosition();
314
  // Recede indices to attempt to find better fit for previous conflicting
315
  // instruction
316
  void retreatPosition();
317

318
  // The exponential time algorithm which finds the provably best fit
319
  bool solveExact();
320
  // The polynomial time algorithm which attempts to find a good fit
321
  bool solveGreedy();
322
  // Find the best SchedGroup for the current SU using the heuristic given all
323
  // current information. One step in the greedy algorithm. Templated against
324
  // the SchedGroup iterator (either reverse or forward).
325
  template <typename T>
326
  void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I,
327
                  T E);
328
  // Whether or not the current solution is optimal
329
  bool checkOptimal();
330
  // Populate the ready list, prioiritizing fewest missed edges first
331
  // Templated against the SchedGroup iterator (either reverse or forward).
332
  template <typename T>
333
  void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,
334
                         T E);
335
  // Add edges corresponding to the SchedGroups as assigned by solver
336
  void makePipeline();
337
  // Link the SchedGroups in the best found pipeline.
338
  // Tmplated against the SchedGroup iterator (either reverse or forward).
339
  template <typename T> void linkSchedGroups(T I, T E);
340
  // Add the edges from the SU to the other SchedGroups in pipeline, and
341
  // return the number of edges missed.
342
  int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
343
               std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
344
  /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
345
  /// returns the cost (in terms of missed pipeline edges), and tracks the edges
346
  /// added in \p AddedEdges
347
  template <typename T>
348
  int linkSUnit(SUnit *SU, int SGID,
349
                std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);
350
  /// Remove the edges passed via \p AddedEdges
351
  void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
352
  // Convert the passed in maps to arrays for bidirectional iterators
353
  void convertSyncMapsToArrays();
354

355
  void reset();
356

357
public:
358
  // Invoke the solver to map instructions to instruction groups. Heuristic &&
359
  // command-line-option determines to use exact or greedy algorithm.
360
  void solve();
361

362
  PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
363
                 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
364
                 ScheduleDAGMI *DAG, bool IsBottomUp = true)
365
      : DAG(DAG), SyncedInstrs(SyncedInstrs),
366
        SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
367

368
    for (auto &PipelineInstrs : SyncedInstrs) {
369
      if (PipelineInstrs.second.size() > 0) {
370
        NeedsSolver = true;
371
        break;
372
      }
373
    }
374

375
    if (!NeedsSolver)
376
      return;
377

378
    convertSyncMapsToArrays();
379

380
    CurrPipeline = BestPipeline;
381

382
    while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
383
           PipelineInstrs[BeginSyncGroupIdx].size() == 0)
384
      ++BeginSyncGroupIdx;
385

386
    if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
387
      return;
388
  }
389
};
390

391
void PipelineSolver::reset() {
392

393
  for (auto &SyncPipeline : CurrPipeline) {
394
    for (auto &SG : SyncPipeline) {
395
      SmallVector<SUnit *, 32> TempCollection = SG.Collection;
396
      SG.Collection.clear();
397
      auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
398
        return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
399
      });
400
      if (SchedBarr != TempCollection.end())
401
        SG.Collection.push_back(*SchedBarr);
402
    }
403
  }
404

405
  CurrSyncGroupIdx = BeginSyncGroupIdx;
406
  CurrConflInstNo = 0;
407
  CurrCost = 0;
408
}
409

410
void PipelineSolver::convertSyncMapsToArrays() {
411
  for (auto &SyncPipe : SyncedSchedGroups) {
412
    BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
413
  }
414

415
  int PipelineIDx = SyncedInstrs.size() - 1;
416
  PipelineInstrs.resize(SyncedInstrs.size());
417
  for (auto &SyncInstrMap : SyncedInstrs) {
418
    for (auto &SUsToCandSGs : SyncInstrMap.second) {
419
      if (PipelineInstrs[PipelineIDx].size() == 0) {
420
        PipelineInstrs[PipelineIDx].push_back(
421
            std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
422
        continue;
423
      }
424
      auto SortPosition = PipelineInstrs[PipelineIDx].begin();
425
      // Insert them in sorted order -- this allows for good parsing order in
426
      // the greedy algorithm
427
      while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
428
             SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
429
        ++SortPosition;
430
      PipelineInstrs[PipelineIDx].insert(
431
          SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
432
    }
433
    --PipelineIDx;
434
  }
435
}
436

437
template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
438
  for (; I != E; ++I) {
439
    auto &GroupA = *I;
440
    for (auto J = std::next(I); J != E; ++J) {
441
      auto &GroupB = *J;
442
      GroupA.link(GroupB);
443
    }
444
  }
445
}
446

447
void PipelineSolver::makePipeline() {
448
  // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
449
  for (auto &SyncPipeline : BestPipeline) {
450
    LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");
451
    for (auto &SG : SyncPipeline) {
452
      LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()
453
                        << " has: \n");
454
      SUnit *SGBarr = nullptr;
455
      for (auto &SU : SG.Collection) {
456
        if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
457
          SGBarr = SU;
458
        LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
459
      }
460
      // Command line requested IGroupLP doesn't have SGBarr
461
      if (!SGBarr)
462
        continue;
463
      resetEdges(*SGBarr, DAG);
464
      SG.link(*SGBarr, false);
465
    }
466
  }
467

468
  for (auto &SyncPipeline : BestPipeline) {
469
    IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
470
               : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
471
  }
472
}
473

474
template <typename T>
475
int PipelineSolver::linkSUnit(
476
    SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
477
    T I, T E) {
478
  bool MakePred = false;
479
  int AddedCost = 0;
480
  for (; I < E; ++I) {
481
    if (I->getSGID() == SGID) {
482
      MakePred = true;
483
      continue;
484
    }
485
    auto Group = *I;
486
    AddedCost += Group.link(*SU, MakePred, AddedEdges);
487
    assert(AddedCost >= 0);
488
  }
489
  return AddedCost;
490
}
491

492
int PipelineSolver::addEdges(
493
    SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
494
    std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
495

496
  // For IsBottomUp, the first SchedGroup in SyncPipeline contains the
497
  // instructions that are the ultimate successors in the resultant mutation.
498
  // Therefore, in such a configuration, the SchedGroups occurring before the
499
  // candidate SGID are successors of the candidate SchedGroup, thus the current
500
  // SU should be linked as a predecessor to SUs in those SchedGroups. The
501
  // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple
502
  // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using
503
  // IsBottomUp (in reverse).
504
  return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(),
505
                                SyncPipeline.rend())
506
                    : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(),
507
                                SyncPipeline.end());
508
}
509

510
void PipelineSolver::removeEdges(
511
    const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
512
  // Only remove the edges that we have added when testing
513
  // the fit.
514
  for (auto &PredSuccPair : EdgesToRemove) {
515
    SUnit *Pred = PredSuccPair.first;
516
    SUnit *Succ = PredSuccPair.second;
517

518
    auto Match = llvm::find_if(
519
        Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
520
    if (Match != Succ->Preds.end()) {
521
      assert(Match->isArtificial());
522
      Succ->removePred(*Match);
523
    }
524
  }
525
}
526

527
void PipelineSolver::advancePosition() {
528
  ++CurrConflInstNo;
529

530
  if (static_cast<size_t>(CurrConflInstNo) >=
531
      PipelineInstrs[CurrSyncGroupIdx].size()) {
532
    CurrConflInstNo = 0;
533
    ++CurrSyncGroupIdx;
534
    // Advance to next non-trivial pipeline
535
    while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
536
           PipelineInstrs[CurrSyncGroupIdx].size() == 0)
537
      ++CurrSyncGroupIdx;
538
  }
539
}
540

541
void PipelineSolver::retreatPosition() {
542
  assert(CurrConflInstNo >= 0);
543
  assert(CurrSyncGroupIdx >= 0);
544

545
  if (CurrConflInstNo > 0) {
546
    --CurrConflInstNo;
547
    return;
548
  }
549

550
  if (CurrConflInstNo == 0) {
551
    // If we return to the starting position, we have explored
552
    // the entire tree
553
    if (CurrSyncGroupIdx == BeginSyncGroupIdx)
554
      return;
555

556
    --CurrSyncGroupIdx;
557
    // Go to previous non-trivial pipeline
558
    while (PipelineInstrs[CurrSyncGroupIdx].size() == 0)
559
      --CurrSyncGroupIdx;
560

561
    CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
562
  }
563
}
564

565
bool PipelineSolver::checkOptimal() {
566
  if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
567
    if (BestCost == -1 || CurrCost < BestCost) {
568
      BestPipeline = CurrPipeline;
569
      BestCost = CurrCost;
570
      LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
571
    }
572
    assert(BestCost >= 0);
573
  }
574

575
  bool DoneExploring = false;
576
  if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
577
    DoneExploring = true;
578

579
  return (DoneExploring || BestCost == 0);
580
}
581

582
template <typename T>
583
void PipelineSolver::populateReadyList(
584
    SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {
585
  SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
586
  auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
587
  assert(CurrSU.second.size() >= 1);
588

589
  for (; I != E; ++I) {
590
    std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
591
    int CandSGID = *I;
592
    SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
593
      return SG.getSGID() == CandSGID;
594
    });
595
    assert(Match);
596

597
    if (UseCostHeur) {
598
      if (Match->isFull()) {
599
        ReadyList.push_back(std::pair(*I, MissPenalty));
600
        continue;
601
      }
602

603
      int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
604
      ReadyList.push_back(std::pair(*I, TempCost));
605
      removeEdges(AddedEdges);
606
    } else
607
      ReadyList.push_back(std::pair(*I, -1));
608
  }
609

610
  if (UseCostHeur) {
611
    std::sort(ReadyList.begin(), ReadyList.end(),
612
              [](std::pair<int, int> A, std::pair<int, int> B) {
613
                return A.second < B.second;
614
              });
615
  }
616

617
  assert(ReadyList.size() == CurrSU.second.size());
618
}
619

620
bool PipelineSolver::solveExact() {
621
  if (checkOptimal())
622
    return true;
623

624
  if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
625
    return false;
626

627
  assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
628
  assert(static_cast<size_t>(CurrConflInstNo) <
629
         PipelineInstrs[CurrSyncGroupIdx].size());
630
  SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
631
  LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
632
                    << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
633

634
  // SchedGroup -> Cost pairs
635
  SmallVector<std::pair<int, int>, 4> ReadyList;
636
  // Prioritize the candidate sched groups in terms of lowest cost first
637
  IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(),
638
                                 CurrSU.second.rend())
639
             : populateReadyList(ReadyList, CurrSU.second.begin(),
640
                                 CurrSU.second.end());
641

642
  auto I = ReadyList.begin();
643
  auto E = ReadyList.end();
644
  for (; I != E; ++I) {
645
    // If we are trying SGs in least cost order, and the current SG is cost
646
    // infeasible, then all subsequent SGs will also be cost infeasible, so we
647
    // can prune.
648
    if (BestCost != -1 && (CurrCost + I->second > BestCost))
649
      return false;
650

651
    int CandSGID = I->first;
652
    int AddedCost = 0;
653
    std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
654
    auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
655
    SchedGroup *Match;
656
    for (auto &SG : SyncPipeline) {
657
      if (SG.getSGID() == CandSGID)
658
        Match = &SG;
659
    }
660

661
    if (Match->isFull())
662
      continue;
663

664
    if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
665
      continue;
666

667
    LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
668
                      << (int)Match->getMask() << "and ID " << CandSGID
669
                      << "\n");
670
    Match->add(*CurrSU.first);
671
    AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
672
    LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
673
    CurrCost += AddedCost;
674
    advancePosition();
675
    ++BranchesExplored;
676
    bool FinishedExploring = false;
677
    // If the Cost after adding edges is greater than a known solution,
678
    // backtrack
679
    if (CurrCost < BestCost || BestCost == -1) {
680
      if (solveExact()) {
681
        FinishedExploring = BestCost != 0;
682
        if (!FinishedExploring)
683
          return true;
684
      }
685
    }
686

687
    retreatPosition();
688
    CurrCost -= AddedCost;
689
    removeEdges(AddedEdges);
690
    Match->pop();
691
    CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
692
    if (FinishedExploring)
693
      return true;
694
  }
695

696
  // Try the pipeline where the current instruction is omitted
697
  // Potentially if we omit a problematic instruction from the pipeline,
698
  // all the other instructions can nicely fit.
699
  CurrCost += MissPenalty;
700
  advancePosition();
701

702
  LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");
703

704
  bool FinishedExploring = false;
705
  if (CurrCost < BestCost || BestCost == -1) {
706
    if (solveExact()) {
707
      bool FinishedExploring = BestCost != 0;
708
      if (!FinishedExploring)
709
        return true;
710
    }
711
  }
712

713
  retreatPosition();
714
  CurrCost -= MissPenalty;
715
  return FinishedExploring;
716
}
717

718
template <typename T>
719
void PipelineSolver::greedyFind(
720
    std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) {
721
  SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
722
  int BestNodeCost = -1;
723
  int TempCost;
724
  SchedGroup *BestGroup = nullptr;
725
  int BestGroupID = -1;
726
  auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
727
  LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
728
                    << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
729

730
  // Since we have added the potential SchedGroups from bottom up, but
731
  // traversed the DAG from top down, parse over the groups from last to
732
  // first. If we fail to do this for the greedy algorithm, the solution will
733
  // likely not be good in more complex cases.
734
  for (; I != E; ++I) {
735
    std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
736
    int CandSGID = *I;
737
    SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
738
      return SG.getSGID() == CandSGID;
739
    });
740
    assert(Match);
741

742
    LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
743
                      << (int)Match->getMask() << "\n");
744

745
    if (Match->isFull()) {
746
      LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
747
      continue;
748
    }
749
    if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
750
      LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n");
751
      continue;
752
    }
753
    TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
754
    LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
755
    if (TempCost < BestNodeCost || BestNodeCost == -1) {
756
      BestGroup = Match;
757
      BestNodeCost = TempCost;
758
      BestGroupID = CandSGID;
759
    }
760
    removeEdges(AddedEdges);
761
    if (BestNodeCost == 0)
762
      break;
763
  }
764

765
  if (BestGroupID != -1) {
766
    BestGroup->add(*CurrSU.first);
767
    addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
768
    LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
769
                      << (int)BestGroup->getMask() << "\n");
770
    BestCost += TempCost;
771
  } else
772
    BestCost += MissPenalty;
773

774
  CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
775
}
776

777
bool PipelineSolver::solveGreedy() {
778
  BestCost = 0;
779
  std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
780

781
  while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
782
    SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
783
    IsBottomUp
784
        ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
785
        : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
786
    advancePosition();
787
  }
788
  BestPipeline = CurrPipeline;
789
  removeEdges(AddedEdges);
790
  return false;
791
}
792

793
unsigned PipelineSolver::computeProblemSize() {
794
  unsigned ProblemSize = 0;
795
  for (auto &PipeConflicts : PipelineInstrs) {
796
    ProblemSize += PipeConflicts.size();
797
  }
798

799
  return ProblemSize;
800
}
801

802
void PipelineSolver::solve() {
803
  if (!NeedsSolver)
804
    return;
805

806
  unsigned ProblemSize = computeProblemSize();
807
  assert(ProblemSize > 0);
808

809
  bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
810
  MissPenalty = (ProblemSize / 2) + 1;
811

812
  LLVM_DEBUG(DAG->dump());
813
  if (EnableExactSolver || BelowCutoff) {
814
    LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
815
    solveGreedy();
816
    reset();
817
    LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
818
    if (BestCost > 0) {
819
      LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
820
      solveExact();
821
      LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
822
    }
823
  } else { // Use the Greedy Algorithm by default
824
    LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
825
    solveGreedy();
826
  }
827

828
  makePipeline();
829
  LLVM_DEBUG(dbgs() << "After applying mutation\n");
830
  LLVM_DEBUG(DAG->dump());
831
}
832

833
enum IGLPStrategyID : int {
834
  MFMASmallGemmOptID = 0,
835
  MFMASmallGemmSingleWaveOptID = 1,
836
  MFMAExpInterleave = 2
837
};
838

839
// Implement a IGLP scheduling strategy.
840
class IGLPStrategy {
841
protected:
842
  ScheduleDAGInstrs *DAG;
843

844
  const SIInstrInfo *TII;
845

846
public:
847
  /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
848
  virtual bool applyIGLPStrategy(
849
      DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
850
      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
851
      AMDGPU::SchedulingPhase Phase) = 0;
852

853
  // Returns true if this strategy should be applied to a ScheduleDAG.
854
  virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
855
                                   AMDGPU::SchedulingPhase Phase) = 0;
856

857
  bool IsBottomUp = true;
858

859
  IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
860
      : DAG(DAG), TII(TII) {}
861

862
  virtual ~IGLPStrategy() = default;
863
};
864

865
class MFMASmallGemmOpt final : public IGLPStrategy {
866
private:
867
public:
868
  bool applyIGLPStrategy(
869
      DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
870
      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
871
      AMDGPU::SchedulingPhase Phase) override;
872

873
  bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
874
                           AMDGPU::SchedulingPhase Phase) override {
875
    return true;
876
  }
877

878
  MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
879
      : IGLPStrategy(DAG, TII) {
880
    IsBottomUp = true;
881
  }
882
};
883

884
bool MFMASmallGemmOpt::applyIGLPStrategy(
885
    DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
886
    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
887
    AMDGPU::SchedulingPhase Phase) {
888
  // Count the number of MFMA instructions.
889
  unsigned MFMACount = 0;
890
  for (const MachineInstr &I : *DAG)
891
    if (TII->isMFMAorWMMA(I))
892
      ++MFMACount;
893

894
  const unsigned PipelineSyncID = 0;
895
  SchedGroup *SG = nullptr;
896
  for (unsigned I = 0; I < MFMACount * 3; ++I) {
897
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
898
        SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
899
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
900

901
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
902
        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
903
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
904
  }
905

906
  return true;
907
}
908

909
class MFMAExpInterleaveOpt final : public IGLPStrategy {
910
private:
911
  // The count of TRANS SUs involved in the interleaved pipeline
912
  static unsigned TransPipeCount;
913
  // The count of MFMA SUs involved in the interleaved pipeline
914
  static unsigned MFMAPipeCount;
915
  // The count of Add SUs involved in the interleaved pipeline
916
  static unsigned AddPipeCount;
917
  // The number of transitive MFMA successors for each TRANS SU
918
  static unsigned MFMAEnablement;
919
  // The number of transitive TRANS predecessors for each MFMA SU
920
  static unsigned ExpRequirement;
921
  // The count of independent "chains" of MFMA instructions in the pipeline
922
  static unsigned MFMAChains;
923
  // The length of each independent "chain" of MFMA instructions
924
  static unsigned MFMAChainLength;
925
  // Whether or not the pipeline has V_CVT instructions
926
  static bool HasCvt;
927
  // Whether or not there are instructions between the TRANS instruction and
928
  // V_CVT
929
  static bool HasChainBetweenCvt;
930
  // The first occuring DS_READ which feeds an MFMA chain
931
  static std::optional<unsigned> FirstPipeDSR;
932
  // The MFMAPipe SUs with no MFMA predecessors
933
  SmallVector<SUnit *, 4> MFMAChainSeeds;
934
  // Compute the heuristics for the pipeline, returning whether or not the DAG
935
  // is well formatted for the mutation
936
  bool analyzeDAG(const SIInstrInfo *TII);
937

938
  /// Whether or not the instruction is a transitive predecessor of an MFMA
939
  /// instruction
940
  class IsPipeExp final : public InstructionRule {
941
  public:
942
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
943
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
944

945
      auto DAG = SyncPipe[0].DAG;
946

947
      if (Cache->empty()) {
948
        auto I = DAG->SUnits.rbegin();
949
        auto E = DAG->SUnits.rend();
950
        for (; I != E; I++) {
951
          if (TII->isMFMAorWMMA(*I->getInstr()))
952
            Cache->push_back(&*I);
953
        }
954
        if (Cache->empty())
955
          return false;
956
      }
957

958
      auto Reaches = (std::any_of(
959
          Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) {
960
            return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
961
          }));
962

963
      return Reaches;
964
    }
965
    IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
966
        : InstructionRule(TII, SGID, NeedsCache) {}
967
  };
968

969
  /// Whether or not the instruction is a transitive predecessor of the
970
  /// \p Number th MFMA of the MFMAs occuring after a TRANS instruction
971
  class EnablesNthMFMA final : public InstructionRule {
972
  private:
973
    unsigned Number = 1;
974

975
  public:
976
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
977
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
978
      bool FoundTrans = false;
979
      unsigned Counter = 1;
980
      auto DAG = SyncPipe[0].DAG;
981

982
      if (Cache->empty()) {
983
        SmallVector<SUnit *, 8> Worklist;
984

985
        auto I = DAG->SUnits.begin();
986
        auto E = DAG->SUnits.end();
987
        for (; I != E; I++) {
988
          if (FoundTrans && TII->isMFMAorWMMA(*I->getInstr())) {
989
            if (Counter == Number) {
990
              Cache->push_back(&*I);
991
              break;
992
            }
993
            ++Counter;
994
          }
995
          if (!FoundTrans && TII->isTRANS(I->getInstr()->getOpcode()))
996
            FoundTrans = true;
997
        }
998
        if (Cache->empty())
999
          return false;
1000
      }
1001

1002
      return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
1003
    }
1004

1005
    EnablesNthMFMA(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
1006
                   bool NeedsCache = false)
1007
        : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
1008
  };
1009

1010
  /// Whether or not the instruction enables the exact MFMA that is the \p
1011
  /// Number th MFMA in the chain starting with \p ChainSeed
1012
  class EnablesNthMFMAInChain final : public InstructionRule {
1013
  private:
1014
    unsigned Number = 1;
1015
    SUnit *ChainSeed;
1016

1017
  public:
1018
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1019
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
1020
      auto DAG = SyncPipe[0].DAG;
1021

1022
      if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
1023
        return false;
1024

1025
      if (Cache->empty()) {
1026
        auto TempSU = ChainSeed;
1027
        auto Depth = Number;
1028
        while (Depth > 0) {
1029
          --Depth;
1030
          bool Found = false;
1031
          for (auto &Succ : TempSU->Succs) {
1032
            if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1033
              TempSU = Succ.getSUnit();
1034
              Found = true;
1035
              break;
1036
            }
1037
          }
1038
          if (!Found)
1039
            return false;
1040
        }
1041

1042
        Cache->push_back(TempSU);
1043
      }
1044
      // If we failed to find the instruction to be placed into the cache, we
1045
      // would have already exited.
1046
      assert(!Cache->empty());
1047

1048
      return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
1049
    }
1050

1051
    EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed,
1052
                          const SIInstrInfo *TII, unsigned SGID,
1053
                          bool NeedsCache = false)
1054
        : InstructionRule(TII, SGID, NeedsCache), Number(Number),
1055
          ChainSeed(ChainSeed) {}
1056
  };
1057

1058
  /// Whether or not the instruction has less than \p Size immediate successors.
1059
  /// If \p HasIntermediary is true, this tests also whether all successors of
1060
  /// the SUnit have less than \p Size successors.
1061
  class LessThanNSuccs final : public InstructionRule {
1062
  private:
1063
    unsigned Size = 1;
1064
    bool HasIntermediary = false;
1065

1066
  public:
1067
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1068
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
1069
      if (!SyncPipe.size())
1070
        return false;
1071

1072
      auto SuccSize = std::count_if(
1073
          SU->Succs.begin(), SU->Succs.end(),
1074
          [](const SDep &Succ) { return Succ.getKind() == SDep::Data; });
1075
      if (SuccSize >= Size)
1076
        return false;
1077

1078
      if (HasIntermediary) {
1079
        for (auto Succ : SU->Succs) {
1080
          auto SuccSize = std::count_if(
1081
              Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(),
1082
              [](const SDep &SuccSucc) {
1083
                return SuccSucc.getKind() == SDep::Data;
1084
              });
1085
          if (SuccSize >= Size)
1086
            return false;
1087
        }
1088
      }
1089

1090
      return true;
1091
    }
1092
    LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID,
1093
                   bool HasIntermediary = false, bool NeedsCache = false)
1094
        : InstructionRule(TII, SGID, NeedsCache), Size(Size),
1095
          HasIntermediary(HasIntermediary) {}
1096
  };
1097

1098
  /// Whether or not the instruction has greater than or equal to \p Size
1099
  /// immediate successors. If \p HasIntermediary is true, this tests also
1100
  /// whether all successors of the SUnit have greater than or equal to \p Size
1101
  /// successors.
1102
  class GreaterThanOrEqualToNSuccs final : public InstructionRule {
1103
  private:
1104
    unsigned Size = 1;
1105
    bool HasIntermediary = false;
1106

1107
  public:
1108
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1109
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
1110
      if (!SyncPipe.size())
1111
        return false;
1112

1113
      auto SuccSize = std::count_if(
1114
          SU->Succs.begin(), SU->Succs.end(),
1115
          [](const SDep &Succ) { return Succ.getKind() == SDep::Data; });
1116
      if (SuccSize >= Size)
1117
        return true;
1118

1119
      if (HasIntermediary) {
1120
        for (auto Succ : SU->Succs) {
1121
          auto SuccSize = std::count_if(
1122
              Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(),
1123
              [](const SDep &SuccSucc) {
1124
                return SuccSucc.getKind() == SDep::Data;
1125
              });
1126
          if (SuccSize >= Size)
1127
            return true;
1128
        }
1129
      }
1130

1131
      return false;
1132
    }
1133
    GreaterThanOrEqualToNSuccs(unsigned Size, const SIInstrInfo *TII,
1134
                               unsigned SGID, bool HasIntermediary = false,
1135
                               bool NeedsCache = false)
1136
        : InstructionRule(TII, SGID, NeedsCache), Size(Size),
1137
          HasIntermediary(HasIntermediary) {}
1138
  };
1139

1140
  // Whether or not the instruction is a relevant V_CVT instruction.
1141
  class IsCvt final : public InstructionRule {
1142
  public:
1143
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1144
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
1145
      auto Opc = SU->getInstr()->getOpcode();
1146
      return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
1147
             Opc == AMDGPU::V_CVT_I32_F32_e32;
1148
    }
1149
    IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1150
        : InstructionRule(TII, SGID, NeedsCache) {}
1151
  };
1152

1153
  // Whether or not the instruction is FMA_F32.
1154
  class IsFMA final : public InstructionRule {
1155
  public:
1156
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1157
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
1158
      return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 ||
1159
             SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32;
1160
    }
1161
    IsFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1162
        : InstructionRule(TII, SGID, NeedsCache) {}
1163
  };
1164

1165
  // Whether or not the instruction is a V_ADD_F32 instruction.
1166
  class IsPipeAdd final : public InstructionRule {
1167
  public:
1168
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1169
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
1170
      return SU->getInstr()->getOpcode() == AMDGPU::V_ADD_F32_e32;
1171
    }
1172
    IsPipeAdd(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1173
        : InstructionRule(TII, SGID, NeedsCache) {}
1174
  };
1175

1176
  /// Whether or not the instruction is an immediate RAW successor
1177
  /// of the SchedGroup \p Distance steps before.
1178
  class IsSuccOfPrevNthGroup final : public InstructionRule {
1179
  private:
1180
    unsigned Distance = 1;
1181

1182
  public:
1183
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1184
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
1185
      SchedGroup *OtherGroup = nullptr;
1186
      if (!SyncPipe.size())
1187
        return false;
1188

1189
      for (auto &PipeSG : SyncPipe) {
1190
        if ((unsigned)PipeSG.getSGID() == SGID - Distance)
1191
          OtherGroup = &PipeSG;
1192
      }
1193

1194
      if (!OtherGroup)
1195
        return false;
1196
      if (!OtherGroup->Collection.size())
1197
        return true;
1198

1199
      for (auto &OtherEle : OtherGroup->Collection) {
1200
        for (auto &Succ : OtherEle->Succs) {
1201
          if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)
1202
            return true;
1203
        }
1204
      }
1205

1206
      return false;
1207
    }
1208
    IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1209
                         unsigned SGID, bool NeedsCache = false)
1210
        : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
1211
  };
1212

1213
  /// Whether or not the instruction is a transitive successor of any
1214
  /// instruction the the SchedGroup \p Distance steps before.
1215
  class IsReachableFromPrevNthGroup final : public InstructionRule {
1216
  private:
1217
    unsigned Distance = 1;
1218

1219
  public:
1220
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1221
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
1222
      SchedGroup *OtherGroup = nullptr;
1223
      if (!SyncPipe.size())
1224
        return false;
1225

1226
      for (auto &PipeSG : SyncPipe) {
1227
        if ((unsigned)PipeSG.getSGID() == SGID - Distance)
1228
          OtherGroup = &PipeSG;
1229
      }
1230

1231
      if (!OtherGroup)
1232
        return false;
1233
      if (!OtherGroup->Collection.size())
1234
        return true;
1235

1236
      auto DAG = SyncPipe[0].DAG;
1237

1238
      for (auto &OtherEle : OtherGroup->Collection)
1239
        if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle))
1240
          return true;
1241

1242
      return false;
1243
    }
1244
    IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1245
                                unsigned SGID, bool NeedsCache = false)
1246
        : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
1247
  };
1248

1249
  /// Whether or not the instruction occurs after the SU with NodeNUm \p Number
1250
  class OccursAtOrAfterNode final : public InstructionRule {
1251
  private:
1252
    unsigned Number = 1;
1253

1254
  public:
1255
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1256
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
1257

1258
      return SU->NodeNum >= Number;
1259
    }
1260
    OccursAtOrAfterNode(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
1261
                        bool NeedsCache = false)
1262
        : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
1263
  };
1264

1265
  /// Whether or not the SU is exactly the \p Number th MFMA in the chain
1266
  /// starting with \p ChainSeed
1267
  class IsExactMFMA final : public InstructionRule {
1268
  private:
1269
    unsigned Number = 1;
1270
    SUnit *ChainSeed;
1271

1272
  public:
1273
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1274
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
1275
      if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
1276
        return false;
1277

1278
      if (Cache->empty()) {
1279
        auto TempSU = ChainSeed;
1280
        auto Depth = Number;
1281
        while (Depth > 0) {
1282
          --Depth;
1283
          bool Found = false;
1284
          for (auto &Succ : TempSU->Succs) {
1285
            if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1286
              TempSU = Succ.getSUnit();
1287
              Found = true;
1288
              break;
1289
            }
1290
          }
1291
          if (!Found) {
1292
            return false;
1293
          }
1294
        }
1295
        Cache->push_back(TempSU);
1296
      }
1297
      // If we failed to find the instruction to be placed into the cache, we
1298
      // would have already exited.
1299
      assert(!Cache->empty());
1300

1301
      return (*Cache)[0] == SU;
1302
    }
1303

1304
    IsExactMFMA(unsigned Number, SUnit *ChainSeed, const SIInstrInfo *TII,
1305
                unsigned SGID, bool NeedsCache = false)
1306
        : InstructionRule(TII, SGID, NeedsCache), Number(Number),
1307
          ChainSeed(ChainSeed) {}
1308
  };
1309

1310
  // Whether the instruction occurs after the first TRANS instruction. This
1311
  // implies the instruction can not be a predecessor of the first TRANS
1312
  // insruction
1313
  class OccursAfterExp final : public InstructionRule {
1314
  public:
1315
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1316
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
1317

1318
      SmallVector<SUnit *, 12> Worklist;
1319
      auto DAG = SyncPipe[0].DAG;
1320
      if (Cache->empty()) {
1321
        for (auto &SU : DAG->SUnits)
1322
          if (TII->isTRANS(SU.getInstr()->getOpcode())) {
1323
            Cache->push_back(&SU);
1324
            break;
1325
          }
1326
        if (Cache->empty())
1327
          return false;
1328
      }
1329

1330
      return SU->NodeNum > (*Cache)[0]->NodeNum;
1331
    }
1332

1333
    OccursAfterExp(const SIInstrInfo *TII, unsigned SGID,
1334
                   bool NeedsCache = false)
1335
        : InstructionRule(TII, SGID, NeedsCache) {}
1336
  };
1337

1338
public:
1339
  bool applyIGLPStrategy(
1340
      DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1341
      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1342
      AMDGPU::SchedulingPhase Phase) override;
1343

1344
  bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1345
                           AMDGPU::SchedulingPhase Phase) override;
1346

1347
  MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
1348
      : IGLPStrategy(DAG, TII) {
1349
    IsBottomUp = false;
1350
  }
1351
};
1352

1353
unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
1354
unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
1355
unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
1356
unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
1357
unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
1358
unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
1359
unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
1360
bool MFMAExpInterleaveOpt::HasCvt = false;
1361
bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
1362
std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
1363

1364
bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
1365
  SmallVector<SUnit *, 10> ExpPipeCands;
1366
  SmallVector<SUnit *, 10> MFMAPipeCands;
1367
  SmallVector<SUnit *, 10> MFMAPipeSUs;
1368
  SmallVector<SUnit *, 10> PackSUs;
1369
  SmallVector<SUnit *, 10> CvtSUs;
1370

1371
  auto isBitPack = [](unsigned Opc) {
1372
    return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
1373
  };
1374

1375
  auto isCvt = [](unsigned Opc) {
1376
    return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32;
1377
  };
1378

1379
  auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; };
1380

1381
  AddPipeCount = 0;
1382
  for (SUnit &SU : DAG->SUnits) {
1383
    auto Opc = SU.getInstr()->getOpcode();
1384
    if (TII->isTRANS(Opc)) {
1385
      // Avoid counting a potential bonus V_EXP which all the MFMA depend on
1386
      if (SU.Succs.size() >= 7)
1387
        continue;
1388
      for (auto &Succ : SU.Succs) {
1389
        if (Succ.getSUnit()->Succs.size() >= 7)
1390
          continue;
1391
      }
1392
      ExpPipeCands.push_back(&SU);
1393
    }
1394

1395
    if (TII->isMFMAorWMMA(*SU.getInstr()))
1396
      MFMAPipeCands.push_back(&SU);
1397

1398
    if (isBitPack(Opc))
1399
      PackSUs.push_back(&SU);
1400

1401
    if (isCvt(Opc))
1402
      CvtSUs.push_back(&SU);
1403

1404
    if (isAdd(Opc))
1405
      ++AddPipeCount;
1406
  }
1407

1408
  if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size()))
1409
    return false;
1410

1411
  TransPipeCount = 0;
1412

1413
  std::optional<SUnit *> TempMFMA;
1414
  std::optional<SUnit *> TempExp;
1415
  // Count the number of EXPs that reach an MFMA
1416
  for (auto &PredSU : ExpPipeCands) {
1417
    for (auto &SuccSU : MFMAPipeCands) {
1418
      if (DAG->IsReachable(SuccSU, PredSU)) {
1419
        if (!TempExp) {
1420
          TempExp = PredSU;
1421
          TempMFMA = SuccSU;
1422
        }
1423
        MFMAPipeSUs.push_back(SuccSU);
1424
        ++TransPipeCount;
1425
        break;
1426
      }
1427
    }
1428
  }
1429

1430
  if (!(TempExp && TempMFMA))
1431
    return false;
1432

1433
  HasChainBetweenCvt =
1434
      std::find_if((*TempExp)->Succs.begin(), (*TempExp)->Succs.end(),
1435
                   [&isCvt](SDep &Succ) {
1436
                     return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
1437
                   }) == (*TempExp)->Succs.end();
1438

1439
  // Count the number of MFMAs that are reached by an EXP
1440
  for (auto &SuccSU : MFMAPipeCands) {
1441
    if (MFMAPipeSUs.size() &&
1442
        std::find_if(MFMAPipeSUs.begin(), MFMAPipeSUs.end(),
1443
                     [&SuccSU](SUnit *PotentialMatch) {
1444
                       return PotentialMatch->NodeNum == SuccSU->NodeNum;
1445
                     }) != MFMAPipeSUs.end())
1446
      continue;
1447

1448
    for (auto &PredSU : ExpPipeCands) {
1449
      if (DAG->IsReachable(SuccSU, PredSU)) {
1450
        MFMAPipeSUs.push_back(SuccSU);
1451
        break;
1452
      }
1453
    }
1454
  }
1455

1456
  MFMAPipeCount = MFMAPipeSUs.size();
1457

1458
  assert(TempExp && TempMFMA);
1459
  assert(MFMAPipeCount > 0);
1460

1461
  std::optional<SUnit *> TempCvt;
1462
  for (auto &SuccSU : CvtSUs) {
1463
    if (DAG->IsReachable(SuccSU, *TempExp)) {
1464
      TempCvt = SuccSU;
1465
      break;
1466
    }
1467
  }
1468

1469
  HasCvt = false;
1470
  if (TempCvt.has_value()) {
1471
    for (auto &SuccSU : MFMAPipeSUs) {
1472
      if (DAG->IsReachable(SuccSU, *TempCvt)) {
1473
        HasCvt = true;
1474
        break;
1475
      }
1476
    }
1477
  }
1478

1479
  MFMAChains = 0;
1480
  for (auto &MFMAPipeSU : MFMAPipeSUs) {
1481
    if (is_contained(MFMAChainSeeds, MFMAPipeSU))
1482
      continue;
1483
    if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(),
1484
                     [&TII](SDep &Succ) {
1485
                       return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1486
                     })) {
1487
      MFMAChainSeeds.push_back(MFMAPipeSU);
1488
      ++MFMAChains;
1489
    }
1490
  }
1491

1492
  if (!MFMAChains)
1493
    return false;
1494

1495
  for (auto Pred : MFMAChainSeeds[0]->Preds) {
1496
    if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
1497
        Pred.getSUnit()->getInstr()->mayLoad())
1498
      FirstPipeDSR = Pred.getSUnit()->NodeNum;
1499
  }
1500

1501
  MFMAChainLength = MFMAPipeCount / MFMAChains;
1502

1503
  // The number of bit pack operations that depend on a single V_EXP
1504
  unsigned PackSuccCount = std::count_if(
1505
      PackSUs.begin(), PackSUs.end(), [this, &TempExp](SUnit *VPack) {
1506
        return DAG->IsReachable(VPack, *TempExp);
1507
      });
1508

1509
  // The number of bit pack operations an MFMA depends on
1510
  unsigned PackPredCount =
1511
      std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
1512
                    [&isBitPack](SDep &Pred) {
1513
                      auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1514
                      return isBitPack(Opc);
1515
                    });
1516

1517
  auto PackPred =
1518
      std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
1519
                   [&isBitPack](SDep &Pred) {
1520
                     auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1521
                     return isBitPack(Opc);
1522
                   });
1523

1524
  if (PackPred == (*TempMFMA)->Preds.end())
1525
    return false;
1526

1527
  MFMAEnablement = 0;
1528
  ExpRequirement = 0;
1529
  // How many MFMAs depend on a single bit pack operation
1530
  MFMAEnablement =
1531
      std::count_if(PackPred->getSUnit()->Succs.begin(),
1532
                    PackPred->getSUnit()->Succs.end(), [&TII](SDep &Succ) {
1533
                      return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1534
                    });
1535

1536
  // The number of MFMAs that depend on a single V_EXP
1537
  MFMAEnablement *= PackSuccCount;
1538

1539
  // The number of V_EXPs required to resolve all dependencies for an MFMA
1540
  ExpRequirement =
1541
      std::count_if(ExpPipeCands.begin(), ExpPipeCands.end(),
1542
                    [this, &PackPred](SUnit *ExpBase) {
1543
                      return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
1544
                    });
1545

1546
  ExpRequirement *= PackPredCount;
1547
  return true;
1548
}
1549

1550
bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1551
                                               AMDGPU::SchedulingPhase Phase) {
1552
  const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
1553
  const SIInstrInfo *TII = ST.getInstrInfo();
1554

1555
  if (Phase != AMDGPU::SchedulingPhase::PostRA)
1556
    MFMAChainSeeds.clear();
1557
  if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
1558
    return false;
1559

1560
  return true;
1561
}
1562

1563
bool MFMAExpInterleaveOpt::applyIGLPStrategy(
1564
    DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
1565
    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1566
    AMDGPU::SchedulingPhase Phase) {
1567

1568
  bool IsSmallKernelType =
1569
      MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
1570
  bool IsLargeKernelType =
1571
      MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
1572

1573
  if (!(IsSmallKernelType || IsLargeKernelType))
1574
    return false;
1575

1576
  const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
1577
  const SIInstrInfo *TII = ST.getInstrInfo();
1578

1579
  unsigned PipelineSyncID = 0;
1580
  SchedGroup *SG = nullptr;
1581

1582
  unsigned MFMAChain = 0;
1583
  unsigned PositionInChain = 0;
1584
  unsigned CurrMFMAForTransPosition = 0;
1585

1586
  auto incrementTransPosition = [&MFMAChain, &PositionInChain,
1587
                                 &CurrMFMAForTransPosition]() {
1588
    CurrMFMAForTransPosition += MFMAEnablement;
1589
    PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
1590
    MFMAChain = CurrMFMAForTransPosition % MFMAChains;
1591
  };
1592

1593
  auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
1594
    auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1595
    return (TempMFMAForTrans / MFMAChains);
1596
  };
1597

1598
  auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
1599
    auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1600
    return TempMFMAForTrans % MFMAChains;
1601
  };
1602

1603
  unsigned CurrMFMAPosition = 0;
1604
  unsigned MFMAChainForMFMA = 0;
1605
  unsigned PositionInChainForMFMA = 0;
1606

1607
  auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
1608
                                &PositionInChainForMFMA]() {
1609
    ++CurrMFMAPosition;
1610
    MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
1611
    PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
1612
  };
1613

1614
  bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA;
1615
  assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
1616

1617
  bool UsesFMA = IsSmallKernelType || !IsPostRA;
1618
  bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
1619
  bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
1620
  bool UsesVALU = IsSmallKernelType;
1621

1622
  // PHASE 1: "Prefetch"
1623
  if (UsesFMA) {
1624
    // First Round FMA
1625
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1626
        SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
1627
    if (!IsPostRA && MFMAChains) {
1628
      SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1629
          PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1630
          true));
1631
    } else
1632
      SG->addRule(
1633
          std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
1634
    SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1635
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1636

1637
    // Second Round FMA
1638
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1639
        SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
1640
    if (!IsPostRA && MFMAChains) {
1641
      SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1642
          getNextTransPositionInChain(),
1643
          MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
1644
    } else
1645
      SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
1646
                                                   SG->getSGID(), true));
1647
    SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1648
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1649
  }
1650

1651
  if (UsesDSRead) {
1652
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1653
        SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
1654
    SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
1655
                                                      SG->getSGID()));
1656
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1657
  }
1658

1659
  // First Round EXP
1660
  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1661
      SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG, TII);
1662
  if (!IsPostRA && MFMAChains)
1663
    SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1664
        PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), true));
1665
  else
1666
    SG->addRule(std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
1667
  SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1668
  SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1669
                                               HasChainBetweenCvt));
1670
  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1671

1672
  incrementTransPosition();
1673

1674
  // First Round CVT, Third Round FMA, Second Round EXP; interleaved
1675
  for (unsigned I = 0; I < ExpRequirement; I++) {
1676
    // First Round CVT
1677
    if (UsesCvt) {
1678
      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1679
          SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1680
      SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
1681
      if (HasChainBetweenCvt)
1682
        SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1683
            1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
1684
      else
1685
        SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
1686
            1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
1687
      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1688
    }
1689

1690
    // Third Round FMA
1691
    if (UsesFMA) {
1692
      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1693
          SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1694
      if (!IsPostRA && MFMAChains) {
1695
        SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1696
            getNextTransPositionInChain(),
1697
            MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
1698
      } else
1699
        SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
1700
                                                     TII, SG->getSGID(), true));
1701
      SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1702
      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1703
    }
1704

1705
    // Second Round EXP
1706
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1707
        SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1708
    if (!IsPostRA && MFMAChains)
1709
      SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1710
          PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1711
          true));
1712
    else
1713
      SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
1714
                                                   SG->getSGID(), true));
1715
    SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1716
    SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1717
                                                 HasChainBetweenCvt));
1718
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1719
  }
1720

1721
  // The "extra" EXP which enables all MFMA
1722
  // TODO: UsesExtraExp
1723
  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1724
      SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1725
  SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1726
  SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
1727
      8, TII, SG->getSGID(), HasChainBetweenCvt));
1728
  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1729

1730
  // PHASE 2: Main Interleave Loop
1731

1732
  // The number of MFMAs per iteration
1733
  unsigned MFMARatio =
1734
      MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
1735
  // The number of Exps per iteration
1736
  unsigned ExpRatio =
1737
      MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
1738
  // The reamaining Exps
1739
  unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
1740
                              ? TransPipeCount - (2 * ExpRequirement)
1741
                              : 0;
1742
  unsigned ExpLoopCount = RemainingExp / ExpRatio;
1743
  // In loop MFMAs
1744
  unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
1745
                            ? MFMAPipeCount - (MFMAEnablement * 2)
1746
                            : 0;
1747
  unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
1748
  unsigned VALUOps =
1749
      AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
1750
  unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);
1751

1752
  for (unsigned I = 0; I < LoopSize; I++) {
1753
    if (!(I * ExpRatio % ExpRequirement))
1754
      incrementTransPosition();
1755

1756
    // Round N MFMA
1757
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1758
        SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG, TII);
1759
    if (!IsPostRA && MFMAChains)
1760
      SG->addRule(std::make_shared<IsExactMFMA>(
1761
          PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA], TII,
1762
          SG->getSGID(), true));
1763
    else
1764
      SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
1765
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1766
    incrementMFMAPosition();
1767

1768
    if (UsesVALU) {
1769
      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1770
          SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG, TII);
1771
      SG->addRule(std::make_shared<IsPipeAdd>(TII, SG->getSGID()));
1772
      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1773
    }
1774

1775
    if (UsesDSRead && !(I % 4)) {
1776
      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1777
          SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
1778
      SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
1779
                                                        SG->getSGID()));
1780
      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1781
    }
1782

1783
    // CVT, EXP, FMA Interleaving
1784
    for (unsigned J = 0; J < ExpRatio; J++) {
1785
      auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (I + 1);
1786
      auto MaxMFMAOffset =
1787
          (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
1788

1789
      // Round N + 1 CVT
1790
      if (UsesCvt) {
1791
        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1792
            SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1793
        SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
1794
        auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
1795
        auto DSROffset = I / 4 + 1;
1796
        auto MaxDSROffset = MaxMFMAOffset / 4;
1797
        // TODO: UsesExtraExp
1798
        auto ExpOffset = I * ExpRatio + J >= ExpRequirement ? 0 : 1;
1799
        auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
1800
                             std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
1801
                             ExpOffset;
1802
        if (HasChainBetweenCvt)
1803
          SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1804
              CurrentOffset, TII, SG->getSGID()));
1805
        else
1806
          SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset, TII,
1807
                                                             SG->getSGID()));
1808
        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1809
      }
1810

1811
      // Round N + 3 FMA
1812
      if (UsesFMA) {
1813
        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1814
            SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1815
        if (!IsPostRA && MFMAChains)
1816
          SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1817
              getNextTransPositionInChain(),
1818
              MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(),
1819
              true));
1820
        else
1821
          SG->addRule(std::make_shared<EnablesNthMFMA>(
1822
              (((I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
1823
              TII, SG->getSGID(), true));
1824
        SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1825
        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1826
      }
1827

1828
      // Round N + 2 Exp
1829
      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1830
          SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1831
      if (!IsPostRA && MFMAChains)
1832
        SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1833
            PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1834
            true));
1835
      else
1836
        SG->addRule(std::make_shared<EnablesNthMFMA>(
1837
            (((I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
1838
            TII, SG->getSGID(), true));
1839
      SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1840
      SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1841
                                                   HasChainBetweenCvt));
1842
      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1843
    }
1844
  }
1845

1846
  // PHASE 3: Remaining MFMAs
1847
  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1848
      SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, TII);
1849
  SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
1850
  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1851
  return true;
1852
}
1853

1854
class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
1855
private:
1856
  // Whether the DS_READ is a predecessor of first four MFMA in region
1857
  class EnablesInitialMFMA final : public InstructionRule {
1858
  public:
1859
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1860
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
1861
      if (!SyncPipe.size())
1862
        return false;
1863
      int MFMAsFound = 0;
1864
      if (!Cache->size()) {
1865
        for (auto &Elt : SyncPipe[0].DAG->SUnits) {
1866
          if (TII->isMFMAorWMMA(*Elt.getInstr())) {
1867
            ++MFMAsFound;
1868
            if (MFMAsFound > 4)
1869
              break;
1870
            Cache->push_back(&Elt);
1871
          }
1872
        }
1873
      }
1874

1875
      assert(Cache->size());
1876
      auto DAG = SyncPipe[0].DAG;
1877
      for (auto &Elt : *Cache) {
1878
        if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
1879
          return true;
1880
      }
1881
      return false;
1882
    }
1883

1884
    EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID,
1885
                       bool NeedsCache = false)
1886
        : InstructionRule(TII, SGID, NeedsCache) {}
1887
  };
1888

1889
  // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE
1890
  class IsPermForDSW final : public InstructionRule {
1891
  public:
1892
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1893
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
1894
      auto MI = SU->getInstr();
1895
      if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
1896
        return false;
1897

1898
      bool FitsInGroup = false;
1899
      // Does the VALU have a DS_WRITE successor
1900
      if (!Collection.size()) {
1901
        for (auto &Succ : SU->Succs) {
1902
          SUnit *SuccUnit = Succ.getSUnit();
1903
          if (TII->isDS(*SuccUnit->getInstr()) &&
1904
              SuccUnit->getInstr()->mayStore()) {
1905
            Cache->push_back(SuccUnit);
1906
            FitsInGroup = true;
1907
          }
1908
        }
1909
        return FitsInGroup;
1910
      }
1911

1912
      assert(Cache->size());
1913

1914
      // Does the VALU have a DS_WRITE successor that is the same as other
1915
      // VALU already in the group. The V_PERMs will all share 1 DS_W succ
1916
      return llvm::any_of(*Cache, [&SU](SUnit *Elt) {
1917
        return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) {
1918
          return ThisSucc.getSUnit() == Elt;
1919
        });
1920
      });
1921
    }
1922

1923
    IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1924
        : InstructionRule(TII, SGID, NeedsCache) {}
1925
  };
1926

1927
  // Whether the SU is a successor of any element in previous SchedGroup
1928
  class IsSuccOfPrevGroup final : public InstructionRule {
1929
  public:
1930
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1931
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
1932
      SchedGroup *OtherGroup = nullptr;
1933
      for (auto &PipeSG : SyncPipe) {
1934
        if ((unsigned)PipeSG.getSGID() == SGID - 1) {
1935
          OtherGroup = &PipeSG;
1936
        }
1937
      }
1938

1939
      if (!OtherGroup)
1940
        return false;
1941
      if (!OtherGroup->Collection.size())
1942
        return true;
1943

1944
      // Does the previous VALU have this DS_Write as a successor
1945
      return (std::any_of(OtherGroup->Collection.begin(),
1946
                          OtherGroup->Collection.end(), [&SU](SUnit *Elt) {
1947
                            return std::any_of(Elt->Succs.begin(),
1948
                                               Elt->Succs.end(),
1949
                                               [&SU](SDep &Succ) {
1950
                                                 return Succ.getSUnit() == SU;
1951
                                               });
1952
                          }));
1953
    }
1954
    IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,
1955
                      bool NeedsCache = false)
1956
        : InstructionRule(TII, SGID, NeedsCache) {}
1957
  };
1958

1959
  // Whether the combined load width of group is 128 bits
1960
  class VMEMSize final : public InstructionRule {
1961
  public:
1962
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1963
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
1964
      auto MI = SU->getInstr();
1965
      if (MI->getOpcode() == TargetOpcode::BUNDLE)
1966
        return false;
1967
      if (!Collection.size())
1968
        return true;
1969

1970
      int NumBits = 0;
1971

1972
      auto TRI = TII->getRegisterInfo();
1973
      auto &MRI = MI->getParent()->getParent()->getRegInfo();
1974
      for (auto &Elt : Collection) {
1975
        auto Op = Elt->getInstr()->getOperand(0);
1976
        auto Size =
1977
            TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op));
1978
        NumBits += Size;
1979
      }
1980

1981
      if (NumBits < 128) {
1982
        assert(TII->isVMEM(*MI) && MI->mayLoad());
1983
        if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
1984
                          MRI, MI->getOperand(0))) <=
1985
            128)
1986
          return true;
1987
      }
1988

1989
      return false;
1990
    }
1991

1992
    VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1993
        : InstructionRule(TII, SGID, NeedsCache) {}
1994
  };
1995

1996
  /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
1997
  /// that is \p Distance steps away
1998
  class SharesPredWithPrevNthGroup final : public InstructionRule {
1999
  private:
2000
    unsigned Distance = 1;
2001

2002
  public:
2003
    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
2004
               SmallVectorImpl<SchedGroup> &SyncPipe) override {
2005
      SchedGroup *OtherGroup = nullptr;
2006
      if (!SyncPipe.size())
2007
        return false;
2008

2009
      if (!Cache->size()) {
2010

2011
        for (auto &PipeSG : SyncPipe) {
2012
          if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
2013
            OtherGroup = &PipeSG;
2014
          }
2015
        }
2016

2017
        if (!OtherGroup)
2018
          return false;
2019
        if (!OtherGroup->Collection.size())
2020
          return true;
2021

2022
        for (auto &OtherEle : OtherGroup->Collection) {
2023
          for (auto &Pred : OtherEle->Preds) {
2024
            if (Pred.getSUnit()->getInstr()->getOpcode() ==
2025
                AMDGPU::V_PERM_B32_e64)
2026
              Cache->push_back(Pred.getSUnit());
2027
          }
2028
        }
2029

2030
        // If the other group has no PERM preds, then this group won't share any
2031
        if (!Cache->size())
2032
          return false;
2033
      }
2034

2035
      auto DAG = SyncPipe[0].DAG;
2036
      // Does the previous DS_WRITE share a V_PERM predecessor with this
2037
      // VMEM_READ
2038
      return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) {
2039
        return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
2040
      });
2041
    }
2042
    SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
2043
                               unsigned SGID, bool NeedsCache = false)
2044
        : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
2045
  };
2046

2047
public:
2048
  bool applyIGLPStrategy(
2049
      DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
2050
      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
2051
      AMDGPU::SchedulingPhase Phase) override;
2052

2053
  bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
2054
                           AMDGPU::SchedulingPhase Phase) override {
2055
    return true;
2056
  }
2057

2058
  MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
2059
      : IGLPStrategy(DAG, TII) {
2060
    IsBottomUp = false;
2061
  }
2062
};
2063

2064
static unsigned DSWCount = 0;
2065
static unsigned DSWWithPermCount = 0;
2066
static unsigned DSWWithSharedVMEMCount = 0;
2067

2068
bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
2069
    DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
2070
    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
2071
    AMDGPU::SchedulingPhase Phase) {
2072
  unsigned MFMACount = 0;
2073
  unsigned DSRCount = 0;
2074

2075
  bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;
2076

2077
  assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
2078
                         DSWWithSharedVMEMCount == 0)) &&
2079
         "DSWCounters should be zero in pre-RA scheduling!");
2080
  SmallVector<SUnit *, 6> DSWithPerms;
2081
  for (auto &SU : DAG->SUnits) {
2082
    auto I = SU.getInstr();
2083
    if (TII->isMFMAorWMMA(*I))
2084
      ++MFMACount;
2085
    else if (TII->isDS(*I)) {
2086
      if (I->mayLoad())
2087
        ++DSRCount;
2088
      else if (I->mayStore() && IsInitial) {
2089
        ++DSWCount;
2090
        for (auto Pred : SU.Preds) {
2091
          if (Pred.getSUnit()->getInstr()->getOpcode() ==
2092
              AMDGPU::V_PERM_B32_e64) {
2093
            DSWithPerms.push_back(&SU);
2094
            break;
2095
          }
2096
        }
2097
      }
2098
    }
2099
  }
2100

2101
  if (IsInitial) {
2102
    DSWWithPermCount = DSWithPerms.size();
2103
    auto I = DSWithPerms.begin();
2104
    auto E = DSWithPerms.end();
2105

2106
    // Get the count of DS_WRITES with V_PERM predecessors which
2107
    // have loop carried dependencies (WAR) on the same VMEM_READs.
2108
    // We consider partial overlap as a miss -- in other words,
2109
    // for a given DS_W, we only consider another DS_W as matching
2110
    // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
2111
    // for every V_PERM pred of this DS_W.
2112
    DenseMap<MachineInstr *, SUnit *> VMEMLookup;
2113
    SmallVector<SUnit *, 6> Counted;
2114
    for (; I != E; I++) {
2115
      SUnit *Cand = nullptr;
2116
      bool MissedAny = false;
2117
      for (auto &Pred : (*I)->Preds) {
2118
        if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
2119
          continue;
2120

2121
        if (Cand && llvm::is_contained(Counted, Cand))
2122
          break;
2123

2124
        for (auto &Succ : Pred.getSUnit()->Succs) {
2125
          auto MI = Succ.getSUnit()->getInstr();
2126
          if (!TII->isVMEM(*MI) || !MI->mayLoad())
2127
            continue;
2128

2129
          if (MissedAny || !VMEMLookup.size()) {
2130
            MissedAny = true;
2131
            VMEMLookup[MI] = *I;
2132
            continue;
2133
          }
2134

2135
          if (!VMEMLookup.contains(MI)) {
2136
            MissedAny = true;
2137
            VMEMLookup[MI] = *I;
2138
            continue;
2139
          }
2140

2141
          Cand = VMEMLookup[MI];
2142
          if (llvm::is_contained(Counted, Cand)) {
2143
            MissedAny = true;
2144
            break;
2145
          }
2146
        }
2147
      }
2148
      if (!MissedAny && Cand) {
2149
        DSWWithSharedVMEMCount += 2;
2150
        Counted.push_back(Cand);
2151
        Counted.push_back(*I);
2152
      }
2153
    }
2154
  }
2155

2156
  assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
2157
  SchedGroup *SG;
2158
  unsigned PipelineSyncID = 0;
2159
  // For kernels with V_PERM, there are enough VALU to mix in between MFMAs
2160
  if (DSWWithPermCount) {
2161
    for (unsigned I = 0; I < MFMACount; I++) {
2162
      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2163
          SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2164
      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2165

2166
      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2167
          SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);
2168
      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2169
    }
2170
  }
2171

2172
  PipelineSyncID = 1;
2173
  // Phase 1: Break up DS_READ and MFMA clusters.
2174
  // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ
2175
  // prefetch
2176

2177
  // Make ready initial MFMA
2178
  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2179
      SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII);
2180
  SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true));
2181
  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2182

2183
  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2184
      SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2185
  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2186

2187
  // Interleave MFMA with DS_READ prefetch
2188
  for (unsigned I = 0; I < DSRCount - 4; ++I) {
2189
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2190
        SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
2191
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2192

2193
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2194
        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2195
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2196
  }
2197

2198
  // Phase 2a: Loop carried dependency with V_PERM
2199
  // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
2200
  // depend on. Interleave MFMA to keep XDL unit busy throughout.
2201
  for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
2202
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2203
        SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
2204
    SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
2205
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2206

2207
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2208
        SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2209
    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
2210
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2211

2212
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2213
        SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2214
    SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2215
        1, TII, SG->getSGID(), true));
2216
    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2217
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2218

2219
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2220
        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2221
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2222

2223
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2224
        SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2225
    SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2226
        3, TII, SG->getSGID(), true));
2227
    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2228
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2229

2230
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2231
        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2232
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2233
  }
2234

2235
  // Phase 2b: Loop carried dependency without V_PERM
2236
  // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
2237
  // Interleave MFMA to keep XDL unit busy throughout.
2238
  for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
2239
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2240
        SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2241
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2242

2243
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2244
        SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2245
    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2246
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2247

2248
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2249
        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2250
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2251
  }
2252

2253
  // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are
2254
  // ultimately used by two DS_WRITE
2255
  // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
2256
  // depend on. Interleave MFMA to keep XDL unit busy throughout.
2257

2258
  for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) {
2259
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2260
        SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
2261
    SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
2262
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2263

2264
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2265
        SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2266
    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
2267
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2268

2269
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2270
        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2271
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2272

2273
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2274
        SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
2275
    SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
2276
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2277

2278
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2279
        SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2280
    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
2281
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2282

2283
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2284
        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2285
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2286

2287
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2288
        SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2289
    SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2290
        2, TII, SG->getSGID(), true));
2291
    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2292
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2293

2294
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2295
        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2296
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2297

2298
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2299
        SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2300
    SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2301
        4, TII, SG->getSGID(), true));
2302
    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2303
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2304

2305
    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2306
        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2307
    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2308
  }
2309

2310
  return true;
2311
}
2312

2313
static std::unique_ptr<IGLPStrategy>
2314
createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
2315
                   const SIInstrInfo *TII) {
2316
  switch (ID) {
2317
  case MFMASmallGemmOptID:
2318
    return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
2319
  case MFMASmallGemmSingleWaveOptID:
2320
    return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
2321
  case MFMAExpInterleave:
2322
    return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
2323
  }
2324

2325
  llvm_unreachable("Unknown IGLPStrategyID");
2326
}
2327

2328
class IGroupLPDAGMutation : public ScheduleDAGMutation {
2329
private:
2330
  const SIInstrInfo *TII;
2331

2332
  ScheduleDAGMI *DAG;
2333

2334
  // Organize lists of SchedGroups by their SyncID. SchedGroups /
2335
  // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
2336
  // between then.
2337
  DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
2338

2339
  // Used to track instructions that can be mapped to multiple sched groups
2340
  DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
2341

2342
  // Add DAG edges that enforce SCHED_BARRIER ordering.
2343
  void addSchedBarrierEdges(SUnit &SU);
2344

2345
  // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
2346
  // not be reordered accross the SCHED_BARRIER. This is used for the base
2347
  // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
2348
  // SCHED_BARRIER will always block all instructions that can be classified
2349
  // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
2350
  // and may only synchronize with some SchedGroups. Returns the inverse of
2351
  // Mask. SCHED_BARRIER's mask describes which instruction types should be
2352
  // allowed to be scheduled across it. Invert the mask to get the
2353
  // SchedGroupMask of instructions that should be barred.
2354
  SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
2355

2356
  // Create SchedGroups for a SCHED_GROUP_BARRIER.
2357
  void initSchedGroupBarrierPipelineStage(
2358
      std::vector<SUnit>::reverse_iterator RIter);
2359

2360
  bool initIGLPOpt(SUnit &SU);
2361

2362
public:
2363
  void apply(ScheduleDAGInstrs *DAGInstrs) override;
2364

2365
  // The order in which the PipelineSolver should process the candidate
2366
  // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
2367
  // created SchedGroup first, and will consider that as the ultimate
2368
  // predecessor group when linking. TOP_DOWN instead links and processes the
2369
  // first created SchedGroup first.
2370
  bool IsBottomUp = true;
2371

2372
  // The scheduling phase this application of IGLP corresponds with.
2373
  AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
2374

2375
  IGroupLPDAGMutation() = default;
2376
  IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}
2377
};
2378

2379
unsigned SchedGroup::NumSchedGroups = 0;
2380

2381
bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
2382
  if (A != B && DAG->canAddEdge(B, A)) {
2383
    DAG->addEdge(B, SDep(A, SDep::Artificial));
2384
    return true;
2385
  }
2386
  return false;
2387
}
2388

2389
bool SchedGroup::canAddMI(const MachineInstr &MI) const {
2390
  bool Result = false;
2391
  if (MI.isMetaInstruction())
2392
    Result = false;
2393

2394
  else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
2395
           (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
2396
            TII->isTRANS(MI)))
2397
    Result = true;
2398

2399
  else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2400
           TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
2401
    Result = true;
2402

2403
  else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
2404
           TII->isSALU(MI))
2405
    Result = true;
2406

2407
  else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
2408
           TII->isMFMAorWMMA(MI))
2409
    Result = true;
2410

2411
  else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
2412
           (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
2413
    Result = true;
2414

2415
  else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
2416
           MI.mayLoad() &&
2417
           (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
2418
    Result = true;
2419

2420
  else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
2421
           MI.mayStore() &&
2422
           (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
2423
    Result = true;
2424

2425
  else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
2426
           TII->isDS(MI))
2427
    Result = true;
2428

2429
  else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
2430
           MI.mayLoad() && TII->isDS(MI))
2431
    Result = true;
2432

2433
  else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
2434
           MI.mayStore() && TII->isDS(MI))
2435
    Result = true;
2436

2437
  else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
2438
           TII->isTRANS(MI))
2439
    Result = true;
2440

2441
  LLVM_DEBUG(
2442
      dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
2443
             << (Result ? " could classify " : " unable to classify ") << MI);
2444

2445
  return Result;
2446
}
2447

2448
int SchedGroup::link(SUnit &SU, bool MakePred,
2449
                     std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
2450
  int MissedEdges = 0;
2451
  for (auto *A : Collection) {
2452
    SUnit *B = &SU;
2453
    if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2454
      continue;
2455
    if (MakePred)
2456
      std::swap(A, B);
2457

2458
    if (DAG->IsReachable(B, A))
2459
      continue;
2460

2461
    // tryAddEdge returns false if there is a dependency that makes adding
2462
    // the A->B edge impossible, otherwise it returns true;
2463
    bool Added = tryAddEdge(A, B);
2464
    if (Added)
2465
      AddedEdges.emplace_back(A, B);
2466
    else
2467
      ++MissedEdges;
2468
  }
2469

2470
  return MissedEdges;
2471
}
2472

2473
void SchedGroup::link(SUnit &SU, bool MakePred) {
2474
  for (auto *A : Collection) {
2475
    SUnit *B = &SU;
2476
    if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2477
      continue;
2478
    if (MakePred)
2479
      std::swap(A, B);
2480

2481
    tryAddEdge(A, B);
2482
  }
2483
}
2484

2485
void SchedGroup::link(SUnit &SU,
2486
                      function_ref<bool(const SUnit *A, const SUnit *B)> P) {
2487
  for (auto *A : Collection) {
2488
    SUnit *B = &SU;
2489
    if (P(A, B))
2490
      std::swap(A, B);
2491

2492
    tryAddEdge(A, B);
2493
  }
2494
}
2495

2496
void SchedGroup::link(SchedGroup &OtherGroup) {
2497
  for (auto *B : OtherGroup.Collection)
2498
    link(*B);
2499
}
2500

2501
bool SchedGroup::canAddSU(SUnit &SU) const {
2502
  MachineInstr &MI = *SU.getInstr();
2503
  if (MI.getOpcode() != TargetOpcode::BUNDLE)
2504
    return canAddMI(MI);
2505

2506
  // Special case for bundled MIs.
2507
  const MachineBasicBlock *MBB = MI.getParent();
2508
  MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
2509
  while (E != MBB->end() && E->isBundledWithPred())
2510
    ++E;
2511

2512
  // Return true if all of the bundled MIs can be added to this group.
2513
  return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
2514
}
2515

2516
void SchedGroup::initSchedGroup() {
2517
  for (auto &SU : DAG->SUnits) {
2518
    if (isFull())
2519
      break;
2520

2521
    if (canAddSU(SU))
2522
      add(SU);
2523
  }
2524
}
2525

2526
void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
2527
                                SUnitsToCandidateSGsMap &SyncedInstrs) {
2528
  SUnit &InitSU = *RIter;
2529
  for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {
2530
    auto &SU = *RIter;
2531
    if (isFull())
2532
      break;
2533

2534
    if (canAddSU(SU))
2535
      SyncedInstrs[&SU].push_back(SGID);
2536
  }
2537

2538
  add(InitSU);
2539
  assert(MaxSize);
2540
  (*MaxSize)++;
2541
}
2542

2543
void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
2544
  auto I = DAG->SUnits.rbegin();
2545
  auto E = DAG->SUnits.rend();
2546
  for (; I != E; ++I) {
2547
    auto &SU = *I;
2548
    if (isFull())
2549
      break;
2550
    if (canAddSU(SU))
2551
      SyncedInstrs[&SU].push_back(SGID);
2552
  }
2553
}
2554

2555
void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
2556
  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
2557
  if (!TSchedModel || DAGInstrs->SUnits.empty())
2558
    return;
2559

2560
  LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
2561
  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
2562
  TII = ST.getInstrInfo();
2563
  DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
2564
  SyncedSchedGroups.clear();
2565
  SyncedInstrs.clear();
2566
  bool FoundSB = false;
2567
  bool FoundIGLP = false;
2568
  bool ShouldApplyIGLP = false;
2569
  for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
2570
    unsigned Opc = R->getInstr()->getOpcode();
2571
    // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
2572
    if (Opc == AMDGPU::SCHED_BARRIER) {
2573
      addSchedBarrierEdges(*R);
2574
      FoundSB = true;
2575
    } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
2576
      initSchedGroupBarrierPipelineStage(R);
2577
      FoundSB = true;
2578
    } else if (Opc == AMDGPU::IGLP_OPT) {
2579
      resetEdges(*R, DAG);
2580
      if (!FoundSB && !FoundIGLP) {
2581
        FoundIGLP = true;
2582
        ShouldApplyIGLP = initIGLPOpt(*R);
2583
      }
2584
    }
2585
  }
2586

2587
  if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
2588
    PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
2589
    // PipelineSolver performs the mutation by adding the edges it
2590
    // determined as the best
2591
    PS.solve();
2592
    return;
2593
  }
2594
}
2595

2596
void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
2597
  MachineInstr &MI = *SchedBarrier.getInstr();
2598
  assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
2599
  // Remove all existing edges from the SCHED_BARRIER that were added due to the
2600
  // instruction having side effects.
2601
  resetEdges(SchedBarrier, DAG);
2602
  LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: "
2603
                    << MI.getOperand(0).getImm() << "\n");
2604
  auto InvertedMask =
2605
      invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
2606
  SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
2607
  SG.initSchedGroup();
2608

2609
  // Preserve original instruction ordering relative to the SCHED_BARRIER.
2610
  SG.link(
2611
      SchedBarrier,
2612
      (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
2613
          const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
2614
}
2615

2616
SchedGroupMask
2617
IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
2618
  // Invert mask and erase bits for types of instructions that are implied to be
2619
  // allowed past the SCHED_BARRIER.
2620
  SchedGroupMask InvertedMask = ~Mask;
2621

2622
  // ALU implies VALU, SALU, MFMA, TRANS.
2623
  if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
2624
    InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
2625
                    ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
2626
  // VALU, SALU, MFMA, TRANS implies ALU.
2627
  else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
2628
           (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
2629
           (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
2630
           (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
2631
    InvertedMask &= ~SchedGroupMask::ALU;
2632

2633
  // VMEM implies VMEM_READ, VMEM_WRITE.
2634
  if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
2635
    InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
2636
  // VMEM_READ, VMEM_WRITE implies VMEM.
2637
  else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
2638
           (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
2639
    InvertedMask &= ~SchedGroupMask::VMEM;
2640

2641
  // DS implies DS_READ, DS_WRITE.
2642
  if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
2643
    InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
2644
  // DS_READ, DS_WRITE implies DS.
2645
  else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
2646
           (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
2647
    InvertedMask &= ~SchedGroupMask::DS;
2648

2649
  LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask
2650
                    << "\n");
2651

2652
  return InvertedMask;
2653
}
2654

2655
void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
2656
    std::vector<SUnit>::reverse_iterator RIter) {
2657
  // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
2658
  // to the instruction having side effects.
2659
  resetEdges(*RIter, DAG);
2660
  MachineInstr &SGB = *RIter->getInstr();
2661
  assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
2662
  int32_t SGMask = SGB.getOperand(0).getImm();
2663
  int32_t Size = SGB.getOperand(1).getImm();
2664
  int32_t SyncID = SGB.getOperand(2).getImm();
2665

2666
  auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
2667
                                                    Size, SyncID, DAG, TII);
2668

2669
  SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
2670
}
2671

2672
bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
2673
  IGLPStrategyID StrategyID =
2674
      (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
2675
  auto S = createIGLPStrategy(StrategyID, DAG, TII);
2676
  if (!S->shouldApplyStrategy(DAG, Phase))
2677
    return false;
2678

2679
  IsBottomUp = S->IsBottomUp;
2680
  return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase);
2681
}
2682

2683
} // namespace
2684

2685
namespace llvm {
2686

2687
/// \p Phase specifes whether or not this is a reentry into the
2688
/// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
2689
/// same scheduling region (e.g. pre and post-RA scheduling / multiple
2690
/// scheduling "phases"), we can reenter this mutation framework more than once
2691
/// for a given region.
2692
std::unique_ptr<ScheduleDAGMutation>
2693
createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
2694
  return std::make_unique<IGroupLPDAGMutation>(Phase);
2695
}
2696

2697
} // end namespace llvm
2698

2699
Product

Resources

Company