Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
35266 views
1
//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "AMDGPU.h"
15
#include "GCNSubtarget.h"
16
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17
#include "SIInstrInfo.h"
18
#include "llvm/ADT/SetVector.h"
19
20
using namespace llvm;
21
22
#define DEBUG_TYPE "amdgpu-insert-delay-alu"
23
24
namespace {
25
26
class AMDGPUInsertDelayAlu : public MachineFunctionPass {
27
public:
28
static char ID;
29
30
const SIInstrInfo *SII;
31
const TargetRegisterInfo *TRI;
32
33
TargetSchedModel SchedModel;
34
35
AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
36
37
void getAnalysisUsage(AnalysisUsage &AU) const override {
38
AU.setPreservesCFG();
39
MachineFunctionPass::getAnalysisUsage(AU);
40
}
41
42
// Return true if MI waits for all outstanding VALU instructions to complete.
43
static bool instructionWaitsForVALU(const MachineInstr &MI) {
44
// These instruction types wait for VA_VDST==0 before issuing.
45
const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
46
SIInstrFlags::FLAT | SIInstrFlags::MIMG |
47
SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
48
if (MI.getDesc().TSFlags & VA_VDST_0)
49
return true;
50
if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
51
MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
52
return true;
53
if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
54
AMDGPU::DepCtr::decodeFieldVaVdst(MI.getOperand(0).getImm()) == 0)
55
return true;
56
return false;
57
}
58
59
// Types of delay that can be encoded in an s_delay_alu instruction.
60
enum DelayType { VALU, TRANS, SALU, OTHER };
61
62
// Get the delay type for an instruction with the specified TSFlags.
63
static DelayType getDelayType(uint64_t TSFlags) {
64
if (TSFlags & SIInstrFlags::TRANS)
65
return TRANS;
66
if (TSFlags & SIInstrFlags::VALU)
67
return VALU;
68
if (TSFlags & SIInstrFlags::SALU)
69
return SALU;
70
return OTHER;
71
}
72
73
// Information about the last instruction(s) that wrote to a particular
74
// regunit. In straight-line code there will only be one such instruction, but
75
// when control flow converges we merge the delay information from each path
76
// to represent the union of the worst-case delays of each type.
77
struct DelayInfo {
78
// One larger than the maximum number of (non-TRANS) VALU instructions we
79
// can encode in an s_delay_alu instruction.
80
static constexpr unsigned VALU_MAX = 5;
81
82
// One larger than the maximum number of TRANS instructions we can encode in
83
// an s_delay_alu instruction.
84
static constexpr unsigned TRANS_MAX = 4;
85
86
// One larger than the maximum number of SALU cycles we can encode in an
87
// s_delay_alu instruction.
88
static constexpr unsigned SALU_CYCLES_MAX = 4;
89
90
// If it was written by a (non-TRANS) VALU, remember how many clock cycles
91
// are left until it completes, and how many other (non-TRANS) VALU we have
92
// seen since it was issued.
93
uint8_t VALUCycles = 0;
94
uint8_t VALUNum = VALU_MAX;
95
96
// If it was written by a TRANS, remember how many clock cycles are left
97
// until it completes, and how many other TRANS we have seen since it was
98
// issued.
99
uint8_t TRANSCycles = 0;
100
uint8_t TRANSNum = TRANS_MAX;
101
// Also remember how many other (non-TRANS) VALU we have seen since it was
102
// issued. When an instruction depends on both a prior TRANS and a prior
103
// non-TRANS VALU, this is used to decide whether to encode a wait for just
104
// one or both of them.
105
uint8_t TRANSNumVALU = VALU_MAX;
106
107
// If it was written by an SALU, remember how many clock cycles are left
108
// until it completes.
109
uint8_t SALUCycles = 0;
110
111
DelayInfo() = default;
112
113
DelayInfo(DelayType Type, unsigned Cycles) {
114
switch (Type) {
115
default:
116
llvm_unreachable("unexpected type");
117
case VALU:
118
VALUCycles = Cycles;
119
VALUNum = 0;
120
break;
121
case TRANS:
122
TRANSCycles = Cycles;
123
TRANSNum = 0;
124
TRANSNumVALU = 0;
125
break;
126
case SALU:
127
// Guard against pseudo-instructions like SI_CALL which are marked as
128
// SALU but with a very high latency.
129
SALUCycles = std::min(Cycles, SALU_CYCLES_MAX);
130
break;
131
}
132
}
133
134
bool operator==(const DelayInfo &RHS) const {
135
return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
136
TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
137
TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
138
}
139
140
bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
141
142
// Merge another DelayInfo into this one, to represent the union of the
143
// worst-case delays of each type.
144
void merge(const DelayInfo &RHS) {
145
VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
146
VALUNum = std::min(VALUNum, RHS.VALUNum);
147
TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
148
TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
149
TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
150
SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
151
}
152
153
// Update this DelayInfo after issuing an instruction. IsVALU should be 1
154
// when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
155
// a TRANS, else 0. Cycles is the number of cycles it takes to issue the
156
// instruction. Return true if there is no longer any useful delay info.
157
bool advance(DelayType Type, unsigned Cycles) {
158
bool Erase = true;
159
160
VALUNum += (Type == VALU);
161
if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
162
// Forget about the VALU instruction. It was too far back or has
163
// definitely completed by now.
164
VALUNum = VALU_MAX;
165
VALUCycles = 0;
166
} else {
167
VALUCycles -= Cycles;
168
Erase = false;
169
}
170
171
TRANSNum += (Type == TRANS);
172
TRANSNumVALU += (Type == VALU);
173
if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
174
// Forget about any TRANS instruction. It was too far back or has
175
// definitely completed by now.
176
TRANSNum = TRANS_MAX;
177
TRANSNumVALU = VALU_MAX;
178
TRANSCycles = 0;
179
} else {
180
TRANSCycles -= Cycles;
181
Erase = false;
182
}
183
184
if (SALUCycles <= Cycles) {
185
// Forget about any SALU instruction. It has definitely completed by
186
// now.
187
SALUCycles = 0;
188
} else {
189
SALUCycles -= Cycles;
190
Erase = false;
191
}
192
193
return Erase;
194
}
195
196
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
197
void dump() const {
198
if (VALUCycles)
199
dbgs() << " VALUCycles=" << (int)VALUCycles;
200
if (VALUNum < VALU_MAX)
201
dbgs() << " VALUNum=" << (int)VALUNum;
202
if (TRANSCycles)
203
dbgs() << " TRANSCycles=" << (int)TRANSCycles;
204
if (TRANSNum < TRANS_MAX)
205
dbgs() << " TRANSNum=" << (int)TRANSNum;
206
if (TRANSNumVALU < VALU_MAX)
207
dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
208
if (SALUCycles)
209
dbgs() << " SALUCycles=" << (int)SALUCycles;
210
}
211
#endif
212
};
213
214
// A map from regunits to the delay info for that regunit.
215
struct DelayState : DenseMap<unsigned, DelayInfo> {
216
// Merge another DelayState into this one by merging the delay info for each
217
// regunit.
218
void merge(const DelayState &RHS) {
219
for (const auto &KV : RHS) {
220
iterator It;
221
bool Inserted;
222
std::tie(It, Inserted) = insert(KV);
223
if (!Inserted)
224
It->second.merge(KV.second);
225
}
226
}
227
228
// Advance the delay info for each regunit, erasing any that are no longer
229
// useful.
230
void advance(DelayType Type, unsigned Cycles) {
231
iterator Next;
232
for (auto I = begin(), E = end(); I != E; I = Next) {
233
Next = std::next(I);
234
if (I->second.advance(Type, Cycles))
235
erase(I);
236
}
237
}
238
239
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
240
void dump(const TargetRegisterInfo *TRI) const {
241
if (empty()) {
242
dbgs() << " empty\n";
243
return;
244
}
245
246
// Dump DelayInfo for each RegUnit in numerical order.
247
SmallVector<const_iterator, 8> Order;
248
Order.reserve(size());
249
for (const_iterator I = begin(), E = end(); I != E; ++I)
250
Order.push_back(I);
251
llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
252
return A->first < B->first;
253
});
254
for (const_iterator I : Order) {
255
dbgs() << " " << printRegUnit(I->first, TRI);
256
I->second.dump();
257
dbgs() << "\n";
258
}
259
}
260
#endif
261
};
262
263
// The saved delay state at the end of each basic block.
264
DenseMap<MachineBasicBlock *, DelayState> BlockState;
265
266
// Emit an s_delay_alu instruction if necessary before MI.
267
MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
268
MachineInstr *LastDelayAlu) {
269
unsigned Imm = 0;
270
271
// Wait for a TRANS instruction.
272
if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
273
Imm |= 4 + Delay.TRANSNum;
274
275
// Wait for a VALU instruction (if it's more recent than any TRANS
276
// instruction that we're also waiting for).
277
if (Delay.VALUNum < DelayInfo::VALU_MAX &&
278
Delay.VALUNum <= Delay.TRANSNumVALU) {
279
if (Imm & 0xf)
280
Imm |= Delay.VALUNum << 7;
281
else
282
Imm |= Delay.VALUNum;
283
}
284
285
// Wait for an SALU instruction.
286
if (Delay.SALUCycles) {
287
assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
288
if (Imm & 0x780) {
289
// We have already encoded a VALU and a TRANS delay. There's no room in
290
// the encoding for an SALU delay as well, so just drop it.
291
} else if (Imm & 0xf) {
292
Imm |= (Delay.SALUCycles + 8) << 7;
293
} else {
294
Imm |= Delay.SALUCycles + 8;
295
}
296
}
297
298
// Don't emit the s_delay_alu instruction if there's nothing to wait for.
299
if (!Imm)
300
return LastDelayAlu;
301
302
// If we only need to wait for one instruction, try encoding it in the last
303
// s_delay_alu that we emitted.
304
if (!(Imm & 0x780) && LastDelayAlu) {
305
unsigned Skip = 0;
306
for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
307
E = MachineBasicBlock::instr_iterator(MI);
308
++I != E;) {
309
if (!I->isBundle() && !I->isMetaInstruction())
310
++Skip;
311
}
312
if (Skip < 6) {
313
MachineOperand &Op = LastDelayAlu->getOperand(0);
314
unsigned LastImm = Op.getImm();
315
assert((LastImm & ~0xf) == 0 &&
316
"Remembered an s_delay_alu with no room for another delay!");
317
LastImm |= Imm << 7 | Skip << 4;
318
Op.setImm(LastImm);
319
return nullptr;
320
}
321
}
322
323
auto &MBB = *MI.getParent();
324
MachineInstr *DelayAlu =
325
BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
326
// Remember the s_delay_alu for next time if there is still room in it to
327
// encode another delay.
328
return (Imm & 0x780) ? nullptr : DelayAlu;
329
}
330
331
bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
332
DelayState State;
333
for (auto *Pred : MBB.predecessors())
334
State.merge(BlockState[Pred]);
335
336
LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
337
<< "\n";
338
State.dump(TRI););
339
340
bool Changed = false;
341
MachineInstr *LastDelayAlu = nullptr;
342
343
// Iterate over the contents of bundles, but don't emit any instructions
344
// inside a bundle.
345
for (auto &MI : MBB.instrs()) {
346
if (MI.isBundle() || MI.isMetaInstruction())
347
continue;
348
349
// Ignore some more instructions that do not generate any code.
350
switch (MI.getOpcode()) {
351
case AMDGPU::SI_RETURN_TO_EPILOG:
352
continue;
353
}
354
355
DelayType Type = getDelayType(MI.getDesc().TSFlags);
356
357
if (instructionWaitsForVALU(MI)) {
358
// Forget about all outstanding VALU delays.
359
// TODO: This is overkill since it also forgets about SALU delays.
360
State = DelayState();
361
} else if (Type != OTHER) {
362
DelayInfo Delay;
363
// TODO: Scan implicit uses too?
364
for (const auto &Op : MI.explicit_uses()) {
365
if (Op.isReg()) {
366
// One of the operands of the writelane is also the output operand.
367
// This creates the insertion of redundant delays. Hence, we have to
368
// ignore this operand.
369
if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
370
continue;
371
for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
372
auto It = State.find(Unit);
373
if (It != State.end()) {
374
Delay.merge(It->second);
375
State.erase(Unit);
376
}
377
}
378
}
379
}
380
if (Emit && !MI.isBundledWithPred()) {
381
// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
382
// just ignore them?
383
LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
384
}
385
}
386
387
if (Type != OTHER) {
388
// TODO: Scan implicit defs too?
389
for (const auto &Op : MI.defs()) {
390
unsigned Latency = SchedModel.computeOperandLatency(
391
&MI, Op.getOperandNo(), nullptr, 0);
392
for (MCRegUnit Unit : TRI->regunits(Op.getReg()))
393
State[Unit] = DelayInfo(Type, Latency);
394
}
395
}
396
397
// Advance by the number of cycles it takes to issue this instruction.
398
// TODO: Use a more advanced model that accounts for instructions that
399
// take multiple cycles to issue on a particular pipeline.
400
unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
401
// TODO: In wave64 mode, double the number of cycles for VALU and VMEM
402
// instructions on the assumption that they will usually have to be issued
403
// twice?
404
State.advance(Type, Cycles);
405
406
LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
407
}
408
409
if (Emit) {
410
assert(State == BlockState[&MBB] &&
411
"Basic block state should not have changed on final pass!");
412
} else if (State != BlockState[&MBB]) {
413
BlockState[&MBB] = std::move(State);
414
Changed = true;
415
}
416
return Changed;
417
}
418
419
bool runOnMachineFunction(MachineFunction &MF) override {
420
if (skipFunction(MF.getFunction()))
421
return false;
422
423
LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
424
<< "\n");
425
426
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
427
if (!ST.hasDelayAlu())
428
return false;
429
430
SII = ST.getInstrInfo();
431
TRI = ST.getRegisterInfo();
432
433
SchedModel.init(&ST);
434
435
// Calculate the delay state for each basic block, iterating until we reach
436
// a fixed point.
437
SetVector<MachineBasicBlock *> WorkList;
438
for (auto &MBB : reverse(MF))
439
WorkList.insert(&MBB);
440
while (!WorkList.empty()) {
441
auto &MBB = *WorkList.pop_back_val();
442
bool Changed = runOnMachineBasicBlock(MBB, false);
443
if (Changed)
444
WorkList.insert(MBB.succ_begin(), MBB.succ_end());
445
}
446
447
LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
448
449
// Make one last pass over all basic blocks to emit s_delay_alu
450
// instructions.
451
bool Changed = false;
452
for (auto &MBB : MF)
453
Changed |= runOnMachineBasicBlock(MBB, true);
454
return Changed;
455
}
456
};
457
458
} // namespace
459
460
char AMDGPUInsertDelayAlu::ID = 0;
461
462
char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
463
464
INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
465
false, false)
466
467