Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
35266 views
//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8/// \file9/// Insert s_delay_alu instructions to avoid stalls on GFX11+.10//11//===----------------------------------------------------------------------===//1213#include "AMDGPU.h"14#include "GCNSubtarget.h"15#include "MCTargetDesc/AMDGPUMCTargetDesc.h"16#include "SIInstrInfo.h"17#include "llvm/ADT/SetVector.h"1819using namespace llvm;2021#define DEBUG_TYPE "amdgpu-insert-delay-alu"2223namespace {2425class AMDGPUInsertDelayAlu : public MachineFunctionPass {26public:27static char ID;2829const SIInstrInfo *SII;30const TargetRegisterInfo *TRI;3132TargetSchedModel SchedModel;3334AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}3536void getAnalysisUsage(AnalysisUsage &AU) const override {37AU.setPreservesCFG();38MachineFunctionPass::getAnalysisUsage(AU);39}4041// Return true if MI waits for all outstanding VALU instructions to complete.42static bool instructionWaitsForVALU(const MachineInstr &MI) {43// These instruction types wait for VA_VDST==0 before issuing.44const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |45SIInstrFlags::FLAT | SIInstrFlags::MIMG |46SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;47if (MI.getDesc().TSFlags & VA_VDST_0)48return true;49if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||50MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)51return true;52if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&53AMDGPU::DepCtr::decodeFieldVaVdst(MI.getOperand(0).getImm()) == 0)54return true;55return false;56}5758// Types of delay that can be encoded in an s_delay_alu instruction.59enum DelayType { VALU, TRANS, SALU, OTHER };6061// Get the delay type for an instruction with the specified TSFlags.62static DelayType getDelayType(uint64_t TSFlags) {63if (TSFlags & SIInstrFlags::TRANS)64return TRANS;65if (TSFlags & SIInstrFlags::VALU)66return VALU;67if (TSFlags & SIInstrFlags::SALU)68return SALU;69return OTHER;70}7172// Information about the last instruction(s) that wrote to a particular73// regunit. In straight-line code there will only be one such instruction, but74// when control flow converges we merge the delay information from each path75// to represent the union of the worst-case delays of each type.76struct DelayInfo {77// One larger than the maximum number of (non-TRANS) VALU instructions we78// can encode in an s_delay_alu instruction.79static constexpr unsigned VALU_MAX = 5;8081// One larger than the maximum number of TRANS instructions we can encode in82// an s_delay_alu instruction.83static constexpr unsigned TRANS_MAX = 4;8485// One larger than the maximum number of SALU cycles we can encode in an86// s_delay_alu instruction.87static constexpr unsigned SALU_CYCLES_MAX = 4;8889// If it was written by a (non-TRANS) VALU, remember how many clock cycles90// are left until it completes, and how many other (non-TRANS) VALU we have91// seen since it was issued.92uint8_t VALUCycles = 0;93uint8_t VALUNum = VALU_MAX;9495// If it was written by a TRANS, remember how many clock cycles are left96// until it completes, and how many other TRANS we have seen since it was97// issued.98uint8_t TRANSCycles = 0;99uint8_t TRANSNum = TRANS_MAX;100// Also remember how many other (non-TRANS) VALU we have seen since it was101// issued. When an instruction depends on both a prior TRANS and a prior102// non-TRANS VALU, this is used to decide whether to encode a wait for just103// one or both of them.104uint8_t TRANSNumVALU = VALU_MAX;105106// If it was written by an SALU, remember how many clock cycles are left107// until it completes.108uint8_t SALUCycles = 0;109110DelayInfo() = default;111112DelayInfo(DelayType Type, unsigned Cycles) {113switch (Type) {114default:115llvm_unreachable("unexpected type");116case VALU:117VALUCycles = Cycles;118VALUNum = 0;119break;120case TRANS:121TRANSCycles = Cycles;122TRANSNum = 0;123TRANSNumVALU = 0;124break;125case SALU:126// Guard against pseudo-instructions like SI_CALL which are marked as127// SALU but with a very high latency.128SALUCycles = std::min(Cycles, SALU_CYCLES_MAX);129break;130}131}132133bool operator==(const DelayInfo &RHS) const {134return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&135TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&136TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;137}138139bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }140141// Merge another DelayInfo into this one, to represent the union of the142// worst-case delays of each type.143void merge(const DelayInfo &RHS) {144VALUCycles = std::max(VALUCycles, RHS.VALUCycles);145VALUNum = std::min(VALUNum, RHS.VALUNum);146TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);147TRANSNum = std::min(TRANSNum, RHS.TRANSNum);148TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);149SALUCycles = std::max(SALUCycles, RHS.SALUCycles);150}151152// Update this DelayInfo after issuing an instruction. IsVALU should be 1153// when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing154// a TRANS, else 0. Cycles is the number of cycles it takes to issue the155// instruction. Return true if there is no longer any useful delay info.156bool advance(DelayType Type, unsigned Cycles) {157bool Erase = true;158159VALUNum += (Type == VALU);160if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {161// Forget about the VALU instruction. It was too far back or has162// definitely completed by now.163VALUNum = VALU_MAX;164VALUCycles = 0;165} else {166VALUCycles -= Cycles;167Erase = false;168}169170TRANSNum += (Type == TRANS);171TRANSNumVALU += (Type == VALU);172if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {173// Forget about any TRANS instruction. It was too far back or has174// definitely completed by now.175TRANSNum = TRANS_MAX;176TRANSNumVALU = VALU_MAX;177TRANSCycles = 0;178} else {179TRANSCycles -= Cycles;180Erase = false;181}182183if (SALUCycles <= Cycles) {184// Forget about any SALU instruction. It has definitely completed by185// now.186SALUCycles = 0;187} else {188SALUCycles -= Cycles;189Erase = false;190}191192return Erase;193}194195#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)196void dump() const {197if (VALUCycles)198dbgs() << " VALUCycles=" << (int)VALUCycles;199if (VALUNum < VALU_MAX)200dbgs() << " VALUNum=" << (int)VALUNum;201if (TRANSCycles)202dbgs() << " TRANSCycles=" << (int)TRANSCycles;203if (TRANSNum < TRANS_MAX)204dbgs() << " TRANSNum=" << (int)TRANSNum;205if (TRANSNumVALU < VALU_MAX)206dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;207if (SALUCycles)208dbgs() << " SALUCycles=" << (int)SALUCycles;209}210#endif211};212213// A map from regunits to the delay info for that regunit.214struct DelayState : DenseMap<unsigned, DelayInfo> {215// Merge another DelayState into this one by merging the delay info for each216// regunit.217void merge(const DelayState &RHS) {218for (const auto &KV : RHS) {219iterator It;220bool Inserted;221std::tie(It, Inserted) = insert(KV);222if (!Inserted)223It->second.merge(KV.second);224}225}226227// Advance the delay info for each regunit, erasing any that are no longer228// useful.229void advance(DelayType Type, unsigned Cycles) {230iterator Next;231for (auto I = begin(), E = end(); I != E; I = Next) {232Next = std::next(I);233if (I->second.advance(Type, Cycles))234erase(I);235}236}237238#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)239void dump(const TargetRegisterInfo *TRI) const {240if (empty()) {241dbgs() << " empty\n";242return;243}244245// Dump DelayInfo for each RegUnit in numerical order.246SmallVector<const_iterator, 8> Order;247Order.reserve(size());248for (const_iterator I = begin(), E = end(); I != E; ++I)249Order.push_back(I);250llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {251return A->first < B->first;252});253for (const_iterator I : Order) {254dbgs() << " " << printRegUnit(I->first, TRI);255I->second.dump();256dbgs() << "\n";257}258}259#endif260};261262// The saved delay state at the end of each basic block.263DenseMap<MachineBasicBlock *, DelayState> BlockState;264265// Emit an s_delay_alu instruction if necessary before MI.266MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,267MachineInstr *LastDelayAlu) {268unsigned Imm = 0;269270// Wait for a TRANS instruction.271if (Delay.TRANSNum < DelayInfo::TRANS_MAX)272Imm |= 4 + Delay.TRANSNum;273274// Wait for a VALU instruction (if it's more recent than any TRANS275// instruction that we're also waiting for).276if (Delay.VALUNum < DelayInfo::VALU_MAX &&277Delay.VALUNum <= Delay.TRANSNumVALU) {278if (Imm & 0xf)279Imm |= Delay.VALUNum << 7;280else281Imm |= Delay.VALUNum;282}283284// Wait for an SALU instruction.285if (Delay.SALUCycles) {286assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);287if (Imm & 0x780) {288// We have already encoded a VALU and a TRANS delay. There's no room in289// the encoding for an SALU delay as well, so just drop it.290} else if (Imm & 0xf) {291Imm |= (Delay.SALUCycles + 8) << 7;292} else {293Imm |= Delay.SALUCycles + 8;294}295}296297// Don't emit the s_delay_alu instruction if there's nothing to wait for.298if (!Imm)299return LastDelayAlu;300301// If we only need to wait for one instruction, try encoding it in the last302// s_delay_alu that we emitted.303if (!(Imm & 0x780) && LastDelayAlu) {304unsigned Skip = 0;305for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),306E = MachineBasicBlock::instr_iterator(MI);307++I != E;) {308if (!I->isBundle() && !I->isMetaInstruction())309++Skip;310}311if (Skip < 6) {312MachineOperand &Op = LastDelayAlu->getOperand(0);313unsigned LastImm = Op.getImm();314assert((LastImm & ~0xf) == 0 &&315"Remembered an s_delay_alu with no room for another delay!");316LastImm |= Imm << 7 | Skip << 4;317Op.setImm(LastImm);318return nullptr;319}320}321322auto &MBB = *MI.getParent();323MachineInstr *DelayAlu =324BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);325// Remember the s_delay_alu for next time if there is still room in it to326// encode another delay.327return (Imm & 0x780) ? nullptr : DelayAlu;328}329330bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {331DelayState State;332for (auto *Pred : MBB.predecessors())333State.merge(BlockState[Pred]);334335LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)336<< "\n";337State.dump(TRI););338339bool Changed = false;340MachineInstr *LastDelayAlu = nullptr;341342// Iterate over the contents of bundles, but don't emit any instructions343// inside a bundle.344for (auto &MI : MBB.instrs()) {345if (MI.isBundle() || MI.isMetaInstruction())346continue;347348// Ignore some more instructions that do not generate any code.349switch (MI.getOpcode()) {350case AMDGPU::SI_RETURN_TO_EPILOG:351continue;352}353354DelayType Type = getDelayType(MI.getDesc().TSFlags);355356if (instructionWaitsForVALU(MI)) {357// Forget about all outstanding VALU delays.358// TODO: This is overkill since it also forgets about SALU delays.359State = DelayState();360} else if (Type != OTHER) {361DelayInfo Delay;362// TODO: Scan implicit uses too?363for (const auto &Op : MI.explicit_uses()) {364if (Op.isReg()) {365// One of the operands of the writelane is also the output operand.366// This creates the insertion of redundant delays. Hence, we have to367// ignore this operand.368if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())369continue;370for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {371auto It = State.find(Unit);372if (It != State.end()) {373Delay.merge(It->second);374State.erase(Unit);375}376}377}378}379if (Emit && !MI.isBundledWithPred()) {380// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or381// just ignore them?382LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);383}384}385386if (Type != OTHER) {387// TODO: Scan implicit defs too?388for (const auto &Op : MI.defs()) {389unsigned Latency = SchedModel.computeOperandLatency(390&MI, Op.getOperandNo(), nullptr, 0);391for (MCRegUnit Unit : TRI->regunits(Op.getReg()))392State[Unit] = DelayInfo(Type, Latency);393}394}395396// Advance by the number of cycles it takes to issue this instruction.397// TODO: Use a more advanced model that accounts for instructions that398// take multiple cycles to issue on a particular pipeline.399unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);400// TODO: In wave64 mode, double the number of cycles for VALU and VMEM401// instructions on the assumption that they will usually have to be issued402// twice?403State.advance(Type, Cycles);404405LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););406}407408if (Emit) {409assert(State == BlockState[&MBB] &&410"Basic block state should not have changed on final pass!");411} else if (State != BlockState[&MBB]) {412BlockState[&MBB] = std::move(State);413Changed = true;414}415return Changed;416}417418bool runOnMachineFunction(MachineFunction &MF) override {419if (skipFunction(MF.getFunction()))420return false;421422LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()423<< "\n");424425const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();426if (!ST.hasDelayAlu())427return false;428429SII = ST.getInstrInfo();430TRI = ST.getRegisterInfo();431432SchedModel.init(&ST);433434// Calculate the delay state for each basic block, iterating until we reach435// a fixed point.436SetVector<MachineBasicBlock *> WorkList;437for (auto &MBB : reverse(MF))438WorkList.insert(&MBB);439while (!WorkList.empty()) {440auto &MBB = *WorkList.pop_back_val();441bool Changed = runOnMachineBasicBlock(MBB, false);442if (Changed)443WorkList.insert(MBB.succ_begin(), MBB.succ_end());444}445446LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");447448// Make one last pass over all basic blocks to emit s_delay_alu449// instructions.450bool Changed = false;451for (auto &MBB : MF)452Changed |= runOnMachineBasicBlock(MBB, true);453return Changed;454}455};456457} // namespace458459char AMDGPUInsertDelayAlu::ID = 0;460461char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;462463INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",464false, false)465466467