Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
35268 views
//===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8/// \file9/// GlobalISel pass that selects divergent i1 phis as lane mask phis.10/// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies.11/// Handles all cases of temporal divergence.12/// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass13/// currently depends on LCSSA to insert phis with one incoming.14//15//===----------------------------------------------------------------------===//1617#include "AMDGPU.h"18#include "SILowerI1Copies.h"19#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"20#include "llvm/CodeGen/MachineFunctionPass.h"21#include "llvm/CodeGen/MachineUniformityAnalysis.h"22#include "llvm/InitializePasses.h"2324#define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"2526using namespace llvm;2728namespace {2930class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {31public:32static char ID;3334public:35AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) {36initializeAMDGPUGlobalISelDivergenceLoweringPass(37*PassRegistry::getPassRegistry());38}3940bool runOnMachineFunction(MachineFunction &MF) override;4142StringRef getPassName() const override {43return "AMDGPU GlobalISel divergence lowering";44}4546void getAnalysisUsage(AnalysisUsage &AU) const override {47AU.setPreservesCFG();48AU.addRequired<MachineDominatorTreeWrapperPass>();49AU.addRequired<MachinePostDominatorTreeWrapperPass>();50AU.addRequired<MachineUniformityAnalysisPass>();51MachineFunctionPass::getAnalysisUsage(AU);52}53};5455class DivergenceLoweringHelper : public PhiLoweringHelper {56public:57DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,58MachinePostDominatorTree *PDT,59MachineUniformityInfo *MUI);6061private:62MachineUniformityInfo *MUI = nullptr;63MachineIRBuilder B;64Register buildRegCopyToLaneMask(Register Reg);6566public:67void markAsLaneMask(Register DstReg) const override;68void getCandidatesForLowering(69SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;70void collectIncomingValuesFromPhi(71const MachineInstr *MI,72SmallVectorImpl<Incoming> &Incomings) const override;73void replaceDstReg(Register NewReg, Register OldReg,74MachineBasicBlock *MBB) override;75void buildMergeLaneMasks(MachineBasicBlock &MBB,76MachineBasicBlock::iterator I, const DebugLoc &DL,77Register DstReg, Register PrevReg,78Register CurReg) override;79void constrainAsLaneMask(Incoming &In) override;80};8182DivergenceLoweringHelper::DivergenceLoweringHelper(83MachineFunction *MF, MachineDominatorTree *DT,84MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI)85: PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {}8687// _(s1) -> SReg_32/64(s1)88void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {89assert(MRI->getType(DstReg) == LLT::scalar(1));9091if (MRI->getRegClassOrNull(DstReg)) {92if (MRI->constrainRegClass(DstReg, ST->getBoolRC()))93return;94llvm_unreachable("Failed to constrain register class");95}9697MRI->setRegClass(DstReg, ST->getBoolRC());98}99100void DivergenceLoweringHelper::getCandidatesForLowering(101SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {102LLT S1 = LLT::scalar(1);103104// Add divergent i1 phis to the list105for (MachineBasicBlock &MBB : *MF) {106for (MachineInstr &MI : MBB.phis()) {107Register Dst = MI.getOperand(0).getReg();108if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))109Vreg1Phis.push_back(&MI);110}111}112}113114void DivergenceLoweringHelper::collectIncomingValuesFromPhi(115const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {116for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {117Incomings.emplace_back(MI->getOperand(i).getReg(),118MI->getOperand(i + 1).getMBB(), Register());119}120}121122void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,123MachineBasicBlock *MBB) {124BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)125.addReg(NewReg);126}127128// Copy Reg to new lane mask register, insert a copy after instruction that129// defines Reg while skipping phis if needed.130Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) {131Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs);132MachineInstr *Instr = MRI->getVRegDef(Reg);133MachineBasicBlock *MBB = Instr->getParent();134B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator())));135B.buildCopy(LaneMask, Reg);136return LaneMask;137}138139// bb.previous140// %PrevReg = ...141//142// bb.current143// %CurReg = ...144//145// %DstReg - not defined146//147// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)148//149// bb.previous150// %PrevReg = ...151// %PrevRegCopy:sreg_32(s1) = COPY %PrevReg152//153// bb.current154// %CurReg = ...155// %CurRegCopy:sreg_32(s1) = COPY %CurReg156// ...157// %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0158// %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0159// %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg160//161// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg162void DivergenceLoweringHelper::buildMergeLaneMasks(163MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,164Register DstReg, Register PrevReg, Register CurReg) {165// DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)166// TODO: check if inputs are constants or results of a compare.167168Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg);169Register CurRegCopy = buildRegCopyToLaneMask(CurReg);170Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);171Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);172173B.setInsertPt(MBB, I);174B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg});175B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy});176B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg});177}178179// GlobalISel has to constrain S1 incoming taken as-is with lane mask register180// class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block,181// Incoming.Reg becomes that new lane mask.182void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {183B.setInsertPt(*In.Block, In.Block->getFirstTerminator());184185auto Copy = B.buildCopy(LLT::scalar(1), In.Reg);186MRI->setRegClass(Copy.getReg(0), ST->getBoolRC());187In.Reg = Copy.getReg(0);188}189190} // End anonymous namespace.191192INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,193"AMDGPU GlobalISel divergence lowering", false, false)194INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)195INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)196INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)197INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,198"AMDGPU GlobalISel divergence lowering", false, false)199200char AMDGPUGlobalISelDivergenceLowering::ID = 0;201202char &llvm::AMDGPUGlobalISelDivergenceLoweringID =203AMDGPUGlobalISelDivergenceLowering::ID;204205FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {206return new AMDGPUGlobalISelDivergenceLowering();207}208209bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(210MachineFunction &MF) {211MachineDominatorTree &DT =212getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();213MachinePostDominatorTree &PDT =214getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();215MachineUniformityInfo &MUI =216getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();217218DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);219220return Helper.lowerPhis();221}222223224