Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
35294 views
//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8/// \file9///10/// The AMDGPUAsmPrinter is used to print both assembly string and also binary11/// code. When passed an MCAsmStreamer it prints assembly and when passed12/// an MCObjectStreamer it outputs binary code.13//14//===----------------------------------------------------------------------===//15//1617#include "AMDGPUAsmPrinter.h"18#include "AMDGPU.h"19#include "AMDGPUHSAMetadataStreamer.h"20#include "AMDGPUResourceUsageAnalysis.h"21#include "GCNSubtarget.h"22#include "MCTargetDesc/AMDGPUInstPrinter.h"23#include "MCTargetDesc/AMDGPUMCExpr.h"24#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"25#include "MCTargetDesc/AMDGPUTargetStreamer.h"26#include "R600AsmPrinter.h"27#include "SIMachineFunctionInfo.h"28#include "TargetInfo/AMDGPUTargetInfo.h"29#include "Utils/AMDGPUBaseInfo.h"30#include "Utils/AMDKernelCodeTUtils.h"31#include "Utils/SIDefinesUtils.h"32#include "llvm/Analysis/OptimizationRemarkEmitter.h"33#include "llvm/BinaryFormat/ELF.h"34#include "llvm/CodeGen/MachineFrameInfo.h"35#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"36#include "llvm/IR/DiagnosticInfo.h"37#include "llvm/MC/MCAssembler.h"38#include "llvm/MC/MCContext.h"39#include "llvm/MC/MCSectionELF.h"40#include "llvm/MC/MCStreamer.h"41#include "llvm/MC/TargetRegistry.h"42#include "llvm/Support/AMDHSAKernelDescriptor.h"43#include "llvm/Target/TargetLoweringObjectFile.h"44#include "llvm/Target/TargetMachine.h"45#include "llvm/TargetParser/TargetParser.h"4647using namespace llvm;48using namespace llvm::AMDGPU;4950// This should get the default rounding mode from the kernel. We just set the51// default here, but this could change if the OpenCL rounding mode pragmas are52// used.53//54// The denormal mode here should match what is reported by the OpenCL runtime55// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but56// can also be override to flush with the -cl-denorms-are-zero compiler flag.57//58// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double59// precision, and leaves single precision to flush all and does not report60// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports61// CL_FP_DENORM for both.62//63// FIXME: It seems some instructions do not support single precision denormals64// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,65// and sin_f32, cos_f32 on most parts).6667// We want to use these instructions, and using fp32 denormals also causes68// instructions to run at the double precision rate for the device so it's69// probably best to just report no single precision denormals.70static uint32_t getFPMode(SIModeRegisterDefaults Mode) {71return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |72FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |73FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |74FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());75}7677static AsmPrinter *78createAMDGPUAsmPrinterPass(TargetMachine &tm,79std::unique_ptr<MCStreamer> &&Streamer) {80return new AMDGPUAsmPrinter(tm, std::move(Streamer));81}8283extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() {84TargetRegistry::RegisterAsmPrinter(getTheR600Target(),85llvm::createR600AsmPrinterPass);86TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),87createAMDGPUAsmPrinterPass);88}8990AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,91std::unique_ptr<MCStreamer> Streamer)92: AsmPrinter(TM, std::move(Streamer)) {93assert(OutStreamer && "AsmPrinter constructed without streamer");94}9596StringRef AMDGPUAsmPrinter::getPassName() const {97return "AMDGPU Assembly Printer";98}99100const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const {101return TM.getMCSubtargetInfo();102}103104AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {105if (!OutStreamer)106return nullptr;107return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());108}109110void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {111IsTargetStreamerInitialized = false;112}113114void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {115IsTargetStreamerInitialized = true;116117// TODO: Which one is called first, emitStartOfAsmFile or118// emitFunctionBodyStart?119if (getTargetStreamer() && !getTargetStreamer()->getTargetID())120initializeTargetID(M);121122if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&123TM.getTargetTriple().getOS() != Triple::AMDPAL)124return;125126getTargetStreamer()->EmitDirectiveAMDGCNTarget();127128if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {129getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion(130CodeObjectVersion);131HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());132}133134if (TM.getTargetTriple().getOS() == Triple::AMDPAL)135getTargetStreamer()->getPALMetadata()->readFromIR(M);136}137138void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {139// Init target streamer if it has not yet happened140if (!IsTargetStreamerInitialized)141initTargetStreamer(M);142143if (TM.getTargetTriple().getOS() != Triple::AMDHSA)144getTargetStreamer()->EmitISAVersion();145146// Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).147// Emit HSA Metadata (NT_AMD_HSA_METADATA).148if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {149HSAMetadataStream->end();150bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());151(void)Success;152assert(Success && "Malformed HSA Metadata");153}154}155156void AMDGPUAsmPrinter::emitFunctionBodyStart() {157const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();158const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();159const Function &F = MF->getFunction();160161// TODO: We're checking this late, would be nice to check it earlier.162if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {163report_fatal_error(164STM.getCPU() + " is only available on code object version 6 or better",165/*gen_crash_diag*/ false);166}167168// TODO: Which one is called first, emitStartOfAsmFile or169// emitFunctionBodyStart?170if (!getTargetStreamer()->getTargetID())171initializeTargetID(*F.getParent());172173const auto &FunctionTargetID = STM.getTargetID();174// Make sure function's xnack settings are compatible with module's175// xnack settings.176if (FunctionTargetID.isXnackSupported() &&177FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&178FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {179OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +180"' function does not match module xnack setting");181return;182}183// Make sure function's sramecc settings are compatible with module's184// sramecc settings.185if (FunctionTargetID.isSramEccSupported() &&186FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&187FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {188OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +189"' function does not match module sramecc setting");190return;191}192193if (!MFI.isEntryFunction())194return;195196if (STM.isMesaKernel(F) &&197(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||198F.getCallingConv() == CallingConv::SPIR_KERNEL)) {199AMDGPUMCKernelCodeT KernelCode;200getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);201KernelCode.validate(&STM, MF->getContext());202getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);203}204205if (STM.isAmdHsaOS())206HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);207208if (MFI.getNumKernargPreloadedSGPRs() > 0) {209assert(AMDGPU::hasKernargPreload(STM));210getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI(),211STM.isAmdHsaOS());212}213}214215void AMDGPUAsmPrinter::emitFunctionBodyEnd() {216const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();217if (!MFI.isEntryFunction())218return;219220if (TM.getTargetTriple().getOS() != Triple::AMDHSA)221return;222223auto &Streamer = getTargetStreamer()->getStreamer();224auto &Context = Streamer.getContext();225auto &ObjectFileInfo = *Context.getObjectFileInfo();226auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();227228Streamer.pushSection();229Streamer.switchSection(&ReadOnlySection);230231// CP microcode requires the kernel descriptor to be allocated on 64 byte232// alignment.233Streamer.emitValueToAlignment(Align(64), 0, 1, 0);234ReadOnlySection.ensureMinAlignment(Align(64));235236const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();237238SmallString<128> KernelName;239getNameWithPrefix(KernelName, &MF->getFunction());240getTargetStreamer()->EmitAmdhsaKernelDescriptor(241STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),242CurrentProgramInfo.NumVGPRsForWavesPerEU,243MCBinaryExpr::createSub(244CurrentProgramInfo.NumSGPRsForWavesPerEU,245AMDGPUMCExpr::createExtraSGPRs(246CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,247getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),248Context),249CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);250251Streamer.popSection();252}253254void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {255Register RegNo = MI->getOperand(0).getReg();256257SmallString<128> Str;258raw_svector_ostream OS(Str);259OS << "implicit-def: "260<< printReg(RegNo, MF->getSubtarget().getRegisterInfo());261262if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)263OS << " : SGPR spill to VGPR lane";264265OutStreamer->AddComment(OS.str());266OutStreamer->addBlankLine();267}268269void AMDGPUAsmPrinter::emitFunctionEntryLabel() {270if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {271AsmPrinter::emitFunctionEntryLabel();272return;273}274275const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();276const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();277if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {278SmallString<128> SymbolName;279getNameWithPrefix(SymbolName, &MF->getFunction()),280getTargetStreamer()->EmitAMDGPUSymbolType(281SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);282}283if (DumpCodeInstEmitter) {284// Disassemble function name label to text.285DisasmLines.push_back(MF->getName().str() + ":");286DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());287HexLines.emplace_back("");288}289290AsmPrinter::emitFunctionEntryLabel();291}292293void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {294if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {295// Write a line for the basic block label if it is not only fallthrough.296DisasmLines.push_back(297(Twine("BB") + Twine(getFunctionNumber())298+ "_" + Twine(MBB.getNumber()) + ":").str());299DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());300HexLines.emplace_back("");301}302AsmPrinter::emitBasicBlockStart(MBB);303}304305void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {306if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {307if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {308OutContext.reportError({},309Twine(GV->getName()) +310": unsupported initializer for address space");311return;312}313314// LDS variables aren't emitted in HSA or PAL yet.315const Triple::OSType OS = TM.getTargetTriple().getOS();316if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)317return;318319MCSymbol *GVSym = getSymbol(GV);320321GVSym->redefineIfPossible();322if (GVSym->isDefined() || GVSym->isVariable())323report_fatal_error("symbol '" + Twine(GVSym->getName()) +324"' is already defined");325326const DataLayout &DL = GV->getDataLayout();327uint64_t Size = DL.getTypeAllocSize(GV->getValueType());328Align Alignment = GV->getAlign().value_or(Align(4));329330emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());331emitLinkage(GV, GVSym);332auto TS = getTargetStreamer();333TS->emitAMDGPULDS(GVSym, Size, Alignment);334return;335}336337AsmPrinter::emitGlobalVariable(GV);338}339340bool AMDGPUAsmPrinter::doInitialization(Module &M) {341CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);342343if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {344switch (CodeObjectVersion) {345case AMDGPU::AMDHSA_COV4:346HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();347break;348case AMDGPU::AMDHSA_COV5:349HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();350break;351case AMDGPU::AMDHSA_COV6:352HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();353break;354default:355report_fatal_error("Unexpected code object version");356}357}358return AsmPrinter::doInitialization(M);359}360361bool AMDGPUAsmPrinter::doFinalization(Module &M) {362// Pad with s_code_end to help tools and guard against instruction prefetch363// causing stale data in caches. Arguably this should be done by the linker,364// which is why this isn't done for Mesa.365const MCSubtargetInfo &STI = *getGlobalSTI();366if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&367(STI.getTargetTriple().getOS() == Triple::AMDHSA ||368STI.getTargetTriple().getOS() == Triple::AMDPAL)) {369OutStreamer->switchSection(getObjFileLowering().getTextSection());370getTargetStreamer()->EmitCodeEnd(STI);371}372373return AsmPrinter::doFinalization(M);374}375376// Print comments that apply to both callable functions and entry points.377void AMDGPUAsmPrinter::emitCommonFunctionComments(378uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,379uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,380const AMDGPUMachineFunction *MFI) {381OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);382OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);383OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);384if (NumAGPR) {385OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);386OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),387false);388}389OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);390OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),391false);392}393394SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {395SmallString<128> Str;396raw_svector_ostream OSS(Str);397int64_t IVal;398if (Value->evaluateAsAbsolute(IVal)) {399OSS << static_cast<uint64_t>(IVal);400} else {401Value->print(OSS, MAI);402}403return Str;404}405406void AMDGPUAsmPrinter::emitCommonFunctionComments(407const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,408const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,409const AMDGPUMachineFunction *MFI) {410OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);411OutStreamer->emitRawComment(" NumSgprs: " + getMCExprStr(NumSGPR), false);412OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);413if (NumAGPR && TotalNumVGPR) {414OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);415OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),416false);417}418OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),419false);420OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),421false);422}423424const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(425const MachineFunction &MF) const {426const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();427MCContext &Ctx = MF.getContext();428uint16_t KernelCodeProperties = 0;429const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();430431if (UserSGPRInfo.hasPrivateSegmentBuffer()) {432KernelCodeProperties |=433amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;434}435if (UserSGPRInfo.hasDispatchPtr()) {436KernelCodeProperties |=437amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;438}439if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {440KernelCodeProperties |=441amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;442}443if (UserSGPRInfo.hasKernargSegmentPtr()) {444KernelCodeProperties |=445amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;446}447if (UserSGPRInfo.hasDispatchID()) {448KernelCodeProperties |=449amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;450}451if (UserSGPRInfo.hasFlatScratchInit()) {452KernelCodeProperties |=453amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;454}455if (UserSGPRInfo.hasPrivateSegmentSize()) {456KernelCodeProperties |=457amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;458}459if (MF.getSubtarget<GCNSubtarget>().isWave32()) {460KernelCodeProperties |=461amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;462}463464// CurrentProgramInfo.DynamicCallStack is a MCExpr and could be465// un-evaluatable at this point so it cannot be conditionally checked here.466// Instead, we'll directly shift the possibly unknown MCExpr into its place467// and bitwise-or it into KernelCodeProperties.468const MCExpr *KernelCodePropExpr =469MCConstantExpr::create(KernelCodeProperties, Ctx);470const MCExpr *OrValue = MCConstantExpr::create(471amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);472OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,473OrValue, Ctx);474KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);475476return KernelCodePropExpr;477}478479MCKernelDescriptor480AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,481const SIProgramInfo &PI) const {482const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();483const Function &F = MF.getFunction();484const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();485MCContext &Ctx = MF.getContext();486487MCKernelDescriptor KernelDescriptor;488489KernelDescriptor.group_segment_fixed_size =490MCConstantExpr::create(PI.LDSSize, Ctx);491KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;492493Align MaxKernArgAlign;494KernelDescriptor.kernarg_size = MCConstantExpr::create(495STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);496497KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);498KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);499KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);500501int64_t PGRM_Rsrc3 = 1;502bool EvaluatableRsrc3 =503CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);504(void)PGRM_Rsrc3;505(void)EvaluatableRsrc3;506assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||507static_cast<uint64_t>(PGRM_Rsrc3) == 0);508KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;509510KernelDescriptor.kernarg_preload = MCConstantExpr::create(511AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,512Ctx);513514return KernelDescriptor;515}516517bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {518// Init target streamer lazily on the first function so that previous passes519// can set metadata.520if (!IsTargetStreamerInitialized)521initTargetStreamer(*MF.getFunction().getParent());522523ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();524CurrentProgramInfo.reset(MF);525526const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();527MCContext &Ctx = MF.getContext();528529// The starting address of all shader programs must be 256 bytes aligned.530// Regular functions just need the basic required instruction alignment.531MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));532533SetupMachineFunction(MF);534535const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();536MCContext &Context = getObjFileLowering().getContext();537// FIXME: This should be an explicit check for Mesa.538if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {539MCSectionELF *ConfigSection =540Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);541OutStreamer->switchSection(ConfigSection);542}543544if (MFI->isModuleEntryFunction()) {545getSIProgramInfo(CurrentProgramInfo, MF);546}547548if (STM.isAmdPalOS()) {549if (MFI->isEntryFunction())550EmitPALMetadata(MF, CurrentProgramInfo);551else if (MFI->isModuleEntryFunction())552emitPALFunctionMetadata(MF);553} else if (!STM.isAmdHsaOS()) {554EmitProgramInfoSI(MF, CurrentProgramInfo);555}556557DumpCodeInstEmitter = nullptr;558if (STM.dumpCode()) {559// For -dumpcode, get the assembler out of the streamer. This only works560// with -filetype=obj.561MCAssembler *Assembler = OutStreamer->getAssemblerPtr();562if (Assembler)563DumpCodeInstEmitter = Assembler->getEmitterPtr();564}565566DisasmLines.clear();567HexLines.clear();568DisasmLineMaxLen = 0;569570emitFunctionBody();571572emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),573STM.hasMAIInsts());574575if (isVerbose()) {576MCSectionELF *CommentSection =577Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);578OutStreamer->switchSection(CommentSection);579580if (!MFI->isEntryFunction()) {581OutStreamer->emitRawComment(" Function info:", false);582const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =583ResourceUsage->getResourceInfo(&MF.getFunction());584emitCommonFunctionComments(585Info.NumVGPR,586STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),587Info.getTotalNumVGPRs(STM),588Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),589Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);590return false;591}592593OutStreamer->emitRawComment(" Kernel info:", false);594emitCommonFunctionComments(595CurrentProgramInfo.NumArchVGPR,596STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,597CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,598CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);599600OutStreamer->emitRawComment(601" FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);602OutStreamer->emitRawComment(603" IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);604OutStreamer->emitRawComment(605" LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +606" bytes/workgroup (compile time only)", false);607608OutStreamer->emitRawComment(609" SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);610611OutStreamer->emitRawComment(612" VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);613614OutStreamer->emitRawComment(615" NumSGPRsForWavesPerEU: " +616getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),617false);618OutStreamer->emitRawComment(619" NumVGPRsForWavesPerEU: " +620getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),621false);622623if (STM.hasGFX90AInsts()) {624const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(625CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);626AdjustedAccum = MCBinaryExpr::createMul(627AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);628OutStreamer->emitRawComment(629" AccumOffset: " + getMCExprStr(AdjustedAccum), false);630}631632OutStreamer->emitRawComment(633" Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);634635OutStreamer->emitRawComment(636" WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);637638OutStreamer->emitRawComment(639" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +640getMCExprStr(CurrentProgramInfo.ScratchEnable),641false);642OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +643Twine(CurrentProgramInfo.UserSGPR),644false);645OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +646Twine(CurrentProgramInfo.TrapHandlerEnable),647false);648OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +649Twine(CurrentProgramInfo.TGIdXEnable),650false);651OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +652Twine(CurrentProgramInfo.TGIdYEnable),653false);654OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +655Twine(CurrentProgramInfo.TGIdZEnable),656false);657OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +658Twine(CurrentProgramInfo.TIdIGCompCount),659false);660661[[maybe_unused]] int64_t PGMRSrc3;662assert(STM.hasGFX90AInsts() ||663(CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(664PGMRSrc3) &&665static_cast<uint64_t>(PGMRSrc3) == 0));666if (STM.hasGFX90AInsts()) {667OutStreamer->emitRawComment(668" COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +669getMCExprStr(MCKernelDescriptor::bits_get(670CurrentProgramInfo.ComputePGMRSrc3GFX90A,671amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,672amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),673false);674OutStreamer->emitRawComment(675" COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +676getMCExprStr(MCKernelDescriptor::bits_get(677CurrentProgramInfo.ComputePGMRSrc3GFX90A,678amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,679amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),680false);681}682}683684if (DumpCodeInstEmitter) {685686OutStreamer->switchSection(687Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));688689for (size_t i = 0; i < DisasmLines.size(); ++i) {690std::string Comment = "\n";691if (!HexLines[i].empty()) {692Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');693Comment += " ; " + HexLines[i] + "\n";694}695696OutStreamer->emitBytes(StringRef(DisasmLines[i]));697OutStreamer->emitBytes(StringRef(Comment));698}699}700701return false;702}703704// TODO: Fold this into emitFunctionBodyStart.705void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {706// In the beginning all features are either 'Any' or 'NotSupported',707// depending on global target features. This will cover empty modules.708getTargetStreamer()->initializeTargetID(*getGlobalSTI(),709getGlobalSTI()->getFeatureString());710711// If module is empty, we are done.712if (M.empty())713return;714715// If module is not empty, need to find first 'Off' or 'On' feature716// setting per feature from functions in module.717for (auto &F : M) {718auto &TSTargetID = getTargetStreamer()->getTargetID();719if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&720(!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))721break;722723const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);724const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();725if (TSTargetID->isXnackSupported())726if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)727TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());728if (TSTargetID->isSramEccSupported())729if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)730TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());731}732}733734uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {735const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();736const SIInstrInfo *TII = STM.getInstrInfo();737738uint64_t CodeSize = 0;739740for (const MachineBasicBlock &MBB : MF) {741for (const MachineInstr &MI : MBB) {742// TODO: CodeSize should account for multiple functions.743744// TODO: Should we count size of debug info?745if (MI.isDebugInstr())746continue;747748CodeSize += TII->getInstSizeInBytes(MI);749}750}751752return CodeSize;753}754755void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,756const MachineFunction &MF) {757const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =758ResourceUsage->getResourceInfo(&MF.getFunction());759const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();760MCContext &Ctx = MF.getContext();761762auto CreateExpr = [&Ctx](int64_t Value) {763return MCConstantExpr::create(Value, Ctx);764};765766auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {767int64_t Val;768if (Value->evaluateAsAbsolute(Val)) {769Res = Val;770return true;771}772return false;773};774775ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR);776ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR);777ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM));778ProgInfo.AccumOffset =779CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1);780ProgInfo.TgSplit = STM.isTgSplitEnabled();781ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR);782ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize);783ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC);784ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch);785ProgInfo.DynamicCallStack =786CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion);787788const uint64_t MaxScratchPerWorkitem =789STM.getMaxWaveScratchSize() / STM.getWavefrontSize();790uint64_t ScratchSize;791if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) &&792ScratchSize > MaxScratchPerWorkitem) {793DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize,794MaxScratchPerWorkitem, DS_Error);795MF.getFunction().getContext().diagnose(DiagStackSize);796}797798const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();799800// The calculations related to SGPR/VGPR blocks are801// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be802// unified.803const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(804ProgInfo.VCCUsed, ProgInfo.FlatUsed,805getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);806807// Check the addressable register limit before we add ExtraSGPRs.808if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&809!STM.hasSGPRInitBug()) {810unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();811uint64_t NumSgpr;812if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&813NumSgpr > MaxAddressableNumSGPRs) {814// This can happen due to a compiler bug or when using inline asm.815LLVMContext &Ctx = MF.getFunction().getContext();816DiagnosticInfoResourceLimit Diag(817MF.getFunction(), "addressable scalar registers", NumSgpr,818MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);819Ctx.diagnose(Diag);820ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);821}822}823824// Account for extra SGPRs and VGPRs reserved for debugger use.825ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);826827const Function &F = MF.getFunction();828829// Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave830// dispatch registers are function args.831unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;832833if (isShader(F.getCallingConv())) {834bool IsPixelShader =835F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();836837// Calculate the number of VGPR registers based on the SPI input registers838uint32_t InputEna = 0;839uint32_t InputAddr = 0;840unsigned LastEna = 0;841842if (IsPixelShader) {843// Note for IsPixelShader:844// By this stage, all enabled inputs are tagged in InputAddr as well.845// We will use InputAddr to determine whether the input counts against the846// vgpr total and only use the InputEnable to determine the last input847// that is relevant - if extra arguments are used, then we have to honour848// the InputAddr for any intermediate non-enabled inputs.849InputEna = MFI->getPSInputEnable();850InputAddr = MFI->getPSInputAddr();851852// We only need to consider input args up to the last used arg.853assert((InputEna || InputAddr) &&854"PSInputAddr and PSInputEnable should "855"never both be 0 for AMDGPU_PS shaders");856// There are some rare circumstances where InputAddr is non-zero and857// InputEna can be set to 0. In this case we default to setting LastEna858// to 1.859LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;860}861862// FIXME: We should be using the number of registers determined during863// calling convention lowering to legalize the types.864const DataLayout &DL = F.getDataLayout();865unsigned PSArgCount = 0;866unsigned IntermediateVGPR = 0;867for (auto &Arg : F.args()) {868unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;869if (Arg.hasAttribute(Attribute::InReg)) {870WaveDispatchNumSGPR += NumRegs;871} else {872// If this is a PS shader and we're processing the PS Input args (first873// 16 VGPR), use the InputEna and InputAddr bits to define how many874// VGPRs are actually used.875// Any extra VGPR arguments are handled as normal arguments (and876// contribute to the VGPR count whether they're used or not).877if (IsPixelShader && PSArgCount < 16) {878if ((1 << PSArgCount) & InputAddr) {879if (PSArgCount < LastEna)880WaveDispatchNumVGPR += NumRegs;881else882IntermediateVGPR += NumRegs;883}884PSArgCount++;885} else {886// If there are extra arguments we have to include the allocation for887// the non-used (but enabled with InputAddr) input arguments888if (IntermediateVGPR) {889WaveDispatchNumVGPR += IntermediateVGPR;890IntermediateVGPR = 0;891}892WaveDispatchNumVGPR += NumRegs;893}894}895}896ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(897{ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);898899ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(900{ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);901902ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(903ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);904}905906// Adjust number of registers used to meet default/requested minimum/maximum907// number of waves per execution unit request.908unsigned MaxWaves = MFI->getMaxWavesPerEU();909ProgInfo.NumSGPRsForWavesPerEU =910AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),911CreateExpr(STM.getMinNumSGPRs(MaxWaves))},912Ctx);913ProgInfo.NumVGPRsForWavesPerEU =914AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),915CreateExpr(STM.getMinNumVGPRs(MaxWaves))},916Ctx);917918if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||919STM.hasSGPRInitBug()) {920unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();921uint64_t NumSgpr;922if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&923NumSgpr > MaxAddressableNumSGPRs) {924// This can happen due to a compiler bug or when using inline asm to use925// the registers which are usually reserved for vcc etc.926LLVMContext &Ctx = MF.getFunction().getContext();927DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",928NumSgpr, MaxAddressableNumSGPRs,929DS_Error, DK_ResourceLimit);930Ctx.diagnose(Diag);931ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);932ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);933}934}935936if (STM.hasSGPRInitBug()) {937ProgInfo.NumSGPR =938CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);939ProgInfo.NumSGPRsForWavesPerEU =940CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);941}942943if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {944LLVMContext &Ctx = MF.getFunction().getContext();945DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",946MFI->getNumUserSGPRs(),947STM.getMaxNumUserSGPRs(), DS_Error);948Ctx.diagnose(Diag);949}950951if (MFI->getLDSSize() >952static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {953LLVMContext &Ctx = MF.getFunction().getContext();954DiagnosticInfoResourceLimit Diag(955MF.getFunction(), "local memory", MFI->getLDSSize(),956STM.getAddressableLocalMemorySize(), DS_Error);957Ctx.diagnose(Diag);958}959// The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:960// (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1961auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,962unsigned Granule) {963const MCExpr *OneConst = CreateExpr(1ul);964const MCExpr *GranuleConst = CreateExpr(Granule);965const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);966const MCExpr *AlignToGPR =967AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);968const MCExpr *DivGPR =969MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);970const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);971return SubGPR;972};973974ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,975IsaInfo::getSGPREncodingGranule(&STM));976ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,977IsaInfo::getVGPREncodingGranule(&STM));978979const SIModeRegisterDefaults Mode = MFI->getMode();980981// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode982// register.983ProgInfo.FloatMode = getFPMode(Mode);984985ProgInfo.IEEEMode = Mode.IEEE;986987// Make clamp modifier on NaN input returns 0.988ProgInfo.DX10Clamp = Mode.DX10Clamp;989990unsigned LDSAlignShift;991if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {992// LDS is allocated in 64 dword blocks.993LDSAlignShift = 8;994} else {995// LDS is allocated in 128 dword blocks.996LDSAlignShift = 9;997}998999ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();1000ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();10011002ProgInfo.LDSSize = MFI->getLDSSize();1003ProgInfo.LDSBlocks =1004alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;10051006// The MCExpr equivalent of divideCeil.1007auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {1008const MCExpr *Ceil =1009AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);1010return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);1011};10121013// Scratch is allocated in 64-dword or 256-dword blocks.1014unsigned ScratchAlignShift =1015STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;1016// We need to program the hardware with the amount of scratch memory that1017// is used by the entire wave. ProgInfo.ScratchSize is the amount of1018// scratch memory used per thread.1019ProgInfo.ScratchBlocks = DivideCeil(1020MCBinaryExpr::createMul(ProgInfo.ScratchSize,1021CreateExpr(STM.getWavefrontSize()), Ctx),1022CreateExpr(1ULL << ScratchAlignShift));10231024if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {1025ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;1026ProgInfo.MemOrdered = 1;1027}10281029// 0 = X, 1 = XY, 2 = XYZ1030unsigned TIDIGCompCnt = 0;1031if (MFI->hasWorkItemIDZ())1032TIDIGCompCnt = 2;1033else if (MFI->hasWorkItemIDY())1034TIDIGCompCnt = 1;10351036// The private segment wave byte offset is the last of the system SGPRs. We1037// initially assumed it was allocated, and may have used it. It shouldn't harm1038// anything to disable it if we know the stack isn't used here. We may still1039// have emitted code reading it to initialize scratch, but if that's unused1040// reading garbage should be OK.1041ProgInfo.ScratchEnable = MCBinaryExpr::createLOr(1042MCBinaryExpr::createGT(ProgInfo.ScratchBlocks,1043MCConstantExpr::create(0, Ctx), Ctx),1044ProgInfo.DynamicCallStack, Ctx);10451046ProgInfo.UserSGPR = MFI->getNumUserSGPRs();1047// For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.1048ProgInfo.TrapHandlerEnable =1049STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();1050ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();1051ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();1052ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();1053ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();1054ProgInfo.TIdIGCompCount = TIDIGCompCnt;1055ProgInfo.EXCPEnMSB = 0;1056// For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.1057ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;1058ProgInfo.EXCPEnable = 0;10591060if (STM.hasGFX90AInsts()) {1061// return ((Dst & ~Mask) | (Value << Shift))1062auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,1063uint32_t Shift) {1064auto Shft = MCConstantExpr::create(Shift, Ctx);1065auto Msk = MCConstantExpr::create(Mask, Ctx);1066Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);1067Dst = MCBinaryExpr::createOr(1068Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);1069return Dst;1070};10711072ProgInfo.ComputePGMRSrc3GFX90A =1073SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,1074amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,1075amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);1076ProgInfo.ComputePGMRSrc3GFX90A =1077SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),1078amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,1079amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);1080}10811082ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(1083STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,1084ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);10851086const auto [MinWEU, MaxWEU] =1087AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);1088uint64_t Occupancy;1089if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {1090DiagnosticInfoOptimizationFailure Diag(1091F, F.getSubprogram(),1092"failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "1093"'" +1094F.getName() + "': desired occupancy was " + Twine(MinWEU) +1095", final occupancy is " + Twine(Occupancy));1096F.getContext().diagnose(Diag);1097}1098}10991100static unsigned getRsrcReg(CallingConv::ID CallConv) {1101switch (CallConv) {1102default: [[fallthrough]];1103case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;1104case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;1105case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;1106case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;1107case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;1108case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;1109case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;1110}1111}11121113void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,1114const SIProgramInfo &CurrentProgramInfo) {1115const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();1116const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();1117unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());1118MCContext &Ctx = MF.getContext();11191120// (((Value) & Mask) << Shift)1121auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {1122const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);1123const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);1124return MCBinaryExpr::createShl(MCBinaryExpr::createAnd(Value, msk, Ctx),1125shft, Ctx);1126};11271128auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {1129int64_t Val;1130if (Value->evaluateAsAbsolute(Val))1131OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);1132else1133OutStreamer->emitValue(Value, Size);1134};11351136if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {1137OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);11381139EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),1140/*Size=*/4);11411142OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);1143EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);11441145OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);11461147// Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the1148// appropriate generation.1149if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {1150EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,1151/*Mask=*/0x3FFFF, /*Shift=*/12),1152/*Size=*/4);1153} else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {1154EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,1155/*Mask=*/0x7FFF, /*Shift=*/12),1156/*Size=*/4);1157} else {1158EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,1159/*Mask=*/0x1FFF, /*Shift=*/12),1160/*Size=*/4);1161}11621163// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =1164// 0" comment but I don't see a corresponding field in the register spec.1165} else {1166OutStreamer->emitInt32(RsrcReg);11671168const MCExpr *GPRBlocks = MCBinaryExpr::createOr(1169SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),1170SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),1171MF.getContext());1172EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);1173OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);11741175// Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the1176// appropriate generation.1177if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {1178EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,1179/*Mask=*/0x3FFFF, /*Shift=*/12),1180/*Size=*/4);1181} else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {1182EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,1183/*Mask=*/0x7FFF, /*Shift=*/12),1184/*Size=*/4);1185} else {1186EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,1187/*Mask=*/0x1FFF, /*Shift=*/12),1188/*Size=*/4);1189}1190}11911192if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {1193OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);1194unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX111195? divideCeil(CurrentProgramInfo.LDSBlocks, 2)1196: CurrentProgramInfo.LDSBlocks;1197OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));1198OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);1199OutStreamer->emitInt32(MFI->getPSInputEnable());1200OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);1201OutStreamer->emitInt32(MFI->getPSInputAddr());1202}12031204OutStreamer->emitInt32(R_SPILLED_SGPRS);1205OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());1206OutStreamer->emitInt32(R_SPILLED_VGPRS);1207OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());1208}12091210// Helper function to add common PAL Metadata 3.0+1211static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,1212const SIProgramInfo &CurrentProgramInfo,1213CallingConv::ID CC, const GCNSubtarget &ST) {1214if (ST.hasIEEEMode())1215MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);12161217MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);1218MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);12191220if (AMDGPU::isCompute(CC)) {1221MD->setHwStage(CC, ".trap_present",1222(bool)CurrentProgramInfo.TrapHandlerEnable);1223MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);1224}12251226MD->setHwStage(CC, ".lds_size",1227(unsigned)(CurrentProgramInfo.LdsSize *1228getLdsDwGranularity(ST) * sizeof(uint32_t)));1229}12301231// This is the equivalent of EmitProgramInfoSI above, but for when the OS type1232// is AMDPAL. It stores each compute/SPI register setting and other PAL1233// metadata items into the PALMD::Metadata, combining with any provided by the1234// frontend as LLVM metadata. Once all functions are written, the PAL metadata1235// is then written as a single block in the .note section.1236void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,1237const SIProgramInfo &CurrentProgramInfo) {1238const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();1239auto CC = MF.getFunction().getCallingConv();1240auto MD = getTargetStreamer()->getPALMetadata();1241auto &Ctx = MF.getContext();12421243MD->setEntryPoint(CC, MF.getFunction().getName());1244MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);12451246// Only set AGPRs for supported devices1247const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();1248if (STM.hasMAIInsts()) {1249MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);1250}12511252MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);1253if (MD->getPALMajorVersion() < 3) {1254MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);1255if (AMDGPU::isCompute(CC)) {1256MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);1257} else {1258const MCExpr *HasScratchBlocks =1259MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,1260MCConstantExpr::create(0, Ctx), Ctx);1261auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);1262MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);1263}1264} else {1265MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);1266MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,1267CurrentProgramInfo.ScratchEnable);1268EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);1269}12701271// ScratchSize is in bytes, 16 aligned.1272MD->setScratchSize(1273CC,1274AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,1275MCConstantExpr::create(16, Ctx), Ctx),1276Ctx);12771278if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {1279unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX111280? divideCeil(CurrentProgramInfo.LDSBlocks, 2)1281: CurrentProgramInfo.LDSBlocks;1282if (MD->getPALMajorVersion() < 3) {1283MD->setRsrc2(1284CC,1285MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx),1286Ctx);1287MD->setSpiPsInputEna(MFI->getPSInputEnable());1288MD->setSpiPsInputAddr(MFI->getPSInputAddr());1289} else {1290// Graphics registers1291const unsigned ExtraLdsDwGranularity =1292STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;1293MD->setGraphicsRegisters(1294".ps_extra_lds_size",1295(unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));12961297// Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr1298static StringLiteral const PsInputFields[] = {1299".persp_sample_ena", ".persp_center_ena",1300".persp_centroid_ena", ".persp_pull_model_ena",1301".linear_sample_ena", ".linear_center_ena",1302".linear_centroid_ena", ".line_stipple_tex_ena",1303".pos_x_float_ena", ".pos_y_float_ena",1304".pos_z_float_ena", ".pos_w_float_ena",1305".front_face_ena", ".ancillary_ena",1306".sample_coverage_ena", ".pos_fixed_pt_ena"};1307unsigned PSInputEna = MFI->getPSInputEnable();1308unsigned PSInputAddr = MFI->getPSInputAddr();1309for (auto [Idx, Field] : enumerate(PsInputFields)) {1310MD->setGraphicsRegisters(".spi_ps_input_ena", Field,1311(bool)((PSInputEna >> Idx) & 1));1312MD->setGraphicsRegisters(".spi_ps_input_addr", Field,1313(bool)((PSInputAddr >> Idx) & 1));1314}1315}1316}13171318// For version 3 and above the wave front size is already set in the metadata1319if (MD->getPALMajorVersion() < 3 && STM.isWave32())1320MD->setWave32(MF.getFunction().getCallingConv());1321}13221323void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {1324auto *MD = getTargetStreamer()->getPALMetadata();1325const MachineFrameInfo &MFI = MF.getFrameInfo();1326StringRef FnName = MF.getFunction().getName();1327MD->setFunctionScratchSize(FnName, MFI.getStackSize());1328const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();1329MCContext &Ctx = MF.getContext();13301331if (MD->getPALMajorVersion() < 3) {1332// Set compute registers1333MD->setRsrc1(1334CallingConv::AMDGPU_CS,1335CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);1336MD->setRsrc2(CallingConv::AMDGPU_CS,1337CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);1338} else {1339EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);1340}13411342// Set optional info1343MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);1344MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);1345MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);1346}13471348// This is supposed to be log2(Size)1349static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {1350switch (Size) {1351case 4:1352return AMD_ELEMENT_4_BYTES;1353case 8:1354return AMD_ELEMENT_8_BYTES;1355case 16:1356return AMD_ELEMENT_16_BYTES;1357default:1358llvm_unreachable("invalid private_element_size");1359}1360}13611362void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,1363const SIProgramInfo &CurrentProgramInfo,1364const MachineFunction &MF) const {1365const Function &F = MF.getFunction();1366assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||1367F.getCallingConv() == CallingConv::SPIR_KERNEL);13681369const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();1370const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();1371MCContext &Ctx = MF.getContext();13721373Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);13741375Out.compute_pgm_resource1_registers =1376CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);1377Out.compute_pgm_resource2_registers =1378CurrentProgramInfo.getComputePGMRSrc2(Ctx);1379Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;13801381Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;13821383AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,1384getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));13851386const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();1387if (UserSGPRInfo.hasPrivateSegmentBuffer()) {1388Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;1389}13901391if (UserSGPRInfo.hasDispatchPtr())1392Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;13931394if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)1395Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;13961397if (UserSGPRInfo.hasKernargSegmentPtr())1398Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;13991400if (UserSGPRInfo.hasDispatchID())1401Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;14021403if (UserSGPRInfo.hasFlatScratchInit())1404Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;14051406if (UserSGPRInfo.hasPrivateSegmentSize())1407Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;14081409if (UserSGPRInfo.hasDispatchPtr())1410Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;14111412if (STM.isXNACKEnabled())1413Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;14141415Align MaxKernArgAlign;1416Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);1417Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;1418Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;1419Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;1420Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;14211422// kernarg_segment_alignment is specified as log of the alignment.1423// The minimum alignment is 16.1424// FIXME: The metadata treats the minimum as 4?1425Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));1426}14271428bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,1429const char *ExtraCode, raw_ostream &O) {1430// First try the generic code, which knows about modifiers like 'c' and 'n'.1431if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))1432return false;14331434if (ExtraCode && ExtraCode[0]) {1435if (ExtraCode[1] != 0)1436return true; // Unknown modifier.14371438switch (ExtraCode[0]) {1439case 'r':1440break;1441default:1442return true;1443}1444}14451446// TODO: Should be able to support other operand types like globals.1447const MachineOperand &MO = MI->getOperand(OpNo);1448if (MO.isReg()) {1449AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,1450*MF->getSubtarget().getRegisterInfo());1451return false;1452}1453if (MO.isImm()) {1454int64_t Val = MO.getImm();1455if (AMDGPU::isInlinableIntLiteral(Val)) {1456O << Val;1457} else if (isUInt<16>(Val)) {1458O << format("0x%" PRIx16, static_cast<uint16_t>(Val));1459} else if (isUInt<32>(Val)) {1460O << format("0x%" PRIx32, static_cast<uint32_t>(Val));1461} else {1462O << format("0x%" PRIx64, static_cast<uint64_t>(Val));1463}1464return false;1465}1466return true;1467}14681469void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {1470AU.addRequired<AMDGPUResourceUsageAnalysis>();1471AU.addPreserved<AMDGPUResourceUsageAnalysis>();1472AsmPrinter::getAnalysisUsage(AU);1473}14741475void AMDGPUAsmPrinter::emitResourceUsageRemarks(1476const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,1477bool isModuleEntryFunction, bool hasMAIInsts) {1478if (!ORE)1479return;14801481const char *Name = "kernel-resource-usage";1482const char *Indent = " ";14831484// If the remark is not specifically enabled, do not output to yaml1485LLVMContext &Ctx = MF.getFunction().getContext();1486if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))1487return;14881489// Currently non-kernel functions have no resources to emit.1490if (!isEntryFunctionCC(MF.getFunction().getCallingConv()))1491return;14921493auto EmitResourceUsageRemark = [&](StringRef RemarkName,1494StringRef RemarkLabel, auto Argument) {1495// Add an indent for every line besides the line with the kernel name. This1496// makes it easier to tell which resource usage go with which kernel since1497// the kernel name will always be displayed first.1498std::string LabelStr = RemarkLabel.str() + ": ";1499if (RemarkName != "FunctionName")1500LabelStr = Indent + LabelStr;15011502ORE->emit([&]() {1503return MachineOptimizationRemarkAnalysis(Name, RemarkName,1504MF.getFunction().getSubprogram(),1505&MF.front())1506<< LabelStr << ore::NV(RemarkName, Argument);1507});1508};15091510// FIXME: Formatting here is pretty nasty because clang does not accept1511// newlines from diagnostics. This forces us to emit multiple diagnostic1512// remarks to simulate newlines. If and when clang does accept newlines, this1513// formatting should be aggregated into one remark with newlines to avoid1514// printing multiple diagnostic location and diag opts.1515EmitResourceUsageRemark("FunctionName", "Function Name",1516MF.getFunction().getName());1517EmitResourceUsageRemark("NumSGPR", "SGPRs",1518getMCExprStr(CurrentProgramInfo.NumSGPR));1519EmitResourceUsageRemark("NumVGPR", "VGPRs",1520getMCExprStr(CurrentProgramInfo.NumArchVGPR));1521if (hasMAIInsts) {1522EmitResourceUsageRemark("NumAGPR", "AGPRs",1523getMCExprStr(CurrentProgramInfo.NumAccVGPR));1524}1525EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",1526getMCExprStr(CurrentProgramInfo.ScratchSize));1527int64_t DynStack;1528bool DynStackEvaluatable =1529CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);1530StringRef DynamicStackStr =1531DynStackEvaluatable && DynStack ? "True" : "False";1532EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);1533EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",1534getMCExprStr(CurrentProgramInfo.Occupancy));1535EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",1536CurrentProgramInfo.SGPRSpill);1537EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",1538CurrentProgramInfo.VGPRSpill);1539if (isModuleEntryFunction)1540EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",1541CurrentProgramInfo.LDSSize);1542}154315441545