Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
35269 views
//===- AMDGPUAttributor.cpp -----------------------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8/// \file This pass uses Attributor framework to deduce AMDGPU attributes.9//10//===----------------------------------------------------------------------===//1112#include "AMDGPU.h"13#include "GCNSubtarget.h"14#include "Utils/AMDGPUBaseInfo.h"15#include "llvm/Analysis/CycleAnalysis.h"16#include "llvm/CodeGen/TargetPassConfig.h"17#include "llvm/IR/IntrinsicsAMDGPU.h"18#include "llvm/IR/IntrinsicsR600.h"19#include "llvm/Target/TargetMachine.h"20#include "llvm/Transforms/IPO/Attributor.h"2122#define DEBUG_TYPE "amdgpu-attributor"2324namespace llvm {25void initializeCycleInfoWrapperPassPass(PassRegistry &);26} // namespace llvm2728using namespace llvm;2930static cl::opt<unsigned> KernargPreloadCount(31"amdgpu-kernarg-preload-count",32cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));3334#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,3536enum ImplicitArgumentPositions {37#include "AMDGPUAttributes.def"38LAST_ARG_POS39};4041#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,4243enum ImplicitArgumentMask {44NOT_IMPLICIT_INPUT = 0,45#include "AMDGPUAttributes.def"46ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 147};4849#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},50static constexpr std::pair<ImplicitArgumentMask,51StringLiteral> ImplicitAttrs[] = {52#include "AMDGPUAttributes.def"53};5455// We do not need to note the x workitem or workgroup id because they are always56// initialized.57//58// TODO: We should not add the attributes if the known compile time workgroup59// size is 1 for y/z.60static ImplicitArgumentMask61intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,62bool HasApertureRegs, bool SupportsGetDoorBellID,63unsigned CodeObjectVersion) {64switch (ID) {65case Intrinsic::amdgcn_workitem_id_x:66NonKernelOnly = true;67return WORKITEM_ID_X;68case Intrinsic::amdgcn_workgroup_id_x:69NonKernelOnly = true;70return WORKGROUP_ID_X;71case Intrinsic::amdgcn_workitem_id_y:72case Intrinsic::r600_read_tidig_y:73return WORKITEM_ID_Y;74case Intrinsic::amdgcn_workitem_id_z:75case Intrinsic::r600_read_tidig_z:76return WORKITEM_ID_Z;77case Intrinsic::amdgcn_workgroup_id_y:78case Intrinsic::r600_read_tgid_y:79return WORKGROUP_ID_Y;80case Intrinsic::amdgcn_workgroup_id_z:81case Intrinsic::r600_read_tgid_z:82return WORKGROUP_ID_Z;83case Intrinsic::amdgcn_lds_kernel_id:84return LDS_KERNEL_ID;85case Intrinsic::amdgcn_dispatch_ptr:86return DISPATCH_PTR;87case Intrinsic::amdgcn_dispatch_id:88return DISPATCH_ID;89case Intrinsic::amdgcn_implicitarg_ptr:90return IMPLICIT_ARG_PTR;91// Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access92// queue_ptr.93case Intrinsic::amdgcn_queue_ptr:94NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);95return QUEUE_PTR;96case Intrinsic::amdgcn_is_shared:97case Intrinsic::amdgcn_is_private:98if (HasApertureRegs)99return NOT_IMPLICIT_INPUT;100// Under V5, we need implicitarg_ptr + offsets to access private_base or101// shared_base. For pre-V5, however, need to access them through queue_ptr +102// offsets.103return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR :104QUEUE_PTR;105case Intrinsic::trap:106if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.107return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT :108QUEUE_PTR;109NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);110return QUEUE_PTR;111default:112return NOT_IMPLICIT_INPUT;113}114}115116static bool castRequiresQueuePtr(unsigned SrcAS) {117return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;118}119120static bool isDSAddress(const Constant *C) {121const GlobalValue *GV = dyn_cast<GlobalValue>(C);122if (!GV)123return false;124unsigned AS = GV->getAddressSpace();125return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;126}127128/// Returns true if the function requires the implicit argument be passed129/// regardless of the function contents.130static bool funcRequiresHostcallPtr(const Function &F) {131// Sanitizers require the hostcall buffer passed in the implicit arguments.132return F.hasFnAttribute(Attribute::SanitizeAddress) ||133F.hasFnAttribute(Attribute::SanitizeThread) ||134F.hasFnAttribute(Attribute::SanitizeMemory) ||135F.hasFnAttribute(Attribute::SanitizeHWAddress) ||136F.hasFnAttribute(Attribute::SanitizeMemTag);137}138139namespace {140class AMDGPUInformationCache : public InformationCache {141public:142AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,143BumpPtrAllocator &Allocator,144SetVector<Function *> *CGSCC, TargetMachine &TM)145: InformationCache(M, AG, Allocator, CGSCC), TM(TM),146CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}147148TargetMachine &TM;149150enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };151152/// Check if the subtarget has aperture regs.153bool hasApertureRegs(Function &F) {154const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);155return ST.hasApertureRegs();156}157158/// Check if the subtarget supports GetDoorbellID.159bool supportsGetDoorbellID(Function &F) {160const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);161return ST.supportsGetDoorbellID();162}163164std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {165const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);166return ST.getFlatWorkGroupSizes(F);167}168169std::pair<unsigned, unsigned>170getMaximumFlatWorkGroupRange(const Function &F) {171const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);172return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};173}174175/// Get code object version.176unsigned getCodeObjectVersion() const {177return CodeObjectVersion;178}179180/// Get the effective value of "amdgpu-waves-per-eu" for the function,181/// accounting for the interaction with the passed value to use for182/// "amdgpu-flat-work-group-size".183std::pair<unsigned, unsigned>184getWavesPerEU(const Function &F,185std::pair<unsigned, unsigned> FlatWorkGroupSize) {186const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);187return ST.getWavesPerEU(F, FlatWorkGroupSize);188}189190std::pair<unsigned, unsigned>191getEffectiveWavesPerEU(const Function &F,192std::pair<unsigned, unsigned> WavesPerEU,193std::pair<unsigned, unsigned> FlatWorkGroupSize) {194const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);195return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize);196}197198unsigned getMaxWavesPerEU(const Function &F) {199const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);200return ST.getMaxWavesPerEU();201}202203private:204/// Check if the ConstantExpr \p CE requires the queue pointer.205static bool visitConstExpr(const ConstantExpr *CE) {206if (CE->getOpcode() == Instruction::AddrSpaceCast) {207unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();208return castRequiresQueuePtr(SrcAS);209}210return false;211}212213/// Get the constant access bitmap for \p C.214uint8_t getConstantAccess(const Constant *C,215SmallPtrSetImpl<const Constant *> &Visited) {216auto It = ConstantStatus.find(C);217if (It != ConstantStatus.end())218return It->second;219220uint8_t Result = 0;221if (isDSAddress(C))222Result = DS_GLOBAL;223224if (const auto *CE = dyn_cast<ConstantExpr>(C))225if (visitConstExpr(CE))226Result |= ADDR_SPACE_CAST;227228for (const Use &U : C->operands()) {229const auto *OpC = dyn_cast<Constant>(U);230if (!OpC || !Visited.insert(OpC).second)231continue;232233Result |= getConstantAccess(OpC, Visited);234}235return Result;236}237238public:239/// Returns true if \p Fn needs the queue pointer because of \p C.240bool needsQueuePtr(const Constant *C, Function &Fn) {241bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());242bool HasAperture = hasApertureRegs(Fn);243244// No need to explore the constants.245if (!IsNonEntryFunc && HasAperture)246return false;247248SmallPtrSet<const Constant *, 8> Visited;249uint8_t Access = getConstantAccess(C, Visited);250251// We need to trap on DS globals in non-entry functions.252if (IsNonEntryFunc && (Access & DS_GLOBAL))253return true;254255return !HasAperture && (Access & ADDR_SPACE_CAST);256}257258private:259/// Used to determine if the Constant needs the queue pointer.260DenseMap<const Constant *, uint8_t> ConstantStatus;261const unsigned CodeObjectVersion;262};263264struct AAAMDAttributes265: public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,266AbstractAttribute> {267using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,268AbstractAttribute>;269270AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}271272/// Create an abstract attribute view for the position \p IRP.273static AAAMDAttributes &createForPosition(const IRPosition &IRP,274Attributor &A);275276/// See AbstractAttribute::getName().277const std::string getName() const override { return "AAAMDAttributes"; }278279/// See AbstractAttribute::getIdAddr().280const char *getIdAddr() const override { return &ID; }281282/// This function should return true if the type of the \p AA is283/// AAAMDAttributes.284static bool classof(const AbstractAttribute *AA) {285return (AA->getIdAddr() == &ID);286}287288/// Unique ID (due to the unique address)289static const char ID;290};291const char AAAMDAttributes::ID = 0;292293struct AAUniformWorkGroupSize294: public StateWrapper<BooleanState, AbstractAttribute> {295using Base = StateWrapper<BooleanState, AbstractAttribute>;296AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}297298/// Create an abstract attribute view for the position \p IRP.299static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,300Attributor &A);301302/// See AbstractAttribute::getName().303const std::string getName() const override {304return "AAUniformWorkGroupSize";305}306307/// See AbstractAttribute::getIdAddr().308const char *getIdAddr() const override { return &ID; }309310/// This function should return true if the type of the \p AA is311/// AAAMDAttributes.312static bool classof(const AbstractAttribute *AA) {313return (AA->getIdAddr() == &ID);314}315316/// Unique ID (due to the unique address)317static const char ID;318};319const char AAUniformWorkGroupSize::ID = 0;320321struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {322AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)323: AAUniformWorkGroupSize(IRP, A) {}324325void initialize(Attributor &A) override {326Function *F = getAssociatedFunction();327CallingConv::ID CC = F->getCallingConv();328329if (CC != CallingConv::AMDGPU_KERNEL)330return;331332bool InitialValue = false;333if (F->hasFnAttribute("uniform-work-group-size"))334InitialValue =335F->getFnAttribute("uniform-work-group-size").getValueAsString() ==336"true";337338if (InitialValue)339indicateOptimisticFixpoint();340else341indicatePessimisticFixpoint();342}343344ChangeStatus updateImpl(Attributor &A) override {345ChangeStatus Change = ChangeStatus::UNCHANGED;346347auto CheckCallSite = [&](AbstractCallSite CS) {348Function *Caller = CS.getInstruction()->getFunction();349LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()350<< "->" << getAssociatedFunction()->getName() << "\n");351352const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(353*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);354if (!CallerInfo)355return false;356357Change = Change | clampStateAndIndicateChange(this->getState(),358CallerInfo->getState());359360return true;361};362363bool AllCallSitesKnown = true;364if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))365return indicatePessimisticFixpoint();366367return Change;368}369370ChangeStatus manifest(Attributor &A) override {371SmallVector<Attribute, 8> AttrList;372LLVMContext &Ctx = getAssociatedFunction()->getContext();373374AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",375getAssumed() ? "true" : "false"));376return A.manifestAttrs(getIRPosition(), AttrList,377/* ForceReplace */ true);378}379380bool isValidState() const override {381// This state is always valid, even when the state is false.382return true;383}384385const std::string getAsStr(Attributor *) const override {386return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";387}388389/// See AbstractAttribute::trackStatistics()390void trackStatistics() const override {}391};392393AAUniformWorkGroupSize &394AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,395Attributor &A) {396if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)397return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);398llvm_unreachable(399"AAUniformWorkGroupSize is only valid for function position");400}401402struct AAAMDAttributesFunction : public AAAMDAttributes {403AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)404: AAAMDAttributes(IRP, A) {}405406void initialize(Attributor &A) override {407Function *F = getAssociatedFunction();408409// If the function requires the implicit arg pointer due to sanitizers,410// assume it's needed even if explicitly marked as not requiring it.411const bool NeedsHostcall = funcRequiresHostcallPtr(*F);412if (NeedsHostcall) {413removeAssumedBits(IMPLICIT_ARG_PTR);414removeAssumedBits(HOSTCALL_PTR);415}416417for (auto Attr : ImplicitAttrs) {418if (NeedsHostcall &&419(Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))420continue;421422if (F->hasFnAttribute(Attr.second))423addKnownBits(Attr.first);424}425426if (F->isDeclaration())427return;428429// Ignore functions with graphics calling conventions, these are currently430// not allowed to have kernel arguments.431if (AMDGPU::isGraphics(F->getCallingConv())) {432indicatePessimisticFixpoint();433return;434}435}436437ChangeStatus updateImpl(Attributor &A) override {438Function *F = getAssociatedFunction();439// The current assumed state used to determine a change.440auto OrigAssumed = getAssumed();441442// Check for Intrinsics and propagate attributes.443const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(444*this, this->getIRPosition(), DepClassTy::REQUIRED);445if (!AAEdges || AAEdges->hasNonAsmUnknownCallee())446return indicatePessimisticFixpoint();447448bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());449450bool NeedsImplicit = false;451auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());452bool HasApertureRegs = InfoCache.hasApertureRegs(*F);453bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);454unsigned COV = InfoCache.getCodeObjectVersion();455456for (Function *Callee : AAEdges->getOptimisticEdges()) {457Intrinsic::ID IID = Callee->getIntrinsicID();458if (IID == Intrinsic::not_intrinsic) {459const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(460*this, IRPosition::function(*Callee), DepClassTy::REQUIRED);461if (!AAAMD)462return indicatePessimisticFixpoint();463*this &= *AAAMD;464continue;465}466467bool NonKernelOnly = false;468ImplicitArgumentMask AttrMask =469intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,470HasApertureRegs, SupportsGetDoorbellID, COV);471if (AttrMask != NOT_IMPLICIT_INPUT) {472if ((IsNonEntryFunc || !NonKernelOnly))473removeAssumedBits(AttrMask);474}475}476477// Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.478if (NeedsImplicit)479removeAssumedBits(IMPLICIT_ARG_PTR);480481if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {482// Under V5, we need implicitarg_ptr + offsets to access private_base or483// shared_base. We do not actually need queue_ptr.484if (COV >= 5)485removeAssumedBits(IMPLICIT_ARG_PTR);486else487removeAssumedBits(QUEUE_PTR);488}489490if (funcRetrievesMultigridSyncArg(A, COV)) {491assert(!isAssumed(IMPLICIT_ARG_PTR) &&492"multigrid_sync_arg needs implicitarg_ptr");493removeAssumedBits(MULTIGRID_SYNC_ARG);494}495496if (funcRetrievesHostcallPtr(A, COV)) {497assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");498removeAssumedBits(HOSTCALL_PTR);499}500501if (funcRetrievesHeapPtr(A, COV)) {502assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");503removeAssumedBits(HEAP_PTR);504}505506if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {507assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");508removeAssumedBits(QUEUE_PTR);509}510511if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {512removeAssumedBits(LDS_KERNEL_ID);513}514515if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))516removeAssumedBits(DEFAULT_QUEUE);517518if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))519removeAssumedBits(COMPLETION_ACTION);520521return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED522: ChangeStatus::UNCHANGED;523}524525ChangeStatus manifest(Attributor &A) override {526SmallVector<Attribute, 8> AttrList;527LLVMContext &Ctx = getAssociatedFunction()->getContext();528529for (auto Attr : ImplicitAttrs) {530if (isKnown(Attr.first))531AttrList.push_back(Attribute::get(Ctx, Attr.second));532}533534return A.manifestAttrs(getIRPosition(), AttrList,535/* ForceReplace */ true);536}537538const std::string getAsStr(Attributor *) const override {539std::string Str;540raw_string_ostream OS(Str);541OS << "AMDInfo[";542for (auto Attr : ImplicitAttrs)543if (isAssumed(Attr.first))544OS << ' ' << Attr.second;545OS << " ]";546return OS.str();547}548549/// See AbstractAttribute::trackStatistics()550void trackStatistics() const override {}551552private:553bool checkForQueuePtr(Attributor &A) {554Function *F = getAssociatedFunction();555bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());556557auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());558559bool NeedsQueuePtr = false;560561auto CheckAddrSpaceCasts = [&](Instruction &I) {562unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();563if (castRequiresQueuePtr(SrcAS)) {564NeedsQueuePtr = true;565return false;566}567return true;568};569570bool HasApertureRegs = InfoCache.hasApertureRegs(*F);571572// `checkForAllInstructions` is much more cheaper than going through all573// instructions, try it first.574575// The queue pointer is not needed if aperture regs is present.576if (!HasApertureRegs) {577bool UsedAssumedInformation = false;578A.checkForAllInstructions(CheckAddrSpaceCasts, *this,579{Instruction::AddrSpaceCast},580UsedAssumedInformation);581}582583// If we found that we need the queue pointer, nothing else to do.584if (NeedsQueuePtr)585return true;586587if (!IsNonEntryFunc && HasApertureRegs)588return false;589590for (BasicBlock &BB : *F) {591for (Instruction &I : BB) {592for (const Use &U : I.operands()) {593if (const auto *C = dyn_cast<Constant>(U)) {594if (InfoCache.needsQueuePtr(C, *F))595return true;596}597}598}599}600601return false;602}603604bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {605auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);606AA::RangeTy Range(Pos, 8);607return funcRetrievesImplicitKernelArg(A, Range);608}609610bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {611auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);612AA::RangeTy Range(Pos, 8);613return funcRetrievesImplicitKernelArg(A, Range);614}615616bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {617auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);618AA::RangeTy Range(Pos, 8);619return funcRetrievesImplicitKernelArg(A, Range);620}621622bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {623auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);624AA::RangeTy Range(Pos, 8);625return funcRetrievesImplicitKernelArg(A, Range);626}627628bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {629if (COV < 5)630return false;631AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);632return funcRetrievesImplicitKernelArg(A, Range);633}634635bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {636if (COV < 5)637return false;638AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);639return funcRetrievesImplicitKernelArg(A, Range);640}641642bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {643// Check if this is a call to the implicitarg_ptr builtin and it644// is used to retrieve the hostcall pointer. The implicit arg for645// hostcall is not used only if every use of the implicitarg_ptr646// is a load that clearly does not retrieve any byte of the647// hostcall pointer. We check this by tracing all the uses of the648// initial call to the implicitarg_ptr intrinsic.649auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {650auto &Call = cast<CallBase>(I);651if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)652return true;653654const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(655*this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);656if (!PointerInfoAA)657return false;658659return PointerInfoAA->forallInterferingAccesses(660Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {661return Acc.getRemoteInst()->isDroppable();662});663};664665bool UsedAssumedInformation = false;666return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,667UsedAssumedInformation);668}669670bool funcRetrievesLDSKernelId(Attributor &A) {671auto DoesNotRetrieve = [&](Instruction &I) {672auto &Call = cast<CallBase>(I);673return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;674};675bool UsedAssumedInformation = false;676return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,677UsedAssumedInformation);678}679};680681AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,682Attributor &A) {683if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)684return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);685llvm_unreachable("AAAMDAttributes is only valid for function position");686}687688/// Base class to derive different size ranges.689struct AAAMDSizeRangeAttribute690: public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {691using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;692693StringRef AttrName;694695AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,696StringRef AttrName)697: Base(IRP, 32), AttrName(AttrName) {}698699/// See AbstractAttribute::trackStatistics()700void trackStatistics() const override {}701702template <class AttributeImpl>703ChangeStatus updateImplImpl(Attributor &A) {704ChangeStatus Change = ChangeStatus::UNCHANGED;705706auto CheckCallSite = [&](AbstractCallSite CS) {707Function *Caller = CS.getInstruction()->getFunction();708LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()709<< "->" << getAssociatedFunction()->getName() << '\n');710711const auto *CallerInfo = A.getAAFor<AttributeImpl>(712*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);713if (!CallerInfo)714return false;715716Change |=717clampStateAndIndicateChange(this->getState(), CallerInfo->getState());718719return true;720};721722bool AllCallSitesKnown = true;723if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))724return indicatePessimisticFixpoint();725726return Change;727}728729ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min,730unsigned Max) {731// Don't add the attribute if it's the implied default.732if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)733return ChangeStatus::UNCHANGED;734735Function *F = getAssociatedFunction();736LLVMContext &Ctx = F->getContext();737SmallString<10> Buffer;738raw_svector_ostream OS(Buffer);739OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;740return A.manifestAttrs(getIRPosition(),741{Attribute::get(Ctx, AttrName, OS.str())},742/* ForceReplace */ true);743}744745const std::string getAsStr(Attributor *) const override {746std::string Str;747raw_string_ostream OS(Str);748OS << getName() << '[';749OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;750OS << ']';751return OS.str();752}753};754755/// Propagate amdgpu-flat-work-group-size attribute.756struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {757AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)758: AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}759760void initialize(Attributor &A) override {761Function *F = getAssociatedFunction();762auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());763unsigned MinGroupSize, MaxGroupSize;764std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);765intersectKnown(766ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));767768if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))769indicatePessimisticFixpoint();770}771772ChangeStatus updateImpl(Attributor &A) override {773return updateImplImpl<AAAMDFlatWorkGroupSize>(A);774}775776/// Create an abstract attribute view for the position \p IRP.777static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,778Attributor &A);779780ChangeStatus manifest(Attributor &A) override {781Function *F = getAssociatedFunction();782auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());783unsigned Min, Max;784std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);785return emitAttributeIfNotDefault(A, Min, Max);786}787788/// See AbstractAttribute::getName()789const std::string getName() const override {790return "AAAMDFlatWorkGroupSize";791}792793/// See AbstractAttribute::getIdAddr()794const char *getIdAddr() const override { return &ID; }795796/// This function should return true if the type of the \p AA is797/// AAAMDFlatWorkGroupSize798static bool classof(const AbstractAttribute *AA) {799return (AA->getIdAddr() == &ID);800}801802/// Unique ID (due to the unique address)803static const char ID;804};805806const char AAAMDFlatWorkGroupSize::ID = 0;807808AAAMDFlatWorkGroupSize &809AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,810Attributor &A) {811if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)812return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);813llvm_unreachable(814"AAAMDFlatWorkGroupSize is only valid for function position");815}816817/// Propagate amdgpu-waves-per-eu attribute.818struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {819AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)820: AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}821822bool isValidState() const override {823return !Assumed.isEmptySet() && IntegerRangeState::isValidState();824}825826void initialize(Attributor &A) override {827Function *F = getAssociatedFunction();828auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());829830if (const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(831*this, IRPosition::function(*F), DepClassTy::REQUIRED)) {832833unsigned Min, Max;834std::tie(Min, Max) = InfoCache.getWavesPerEU(835*F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(),836AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});837838ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));839intersectKnown(Range);840}841842if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))843indicatePessimisticFixpoint();844}845846ChangeStatus updateImpl(Attributor &A) override {847auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());848ChangeStatus Change = ChangeStatus::UNCHANGED;849850auto CheckCallSite = [&](AbstractCallSite CS) {851Function *Caller = CS.getInstruction()->getFunction();852Function *Func = getAssociatedFunction();853LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()854<< "->" << Func->getName() << '\n');855856const auto *CallerInfo = A.getAAFor<AAAMDWavesPerEU>(857*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);858const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(859*this, IRPosition::function(*Func), DepClassTy::REQUIRED);860if (!CallerInfo || !AssumedGroupSize)861return false;862863unsigned Min, Max;864std::tie(Min, Max) = InfoCache.getEffectiveWavesPerEU(865*Caller,866{CallerInfo->getAssumed().getLower().getZExtValue(),867CallerInfo->getAssumed().getUpper().getZExtValue() - 1},868{AssumedGroupSize->getAssumed().getLower().getZExtValue(),869AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});870ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1));871IntegerRangeState CallerRangeState(CallerRange);872Change |= clampStateAndIndicateChange(this->getState(), CallerRangeState);873874return true;875};876877bool AllCallSitesKnown = true;878if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))879return indicatePessimisticFixpoint();880881return Change;882}883884/// Create an abstract attribute view for the position \p IRP.885static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,886Attributor &A);887888ChangeStatus manifest(Attributor &A) override {889Function *F = getAssociatedFunction();890auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());891unsigned Max = InfoCache.getMaxWavesPerEU(*F);892return emitAttributeIfNotDefault(A, 1, Max);893}894895/// See AbstractAttribute::getName()896const std::string getName() const override { return "AAAMDWavesPerEU"; }897898/// See AbstractAttribute::getIdAddr()899const char *getIdAddr() const override { return &ID; }900901/// This function should return true if the type of the \p AA is902/// AAAMDWavesPerEU903static bool classof(const AbstractAttribute *AA) {904return (AA->getIdAddr() == &ID);905}906907/// Unique ID (due to the unique address)908static const char ID;909};910911const char AAAMDWavesPerEU::ID = 0;912913AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,914Attributor &A) {915if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)916return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);917llvm_unreachable("AAAMDWavesPerEU is only valid for function position");918}919920static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {921for (const auto &CI : IA->ParseConstraints()) {922for (StringRef Code : CI.Codes) {923Code.consume_front("{");924if (Code.starts_with("a"))925return true;926}927}928929return false;930}931932struct AAAMDGPUNoAGPR933: public IRAttribute<Attribute::NoUnwind,934StateWrapper<BooleanState, AbstractAttribute>,935AAAMDGPUNoAGPR> {936AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}937938static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,939Attributor &A) {940if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)941return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A);942llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position");943}944945void initialize(Attributor &A) override {946Function *F = getAssociatedFunction();947if (F->hasFnAttribute("amdgpu-no-agpr"))948indicateOptimisticFixpoint();949}950951const std::string getAsStr(Attributor *A) const override {952return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr";953}954955void trackStatistics() const override {}956957ChangeStatus updateImpl(Attributor &A) override {958// TODO: Use AACallEdges, but then we need a way to inspect asm edges.959960auto CheckForNoAGPRs = [&](Instruction &I) {961const auto &CB = cast<CallBase>(I);962const Value *CalleeOp = CB.getCalledOperand();963const Function *Callee = dyn_cast<Function>(CalleeOp);964if (!Callee) {965if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))966return !inlineAsmUsesAGPRs(IA);967return false;968}969970// Some intrinsics may use AGPRs, but if we have a choice, we are not971// required to use AGPRs.972if (Callee->isIntrinsic())973return true;974975// TODO: Handle callsite attributes976const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(977*this, IRPosition::function(*Callee), DepClassTy::REQUIRED);978return CalleeInfo && CalleeInfo->getAssumed();979};980981bool UsedAssumedInformation = false;982if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this,983UsedAssumedInformation))984return indicatePessimisticFixpoint();985return ChangeStatus::UNCHANGED;986}987988ChangeStatus manifest(Attributor &A) override {989if (!getAssumed())990return ChangeStatus::UNCHANGED;991LLVMContext &Ctx = getAssociatedFunction()->getContext();992return A.manifestAttrs(getIRPosition(),993{Attribute::get(Ctx, "amdgpu-no-agpr")});994}995996const std::string getName() const override { return "AAAMDGPUNoAGPR"; }997const char *getIdAddr() const override { return &ID; }998999/// This function should return true if the type of the \p AA is1000/// AAAMDGPUNoAGPRs1001static bool classof(const AbstractAttribute *AA) {1002return (AA->getIdAddr() == &ID);1003}10041005static const char ID;1006};10071008const char AAAMDGPUNoAGPR::ID = 0;10091010static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {1011const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);1012for (unsigned I = 0;1013I < F.arg_size() &&1014I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs());1015++I) {1016Argument &Arg = *F.getArg(I);1017// Check for incompatible attributes.1018if (Arg.hasByRefAttr() || Arg.hasNestAttr())1019break;10201021Arg.addAttr(Attribute::InReg);1022}1023}10241025static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {1026SetVector<Function *> Functions;1027for (Function &F : M) {1028if (!F.isIntrinsic())1029Functions.insert(&F);1030}10311032CallGraphUpdater CGUpdater;1033BumpPtrAllocator Allocator;1034AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);1035DenseSet<const char *> Allowed(1036{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,1037&AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,1038&AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID,1039&AAPointerInfo::ID, &AAPotentialConstantValues::ID,1040&AAUnderlyingObjects::ID});10411042AttributorConfig AC(CGUpdater);1043AC.Allowed = &Allowed;1044AC.IsModulePass = true;1045AC.DefaultInitializeLiveInternals = false;1046AC.IPOAmendableCB = [](const Function &F) {1047return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;1048};10491050Attributor A(Functions, InfoCache, AC);10511052for (Function &F : M) {1053if (!F.isIntrinsic()) {1054A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));1055A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));1056A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(F));1057CallingConv::ID CC = F.getCallingConv();1058if (!AMDGPU::isEntryFunctionCC(CC)) {1059A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));1060A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(F));1061} else if (CC == CallingConv::AMDGPU_KERNEL) {1062addPreloadKernArgHint(F, TM);1063}1064}1065}10661067ChangeStatus Change = A.run();1068return Change == ChangeStatus::CHANGED;1069}10701071class AMDGPUAttributorLegacy : public ModulePass {1072public:1073AMDGPUAttributorLegacy() : ModulePass(ID) {}10741075/// doInitialization - Virtual method overridden by subclasses to do1076/// any necessary initialization before any pass is run.1077bool doInitialization(Module &) override {1078auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();1079if (!TPC)1080report_fatal_error("TargetMachine is required");10811082TM = &TPC->getTM<TargetMachine>();1083return false;1084}10851086bool runOnModule(Module &M) override {1087AnalysisGetter AG(this);1088return runImpl(M, AG, *TM);1089}10901091void getAnalysisUsage(AnalysisUsage &AU) const override {1092AU.addRequired<CycleInfoWrapperPass>();1093}10941095StringRef getPassName() const override { return "AMDGPU Attributor"; }1096TargetMachine *TM;1097static char ID;1098};1099} // namespace11001101PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,1102ModuleAnalysisManager &AM) {11031104FunctionAnalysisManager &FAM =1105AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();1106AnalysisGetter AG(FAM);11071108// TODO: Probably preserves CFG1109return runImpl(M, AG, TM) ? PreservedAnalyses::none()1110: PreservedAnalyses::all();1111}11121113char AMDGPUAttributorLegacy::ID = 0;11141115Pass *llvm::createAMDGPUAttributorLegacyPass() {1116return new AMDGPUAttributorLegacy();1117}1118INITIALIZE_PASS_BEGIN(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",1119false, false)1120INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass);1121INITIALIZE_PASS_END(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",1122false, false)112311241125