Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
35269 views
1
//===- AMDGPUAttributor.cpp -----------------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10
//
11
//===----------------------------------------------------------------------===//
12
13
#include "AMDGPU.h"
14
#include "GCNSubtarget.h"
15
#include "Utils/AMDGPUBaseInfo.h"
16
#include "llvm/Analysis/CycleAnalysis.h"
17
#include "llvm/CodeGen/TargetPassConfig.h"
18
#include "llvm/IR/IntrinsicsAMDGPU.h"
19
#include "llvm/IR/IntrinsicsR600.h"
20
#include "llvm/Target/TargetMachine.h"
21
#include "llvm/Transforms/IPO/Attributor.h"
22
23
#define DEBUG_TYPE "amdgpu-attributor"
24
25
namespace llvm {
26
void initializeCycleInfoWrapperPassPass(PassRegistry &);
27
} // namespace llvm
28
29
using namespace llvm;
30
31
static cl::opt<unsigned> KernargPreloadCount(
32
"amdgpu-kernarg-preload-count",
33
cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
34
35
#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
36
37
enum ImplicitArgumentPositions {
38
#include "AMDGPUAttributes.def"
39
LAST_ARG_POS
40
};
41
42
#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
43
44
enum ImplicitArgumentMask {
45
NOT_IMPLICIT_INPUT = 0,
46
#include "AMDGPUAttributes.def"
47
ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
48
};
49
50
#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
51
static constexpr std::pair<ImplicitArgumentMask,
52
StringLiteral> ImplicitAttrs[] = {
53
#include "AMDGPUAttributes.def"
54
};
55
56
// We do not need to note the x workitem or workgroup id because they are always
57
// initialized.
58
//
59
// TODO: We should not add the attributes if the known compile time workgroup
60
// size is 1 for y/z.
61
static ImplicitArgumentMask
62
intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
63
bool HasApertureRegs, bool SupportsGetDoorBellID,
64
unsigned CodeObjectVersion) {
65
switch (ID) {
66
case Intrinsic::amdgcn_workitem_id_x:
67
NonKernelOnly = true;
68
return WORKITEM_ID_X;
69
case Intrinsic::amdgcn_workgroup_id_x:
70
NonKernelOnly = true;
71
return WORKGROUP_ID_X;
72
case Intrinsic::amdgcn_workitem_id_y:
73
case Intrinsic::r600_read_tidig_y:
74
return WORKITEM_ID_Y;
75
case Intrinsic::amdgcn_workitem_id_z:
76
case Intrinsic::r600_read_tidig_z:
77
return WORKITEM_ID_Z;
78
case Intrinsic::amdgcn_workgroup_id_y:
79
case Intrinsic::r600_read_tgid_y:
80
return WORKGROUP_ID_Y;
81
case Intrinsic::amdgcn_workgroup_id_z:
82
case Intrinsic::r600_read_tgid_z:
83
return WORKGROUP_ID_Z;
84
case Intrinsic::amdgcn_lds_kernel_id:
85
return LDS_KERNEL_ID;
86
case Intrinsic::amdgcn_dispatch_ptr:
87
return DISPATCH_PTR;
88
case Intrinsic::amdgcn_dispatch_id:
89
return DISPATCH_ID;
90
case Intrinsic::amdgcn_implicitarg_ptr:
91
return IMPLICIT_ARG_PTR;
92
// Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
93
// queue_ptr.
94
case Intrinsic::amdgcn_queue_ptr:
95
NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
96
return QUEUE_PTR;
97
case Intrinsic::amdgcn_is_shared:
98
case Intrinsic::amdgcn_is_private:
99
if (HasApertureRegs)
100
return NOT_IMPLICIT_INPUT;
101
// Under V5, we need implicitarg_ptr + offsets to access private_base or
102
// shared_base. For pre-V5, however, need to access them through queue_ptr +
103
// offsets.
104
return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR :
105
QUEUE_PTR;
106
case Intrinsic::trap:
107
if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
108
return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT :
109
QUEUE_PTR;
110
NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
111
return QUEUE_PTR;
112
default:
113
return NOT_IMPLICIT_INPUT;
114
}
115
}
116
117
static bool castRequiresQueuePtr(unsigned SrcAS) {
118
return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
119
}
120
121
static bool isDSAddress(const Constant *C) {
122
const GlobalValue *GV = dyn_cast<GlobalValue>(C);
123
if (!GV)
124
return false;
125
unsigned AS = GV->getAddressSpace();
126
return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
127
}
128
129
/// Returns true if the function requires the implicit argument be passed
130
/// regardless of the function contents.
131
static bool funcRequiresHostcallPtr(const Function &F) {
132
// Sanitizers require the hostcall buffer passed in the implicit arguments.
133
return F.hasFnAttribute(Attribute::SanitizeAddress) ||
134
F.hasFnAttribute(Attribute::SanitizeThread) ||
135
F.hasFnAttribute(Attribute::SanitizeMemory) ||
136
F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
137
F.hasFnAttribute(Attribute::SanitizeMemTag);
138
}
139
140
namespace {
141
class AMDGPUInformationCache : public InformationCache {
142
public:
143
AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
144
BumpPtrAllocator &Allocator,
145
SetVector<Function *> *CGSCC, TargetMachine &TM)
146
: InformationCache(M, AG, Allocator, CGSCC), TM(TM),
147
CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
148
149
TargetMachine &TM;
150
151
enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };
152
153
/// Check if the subtarget has aperture regs.
154
bool hasApertureRegs(Function &F) {
155
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
156
return ST.hasApertureRegs();
157
}
158
159
/// Check if the subtarget supports GetDoorbellID.
160
bool supportsGetDoorbellID(Function &F) {
161
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
162
return ST.supportsGetDoorbellID();
163
}
164
165
std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
166
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
167
return ST.getFlatWorkGroupSizes(F);
168
}
169
170
std::pair<unsigned, unsigned>
171
getMaximumFlatWorkGroupRange(const Function &F) {
172
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
173
return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
174
}
175
176
/// Get code object version.
177
unsigned getCodeObjectVersion() const {
178
return CodeObjectVersion;
179
}
180
181
/// Get the effective value of "amdgpu-waves-per-eu" for the function,
182
/// accounting for the interaction with the passed value to use for
183
/// "amdgpu-flat-work-group-size".
184
std::pair<unsigned, unsigned>
185
getWavesPerEU(const Function &F,
186
std::pair<unsigned, unsigned> FlatWorkGroupSize) {
187
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
188
return ST.getWavesPerEU(F, FlatWorkGroupSize);
189
}
190
191
std::pair<unsigned, unsigned>
192
getEffectiveWavesPerEU(const Function &F,
193
std::pair<unsigned, unsigned> WavesPerEU,
194
std::pair<unsigned, unsigned> FlatWorkGroupSize) {
195
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
196
return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize);
197
}
198
199
unsigned getMaxWavesPerEU(const Function &F) {
200
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
201
return ST.getMaxWavesPerEU();
202
}
203
204
private:
205
/// Check if the ConstantExpr \p CE requires the queue pointer.
206
static bool visitConstExpr(const ConstantExpr *CE) {
207
if (CE->getOpcode() == Instruction::AddrSpaceCast) {
208
unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
209
return castRequiresQueuePtr(SrcAS);
210
}
211
return false;
212
}
213
214
/// Get the constant access bitmap for \p C.
215
uint8_t getConstantAccess(const Constant *C,
216
SmallPtrSetImpl<const Constant *> &Visited) {
217
auto It = ConstantStatus.find(C);
218
if (It != ConstantStatus.end())
219
return It->second;
220
221
uint8_t Result = 0;
222
if (isDSAddress(C))
223
Result = DS_GLOBAL;
224
225
if (const auto *CE = dyn_cast<ConstantExpr>(C))
226
if (visitConstExpr(CE))
227
Result |= ADDR_SPACE_CAST;
228
229
for (const Use &U : C->operands()) {
230
const auto *OpC = dyn_cast<Constant>(U);
231
if (!OpC || !Visited.insert(OpC).second)
232
continue;
233
234
Result |= getConstantAccess(OpC, Visited);
235
}
236
return Result;
237
}
238
239
public:
240
/// Returns true if \p Fn needs the queue pointer because of \p C.
241
bool needsQueuePtr(const Constant *C, Function &Fn) {
242
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
243
bool HasAperture = hasApertureRegs(Fn);
244
245
// No need to explore the constants.
246
if (!IsNonEntryFunc && HasAperture)
247
return false;
248
249
SmallPtrSet<const Constant *, 8> Visited;
250
uint8_t Access = getConstantAccess(C, Visited);
251
252
// We need to trap on DS globals in non-entry functions.
253
if (IsNonEntryFunc && (Access & DS_GLOBAL))
254
return true;
255
256
return !HasAperture && (Access & ADDR_SPACE_CAST);
257
}
258
259
private:
260
/// Used to determine if the Constant needs the queue pointer.
261
DenseMap<const Constant *, uint8_t> ConstantStatus;
262
const unsigned CodeObjectVersion;
263
};
264
265
struct AAAMDAttributes
266
: public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
267
AbstractAttribute> {
268
using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
269
AbstractAttribute>;
270
271
AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
272
273
/// Create an abstract attribute view for the position \p IRP.
274
static AAAMDAttributes &createForPosition(const IRPosition &IRP,
275
Attributor &A);
276
277
/// See AbstractAttribute::getName().
278
const std::string getName() const override { return "AAAMDAttributes"; }
279
280
/// See AbstractAttribute::getIdAddr().
281
const char *getIdAddr() const override { return &ID; }
282
283
/// This function should return true if the type of the \p AA is
284
/// AAAMDAttributes.
285
static bool classof(const AbstractAttribute *AA) {
286
return (AA->getIdAddr() == &ID);
287
}
288
289
/// Unique ID (due to the unique address)
290
static const char ID;
291
};
292
const char AAAMDAttributes::ID = 0;
293
294
struct AAUniformWorkGroupSize
295
: public StateWrapper<BooleanState, AbstractAttribute> {
296
using Base = StateWrapper<BooleanState, AbstractAttribute>;
297
AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
298
299
/// Create an abstract attribute view for the position \p IRP.
300
static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
301
Attributor &A);
302
303
/// See AbstractAttribute::getName().
304
const std::string getName() const override {
305
return "AAUniformWorkGroupSize";
306
}
307
308
/// See AbstractAttribute::getIdAddr().
309
const char *getIdAddr() const override { return &ID; }
310
311
/// This function should return true if the type of the \p AA is
312
/// AAAMDAttributes.
313
static bool classof(const AbstractAttribute *AA) {
314
return (AA->getIdAddr() == &ID);
315
}
316
317
/// Unique ID (due to the unique address)
318
static const char ID;
319
};
320
const char AAUniformWorkGroupSize::ID = 0;
321
322
struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
323
AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
324
: AAUniformWorkGroupSize(IRP, A) {}
325
326
void initialize(Attributor &A) override {
327
Function *F = getAssociatedFunction();
328
CallingConv::ID CC = F->getCallingConv();
329
330
if (CC != CallingConv::AMDGPU_KERNEL)
331
return;
332
333
bool InitialValue = false;
334
if (F->hasFnAttribute("uniform-work-group-size"))
335
InitialValue =
336
F->getFnAttribute("uniform-work-group-size").getValueAsString() ==
337
"true";
338
339
if (InitialValue)
340
indicateOptimisticFixpoint();
341
else
342
indicatePessimisticFixpoint();
343
}
344
345
ChangeStatus updateImpl(Attributor &A) override {
346
ChangeStatus Change = ChangeStatus::UNCHANGED;
347
348
auto CheckCallSite = [&](AbstractCallSite CS) {
349
Function *Caller = CS.getInstruction()->getFunction();
350
LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
351
<< "->" << getAssociatedFunction()->getName() << "\n");
352
353
const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
354
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
355
if (!CallerInfo)
356
return false;
357
358
Change = Change | clampStateAndIndicateChange(this->getState(),
359
CallerInfo->getState());
360
361
return true;
362
};
363
364
bool AllCallSitesKnown = true;
365
if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
366
return indicatePessimisticFixpoint();
367
368
return Change;
369
}
370
371
ChangeStatus manifest(Attributor &A) override {
372
SmallVector<Attribute, 8> AttrList;
373
LLVMContext &Ctx = getAssociatedFunction()->getContext();
374
375
AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
376
getAssumed() ? "true" : "false"));
377
return A.manifestAttrs(getIRPosition(), AttrList,
378
/* ForceReplace */ true);
379
}
380
381
bool isValidState() const override {
382
// This state is always valid, even when the state is false.
383
return true;
384
}
385
386
const std::string getAsStr(Attributor *) const override {
387
return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
388
}
389
390
/// See AbstractAttribute::trackStatistics()
391
void trackStatistics() const override {}
392
};
393
394
AAUniformWorkGroupSize &
395
AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
396
Attributor &A) {
397
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
398
return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
399
llvm_unreachable(
400
"AAUniformWorkGroupSize is only valid for function position");
401
}
402
403
struct AAAMDAttributesFunction : public AAAMDAttributes {
404
AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
405
: AAAMDAttributes(IRP, A) {}
406
407
void initialize(Attributor &A) override {
408
Function *F = getAssociatedFunction();
409
410
// If the function requires the implicit arg pointer due to sanitizers,
411
// assume it's needed even if explicitly marked as not requiring it.
412
const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
413
if (NeedsHostcall) {
414
removeAssumedBits(IMPLICIT_ARG_PTR);
415
removeAssumedBits(HOSTCALL_PTR);
416
}
417
418
for (auto Attr : ImplicitAttrs) {
419
if (NeedsHostcall &&
420
(Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
421
continue;
422
423
if (F->hasFnAttribute(Attr.second))
424
addKnownBits(Attr.first);
425
}
426
427
if (F->isDeclaration())
428
return;
429
430
// Ignore functions with graphics calling conventions, these are currently
431
// not allowed to have kernel arguments.
432
if (AMDGPU::isGraphics(F->getCallingConv())) {
433
indicatePessimisticFixpoint();
434
return;
435
}
436
}
437
438
ChangeStatus updateImpl(Attributor &A) override {
439
Function *F = getAssociatedFunction();
440
// The current assumed state used to determine a change.
441
auto OrigAssumed = getAssumed();
442
443
// Check for Intrinsics and propagate attributes.
444
const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
445
*this, this->getIRPosition(), DepClassTy::REQUIRED);
446
if (!AAEdges || AAEdges->hasNonAsmUnknownCallee())
447
return indicatePessimisticFixpoint();
448
449
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
450
451
bool NeedsImplicit = false;
452
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
453
bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
454
bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
455
unsigned COV = InfoCache.getCodeObjectVersion();
456
457
for (Function *Callee : AAEdges->getOptimisticEdges()) {
458
Intrinsic::ID IID = Callee->getIntrinsicID();
459
if (IID == Intrinsic::not_intrinsic) {
460
const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
461
*this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
462
if (!AAAMD)
463
return indicatePessimisticFixpoint();
464
*this &= *AAAMD;
465
continue;
466
}
467
468
bool NonKernelOnly = false;
469
ImplicitArgumentMask AttrMask =
470
intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
471
HasApertureRegs, SupportsGetDoorbellID, COV);
472
if (AttrMask != NOT_IMPLICIT_INPUT) {
473
if ((IsNonEntryFunc || !NonKernelOnly))
474
removeAssumedBits(AttrMask);
475
}
476
}
477
478
// Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
479
if (NeedsImplicit)
480
removeAssumedBits(IMPLICIT_ARG_PTR);
481
482
if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
483
// Under V5, we need implicitarg_ptr + offsets to access private_base or
484
// shared_base. We do not actually need queue_ptr.
485
if (COV >= 5)
486
removeAssumedBits(IMPLICIT_ARG_PTR);
487
else
488
removeAssumedBits(QUEUE_PTR);
489
}
490
491
if (funcRetrievesMultigridSyncArg(A, COV)) {
492
assert(!isAssumed(IMPLICIT_ARG_PTR) &&
493
"multigrid_sync_arg needs implicitarg_ptr");
494
removeAssumedBits(MULTIGRID_SYNC_ARG);
495
}
496
497
if (funcRetrievesHostcallPtr(A, COV)) {
498
assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
499
removeAssumedBits(HOSTCALL_PTR);
500
}
501
502
if (funcRetrievesHeapPtr(A, COV)) {
503
assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
504
removeAssumedBits(HEAP_PTR);
505
}
506
507
if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
508
assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
509
removeAssumedBits(QUEUE_PTR);
510
}
511
512
if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
513
removeAssumedBits(LDS_KERNEL_ID);
514
}
515
516
if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
517
removeAssumedBits(DEFAULT_QUEUE);
518
519
if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
520
removeAssumedBits(COMPLETION_ACTION);
521
522
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
523
: ChangeStatus::UNCHANGED;
524
}
525
526
ChangeStatus manifest(Attributor &A) override {
527
SmallVector<Attribute, 8> AttrList;
528
LLVMContext &Ctx = getAssociatedFunction()->getContext();
529
530
for (auto Attr : ImplicitAttrs) {
531
if (isKnown(Attr.first))
532
AttrList.push_back(Attribute::get(Ctx, Attr.second));
533
}
534
535
return A.manifestAttrs(getIRPosition(), AttrList,
536
/* ForceReplace */ true);
537
}
538
539
const std::string getAsStr(Attributor *) const override {
540
std::string Str;
541
raw_string_ostream OS(Str);
542
OS << "AMDInfo[";
543
for (auto Attr : ImplicitAttrs)
544
if (isAssumed(Attr.first))
545
OS << ' ' << Attr.second;
546
OS << " ]";
547
return OS.str();
548
}
549
550
/// See AbstractAttribute::trackStatistics()
551
void trackStatistics() const override {}
552
553
private:
554
bool checkForQueuePtr(Attributor &A) {
555
Function *F = getAssociatedFunction();
556
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
557
558
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
559
560
bool NeedsQueuePtr = false;
561
562
auto CheckAddrSpaceCasts = [&](Instruction &I) {
563
unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
564
if (castRequiresQueuePtr(SrcAS)) {
565
NeedsQueuePtr = true;
566
return false;
567
}
568
return true;
569
};
570
571
bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
572
573
// `checkForAllInstructions` is much more cheaper than going through all
574
// instructions, try it first.
575
576
// The queue pointer is not needed if aperture regs is present.
577
if (!HasApertureRegs) {
578
bool UsedAssumedInformation = false;
579
A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
580
{Instruction::AddrSpaceCast},
581
UsedAssumedInformation);
582
}
583
584
// If we found that we need the queue pointer, nothing else to do.
585
if (NeedsQueuePtr)
586
return true;
587
588
if (!IsNonEntryFunc && HasApertureRegs)
589
return false;
590
591
for (BasicBlock &BB : *F) {
592
for (Instruction &I : BB) {
593
for (const Use &U : I.operands()) {
594
if (const auto *C = dyn_cast<Constant>(U)) {
595
if (InfoCache.needsQueuePtr(C, *F))
596
return true;
597
}
598
}
599
}
600
}
601
602
return false;
603
}
604
605
bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
606
auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
607
AA::RangeTy Range(Pos, 8);
608
return funcRetrievesImplicitKernelArg(A, Range);
609
}
610
611
bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
612
auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
613
AA::RangeTy Range(Pos, 8);
614
return funcRetrievesImplicitKernelArg(A, Range);
615
}
616
617
bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
618
auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
619
AA::RangeTy Range(Pos, 8);
620
return funcRetrievesImplicitKernelArg(A, Range);
621
}
622
623
bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
624
auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
625
AA::RangeTy Range(Pos, 8);
626
return funcRetrievesImplicitKernelArg(A, Range);
627
}
628
629
bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
630
if (COV < 5)
631
return false;
632
AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
633
return funcRetrievesImplicitKernelArg(A, Range);
634
}
635
636
bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
637
if (COV < 5)
638
return false;
639
AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
640
return funcRetrievesImplicitKernelArg(A, Range);
641
}
642
643
bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
644
// Check if this is a call to the implicitarg_ptr builtin and it
645
// is used to retrieve the hostcall pointer. The implicit arg for
646
// hostcall is not used only if every use of the implicitarg_ptr
647
// is a load that clearly does not retrieve any byte of the
648
// hostcall pointer. We check this by tracing all the uses of the
649
// initial call to the implicitarg_ptr intrinsic.
650
auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
651
auto &Call = cast<CallBase>(I);
652
if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
653
return true;
654
655
const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
656
*this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
657
if (!PointerInfoAA)
658
return false;
659
660
return PointerInfoAA->forallInterferingAccesses(
661
Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {
662
return Acc.getRemoteInst()->isDroppable();
663
});
664
};
665
666
bool UsedAssumedInformation = false;
667
return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
668
UsedAssumedInformation);
669
}
670
671
bool funcRetrievesLDSKernelId(Attributor &A) {
672
auto DoesNotRetrieve = [&](Instruction &I) {
673
auto &Call = cast<CallBase>(I);
674
return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
675
};
676
bool UsedAssumedInformation = false;
677
return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
678
UsedAssumedInformation);
679
}
680
};
681
682
AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
683
Attributor &A) {
684
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
685
return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
686
llvm_unreachable("AAAMDAttributes is only valid for function position");
687
}
688
689
/// Base class to derive different size ranges.
690
struct AAAMDSizeRangeAttribute
691
: public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
692
using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
693
694
StringRef AttrName;
695
696
AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
697
StringRef AttrName)
698
: Base(IRP, 32), AttrName(AttrName) {}
699
700
/// See AbstractAttribute::trackStatistics()
701
void trackStatistics() const override {}
702
703
template <class AttributeImpl>
704
ChangeStatus updateImplImpl(Attributor &A) {
705
ChangeStatus Change = ChangeStatus::UNCHANGED;
706
707
auto CheckCallSite = [&](AbstractCallSite CS) {
708
Function *Caller = CS.getInstruction()->getFunction();
709
LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
710
<< "->" << getAssociatedFunction()->getName() << '\n');
711
712
const auto *CallerInfo = A.getAAFor<AttributeImpl>(
713
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
714
if (!CallerInfo)
715
return false;
716
717
Change |=
718
clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
719
720
return true;
721
};
722
723
bool AllCallSitesKnown = true;
724
if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
725
return indicatePessimisticFixpoint();
726
727
return Change;
728
}
729
730
ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min,
731
unsigned Max) {
732
// Don't add the attribute if it's the implied default.
733
if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
734
return ChangeStatus::UNCHANGED;
735
736
Function *F = getAssociatedFunction();
737
LLVMContext &Ctx = F->getContext();
738
SmallString<10> Buffer;
739
raw_svector_ostream OS(Buffer);
740
OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
741
return A.manifestAttrs(getIRPosition(),
742
{Attribute::get(Ctx, AttrName, OS.str())},
743
/* ForceReplace */ true);
744
}
745
746
const std::string getAsStr(Attributor *) const override {
747
std::string Str;
748
raw_string_ostream OS(Str);
749
OS << getName() << '[';
750
OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
751
OS << ']';
752
return OS.str();
753
}
754
};
755
756
/// Propagate amdgpu-flat-work-group-size attribute.
757
struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
758
AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
759
: AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
760
761
void initialize(Attributor &A) override {
762
Function *F = getAssociatedFunction();
763
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
764
unsigned MinGroupSize, MaxGroupSize;
765
std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
766
intersectKnown(
767
ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
768
769
if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
770
indicatePessimisticFixpoint();
771
}
772
773
ChangeStatus updateImpl(Attributor &A) override {
774
return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
775
}
776
777
/// Create an abstract attribute view for the position \p IRP.
778
static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
779
Attributor &A);
780
781
ChangeStatus manifest(Attributor &A) override {
782
Function *F = getAssociatedFunction();
783
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
784
unsigned Min, Max;
785
std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);
786
return emitAttributeIfNotDefault(A, Min, Max);
787
}
788
789
/// See AbstractAttribute::getName()
790
const std::string getName() const override {
791
return "AAAMDFlatWorkGroupSize";
792
}
793
794
/// See AbstractAttribute::getIdAddr()
795
const char *getIdAddr() const override { return &ID; }
796
797
/// This function should return true if the type of the \p AA is
798
/// AAAMDFlatWorkGroupSize
799
static bool classof(const AbstractAttribute *AA) {
800
return (AA->getIdAddr() == &ID);
801
}
802
803
/// Unique ID (due to the unique address)
804
static const char ID;
805
};
806
807
const char AAAMDFlatWorkGroupSize::ID = 0;
808
809
AAAMDFlatWorkGroupSize &
810
AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
811
Attributor &A) {
812
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
813
return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
814
llvm_unreachable(
815
"AAAMDFlatWorkGroupSize is only valid for function position");
816
}
817
818
/// Propagate amdgpu-waves-per-eu attribute.
819
struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
820
AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
821
: AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
822
823
bool isValidState() const override {
824
return !Assumed.isEmptySet() && IntegerRangeState::isValidState();
825
}
826
827
void initialize(Attributor &A) override {
828
Function *F = getAssociatedFunction();
829
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
830
831
if (const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
832
*this, IRPosition::function(*F), DepClassTy::REQUIRED)) {
833
834
unsigned Min, Max;
835
std::tie(Min, Max) = InfoCache.getWavesPerEU(
836
*F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
837
AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
838
839
ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
840
intersectKnown(Range);
841
}
842
843
if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
844
indicatePessimisticFixpoint();
845
}
846
847
ChangeStatus updateImpl(Attributor &A) override {
848
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
849
ChangeStatus Change = ChangeStatus::UNCHANGED;
850
851
auto CheckCallSite = [&](AbstractCallSite CS) {
852
Function *Caller = CS.getInstruction()->getFunction();
853
Function *Func = getAssociatedFunction();
854
LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
855
<< "->" << Func->getName() << '\n');
856
857
const auto *CallerInfo = A.getAAFor<AAAMDWavesPerEU>(
858
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
859
const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
860
*this, IRPosition::function(*Func), DepClassTy::REQUIRED);
861
if (!CallerInfo || !AssumedGroupSize)
862
return false;
863
864
unsigned Min, Max;
865
std::tie(Min, Max) = InfoCache.getEffectiveWavesPerEU(
866
*Caller,
867
{CallerInfo->getAssumed().getLower().getZExtValue(),
868
CallerInfo->getAssumed().getUpper().getZExtValue() - 1},
869
{AssumedGroupSize->getAssumed().getLower().getZExtValue(),
870
AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
871
ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1));
872
IntegerRangeState CallerRangeState(CallerRange);
873
Change |= clampStateAndIndicateChange(this->getState(), CallerRangeState);
874
875
return true;
876
};
877
878
bool AllCallSitesKnown = true;
879
if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
880
return indicatePessimisticFixpoint();
881
882
return Change;
883
}
884
885
/// Create an abstract attribute view for the position \p IRP.
886
static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
887
Attributor &A);
888
889
ChangeStatus manifest(Attributor &A) override {
890
Function *F = getAssociatedFunction();
891
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
892
unsigned Max = InfoCache.getMaxWavesPerEU(*F);
893
return emitAttributeIfNotDefault(A, 1, Max);
894
}
895
896
/// See AbstractAttribute::getName()
897
const std::string getName() const override { return "AAAMDWavesPerEU"; }
898
899
/// See AbstractAttribute::getIdAddr()
900
const char *getIdAddr() const override { return &ID; }
901
902
/// This function should return true if the type of the \p AA is
903
/// AAAMDWavesPerEU
904
static bool classof(const AbstractAttribute *AA) {
905
return (AA->getIdAddr() == &ID);
906
}
907
908
/// Unique ID (due to the unique address)
909
static const char ID;
910
};
911
912
const char AAAMDWavesPerEU::ID = 0;
913
914
AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
915
Attributor &A) {
916
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
917
return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
918
llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
919
}
920
921
static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
922
for (const auto &CI : IA->ParseConstraints()) {
923
for (StringRef Code : CI.Codes) {
924
Code.consume_front("{");
925
if (Code.starts_with("a"))
926
return true;
927
}
928
}
929
930
return false;
931
}
932
933
struct AAAMDGPUNoAGPR
934
: public IRAttribute<Attribute::NoUnwind,
935
StateWrapper<BooleanState, AbstractAttribute>,
936
AAAMDGPUNoAGPR> {
937
AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
938
939
static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
940
Attributor &A) {
941
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
942
return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A);
943
llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position");
944
}
945
946
void initialize(Attributor &A) override {
947
Function *F = getAssociatedFunction();
948
if (F->hasFnAttribute("amdgpu-no-agpr"))
949
indicateOptimisticFixpoint();
950
}
951
952
const std::string getAsStr(Attributor *A) const override {
953
return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr";
954
}
955
956
void trackStatistics() const override {}
957
958
ChangeStatus updateImpl(Attributor &A) override {
959
// TODO: Use AACallEdges, but then we need a way to inspect asm edges.
960
961
auto CheckForNoAGPRs = [&](Instruction &I) {
962
const auto &CB = cast<CallBase>(I);
963
const Value *CalleeOp = CB.getCalledOperand();
964
const Function *Callee = dyn_cast<Function>(CalleeOp);
965
if (!Callee) {
966
if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
967
return !inlineAsmUsesAGPRs(IA);
968
return false;
969
}
970
971
// Some intrinsics may use AGPRs, but if we have a choice, we are not
972
// required to use AGPRs.
973
if (Callee->isIntrinsic())
974
return true;
975
976
// TODO: Handle callsite attributes
977
const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
978
*this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
979
return CalleeInfo && CalleeInfo->getAssumed();
980
};
981
982
bool UsedAssumedInformation = false;
983
if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this,
984
UsedAssumedInformation))
985
return indicatePessimisticFixpoint();
986
return ChangeStatus::UNCHANGED;
987
}
988
989
ChangeStatus manifest(Attributor &A) override {
990
if (!getAssumed())
991
return ChangeStatus::UNCHANGED;
992
LLVMContext &Ctx = getAssociatedFunction()->getContext();
993
return A.manifestAttrs(getIRPosition(),
994
{Attribute::get(Ctx, "amdgpu-no-agpr")});
995
}
996
997
const std::string getName() const override { return "AAAMDGPUNoAGPR"; }
998
const char *getIdAddr() const override { return &ID; }
999
1000
/// This function should return true if the type of the \p AA is
1001
/// AAAMDGPUNoAGPRs
1002
static bool classof(const AbstractAttribute *AA) {
1003
return (AA->getIdAddr() == &ID);
1004
}
1005
1006
static const char ID;
1007
};
1008
1009
const char AAAMDGPUNoAGPR::ID = 0;
1010
1011
static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
1012
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
1013
for (unsigned I = 0;
1014
I < F.arg_size() &&
1015
I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs());
1016
++I) {
1017
Argument &Arg = *F.getArg(I);
1018
// Check for incompatible attributes.
1019
if (Arg.hasByRefAttr() || Arg.hasNestAttr())
1020
break;
1021
1022
Arg.addAttr(Attribute::InReg);
1023
}
1024
}
1025
1026
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
1027
SetVector<Function *> Functions;
1028
for (Function &F : M) {
1029
if (!F.isIntrinsic())
1030
Functions.insert(&F);
1031
}
1032
1033
CallGraphUpdater CGUpdater;
1034
BumpPtrAllocator Allocator;
1035
AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
1036
DenseSet<const char *> Allowed(
1037
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1038
&AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
1039
&AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID,
1040
&AAPointerInfo::ID, &AAPotentialConstantValues::ID,
1041
&AAUnderlyingObjects::ID});
1042
1043
AttributorConfig AC(CGUpdater);
1044
AC.Allowed = &Allowed;
1045
AC.IsModulePass = true;
1046
AC.DefaultInitializeLiveInternals = false;
1047
AC.IPOAmendableCB = [](const Function &F) {
1048
return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1049
};
1050
1051
Attributor A(Functions, InfoCache, AC);
1052
1053
for (Function &F : M) {
1054
if (!F.isIntrinsic()) {
1055
A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
1056
A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
1057
A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(F));
1058
CallingConv::ID CC = F.getCallingConv();
1059
if (!AMDGPU::isEntryFunctionCC(CC)) {
1060
A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
1061
A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(F));
1062
} else if (CC == CallingConv::AMDGPU_KERNEL) {
1063
addPreloadKernArgHint(F, TM);
1064
}
1065
}
1066
}
1067
1068
ChangeStatus Change = A.run();
1069
return Change == ChangeStatus::CHANGED;
1070
}
1071
1072
class AMDGPUAttributorLegacy : public ModulePass {
1073
public:
1074
AMDGPUAttributorLegacy() : ModulePass(ID) {}
1075
1076
/// doInitialization - Virtual method overridden by subclasses to do
1077
/// any necessary initialization before any pass is run.
1078
bool doInitialization(Module &) override {
1079
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1080
if (!TPC)
1081
report_fatal_error("TargetMachine is required");
1082
1083
TM = &TPC->getTM<TargetMachine>();
1084
return false;
1085
}
1086
1087
bool runOnModule(Module &M) override {
1088
AnalysisGetter AG(this);
1089
return runImpl(M, AG, *TM);
1090
}
1091
1092
void getAnalysisUsage(AnalysisUsage &AU) const override {
1093
AU.addRequired<CycleInfoWrapperPass>();
1094
}
1095
1096
StringRef getPassName() const override { return "AMDGPU Attributor"; }
1097
TargetMachine *TM;
1098
static char ID;
1099
};
1100
} // namespace
1101
1102
PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
1103
ModuleAnalysisManager &AM) {
1104
1105
FunctionAnalysisManager &FAM =
1106
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1107
AnalysisGetter AG(FAM);
1108
1109
// TODO: Probably preserves CFG
1110
return runImpl(M, AG, TM) ? PreservedAnalyses::none()
1111
: PreservedAnalyses::all();
1112
}
1113
1114
char AMDGPUAttributorLegacy::ID = 0;
1115
1116
Pass *llvm::createAMDGPUAttributorLegacyPass() {
1117
return new AMDGPUAttributorLegacy();
1118
}
1119
INITIALIZE_PASS_BEGIN(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
1120
false, false)
1121
INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass);
1122
INITIALIZE_PASS_END(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
1123
false, false)
1124
1125