Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp
35233 views
1
//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10
// runtime library.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "CGCUDARuntime.h"
15
#include "CGCXXABI.h"
16
#include "CodeGenFunction.h"
17
#include "CodeGenModule.h"
18
#include "clang/AST/Decl.h"
19
#include "clang/Basic/Cuda.h"
20
#include "clang/CodeGen/CodeGenABITypes.h"
21
#include "clang/CodeGen/ConstantInitBuilder.h"
22
#include "llvm/Frontend/Offloading/Utility.h"
23
#include "llvm/IR/BasicBlock.h"
24
#include "llvm/IR/Constants.h"
25
#include "llvm/IR/DerivedTypes.h"
26
#include "llvm/IR/ReplaceConstant.h"
27
#include "llvm/Support/Format.h"
28
#include "llvm/Support/VirtualFileSystem.h"
29
30
using namespace clang;
31
using namespace CodeGen;
32
33
namespace {
34
constexpr unsigned CudaFatMagic = 0x466243b1;
35
constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
36
37
class CGNVCUDARuntime : public CGCUDARuntime {
38
39
private:
40
llvm::IntegerType *IntTy, *SizeTy;
41
llvm::Type *VoidTy;
42
llvm::PointerType *PtrTy;
43
44
/// Convenience reference to LLVM Context
45
llvm::LLVMContext &Context;
46
/// Convenience reference to the current module
47
llvm::Module &TheModule;
48
/// Keeps track of kernel launch stubs and handles emitted in this module
49
struct KernelInfo {
50
llvm::Function *Kernel; // stub function to help launch kernel
51
const Decl *D;
52
};
53
llvm::SmallVector<KernelInfo, 16> EmittedKernels;
54
// Map a kernel mangled name to a symbol for identifying kernel in host code
55
// For CUDA, the symbol for identifying the kernel is the same as the device
56
// stub function. For HIP, they are different.
57
llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
58
// Map a kernel handle to the kernel stub.
59
llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
60
struct VarInfo {
61
llvm::GlobalVariable *Var;
62
const VarDecl *D;
63
DeviceVarFlags Flags;
64
};
65
llvm::SmallVector<VarInfo, 16> DeviceVars;
66
/// Keeps track of variable containing handle of GPU binary. Populated by
67
/// ModuleCtorFunction() and used to create corresponding cleanup calls in
68
/// ModuleDtorFunction()
69
llvm::GlobalVariable *GpuBinaryHandle = nullptr;
70
/// Whether we generate relocatable device code.
71
bool RelocatableDeviceCode;
72
/// Mangle context for device.
73
std::unique_ptr<MangleContext> DeviceMC;
74
75
llvm::FunctionCallee getSetupArgumentFn() const;
76
llvm::FunctionCallee getLaunchFn() const;
77
78
llvm::FunctionType *getRegisterGlobalsFnTy() const;
79
llvm::FunctionType *getCallbackFnTy() const;
80
llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
81
std::string addPrefixToName(StringRef FuncName) const;
82
std::string addUnderscoredPrefixToName(StringRef FuncName) const;
83
84
/// Creates a function to register all kernel stubs generated in this module.
85
llvm::Function *makeRegisterGlobalsFn();
86
87
/// Helper function that generates a constant string and returns a pointer to
88
/// the start of the string. The result of this function can be used anywhere
89
/// where the C code specifies const char*.
90
llvm::Constant *makeConstantString(const std::string &Str,
91
const std::string &Name = "") {
92
return CGM.GetAddrOfConstantCString(Str, Name.c_str()).getPointer();
93
}
94
95
/// Helper function which generates an initialized constant array from Str,
96
/// and optionally sets section name and alignment. AddNull specifies whether
97
/// the array should nave NUL termination.
98
llvm::Constant *makeConstantArray(StringRef Str,
99
StringRef Name = "",
100
StringRef SectionName = "",
101
unsigned Alignment = 0,
102
bool AddNull = false) {
103
llvm::Constant *Value =
104
llvm::ConstantDataArray::getString(Context, Str, AddNull);
105
auto *GV = new llvm::GlobalVariable(
106
TheModule, Value->getType(), /*isConstant=*/true,
107
llvm::GlobalValue::PrivateLinkage, Value, Name);
108
if (!SectionName.empty()) {
109
GV->setSection(SectionName);
110
// Mark the address as used which make sure that this section isn't
111
// merged and we will really have it in the object file.
112
GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
113
}
114
if (Alignment)
115
GV->setAlignment(llvm::Align(Alignment));
116
return GV;
117
}
118
119
/// Helper function that generates an empty dummy function returning void.
120
llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
121
assert(FnTy->getReturnType()->isVoidTy() &&
122
"Can only generate dummy functions returning void!");
123
llvm::Function *DummyFunc = llvm::Function::Create(
124
FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
125
126
llvm::BasicBlock *DummyBlock =
127
llvm::BasicBlock::Create(Context, "", DummyFunc);
128
CGBuilderTy FuncBuilder(CGM, Context);
129
FuncBuilder.SetInsertPoint(DummyBlock);
130
FuncBuilder.CreateRetVoid();
131
132
return DummyFunc;
133
}
134
135
void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
136
void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
137
std::string getDeviceSideName(const NamedDecl *ND) override;
138
139
void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
140
bool Extern, bool Constant) {
141
DeviceVars.push_back({&Var,
142
VD,
143
{DeviceVarFlags::Variable, Extern, Constant,
144
VD->hasAttr<HIPManagedAttr>(),
145
/*Normalized*/ false, 0}});
146
}
147
void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
148
bool Extern, int Type) {
149
DeviceVars.push_back({&Var,
150
VD,
151
{DeviceVarFlags::Surface, Extern, /*Constant*/ false,
152
/*Managed*/ false,
153
/*Normalized*/ false, Type}});
154
}
155
void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
156
bool Extern, int Type, bool Normalized) {
157
DeviceVars.push_back({&Var,
158
VD,
159
{DeviceVarFlags::Texture, Extern, /*Constant*/ false,
160
/*Managed*/ false, Normalized, Type}});
161
}
162
163
/// Creates module constructor function
164
llvm::Function *makeModuleCtorFunction();
165
/// Creates module destructor function
166
llvm::Function *makeModuleDtorFunction();
167
/// Transform managed variables for device compilation.
168
void transformManagedVars();
169
/// Create offloading entries to register globals in RDC mode.
170
void createOffloadingEntries();
171
172
public:
173
CGNVCUDARuntime(CodeGenModule &CGM);
174
175
llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
176
llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
177
auto Loc = KernelStubs.find(Handle);
178
assert(Loc != KernelStubs.end());
179
return Loc->second;
180
}
181
void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
182
void handleVarRegistration(const VarDecl *VD,
183
llvm::GlobalVariable &Var) override;
184
void
185
internalizeDeviceSideVar(const VarDecl *D,
186
llvm::GlobalValue::LinkageTypes &Linkage) override;
187
188
llvm::Function *finalizeModule() override;
189
};
190
191
} // end anonymous namespace
192
193
std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
194
if (CGM.getLangOpts().HIP)
195
return ((Twine("hip") + Twine(FuncName)).str());
196
return ((Twine("cuda") + Twine(FuncName)).str());
197
}
198
std::string
199
CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
200
if (CGM.getLangOpts().HIP)
201
return ((Twine("__hip") + Twine(FuncName)).str());
202
return ((Twine("__cuda") + Twine(FuncName)).str());
203
}
204
205
static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
206
// If the host and device have different C++ ABIs, mark it as the device
207
// mangle context so that the mangling needs to retrieve the additional
208
// device lambda mangling number instead of the regular host one.
209
if (CGM.getContext().getAuxTargetInfo() &&
210
CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
211
CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
212
return std::unique_ptr<MangleContext>(
213
CGM.getContext().createDeviceMangleContext(
214
*CGM.getContext().getAuxTargetInfo()));
215
}
216
217
return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
218
CGM.getContext().getAuxTargetInfo()));
219
}
220
221
CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
222
: CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
223
TheModule(CGM.getModule()),
224
RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
225
DeviceMC(InitDeviceMC(CGM)) {
226
IntTy = CGM.IntTy;
227
SizeTy = CGM.SizeTy;
228
VoidTy = CGM.VoidTy;
229
PtrTy = CGM.UnqualPtrTy;
230
}
231
232
llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
233
// cudaError_t cudaSetupArgument(void *, size_t, size_t)
234
llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
235
return CGM.CreateRuntimeFunction(
236
llvm::FunctionType::get(IntTy, Params, false),
237
addPrefixToName("SetupArgument"));
238
}
239
240
llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
241
if (CGM.getLangOpts().HIP) {
242
// hipError_t hipLaunchByPtr(char *);
243
return CGM.CreateRuntimeFunction(
244
llvm::FunctionType::get(IntTy, PtrTy, false), "hipLaunchByPtr");
245
}
246
// cudaError_t cudaLaunch(char *);
247
return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy, false),
248
"cudaLaunch");
249
}
250
251
llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
252
return llvm::FunctionType::get(VoidTy, PtrTy, false);
253
}
254
255
llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
256
return llvm::FunctionType::get(VoidTy, PtrTy, false);
257
}
258
259
llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
260
llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
261
llvm::PointerType::getUnqual(Context)};
262
return llvm::FunctionType::get(VoidTy, Params, false);
263
}
264
265
std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
266
GlobalDecl GD;
267
// D could be either a kernel or a variable.
268
if (auto *FD = dyn_cast<FunctionDecl>(ND))
269
GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
270
else
271
GD = GlobalDecl(ND);
272
std::string DeviceSideName;
273
MangleContext *MC;
274
if (CGM.getLangOpts().CUDAIsDevice)
275
MC = &CGM.getCXXABI().getMangleContext();
276
else
277
MC = DeviceMC.get();
278
if (MC->shouldMangleDeclName(ND)) {
279
SmallString<256> Buffer;
280
llvm::raw_svector_ostream Out(Buffer);
281
MC->mangleName(GD, Out);
282
DeviceSideName = std::string(Out.str());
283
} else
284
DeviceSideName = std::string(ND->getIdentifier()->getName());
285
286
// Make unique name for device side static file-scope variable for HIP.
287
if (CGM.getContext().shouldExternalize(ND) &&
288
CGM.getLangOpts().GPURelocatableDeviceCode) {
289
SmallString<256> Buffer;
290
llvm::raw_svector_ostream Out(Buffer);
291
Out << DeviceSideName;
292
CGM.printPostfixForExternalizedDecl(Out, ND);
293
DeviceSideName = std::string(Out.str());
294
}
295
return DeviceSideName;
296
}
297
298
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
299
FunctionArgList &Args) {
300
EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
301
if (auto *GV =
302
dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) {
303
GV->setLinkage(CGF.CurFn->getLinkage());
304
GV->setInitializer(CGF.CurFn);
305
}
306
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
307
CudaFeature::CUDA_USES_NEW_LAUNCH) ||
308
(CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
309
emitDeviceStubBodyNew(CGF, Args);
310
else
311
emitDeviceStubBodyLegacy(CGF, Args);
312
}
313
314
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
315
// array and kernels are launched using cudaLaunchKernel().
316
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
317
FunctionArgList &Args) {
318
// Build the shadow stack entry at the very start of the function.
319
320
// Calculate amount of space we will need for all arguments. If we have no
321
// args, allocate a single pointer so we still have a valid pointer to the
322
// argument array that we can pass to runtime, even if it will be unused.
323
Address KernelArgs = CGF.CreateTempAlloca(
324
PtrTy, CharUnits::fromQuantity(16), "kernel_args",
325
llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
326
// Store pointers to the arguments in a locally allocated launch_args.
327
for (unsigned i = 0; i < Args.size(); ++i) {
328
llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(Args[i]).emitRawPointer(CGF);
329
llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, PtrTy);
330
CGF.Builder.CreateDefaultAlignedStore(
331
VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
332
PtrTy, KernelArgs.emitRawPointer(CGF), i));
333
}
334
335
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
336
337
// Lookup cudaLaunchKernel/hipLaunchKernel function.
338
// HIP kernel launching API name depends on -fgpu-default-stream option. For
339
// the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
340
// it is hipLaunchKernel_spt.
341
// cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
342
// void **args, size_t sharedMem,
343
// cudaStream_t stream);
344
// hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
345
// dim3 blockDim, void **args,
346
// size_t sharedMem, hipStream_t stream);
347
TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
348
DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
349
std::string KernelLaunchAPI = "LaunchKernel";
350
if (CGF.getLangOpts().GPUDefaultStream ==
351
LangOptions::GPUDefaultStreamKind::PerThread) {
352
if (CGF.getLangOpts().HIP)
353
KernelLaunchAPI = KernelLaunchAPI + "_spt";
354
else if (CGF.getLangOpts().CUDA)
355
KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
356
}
357
auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
358
const IdentifierInfo &cudaLaunchKernelII =
359
CGM.getContext().Idents.get(LaunchKernelName);
360
FunctionDecl *cudaLaunchKernelFD = nullptr;
361
for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
362
if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
363
cudaLaunchKernelFD = FD;
364
}
365
366
if (cudaLaunchKernelFD == nullptr) {
367
CGM.Error(CGF.CurFuncDecl->getLocation(),
368
"Can't find declaration for " + LaunchKernelName);
369
return;
370
}
371
// Create temporary dim3 grid_dim, block_dim.
372
ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
373
QualType Dim3Ty = GridDimParam->getType();
374
Address GridDim =
375
CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
376
Address BlockDim =
377
CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
378
Address ShmemSize =
379
CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
380
Address Stream = CGF.CreateTempAlloca(PtrTy, CGM.getPointerAlign(), "stream");
381
llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
382
llvm::FunctionType::get(IntTy,
383
{/*gridDim=*/GridDim.getType(),
384
/*blockDim=*/BlockDim.getType(),
385
/*ShmemSize=*/ShmemSize.getType(),
386
/*Stream=*/Stream.getType()},
387
/*isVarArg=*/false),
388
addUnderscoredPrefixToName("PopCallConfiguration"));
389
390
CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.emitRawPointer(CGF),
391
BlockDim.emitRawPointer(CGF),
392
ShmemSize.emitRawPointer(CGF),
393
Stream.emitRawPointer(CGF)});
394
395
// Emit the call to cudaLaunch
396
llvm::Value *Kernel =
397
CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
398
CallArgList LaunchKernelArgs;
399
LaunchKernelArgs.add(RValue::get(Kernel),
400
cudaLaunchKernelFD->getParamDecl(0)->getType());
401
LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
402
LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
403
LaunchKernelArgs.add(RValue::get(KernelArgs, CGF),
404
cudaLaunchKernelFD->getParamDecl(3)->getType());
405
LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
406
cudaLaunchKernelFD->getParamDecl(4)->getType());
407
LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
408
cudaLaunchKernelFD->getParamDecl(5)->getType());
409
410
QualType QT = cudaLaunchKernelFD->getType();
411
QualType CQT = QT.getCanonicalType();
412
llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
413
llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
414
415
const CGFunctionInfo &FI =
416
CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
417
llvm::FunctionCallee cudaLaunchKernelFn =
418
CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
419
CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
420
LaunchKernelArgs);
421
422
// To prevent CUDA device stub functions from being merged by ICF in MSVC
423
// environment, create an unique global variable for each kernel and write to
424
// the variable in the device stub.
425
if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
426
!CGF.getLangOpts().HIP) {
427
llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Kernel);
428
std::string GlobalVarName = (KernelFunction->getName() + ".id").str();
429
430
llvm::GlobalVariable *HandleVar =
431
CGM.getModule().getNamedGlobal(GlobalVarName);
432
if (!HandleVar) {
433
HandleVar = new llvm::GlobalVariable(
434
CGM.getModule(), CGM.Int8Ty,
435
/*Constant=*/false, KernelFunction->getLinkage(),
436
llvm::ConstantInt::get(CGM.Int8Ty, 0), GlobalVarName);
437
HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
438
HandleVar->setVisibility(KernelFunction->getVisibility());
439
if (KernelFunction->hasComdat())
440
HandleVar->setComdat(CGM.getModule().getOrInsertComdat(GlobalVarName));
441
}
442
443
CGF.Builder.CreateAlignedStore(llvm::ConstantInt::get(CGM.Int8Ty, 1),
444
HandleVar, CharUnits::One(),
445
/*IsVolatile=*/true);
446
}
447
448
CGF.EmitBranch(EndBlock);
449
450
CGF.EmitBlock(EndBlock);
451
}
452
453
void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
454
FunctionArgList &Args) {
455
// Emit a call to cudaSetupArgument for each arg in Args.
456
llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
457
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
458
CharUnits Offset = CharUnits::Zero();
459
for (const VarDecl *A : Args) {
460
auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
461
Offset = Offset.alignTo(TInfo.Align);
462
llvm::Value *Args[] = {
463
CGF.Builder.CreatePointerCast(
464
CGF.GetAddrOfLocalVar(A).emitRawPointer(CGF), PtrTy),
465
llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
466
llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
467
};
468
llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
469
llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
470
llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
471
llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
472
CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
473
CGF.EmitBlock(NextBlock);
474
Offset += TInfo.Width;
475
}
476
477
// Emit the call to cudaLaunch
478
llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
479
llvm::Value *Arg =
480
CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
481
CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
482
CGF.EmitBranch(EndBlock);
483
484
CGF.EmitBlock(EndBlock);
485
}
486
487
// Replace the original variable Var with the address loaded from variable
488
// ManagedVar populated by HIP runtime.
489
static void replaceManagedVar(llvm::GlobalVariable *Var,
490
llvm::GlobalVariable *ManagedVar) {
491
SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
492
for (auto &&VarUse : Var->uses()) {
493
WorkList.push_back({VarUse.getUser()});
494
}
495
while (!WorkList.empty()) {
496
auto &&WorkItem = WorkList.pop_back_val();
497
auto *U = WorkItem.back();
498
if (isa<llvm::ConstantExpr>(U)) {
499
for (auto &&UU : U->uses()) {
500
WorkItem.push_back(UU.getUser());
501
WorkList.push_back(WorkItem);
502
WorkItem.pop_back();
503
}
504
continue;
505
}
506
if (auto *I = dyn_cast<llvm::Instruction>(U)) {
507
llvm::Value *OldV = Var;
508
llvm::Instruction *NewV =
509
new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
510
llvm::Align(Var->getAlignment()), I);
511
WorkItem.pop_back();
512
// Replace constant expressions directly or indirectly using the managed
513
// variable with instructions.
514
for (auto &&Op : WorkItem) {
515
auto *CE = cast<llvm::ConstantExpr>(Op);
516
auto *NewInst = CE->getAsInstruction();
517
NewInst->insertBefore(*I->getParent(), I->getIterator());
518
NewInst->replaceUsesOfWith(OldV, NewV);
519
OldV = CE;
520
NewV = NewInst;
521
}
522
I->replaceUsesOfWith(OldV, NewV);
523
} else {
524
llvm_unreachable("Invalid use of managed variable");
525
}
526
}
527
}
528
529
/// Creates a function that sets up state on the host side for CUDA objects that
530
/// have a presence on both the host and device sides. Specifically, registers
531
/// the host side of kernel functions and device global variables with the CUDA
532
/// runtime.
533
/// \code
534
/// void __cuda_register_globals(void** GpuBinaryHandle) {
535
/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
536
/// ...
537
/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
538
/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
539
/// ...
540
/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
541
/// }
542
/// \endcode
543
llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
544
// No need to register anything
545
if (EmittedKernels.empty() && DeviceVars.empty())
546
return nullptr;
547
548
llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
549
getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
550
addUnderscoredPrefixToName("_register_globals"), &TheModule);
551
llvm::BasicBlock *EntryBB =
552
llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
553
CGBuilderTy Builder(CGM, Context);
554
Builder.SetInsertPoint(EntryBB);
555
556
// void __cudaRegisterFunction(void **, const char *, char *, const char *,
557
// int, uint3*, uint3*, dim3*, dim3*, int*)
558
llvm::Type *RegisterFuncParams[] = {
559
PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
560
PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
561
llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
562
llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
563
addUnderscoredPrefixToName("RegisterFunction"));
564
565
// Extract GpuBinaryHandle passed as the first argument passed to
566
// __cuda_register_globals() and generate __cudaRegisterFunction() call for
567
// each emitted kernel.
568
llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
569
for (auto &&I : EmittedKernels) {
570
llvm::Constant *KernelName =
571
makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
572
llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
573
llvm::Value *Args[] = {
574
&GpuBinaryHandlePtr,
575
KernelHandles[I.Kernel->getName()],
576
KernelName,
577
KernelName,
578
llvm::ConstantInt::get(IntTy, -1),
579
NullPtr,
580
NullPtr,
581
NullPtr,
582
NullPtr,
583
llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
584
Builder.CreateCall(RegisterFunc, Args);
585
}
586
587
llvm::Type *VarSizeTy = IntTy;
588
// For HIP or CUDA 9.0+, device variable size is type of `size_t`.
589
if (CGM.getLangOpts().HIP ||
590
ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
591
VarSizeTy = SizeTy;
592
593
// void __cudaRegisterVar(void **, char *, char *, const char *,
594
// int, int, int, int)
595
llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
596
IntTy, VarSizeTy, IntTy, IntTy};
597
llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
598
llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
599
addUnderscoredPrefixToName("RegisterVar"));
600
// void __hipRegisterManagedVar(void **, char *, char *, const char *,
601
// size_t, unsigned)
602
llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
603
PtrTy, VarSizeTy, IntTy};
604
llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
605
llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
606
addUnderscoredPrefixToName("RegisterManagedVar"));
607
// void __cudaRegisterSurface(void **, const struct surfaceReference *,
608
// const void **, const char *, int, int);
609
llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
610
llvm::FunctionType::get(
611
VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, false),
612
addUnderscoredPrefixToName("RegisterSurface"));
613
// void __cudaRegisterTexture(void **, const struct textureReference *,
614
// const void **, const char *, int, int, int)
615
llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
616
llvm::FunctionType::get(
617
VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, false),
618
addUnderscoredPrefixToName("RegisterTexture"));
619
for (auto &&Info : DeviceVars) {
620
llvm::GlobalVariable *Var = Info.Var;
621
assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
622
"External variables should not show up here, except HIP managed "
623
"variables");
624
llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
625
switch (Info.Flags.getKind()) {
626
case DeviceVarFlags::Variable: {
627
uint64_t VarSize =
628
CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
629
if (Info.Flags.isManaged()) {
630
assert(Var->getName().ends_with(".managed") &&
631
"HIP managed variables not transformed");
632
auto *ManagedVar = CGM.getModule().getNamedGlobal(
633
Var->getName().drop_back(StringRef(".managed").size()));
634
llvm::Value *Args[] = {
635
&GpuBinaryHandlePtr,
636
ManagedVar,
637
Var,
638
VarName,
639
llvm::ConstantInt::get(VarSizeTy, VarSize),
640
llvm::ConstantInt::get(IntTy, Var->getAlignment())};
641
if (!Var->isDeclaration())
642
Builder.CreateCall(RegisterManagedVar, Args);
643
} else {
644
llvm::Value *Args[] = {
645
&GpuBinaryHandlePtr,
646
Var,
647
VarName,
648
VarName,
649
llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
650
llvm::ConstantInt::get(VarSizeTy, VarSize),
651
llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
652
llvm::ConstantInt::get(IntTy, 0)};
653
Builder.CreateCall(RegisterVar, Args);
654
}
655
break;
656
}
657
case DeviceVarFlags::Surface:
658
Builder.CreateCall(
659
RegisterSurf,
660
{&GpuBinaryHandlePtr, Var, VarName, VarName,
661
llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
662
llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
663
break;
664
case DeviceVarFlags::Texture:
665
Builder.CreateCall(
666
RegisterTex,
667
{&GpuBinaryHandlePtr, Var, VarName, VarName,
668
llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
669
llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
670
llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
671
break;
672
}
673
}
674
675
Builder.CreateRetVoid();
676
return RegisterKernelsFunc;
677
}
678
679
/// Creates a global constructor function for the module:
680
///
681
/// For CUDA:
682
/// \code
683
/// void __cuda_module_ctor() {
684
/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
685
/// __cuda_register_globals(Handle);
686
/// }
687
/// \endcode
688
///
689
/// For HIP:
690
/// \code
691
/// void __hip_module_ctor() {
692
/// if (__hip_gpubin_handle == 0) {
693
/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
694
/// __hip_register_globals(__hip_gpubin_handle);
695
/// }
696
/// }
697
/// \endcode
698
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
699
bool IsHIP = CGM.getLangOpts().HIP;
700
bool IsCUDA = CGM.getLangOpts().CUDA;
701
// No need to generate ctors/dtors if there is no GPU binary.
702
StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
703
if (CudaGpuBinaryFileName.empty() && !IsHIP)
704
return nullptr;
705
if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
706
DeviceVars.empty())
707
return nullptr;
708
709
// void __{cuda|hip}_register_globals(void* handle);
710
llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
711
// We always need a function to pass in as callback. Create a dummy
712
// implementation if we don't need to register anything.
713
if (RelocatableDeviceCode && !RegisterGlobalsFunc)
714
RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
715
716
// void ** __{cuda|hip}RegisterFatBinary(void *);
717
llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
718
llvm::FunctionType::get(PtrTy, PtrTy, false),
719
addUnderscoredPrefixToName("RegisterFatBinary"));
720
// struct { int magic, int version, void * gpu_binary, void * dont_care };
721
llvm::StructType *FatbinWrapperTy =
722
llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
723
724
// Register GPU binary with the CUDA runtime, store returned handle in a
725
// global variable and save a reference in GpuBinaryHandle to be cleaned up
726
// in destructor on exit. Then associate all known kernels with the GPU binary
727
// handle so CUDA runtime can figure out what to call on the GPU side.
728
std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
729
if (!CudaGpuBinaryFileName.empty()) {
730
auto VFS = CGM.getFileSystem();
731
auto CudaGpuBinaryOrErr =
732
VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false);
733
if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
734
CGM.getDiags().Report(diag::err_cannot_open_file)
735
<< CudaGpuBinaryFileName << EC.message();
736
return nullptr;
737
}
738
CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
739
}
740
741
llvm::Function *ModuleCtorFunc = llvm::Function::Create(
742
llvm::FunctionType::get(VoidTy, false),
743
llvm::GlobalValue::InternalLinkage,
744
addUnderscoredPrefixToName("_module_ctor"), &TheModule);
745
llvm::BasicBlock *CtorEntryBB =
746
llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
747
CGBuilderTy CtorBuilder(CGM, Context);
748
749
CtorBuilder.SetInsertPoint(CtorEntryBB);
750
751
const char *FatbinConstantName;
752
const char *FatbinSectionName;
753
const char *ModuleIDSectionName;
754
StringRef ModuleIDPrefix;
755
llvm::Constant *FatBinStr;
756
unsigned FatMagic;
757
if (IsHIP) {
758
FatbinConstantName = ".hip_fatbin";
759
FatbinSectionName = ".hipFatBinSegment";
760
761
ModuleIDSectionName = "__hip_module_id";
762
ModuleIDPrefix = "__hip_";
763
764
if (CudaGpuBinary) {
765
// If fatbin is available from early finalization, create a string
766
// literal containing the fat binary loaded from the given file.
767
const unsigned HIPCodeObjectAlign = 4096;
768
FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
769
FatbinConstantName, HIPCodeObjectAlign);
770
} else {
771
// If fatbin is not available, create an external symbol
772
// __hip_fatbin in section .hip_fatbin. The external symbol is supposed
773
// to contain the fat binary but will be populated somewhere else,
774
// e.g. by lld through link script.
775
FatBinStr = new llvm::GlobalVariable(
776
CGM.getModule(), CGM.Int8Ty,
777
/*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
778
"__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
779
llvm::GlobalVariable::NotThreadLocal);
780
cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
781
}
782
783
FatMagic = HIPFatMagic;
784
} else {
785
if (RelocatableDeviceCode)
786
FatbinConstantName = CGM.getTriple().isMacOSX()
787
? "__NV_CUDA,__nv_relfatbin"
788
: "__nv_relfatbin";
789
else
790
FatbinConstantName =
791
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
792
// NVIDIA's cuobjdump looks for fatbins in this section.
793
FatbinSectionName =
794
CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
795
796
ModuleIDSectionName = CGM.getTriple().isMacOSX()
797
? "__NV_CUDA,__nv_module_id"
798
: "__nv_module_id";
799
ModuleIDPrefix = "__nv_";
800
801
// For CUDA, create a string literal containing the fat binary loaded from
802
// the given file.
803
FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
804
FatbinConstantName, 8);
805
FatMagic = CudaFatMagic;
806
}
807
808
// Create initialized wrapper structure that points to the loaded GPU binary
809
ConstantInitBuilder Builder(CGM);
810
auto Values = Builder.beginStruct(FatbinWrapperTy);
811
// Fatbin wrapper magic.
812
Values.addInt(IntTy, FatMagic);
813
// Fatbin version.
814
Values.addInt(IntTy, 1);
815
// Data.
816
Values.add(FatBinStr);
817
// Unused in fatbin v1.
818
Values.add(llvm::ConstantPointerNull::get(PtrTy));
819
llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
820
addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
821
/*constant*/ true);
822
FatbinWrapper->setSection(FatbinSectionName);
823
824
// There is only one HIP fat binary per linked module, however there are
825
// multiple constructor functions. Make sure the fat binary is registered
826
// only once. The constructor functions are executed by the dynamic loader
827
// before the program gains control. The dynamic loader cannot execute the
828
// constructor functions concurrently since doing that would not guarantee
829
// thread safety of the loaded program. Therefore we can assume sequential
830
// execution of constructor functions here.
831
if (IsHIP) {
832
auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
833
: llvm::GlobalValue::ExternalLinkage;
834
llvm::BasicBlock *IfBlock =
835
llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
836
llvm::BasicBlock *ExitBlock =
837
llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
838
// The name, size, and initialization pattern of this variable is part
839
// of HIP ABI.
840
GpuBinaryHandle = new llvm::GlobalVariable(
841
TheModule, PtrTy, /*isConstant=*/false, Linkage,
842
/*Initializer=*/
843
CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
844
CudaGpuBinary
845
? "__hip_gpubin_handle"
846
: "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
847
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
848
// Prevent the weak symbol in different shared libraries being merged.
849
if (Linkage != llvm::GlobalValue::InternalLinkage)
850
GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
851
Address GpuBinaryAddr(
852
GpuBinaryHandle, PtrTy,
853
CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
854
{
855
auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
856
llvm::Constant *Zero =
857
llvm::Constant::getNullValue(HandleValue->getType());
858
llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
859
CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
860
}
861
{
862
CtorBuilder.SetInsertPoint(IfBlock);
863
// GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
864
llvm::CallInst *RegisterFatbinCall =
865
CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
866
CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
867
CtorBuilder.CreateBr(ExitBlock);
868
}
869
{
870
CtorBuilder.SetInsertPoint(ExitBlock);
871
// Call __hip_register_globals(GpuBinaryHandle);
872
if (RegisterGlobalsFunc) {
873
auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
874
CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
875
}
876
}
877
} else if (!RelocatableDeviceCode) {
878
// Register binary with CUDA runtime. This is substantially different in
879
// default mode vs. separate compilation!
880
// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
881
llvm::CallInst *RegisterFatbinCall =
882
CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
883
GpuBinaryHandle = new llvm::GlobalVariable(
884
TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage,
885
llvm::ConstantPointerNull::get(PtrTy), "__cuda_gpubin_handle");
886
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
887
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
888
CGM.getPointerAlign());
889
890
// Call __cuda_register_globals(GpuBinaryHandle);
891
if (RegisterGlobalsFunc)
892
CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
893
894
// Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
895
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
896
CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
897
// void __cudaRegisterFatBinaryEnd(void **);
898
llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
899
llvm::FunctionType::get(VoidTy, PtrTy, false),
900
"__cudaRegisterFatBinaryEnd");
901
CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
902
}
903
} else {
904
// Generate a unique module ID.
905
SmallString<64> ModuleID;
906
llvm::raw_svector_ostream OS(ModuleID);
907
OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
908
llvm::Constant *ModuleIDConstant = makeConstantArray(
909
std::string(ModuleID), "", ModuleIDSectionName, 32, /*AddNull=*/true);
910
911
// Create an alias for the FatbinWrapper that nvcc will look for.
912
llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
913
Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
914
915
// void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
916
// void *, void (*)(void **))
917
SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
918
RegisterLinkedBinaryName += ModuleID;
919
llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
920
getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
921
922
assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
923
llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
924
makeDummyFunction(getCallbackFnTy())};
925
CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
926
}
927
928
// Create destructor and register it with atexit() the way NVCC does it. Doing
929
// it during regular destructor phase worked in CUDA before 9.2 but results in
930
// double-free in 9.2.
931
if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
932
// extern "C" int atexit(void (*f)(void));
933
llvm::FunctionType *AtExitTy =
934
llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
935
llvm::FunctionCallee AtExitFunc =
936
CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
937
/*Local=*/true);
938
CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
939
}
940
941
CtorBuilder.CreateRetVoid();
942
return ModuleCtorFunc;
943
}
944
945
/// Creates a global destructor function that unregisters the GPU code blob
946
/// registered by constructor.
947
///
948
/// For CUDA:
949
/// \code
950
/// void __cuda_module_dtor() {
951
/// __cudaUnregisterFatBinary(Handle);
952
/// }
953
/// \endcode
954
///
955
/// For HIP:
956
/// \code
957
/// void __hip_module_dtor() {
958
/// if (__hip_gpubin_handle) {
959
/// __hipUnregisterFatBinary(__hip_gpubin_handle);
960
/// __hip_gpubin_handle = 0;
961
/// }
962
/// }
963
/// \endcode
964
llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
965
// No need for destructor if we don't have a handle to unregister.
966
if (!GpuBinaryHandle)
967
return nullptr;
968
969
// void __cudaUnregisterFatBinary(void ** handle);
970
llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
971
llvm::FunctionType::get(VoidTy, PtrTy, false),
972
addUnderscoredPrefixToName("UnregisterFatBinary"));
973
974
llvm::Function *ModuleDtorFunc = llvm::Function::Create(
975
llvm::FunctionType::get(VoidTy, false),
976
llvm::GlobalValue::InternalLinkage,
977
addUnderscoredPrefixToName("_module_dtor"), &TheModule);
978
979
llvm::BasicBlock *DtorEntryBB =
980
llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
981
CGBuilderTy DtorBuilder(CGM, Context);
982
DtorBuilder.SetInsertPoint(DtorEntryBB);
983
984
Address GpuBinaryAddr(
985
GpuBinaryHandle, GpuBinaryHandle->getValueType(),
986
CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
987
auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
988
// There is only one HIP fat binary per linked module, however there are
989
// multiple destructor functions. Make sure the fat binary is unregistered
990
// only once.
991
if (CGM.getLangOpts().HIP) {
992
llvm::BasicBlock *IfBlock =
993
llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
994
llvm::BasicBlock *ExitBlock =
995
llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
996
llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
997
llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
998
DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
999
1000
DtorBuilder.SetInsertPoint(IfBlock);
1001
DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1002
DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
1003
DtorBuilder.CreateBr(ExitBlock);
1004
1005
DtorBuilder.SetInsertPoint(ExitBlock);
1006
} else {
1007
DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1008
}
1009
DtorBuilder.CreateRetVoid();
1010
return ModuleDtorFunc;
1011
}
1012
1013
CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
1014
return new CGNVCUDARuntime(CGM);
1015
}
1016
1017
void CGNVCUDARuntime::internalizeDeviceSideVar(
1018
const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
1019
// For -fno-gpu-rdc, host-side shadows of external declarations of device-side
1020
// global variables become internal definitions. These have to be internal in
1021
// order to prevent name conflicts with global host variables with the same
1022
// name in a different TUs.
1023
//
1024
// For -fgpu-rdc, the shadow variables should not be internalized because
1025
// they may be accessed by different TU.
1026
if (CGM.getLangOpts().GPURelocatableDeviceCode)
1027
return;
1028
1029
// __shared__ variables are odd. Shadows do get created, but
1030
// they are not registered with the CUDA runtime, so they
1031
// can't really be used to access their device-side
1032
// counterparts. It's not clear yet whether it's nvcc's bug or
1033
// a feature, but we've got to do the same for compatibility.
1034
if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
1035
D->hasAttr<CUDASharedAttr>() ||
1036
D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1037
D->getType()->isCUDADeviceBuiltinTextureType()) {
1038
Linkage = llvm::GlobalValue::InternalLinkage;
1039
}
1040
}
1041
1042
void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1043
llvm::GlobalVariable &GV) {
1044
if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
1045
// Shadow variables and their properties must be registered with CUDA
1046
// runtime. Skip Extern global variables, which will be registered in
1047
// the TU where they are defined.
1048
//
1049
// Don't register a C++17 inline variable. The local symbol can be
1050
// discarded and referencing a discarded local symbol from outside the
1051
// comdat (__cuda_register_globals) is disallowed by the ELF spec.
1052
//
1053
// HIP managed variables need to be always recorded in device and host
1054
// compilations for transformation.
1055
//
1056
// HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1057
// added to llvm.compiler-used, therefore they are safe to be registered.
1058
if ((!D->hasExternalStorage() && !D->isInline()) ||
1059
CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
1060
D->hasAttr<HIPManagedAttr>()) {
1061
registerDeviceVar(D, GV, !D->hasDefinition(),
1062
D->hasAttr<CUDAConstantAttr>());
1063
}
1064
} else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1065
D->getType()->isCUDADeviceBuiltinTextureType()) {
1066
// Builtin surfaces and textures and their template arguments are
1067
// also registered with CUDA runtime.
1068
const auto *TD = cast<ClassTemplateSpecializationDecl>(
1069
D->getType()->castAs<RecordType>()->getDecl());
1070
const TemplateArgumentList &Args = TD->getTemplateArgs();
1071
if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1072
assert(Args.size() == 2 &&
1073
"Unexpected number of template arguments of CUDA device "
1074
"builtin surface type.");
1075
auto SurfType = Args[1].getAsIntegral();
1076
if (!D->hasExternalStorage())
1077
registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1078
} else {
1079
assert(Args.size() == 3 &&
1080
"Unexpected number of template arguments of CUDA device "
1081
"builtin texture type.");
1082
auto TexType = Args[1].getAsIntegral();
1083
auto Normalized = Args[2].getAsIntegral();
1084
if (!D->hasExternalStorage())
1085
registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1086
Normalized.getZExtValue());
1087
}
1088
}
1089
}
1090
1091
// Transform managed variables to pointers to managed variables in device code.
1092
// Each use of the original managed variable is replaced by a load from the
1093
// transformed managed variable. The transformed managed variable contains
1094
// the address of managed memory which will be allocated by the runtime.
1095
void CGNVCUDARuntime::transformManagedVars() {
1096
for (auto &&Info : DeviceVars) {
1097
llvm::GlobalVariable *Var = Info.Var;
1098
if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1099
Info.Flags.isManaged()) {
1100
auto *ManagedVar = new llvm::GlobalVariable(
1101
CGM.getModule(), Var->getType(),
1102
/*isConstant=*/false, Var->getLinkage(),
1103
/*Init=*/Var->isDeclaration()
1104
? nullptr
1105
: llvm::ConstantPointerNull::get(Var->getType()),
1106
/*Name=*/"", /*InsertBefore=*/nullptr,
1107
llvm::GlobalVariable::NotThreadLocal,
1108
CGM.getContext().getTargetAddressSpace(CGM.getLangOpts().CUDAIsDevice
1109
? LangAS::cuda_device
1110
: LangAS::Default));
1111
ManagedVar->setDSOLocal(Var->isDSOLocal());
1112
ManagedVar->setVisibility(Var->getVisibility());
1113
ManagedVar->setExternallyInitialized(true);
1114
replaceManagedVar(Var, ManagedVar);
1115
ManagedVar->takeName(Var);
1116
Var->setName(Twine(ManagedVar->getName()) + ".managed");
1117
// Keep managed variables even if they are not used in device code since
1118
// they need to be allocated by the runtime.
1119
if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1120
assert(!ManagedVar->isDeclaration());
1121
CGM.addCompilerUsedGlobal(Var);
1122
CGM.addCompilerUsedGlobal(ManagedVar);
1123
}
1124
}
1125
}
1126
}
1127
1128
// Creates offloading entries for all the kernels and globals that must be
1129
// registered. The linker will provide a pointer to this section so we can
1130
// register the symbols with the linked device image.
1131
void CGNVCUDARuntime::createOffloadingEntries() {
1132
StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
1133
: "cuda_offloading_entries";
1134
llvm::Module &M = CGM.getModule();
1135
for (KernelInfo &I : EmittedKernels)
1136
llvm::offloading::emitOffloadingEntry(
1137
M, KernelHandles[I.Kernel->getName()],
1138
getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0,
1139
llvm::offloading::OffloadGlobalEntry, Section);
1140
1141
for (VarInfo &I : DeviceVars) {
1142
uint64_t VarSize =
1143
CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1144
int32_t Flags =
1145
(I.Flags.isExtern()
1146
? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1147
: 0) |
1148
(I.Flags.isConstant()
1149
? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1150
: 0) |
1151
(I.Flags.isNormalized()
1152
? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1153
: 0);
1154
if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1155
llvm::offloading::emitOffloadingEntry(
1156
M, I.Var, getDeviceSideName(I.D), VarSize,
1157
(I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1158
: llvm::offloading::OffloadGlobalEntry) |
1159
Flags,
1160
/*Data=*/0, Section);
1161
} else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1162
llvm::offloading::emitOffloadingEntry(
1163
M, I.Var, getDeviceSideName(I.D), VarSize,
1164
llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1165
I.Flags.getSurfTexType(), Section);
1166
} else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1167
llvm::offloading::emitOffloadingEntry(
1168
M, I.Var, getDeviceSideName(I.D), VarSize,
1169
llvm::offloading::OffloadGlobalTextureEntry | Flags,
1170
I.Flags.getSurfTexType(), Section);
1171
}
1172
}
1173
}
1174
1175
// Returns module constructor to be added.
1176
llvm::Function *CGNVCUDARuntime::finalizeModule() {
1177
transformManagedVars();
1178
if (CGM.getLangOpts().CUDAIsDevice) {
1179
// Mark ODR-used device variables as compiler used to prevent it from being
1180
// eliminated by optimization. This is necessary for device variables
1181
// ODR-used by host functions. Sema correctly marks them as ODR-used no
1182
// matter whether they are ODR-used by device or host functions.
1183
//
1184
// We do not need to do this if the variable has used attribute since it
1185
// has already been added.
1186
//
1187
// Static device variables have been externalized at this point, therefore
1188
// variables with LLVM private or internal linkage need not be added.
1189
for (auto &&Info : DeviceVars) {
1190
auto Kind = Info.Flags.getKind();
1191
if (!Info.Var->isDeclaration() &&
1192
!llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1193
(Kind == DeviceVarFlags::Variable ||
1194
Kind == DeviceVarFlags::Surface ||
1195
Kind == DeviceVarFlags::Texture) &&
1196
Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1197
CGM.addCompilerUsedGlobal(Info.Var);
1198
}
1199
}
1200
return nullptr;
1201
}
1202
if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1203
createOffloadingEntries();
1204
else
1205
return makeModuleCtorFunction();
1206
1207
return nullptr;
1208
}
1209
1210
llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1211
GlobalDecl GD) {
1212
auto Loc = KernelHandles.find(F->getName());
1213
if (Loc != KernelHandles.end()) {
1214
auto OldHandle = Loc->second;
1215
if (KernelStubs[OldHandle] == F)
1216
return OldHandle;
1217
1218
// We've found the function name, but F itself has changed, so we need to
1219
// update the references.
1220
if (CGM.getLangOpts().HIP) {
1221
// For HIP compilation the handle itself does not change, so we only need
1222
// to update the Stub value.
1223
KernelStubs[OldHandle] = F;
1224
return OldHandle;
1225
}
1226
// For non-HIP compilation, erase the old Stub and fall-through to creating
1227
// new entries.
1228
KernelStubs.erase(OldHandle);
1229
}
1230
1231
if (!CGM.getLangOpts().HIP) {
1232
KernelHandles[F->getName()] = F;
1233
KernelStubs[F] = F;
1234
return F;
1235
}
1236
1237
auto *Var = new llvm::GlobalVariable(
1238
TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1239
/*Initializer=*/nullptr,
1240
CGM.getMangledName(
1241
GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
1242
Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1243
Var->setDSOLocal(F->isDSOLocal());
1244
Var->setVisibility(F->getVisibility());
1245
auto *FD = cast<FunctionDecl>(GD.getDecl());
1246
auto *FT = FD->getPrimaryTemplate();
1247
if (!FT || FT->isThisDeclarationADefinition())
1248
CGM.maybeSetTrivialComdat(*FD, *Var);
1249
KernelHandles[F->getName()] = Var;
1250
KernelStubs[Var] = F;
1251
return Var;
1252
}
1253
1254