CoCalc -- CGCUDANV.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp
³⁵²³³ views
1
//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10
// runtime library.
11
//
12
//===----------------------------------------------------------------------===//
13

14
#include "CGCUDARuntime.h"
15
#include "CGCXXABI.h"
16
#include "CodeGenFunction.h"
17
#include "CodeGenModule.h"
18
#include "clang/AST/Decl.h"
19
#include "clang/Basic/Cuda.h"
20
#include "clang/CodeGen/CodeGenABITypes.h"
21
#include "clang/CodeGen/ConstantInitBuilder.h"
22
#include "llvm/Frontend/Offloading/Utility.h"
23
#include "llvm/IR/BasicBlock.h"
24
#include "llvm/IR/Constants.h"
25
#include "llvm/IR/DerivedTypes.h"
26
#include "llvm/IR/ReplaceConstant.h"
27
#include "llvm/Support/Format.h"
28
#include "llvm/Support/VirtualFileSystem.h"
29

30
using namespace clang;
31
using namespace CodeGen;
32

33
namespace {
34
constexpr unsigned CudaFatMagic = 0x466243b1;
35
constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
36

37
class CGNVCUDARuntime : public CGCUDARuntime {
38

39
private:
40
  llvm::IntegerType *IntTy, *SizeTy;
41
  llvm::Type *VoidTy;
42
  llvm::PointerType *PtrTy;
43

44
  /// Convenience reference to LLVM Context
45
  llvm::LLVMContext &Context;
46
  /// Convenience reference to the current module
47
  llvm::Module &TheModule;
48
  /// Keeps track of kernel launch stubs and handles emitted in this module
49
  struct KernelInfo {
50
    llvm::Function *Kernel; // stub function to help launch kernel
51
    const Decl *D;
52
  };
53
  llvm::SmallVector<KernelInfo, 16> EmittedKernels;
54
  // Map a kernel mangled name to a symbol for identifying kernel in host code
55
  // For CUDA, the symbol for identifying the kernel is the same as the device
56
  // stub function. For HIP, they are different.
57
  llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
58
  // Map a kernel handle to the kernel stub.
59
  llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
60
  struct VarInfo {
61
    llvm::GlobalVariable *Var;
62
    const VarDecl *D;
63
    DeviceVarFlags Flags;
64
  };
65
  llvm::SmallVector<VarInfo, 16> DeviceVars;
66
  /// Keeps track of variable containing handle of GPU binary. Populated by
67
  /// ModuleCtorFunction() and used to create corresponding cleanup calls in
68
  /// ModuleDtorFunction()
69
  llvm::GlobalVariable *GpuBinaryHandle = nullptr;
70
  /// Whether we generate relocatable device code.
71
  bool RelocatableDeviceCode;
72
  /// Mangle context for device.
73
  std::unique_ptr<MangleContext> DeviceMC;
74

75
  llvm::FunctionCallee getSetupArgumentFn() const;
76
  llvm::FunctionCallee getLaunchFn() const;
77

78
  llvm::FunctionType *getRegisterGlobalsFnTy() const;
79
  llvm::FunctionType *getCallbackFnTy() const;
80
  llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
81
  std::string addPrefixToName(StringRef FuncName) const;
82
  std::string addUnderscoredPrefixToName(StringRef FuncName) const;
83

84
  /// Creates a function to register all kernel stubs generated in this module.
85
  llvm::Function *makeRegisterGlobalsFn();
86

87
  /// Helper function that generates a constant string and returns a pointer to
88
  /// the start of the string.  The result of this function can be used anywhere
89
  /// where the C code specifies const char*.
90
  llvm::Constant *makeConstantString(const std::string &Str,
91
                                     const std::string &Name = "") {
92
    return CGM.GetAddrOfConstantCString(Str, Name.c_str()).getPointer();
93
  }
94

95
  /// Helper function which generates an initialized constant array from Str,
96
  /// and optionally sets section name and alignment. AddNull specifies whether
97
  /// the array should nave NUL termination.
98
  llvm::Constant *makeConstantArray(StringRef Str,
99
                                    StringRef Name = "",
100
                                    StringRef SectionName = "",
101
                                    unsigned Alignment = 0,
102
                                    bool AddNull = false) {
103
    llvm::Constant *Value =
104
        llvm::ConstantDataArray::getString(Context, Str, AddNull);
105
    auto *GV = new llvm::GlobalVariable(
106
        TheModule, Value->getType(), /*isConstant=*/true,
107
        llvm::GlobalValue::PrivateLinkage, Value, Name);
108
    if (!SectionName.empty()) {
109
      GV->setSection(SectionName);
110
      // Mark the address as used which make sure that this section isn't
111
      // merged and we will really have it in the object file.
112
      GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
113
    }
114
    if (Alignment)
115
      GV->setAlignment(llvm::Align(Alignment));
116
    return GV;
117
  }
118

119
  /// Helper function that generates an empty dummy function returning void.
120
  llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
121
    assert(FnTy->getReturnType()->isVoidTy() &&
122
           "Can only generate dummy functions returning void!");
123
    llvm::Function *DummyFunc = llvm::Function::Create(
124
        FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
125

126
    llvm::BasicBlock *DummyBlock =
127
        llvm::BasicBlock::Create(Context, "", DummyFunc);
128
    CGBuilderTy FuncBuilder(CGM, Context);
129
    FuncBuilder.SetInsertPoint(DummyBlock);
130
    FuncBuilder.CreateRetVoid();
131

132
    return DummyFunc;
133
  }
134

135
  void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
136
  void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
137
  std::string getDeviceSideName(const NamedDecl *ND) override;
138

139
  void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
140
                         bool Extern, bool Constant) {
141
    DeviceVars.push_back({&Var,
142
                          VD,
143
                          {DeviceVarFlags::Variable, Extern, Constant,
144
                           VD->hasAttr<HIPManagedAttr>(),
145
                           /*Normalized*/ false, 0}});
146
  }
147
  void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
148
                          bool Extern, int Type) {
149
    DeviceVars.push_back({&Var,
150
                          VD,
151
                          {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
152
                           /*Managed*/ false,
153
                           /*Normalized*/ false, Type}});
154
  }
155
  void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
156
                         bool Extern, int Type, bool Normalized) {
157
    DeviceVars.push_back({&Var,
158
                          VD,
159
                          {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
160
                           /*Managed*/ false, Normalized, Type}});
161
  }
162

163
  /// Creates module constructor function
164
  llvm::Function *makeModuleCtorFunction();
165
  /// Creates module destructor function
166
  llvm::Function *makeModuleDtorFunction();
167
  /// Transform managed variables for device compilation.
168
  void transformManagedVars();
169
  /// Create offloading entries to register globals in RDC mode.
170
  void createOffloadingEntries();
171

172
public:
173
  CGNVCUDARuntime(CodeGenModule &CGM);
174

175
  llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
176
  llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
177
    auto Loc = KernelStubs.find(Handle);
178
    assert(Loc != KernelStubs.end());
179
    return Loc->second;
180
  }
181
  void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
182
  void handleVarRegistration(const VarDecl *VD,
183
                             llvm::GlobalVariable &Var) override;
184
  void
185
  internalizeDeviceSideVar(const VarDecl *D,
186
                           llvm::GlobalValue::LinkageTypes &Linkage) override;
187

188
  llvm::Function *finalizeModule() override;
189
};
190

191
} // end anonymous namespace
192

193
std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
194
  if (CGM.getLangOpts().HIP)
195
    return ((Twine("hip") + Twine(FuncName)).str());
196
  return ((Twine("cuda") + Twine(FuncName)).str());
197
}
198
std::string
199
CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
200
  if (CGM.getLangOpts().HIP)
201
    return ((Twine("__hip") + Twine(FuncName)).str());
202
  return ((Twine("__cuda") + Twine(FuncName)).str());
203
}
204

205
static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
206
  // If the host and device have different C++ ABIs, mark it as the device
207
  // mangle context so that the mangling needs to retrieve the additional
208
  // device lambda mangling number instead of the regular host one.
209
  if (CGM.getContext().getAuxTargetInfo() &&
210
      CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
211
      CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
212
    return std::unique_ptr<MangleContext>(
213
        CGM.getContext().createDeviceMangleContext(
214
            *CGM.getContext().getAuxTargetInfo()));
215
  }
216

217
  return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
218
      CGM.getContext().getAuxTargetInfo()));
219
}
220

221
CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
222
    : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
223
      TheModule(CGM.getModule()),
224
      RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
225
      DeviceMC(InitDeviceMC(CGM)) {
226
  IntTy = CGM.IntTy;
227
  SizeTy = CGM.SizeTy;
228
  VoidTy = CGM.VoidTy;
229
  PtrTy = CGM.UnqualPtrTy;
230
}
231

232
llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
233
  // cudaError_t cudaSetupArgument(void *, size_t, size_t)
234
  llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
235
  return CGM.CreateRuntimeFunction(
236
      llvm::FunctionType::get(IntTy, Params, false),
237
      addPrefixToName("SetupArgument"));
238
}
239

240
llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
241
  if (CGM.getLangOpts().HIP) {
242
    // hipError_t hipLaunchByPtr(char *);
243
    return CGM.CreateRuntimeFunction(
244
        llvm::FunctionType::get(IntTy, PtrTy, false), "hipLaunchByPtr");
245
  }
246
  // cudaError_t cudaLaunch(char *);
247
  return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy, false),
248
                                   "cudaLaunch");
249
}
250

251
llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
252
  return llvm::FunctionType::get(VoidTy, PtrTy, false);
253
}
254

255
llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
256
  return llvm::FunctionType::get(VoidTy, PtrTy, false);
257
}
258

259
llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
260
  llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
261
                          llvm::PointerType::getUnqual(Context)};
262
  return llvm::FunctionType::get(VoidTy, Params, false);
263
}
264

265
std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
266
  GlobalDecl GD;
267
  // D could be either a kernel or a variable.
268
  if (auto *FD = dyn_cast<FunctionDecl>(ND))
269
    GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
270
  else
271
    GD = GlobalDecl(ND);
272
  std::string DeviceSideName;
273
  MangleContext *MC;
274
  if (CGM.getLangOpts().CUDAIsDevice)
275
    MC = &CGM.getCXXABI().getMangleContext();
276
  else
277
    MC = DeviceMC.get();
278
  if (MC->shouldMangleDeclName(ND)) {
279
    SmallString<256> Buffer;
280
    llvm::raw_svector_ostream Out(Buffer);
281
    MC->mangleName(GD, Out);
282
    DeviceSideName = std::string(Out.str());
283
  } else
284
    DeviceSideName = std::string(ND->getIdentifier()->getName());
285

286
  // Make unique name for device side static file-scope variable for HIP.
287
  if (CGM.getContext().shouldExternalize(ND) &&
288
      CGM.getLangOpts().GPURelocatableDeviceCode) {
289
    SmallString<256> Buffer;
290
    llvm::raw_svector_ostream Out(Buffer);
291
    Out << DeviceSideName;
292
    CGM.printPostfixForExternalizedDecl(Out, ND);
293
    DeviceSideName = std::string(Out.str());
294
  }
295
  return DeviceSideName;
296
}
297

298
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
299
                                     FunctionArgList &Args) {
300
  EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
301
  if (auto *GV =
302
          dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) {
303
    GV->setLinkage(CGF.CurFn->getLinkage());
304
    GV->setInitializer(CGF.CurFn);
305
  }
306
  if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
307
                         CudaFeature::CUDA_USES_NEW_LAUNCH) ||
308
      (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
309
    emitDeviceStubBodyNew(CGF, Args);
310
  else
311
    emitDeviceStubBodyLegacy(CGF, Args);
312
}
313

314
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
315
// array and kernels are launched using cudaLaunchKernel().
316
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
317
                                            FunctionArgList &Args) {
318
  // Build the shadow stack entry at the very start of the function.
319

320
  // Calculate amount of space we will need for all arguments.  If we have no
321
  // args, allocate a single pointer so we still have a valid pointer to the
322
  // argument array that we can pass to runtime, even if it will be unused.
323
  Address KernelArgs = CGF.CreateTempAlloca(
324
      PtrTy, CharUnits::fromQuantity(16), "kernel_args",
325
      llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
326
  // Store pointers to the arguments in a locally allocated launch_args.
327
  for (unsigned i = 0; i < Args.size(); ++i) {
328
    llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(Args[i]).emitRawPointer(CGF);
329
    llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, PtrTy);
330
    CGF.Builder.CreateDefaultAlignedStore(
331
        VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
332
                        PtrTy, KernelArgs.emitRawPointer(CGF), i));
333
  }
334

335
  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
336

337
  // Lookup cudaLaunchKernel/hipLaunchKernel function.
338
  // HIP kernel launching API name depends on -fgpu-default-stream option. For
339
  // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
340
  // it is hipLaunchKernel_spt.
341
  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
342
  //                              void **args, size_t sharedMem,
343
  //                              cudaStream_t stream);
344
  // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
345
  //                                  dim3 blockDim, void **args,
346
  //                                  size_t sharedMem, hipStream_t stream);
347
  TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
348
  DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
349
  std::string KernelLaunchAPI = "LaunchKernel";
350
  if (CGF.getLangOpts().GPUDefaultStream ==
351
      LangOptions::GPUDefaultStreamKind::PerThread) {
352
    if (CGF.getLangOpts().HIP)
353
      KernelLaunchAPI = KernelLaunchAPI + "_spt";
354
    else if (CGF.getLangOpts().CUDA)
355
      KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
356
  }
357
  auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
358
  const IdentifierInfo &cudaLaunchKernelII =
359
      CGM.getContext().Idents.get(LaunchKernelName);
360
  FunctionDecl *cudaLaunchKernelFD = nullptr;
361
  for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
362
    if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
363
      cudaLaunchKernelFD = FD;
364
  }
365

366
  if (cudaLaunchKernelFD == nullptr) {
367
    CGM.Error(CGF.CurFuncDecl->getLocation(),
368
              "Can't find declaration for " + LaunchKernelName);
369
    return;
370
  }
371
  // Create temporary dim3 grid_dim, block_dim.
372
  ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
373
  QualType Dim3Ty = GridDimParam->getType();
374
  Address GridDim =
375
      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
376
  Address BlockDim =
377
      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
378
  Address ShmemSize =
379
      CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
380
  Address Stream = CGF.CreateTempAlloca(PtrTy, CGM.getPointerAlign(), "stream");
381
  llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
382
      llvm::FunctionType::get(IntTy,
383
                              {/*gridDim=*/GridDim.getType(),
384
                               /*blockDim=*/BlockDim.getType(),
385
                               /*ShmemSize=*/ShmemSize.getType(),
386
                               /*Stream=*/Stream.getType()},
387
                              /*isVarArg=*/false),
388
      addUnderscoredPrefixToName("PopCallConfiguration"));
389

390
  CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.emitRawPointer(CGF),
391
                                                BlockDim.emitRawPointer(CGF),
392
                                                ShmemSize.emitRawPointer(CGF),
393
                                                Stream.emitRawPointer(CGF)});
394

395
  // Emit the call to cudaLaunch
396
  llvm::Value *Kernel =
397
      CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
398
  CallArgList LaunchKernelArgs;
399
  LaunchKernelArgs.add(RValue::get(Kernel),
400
                       cudaLaunchKernelFD->getParamDecl(0)->getType());
401
  LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
402
  LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
403
  LaunchKernelArgs.add(RValue::get(KernelArgs, CGF),
404
                       cudaLaunchKernelFD->getParamDecl(3)->getType());
405
  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
406
                       cudaLaunchKernelFD->getParamDecl(4)->getType());
407
  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
408
                       cudaLaunchKernelFD->getParamDecl(5)->getType());
409

410
  QualType QT = cudaLaunchKernelFD->getType();
411
  QualType CQT = QT.getCanonicalType();
412
  llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
413
  llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
414

415
  const CGFunctionInfo &FI =
416
      CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
417
  llvm::FunctionCallee cudaLaunchKernelFn =
418
      CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
419
  CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
420
               LaunchKernelArgs);
421

422
  // To prevent CUDA device stub functions from being merged by ICF in MSVC
423
  // environment, create an unique global variable for each kernel and write to
424
  // the variable in the device stub.
425
  if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
426
      !CGF.getLangOpts().HIP) {
427
    llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Kernel);
428
    std::string GlobalVarName = (KernelFunction->getName() + ".id").str();
429

430
    llvm::GlobalVariable *HandleVar =
431
        CGM.getModule().getNamedGlobal(GlobalVarName);
432
    if (!HandleVar) {
433
      HandleVar = new llvm::GlobalVariable(
434
          CGM.getModule(), CGM.Int8Ty,
435
          /*Constant=*/false, KernelFunction->getLinkage(),
436
          llvm::ConstantInt::get(CGM.Int8Ty, 0), GlobalVarName);
437
      HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
438
      HandleVar->setVisibility(KernelFunction->getVisibility());
439
      if (KernelFunction->hasComdat())
440
        HandleVar->setComdat(CGM.getModule().getOrInsertComdat(GlobalVarName));
441
    }
442

443
    CGF.Builder.CreateAlignedStore(llvm::ConstantInt::get(CGM.Int8Ty, 1),
444
                                   HandleVar, CharUnits::One(),
445
                                   /*IsVolatile=*/true);
446
  }
447

448
  CGF.EmitBranch(EndBlock);
449

450
  CGF.EmitBlock(EndBlock);
451
}
452

453
void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
454
                                               FunctionArgList &Args) {
455
  // Emit a call to cudaSetupArgument for each arg in Args.
456
  llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
457
  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
458
  CharUnits Offset = CharUnits::Zero();
459
  for (const VarDecl *A : Args) {
460
    auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
461
    Offset = Offset.alignTo(TInfo.Align);
462
    llvm::Value *Args[] = {
463
        CGF.Builder.CreatePointerCast(
464
            CGF.GetAddrOfLocalVar(A).emitRawPointer(CGF), PtrTy),
465
        llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
466
        llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
467
    };
468
    llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
469
    llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
470
    llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
471
    llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
472
    CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
473
    CGF.EmitBlock(NextBlock);
474
    Offset += TInfo.Width;
475
  }
476

477
  // Emit the call to cudaLaunch
478
  llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
479
  llvm::Value *Arg =
480
      CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
481
  CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
482
  CGF.EmitBranch(EndBlock);
483

484
  CGF.EmitBlock(EndBlock);
485
}
486

487
// Replace the original variable Var with the address loaded from variable
488
// ManagedVar populated by HIP runtime.
489
static void replaceManagedVar(llvm::GlobalVariable *Var,
490
                              llvm::GlobalVariable *ManagedVar) {
491
  SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
492
  for (auto &&VarUse : Var->uses()) {
493
    WorkList.push_back({VarUse.getUser()});
494
  }
495
  while (!WorkList.empty()) {
496
    auto &&WorkItem = WorkList.pop_back_val();
497
    auto *U = WorkItem.back();
498
    if (isa<llvm::ConstantExpr>(U)) {
499
      for (auto &&UU : U->uses()) {
500
        WorkItem.push_back(UU.getUser());
501
        WorkList.push_back(WorkItem);
502
        WorkItem.pop_back();
503
      }
504
      continue;
505
    }
506
    if (auto *I = dyn_cast<llvm::Instruction>(U)) {
507
      llvm::Value *OldV = Var;
508
      llvm::Instruction *NewV =
509
          new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
510
                             llvm::Align(Var->getAlignment()), I);
511
      WorkItem.pop_back();
512
      // Replace constant expressions directly or indirectly using the managed
513
      // variable with instructions.
514
      for (auto &&Op : WorkItem) {
515
        auto *CE = cast<llvm::ConstantExpr>(Op);
516
        auto *NewInst = CE->getAsInstruction();
517
        NewInst->insertBefore(*I->getParent(), I->getIterator());
518
        NewInst->replaceUsesOfWith(OldV, NewV);
519
        OldV = CE;
520
        NewV = NewInst;
521
      }
522
      I->replaceUsesOfWith(OldV, NewV);
523
    } else {
524
      llvm_unreachable("Invalid use of managed variable");
525
    }
526
  }
527
}
528

529
/// Creates a function that sets up state on the host side for CUDA objects that
530
/// have a presence on both the host and device sides. Specifically, registers
531
/// the host side of kernel functions and device global variables with the CUDA
532
/// runtime.
533
/// \code
534
/// void __cuda_register_globals(void** GpuBinaryHandle) {
535
///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
536
///    ...
537
///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
538
///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
539
///    ...
540
///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
541
/// }
542
/// \endcode
543
llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
544
  // No need to register anything
545
  if (EmittedKernels.empty() && DeviceVars.empty())
546
    return nullptr;
547

548
  llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
549
      getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
550
      addUnderscoredPrefixToName("_register_globals"), &TheModule);
551
  llvm::BasicBlock *EntryBB =
552
      llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
553
  CGBuilderTy Builder(CGM, Context);
554
  Builder.SetInsertPoint(EntryBB);
555

556
  // void __cudaRegisterFunction(void **, const char *, char *, const char *,
557
  //                             int, uint3*, uint3*, dim3*, dim3*, int*)
558
  llvm::Type *RegisterFuncParams[] = {
559
      PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
560
      PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
561
  llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
562
      llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
563
      addUnderscoredPrefixToName("RegisterFunction"));
564

565
  // Extract GpuBinaryHandle passed as the first argument passed to
566
  // __cuda_register_globals() and generate __cudaRegisterFunction() call for
567
  // each emitted kernel.
568
  llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
569
  for (auto &&I : EmittedKernels) {
570
    llvm::Constant *KernelName =
571
        makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
572
    llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
573
    llvm::Value *Args[] = {
574
        &GpuBinaryHandlePtr,
575
        KernelHandles[I.Kernel->getName()],
576
        KernelName,
577
        KernelName,
578
        llvm::ConstantInt::get(IntTy, -1),
579
        NullPtr,
580
        NullPtr,
581
        NullPtr,
582
        NullPtr,
583
        llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
584
    Builder.CreateCall(RegisterFunc, Args);
585
  }
586

587
  llvm::Type *VarSizeTy = IntTy;
588
  // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
589
  if (CGM.getLangOpts().HIP ||
590
      ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
591
    VarSizeTy = SizeTy;
592

593
  // void __cudaRegisterVar(void **, char *, char *, const char *,
594
  //                        int, int, int, int)
595
  llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy,     PtrTy, PtrTy,
596
                                     IntTy, VarSizeTy, IntTy, IntTy};
597
  llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
598
      llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
599
      addUnderscoredPrefixToName("RegisterVar"));
600
  // void __hipRegisterManagedVar(void **, char *, char *, const char *,
601
  //                              size_t, unsigned)
602
  llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy,     PtrTy,
603
                                            PtrTy, VarSizeTy, IntTy};
604
  llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
605
      llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
606
      addUnderscoredPrefixToName("RegisterManagedVar"));
607
  // void __cudaRegisterSurface(void **, const struct surfaceReference *,
608
  //                            const void **, const char *, int, int);
609
  llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
610
      llvm::FunctionType::get(
611
          VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, false),
612
      addUnderscoredPrefixToName("RegisterSurface"));
613
  // void __cudaRegisterTexture(void **, const struct textureReference *,
614
  //                            const void **, const char *, int, int, int)
615
  llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
616
      llvm::FunctionType::get(
617
          VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, false),
618
      addUnderscoredPrefixToName("RegisterTexture"));
619
  for (auto &&Info : DeviceVars) {
620
    llvm::GlobalVariable *Var = Info.Var;
621
    assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
622
           "External variables should not show up here, except HIP managed "
623
           "variables");
624
    llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
625
    switch (Info.Flags.getKind()) {
626
    case DeviceVarFlags::Variable: {
627
      uint64_t VarSize =
628
          CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
629
      if (Info.Flags.isManaged()) {
630
        assert(Var->getName().ends_with(".managed") &&
631
               "HIP managed variables not transformed");
632
        auto *ManagedVar = CGM.getModule().getNamedGlobal(
633
            Var->getName().drop_back(StringRef(".managed").size()));
634
        llvm::Value *Args[] = {
635
            &GpuBinaryHandlePtr,
636
            ManagedVar,
637
            Var,
638
            VarName,
639
            llvm::ConstantInt::get(VarSizeTy, VarSize),
640
            llvm::ConstantInt::get(IntTy, Var->getAlignment())};
641
        if (!Var->isDeclaration())
642
          Builder.CreateCall(RegisterManagedVar, Args);
643
      } else {
644
        llvm::Value *Args[] = {
645
            &GpuBinaryHandlePtr,
646
            Var,
647
            VarName,
648
            VarName,
649
            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
650
            llvm::ConstantInt::get(VarSizeTy, VarSize),
651
            llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
652
            llvm::ConstantInt::get(IntTy, 0)};
653
        Builder.CreateCall(RegisterVar, Args);
654
      }
655
      break;
656
    }
657
    case DeviceVarFlags::Surface:
658
      Builder.CreateCall(
659
          RegisterSurf,
660
          {&GpuBinaryHandlePtr, Var, VarName, VarName,
661
           llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
662
           llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
663
      break;
664
    case DeviceVarFlags::Texture:
665
      Builder.CreateCall(
666
          RegisterTex,
667
          {&GpuBinaryHandlePtr, Var, VarName, VarName,
668
           llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
669
           llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
670
           llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
671
      break;
672
    }
673
  }
674

675
  Builder.CreateRetVoid();
676
  return RegisterKernelsFunc;
677
}
678

679
/// Creates a global constructor function for the module:
680
///
681
/// For CUDA:
682
/// \code
683
/// void __cuda_module_ctor() {
684
///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
685
///     __cuda_register_globals(Handle);
686
/// }
687
/// \endcode
688
///
689
/// For HIP:
690
/// \code
691
/// void __hip_module_ctor() {
692
///     if (__hip_gpubin_handle == 0) {
693
///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
694
///         __hip_register_globals(__hip_gpubin_handle);
695
///     }
696
/// }
697
/// \endcode
698
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
699
  bool IsHIP = CGM.getLangOpts().HIP;
700
  bool IsCUDA = CGM.getLangOpts().CUDA;
701
  // No need to generate ctors/dtors if there is no GPU binary.
702
  StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
703
  if (CudaGpuBinaryFileName.empty() && !IsHIP)
704
    return nullptr;
705
  if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
706
      DeviceVars.empty())
707
    return nullptr;
708

709
  // void __{cuda|hip}_register_globals(void* handle);
710
  llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
711
  // We always need a function to pass in as callback. Create a dummy
712
  // implementation if we don't need to register anything.
713
  if (RelocatableDeviceCode && !RegisterGlobalsFunc)
714
    RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
715

716
  // void ** __{cuda|hip}RegisterFatBinary(void *);
717
  llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
718
      llvm::FunctionType::get(PtrTy, PtrTy, false),
719
      addUnderscoredPrefixToName("RegisterFatBinary"));
720
  // struct { int magic, int version, void * gpu_binary, void * dont_care };
721
  llvm::StructType *FatbinWrapperTy =
722
      llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
723

724
  // Register GPU binary with the CUDA runtime, store returned handle in a
725
  // global variable and save a reference in GpuBinaryHandle to be cleaned up
726
  // in destructor on exit. Then associate all known kernels with the GPU binary
727
  // handle so CUDA runtime can figure out what to call on the GPU side.
728
  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
729
  if (!CudaGpuBinaryFileName.empty()) {
730
    auto VFS = CGM.getFileSystem();
731
    auto CudaGpuBinaryOrErr =
732
        VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false);
733
    if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
734
      CGM.getDiags().Report(diag::err_cannot_open_file)
735
          << CudaGpuBinaryFileName << EC.message();
736
      return nullptr;
737
    }
738
    CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
739
  }
740

741
  llvm::Function *ModuleCtorFunc = llvm::Function::Create(
742
      llvm::FunctionType::get(VoidTy, false),
743
      llvm::GlobalValue::InternalLinkage,
744
      addUnderscoredPrefixToName("_module_ctor"), &TheModule);
745
  llvm::BasicBlock *CtorEntryBB =
746
      llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
747
  CGBuilderTy CtorBuilder(CGM, Context);
748

749
  CtorBuilder.SetInsertPoint(CtorEntryBB);
750

751
  const char *FatbinConstantName;
752
  const char *FatbinSectionName;
753
  const char *ModuleIDSectionName;
754
  StringRef ModuleIDPrefix;
755
  llvm::Constant *FatBinStr;
756
  unsigned FatMagic;
757
  if (IsHIP) {
758
    FatbinConstantName = ".hip_fatbin";
759
    FatbinSectionName = ".hipFatBinSegment";
760

761
    ModuleIDSectionName = "__hip_module_id";
762
    ModuleIDPrefix = "__hip_";
763

764
    if (CudaGpuBinary) {
765
      // If fatbin is available from early finalization, create a string
766
      // literal containing the fat binary loaded from the given file.
767
      const unsigned HIPCodeObjectAlign = 4096;
768
      FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
769
                                    FatbinConstantName, HIPCodeObjectAlign);
770
    } else {
771
      // If fatbin is not available, create an external symbol
772
      // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
773
      // to contain the fat binary but will be populated somewhere else,
774
      // e.g. by lld through link script.
775
      FatBinStr = new llvm::GlobalVariable(
776
          CGM.getModule(), CGM.Int8Ty,
777
          /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
778
          "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
779
          llvm::GlobalVariable::NotThreadLocal);
780
      cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
781
    }
782

783
    FatMagic = HIPFatMagic;
784
  } else {
785
    if (RelocatableDeviceCode)
786
      FatbinConstantName = CGM.getTriple().isMacOSX()
787
                               ? "__NV_CUDA,__nv_relfatbin"
788
                               : "__nv_relfatbin";
789
    else
790
      FatbinConstantName =
791
          CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
792
    // NVIDIA's cuobjdump looks for fatbins in this section.
793
    FatbinSectionName =
794
        CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
795

796
    ModuleIDSectionName = CGM.getTriple().isMacOSX()
797
                              ? "__NV_CUDA,__nv_module_id"
798
                              : "__nv_module_id";
799
    ModuleIDPrefix = "__nv_";
800

801
    // For CUDA, create a string literal containing the fat binary loaded from
802
    // the given file.
803
    FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
804
                                  FatbinConstantName, 8);
805
    FatMagic = CudaFatMagic;
806
  }
807

808
  // Create initialized wrapper structure that points to the loaded GPU binary
809
  ConstantInitBuilder Builder(CGM);
810
  auto Values = Builder.beginStruct(FatbinWrapperTy);
811
  // Fatbin wrapper magic.
812
  Values.addInt(IntTy, FatMagic);
813
  // Fatbin version.
814
  Values.addInt(IntTy, 1);
815
  // Data.
816
  Values.add(FatBinStr);
817
  // Unused in fatbin v1.
818
  Values.add(llvm::ConstantPointerNull::get(PtrTy));
819
  llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
820
      addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
821
      /*constant*/ true);
822
  FatbinWrapper->setSection(FatbinSectionName);
823

824
  // There is only one HIP fat binary per linked module, however there are
825
  // multiple constructor functions. Make sure the fat binary is registered
826
  // only once. The constructor functions are executed by the dynamic loader
827
  // before the program gains control. The dynamic loader cannot execute the
828
  // constructor functions concurrently since doing that would not guarantee
829
  // thread safety of the loaded program. Therefore we can assume sequential
830
  // execution of constructor functions here.
831
  if (IsHIP) {
832
    auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
833
                                 : llvm::GlobalValue::ExternalLinkage;
834
    llvm::BasicBlock *IfBlock =
835
        llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
836
    llvm::BasicBlock *ExitBlock =
837
        llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
838
    // The name, size, and initialization pattern of this variable is part
839
    // of HIP ABI.
840
    GpuBinaryHandle = new llvm::GlobalVariable(
841
        TheModule, PtrTy, /*isConstant=*/false, Linkage,
842
        /*Initializer=*/
843
        CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
844
        CudaGpuBinary
845
            ? "__hip_gpubin_handle"
846
            : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
847
    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
848
    // Prevent the weak symbol in different shared libraries being merged.
849
    if (Linkage != llvm::GlobalValue::InternalLinkage)
850
      GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
851
    Address GpuBinaryAddr(
852
        GpuBinaryHandle, PtrTy,
853
        CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
854
    {
855
      auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
856
      llvm::Constant *Zero =
857
          llvm::Constant::getNullValue(HandleValue->getType());
858
      llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
859
      CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
860
    }
861
    {
862
      CtorBuilder.SetInsertPoint(IfBlock);
863
      // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
864
      llvm::CallInst *RegisterFatbinCall =
865
          CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
866
      CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
867
      CtorBuilder.CreateBr(ExitBlock);
868
    }
869
    {
870
      CtorBuilder.SetInsertPoint(ExitBlock);
871
      // Call __hip_register_globals(GpuBinaryHandle);
872
      if (RegisterGlobalsFunc) {
873
        auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
874
        CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
875
      }
876
    }
877
  } else if (!RelocatableDeviceCode) {
878
    // Register binary with CUDA runtime. This is substantially different in
879
    // default mode vs. separate compilation!
880
    // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
881
    llvm::CallInst *RegisterFatbinCall =
882
        CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
883
    GpuBinaryHandle = new llvm::GlobalVariable(
884
        TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage,
885
        llvm::ConstantPointerNull::get(PtrTy), "__cuda_gpubin_handle");
886
    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
887
    CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
888
                                   CGM.getPointerAlign());
889

890
    // Call __cuda_register_globals(GpuBinaryHandle);
891
    if (RegisterGlobalsFunc)
892
      CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
893

894
    // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
895
    if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
896
                           CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
897
      // void __cudaRegisterFatBinaryEnd(void **);
898
      llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
899
          llvm::FunctionType::get(VoidTy, PtrTy, false),
900
          "__cudaRegisterFatBinaryEnd");
901
      CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
902
    }
903
  } else {
904
    // Generate a unique module ID.
905
    SmallString<64> ModuleID;
906
    llvm::raw_svector_ostream OS(ModuleID);
907
    OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
908
    llvm::Constant *ModuleIDConstant = makeConstantArray(
909
        std::string(ModuleID), "", ModuleIDSectionName, 32, /*AddNull=*/true);
910

911
    // Create an alias for the FatbinWrapper that nvcc will look for.
912
    llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
913
                              Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
914

915
    // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
916
    // void *, void (*)(void **))
917
    SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
918
    RegisterLinkedBinaryName += ModuleID;
919
    llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
920
        getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
921

922
    assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
923
    llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
924
                           makeDummyFunction(getCallbackFnTy())};
925
    CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
926
  }
927

928
  // Create destructor and register it with atexit() the way NVCC does it. Doing
929
  // it during regular destructor phase worked in CUDA before 9.2 but results in
930
  // double-free in 9.2.
931
  if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
932
    // extern "C" int atexit(void (*f)(void));
933
    llvm::FunctionType *AtExitTy =
934
        llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
935
    llvm::FunctionCallee AtExitFunc =
936
        CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
937
                                  /*Local=*/true);
938
    CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
939
  }
940

941
  CtorBuilder.CreateRetVoid();
942
  return ModuleCtorFunc;
943
}
944

945
/// Creates a global destructor function that unregisters the GPU code blob
946
/// registered by constructor.
947
///
948
/// For CUDA:
949
/// \code
950
/// void __cuda_module_dtor() {
951
///     __cudaUnregisterFatBinary(Handle);
952
/// }
953
/// \endcode
954
///
955
/// For HIP:
956
/// \code
957
/// void __hip_module_dtor() {
958
///     if (__hip_gpubin_handle) {
959
///         __hipUnregisterFatBinary(__hip_gpubin_handle);
960
///         __hip_gpubin_handle = 0;
961
///     }
962
/// }
963
/// \endcode
964
llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
965
  // No need for destructor if we don't have a handle to unregister.
966
  if (!GpuBinaryHandle)
967
    return nullptr;
968

969
  // void __cudaUnregisterFatBinary(void ** handle);
970
  llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
971
      llvm::FunctionType::get(VoidTy, PtrTy, false),
972
      addUnderscoredPrefixToName("UnregisterFatBinary"));
973

974
  llvm::Function *ModuleDtorFunc = llvm::Function::Create(
975
      llvm::FunctionType::get(VoidTy, false),
976
      llvm::GlobalValue::InternalLinkage,
977
      addUnderscoredPrefixToName("_module_dtor"), &TheModule);
978

979
  llvm::BasicBlock *DtorEntryBB =
980
      llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
981
  CGBuilderTy DtorBuilder(CGM, Context);
982
  DtorBuilder.SetInsertPoint(DtorEntryBB);
983

984
  Address GpuBinaryAddr(
985
      GpuBinaryHandle, GpuBinaryHandle->getValueType(),
986
      CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
987
  auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
988
  // There is only one HIP fat binary per linked module, however there are
989
  // multiple destructor functions. Make sure the fat binary is unregistered
990
  // only once.
991
  if (CGM.getLangOpts().HIP) {
992
    llvm::BasicBlock *IfBlock =
993
        llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
994
    llvm::BasicBlock *ExitBlock =
995
        llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
996
    llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
997
    llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
998
    DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
999

1000
    DtorBuilder.SetInsertPoint(IfBlock);
1001
    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1002
    DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
1003
    DtorBuilder.CreateBr(ExitBlock);
1004

1005
    DtorBuilder.SetInsertPoint(ExitBlock);
1006
  } else {
1007
    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1008
  }
1009
  DtorBuilder.CreateRetVoid();
1010
  return ModuleDtorFunc;
1011
}
1012

1013
CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
1014
  return new CGNVCUDARuntime(CGM);
1015
}
1016

1017
void CGNVCUDARuntime::internalizeDeviceSideVar(
1018
    const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
1019
  // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
1020
  // global variables become internal definitions. These have to be internal in
1021
  // order to prevent name conflicts with global host variables with the same
1022
  // name in a different TUs.
1023
  //
1024
  // For -fgpu-rdc, the shadow variables should not be internalized because
1025
  // they may be accessed by different TU.
1026
  if (CGM.getLangOpts().GPURelocatableDeviceCode)
1027
    return;
1028

1029
  // __shared__ variables are odd. Shadows do get created, but
1030
  // they are not registered with the CUDA runtime, so they
1031
  // can't really be used to access their device-side
1032
  // counterparts. It's not clear yet whether it's nvcc's bug or
1033
  // a feature, but we've got to do the same for compatibility.
1034
  if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
1035
      D->hasAttr<CUDASharedAttr>() ||
1036
      D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1037
      D->getType()->isCUDADeviceBuiltinTextureType()) {
1038
    Linkage = llvm::GlobalValue::InternalLinkage;
1039
  }
1040
}
1041

1042
void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1043
                                            llvm::GlobalVariable &GV) {
1044
  if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
1045
    // Shadow variables and their properties must be registered with CUDA
1046
    // runtime. Skip Extern global variables, which will be registered in
1047
    // the TU where they are defined.
1048
    //
1049
    // Don't register a C++17 inline variable. The local symbol can be
1050
    // discarded and referencing a discarded local symbol from outside the
1051
    // comdat (__cuda_register_globals) is disallowed by the ELF spec.
1052
    //
1053
    // HIP managed variables need to be always recorded in device and host
1054
    // compilations for transformation.
1055
    //
1056
    // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1057
    // added to llvm.compiler-used, therefore they are safe to be registered.
1058
    if ((!D->hasExternalStorage() && !D->isInline()) ||
1059
        CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
1060
        D->hasAttr<HIPManagedAttr>()) {
1061
      registerDeviceVar(D, GV, !D->hasDefinition(),
1062
                        D->hasAttr<CUDAConstantAttr>());
1063
    }
1064
  } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1065
             D->getType()->isCUDADeviceBuiltinTextureType()) {
1066
    // Builtin surfaces and textures and their template arguments are
1067
    // also registered with CUDA runtime.
1068
    const auto *TD = cast<ClassTemplateSpecializationDecl>(
1069
        D->getType()->castAs<RecordType>()->getDecl());
1070
    const TemplateArgumentList &Args = TD->getTemplateArgs();
1071
    if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1072
      assert(Args.size() == 2 &&
1073
             "Unexpected number of template arguments of CUDA device "
1074
             "builtin surface type.");
1075
      auto SurfType = Args[1].getAsIntegral();
1076
      if (!D->hasExternalStorage())
1077
        registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1078
    } else {
1079
      assert(Args.size() == 3 &&
1080
             "Unexpected number of template arguments of CUDA device "
1081
             "builtin texture type.");
1082
      auto TexType = Args[1].getAsIntegral();
1083
      auto Normalized = Args[2].getAsIntegral();
1084
      if (!D->hasExternalStorage())
1085
        registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1086
                          Normalized.getZExtValue());
1087
    }
1088
  }
1089
}
1090

1091
// Transform managed variables to pointers to managed variables in device code.
1092
// Each use of the original managed variable is replaced by a load from the
1093
// transformed managed variable. The transformed managed variable contains
1094
// the address of managed memory which will be allocated by the runtime.
1095
void CGNVCUDARuntime::transformManagedVars() {
1096
  for (auto &&Info : DeviceVars) {
1097
    llvm::GlobalVariable *Var = Info.Var;
1098
    if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1099
        Info.Flags.isManaged()) {
1100
      auto *ManagedVar = new llvm::GlobalVariable(
1101
          CGM.getModule(), Var->getType(),
1102
          /*isConstant=*/false, Var->getLinkage(),
1103
          /*Init=*/Var->isDeclaration()
1104
              ? nullptr
1105
              : llvm::ConstantPointerNull::get(Var->getType()),
1106
          /*Name=*/"", /*InsertBefore=*/nullptr,
1107
          llvm::GlobalVariable::NotThreadLocal,
1108
          CGM.getContext().getTargetAddressSpace(CGM.getLangOpts().CUDAIsDevice
1109
                                                     ? LangAS::cuda_device
1110
                                                     : LangAS::Default));
1111
      ManagedVar->setDSOLocal(Var->isDSOLocal());
1112
      ManagedVar->setVisibility(Var->getVisibility());
1113
      ManagedVar->setExternallyInitialized(true);
1114
      replaceManagedVar(Var, ManagedVar);
1115
      ManagedVar->takeName(Var);
1116
      Var->setName(Twine(ManagedVar->getName()) + ".managed");
1117
      // Keep managed variables even if they are not used in device code since
1118
      // they need to be allocated by the runtime.
1119
      if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1120
        assert(!ManagedVar->isDeclaration());
1121
        CGM.addCompilerUsedGlobal(Var);
1122
        CGM.addCompilerUsedGlobal(ManagedVar);
1123
      }
1124
    }
1125
  }
1126
}
1127

1128
// Creates offloading entries for all the kernels and globals that must be
1129
// registered. The linker will provide a pointer to this section so we can
1130
// register the symbols with the linked device image.
1131
void CGNVCUDARuntime::createOffloadingEntries() {
1132
  StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
1133
                                            : "cuda_offloading_entries";
1134
  llvm::Module &M = CGM.getModule();
1135
  for (KernelInfo &I : EmittedKernels)
1136
    llvm::offloading::emitOffloadingEntry(
1137
        M, KernelHandles[I.Kernel->getName()],
1138
        getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0,
1139
        llvm::offloading::OffloadGlobalEntry, Section);
1140

1141
  for (VarInfo &I : DeviceVars) {
1142
    uint64_t VarSize =
1143
        CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1144
    int32_t Flags =
1145
        (I.Flags.isExtern()
1146
             ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1147
             : 0) |
1148
        (I.Flags.isConstant()
1149
             ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1150
             : 0) |
1151
        (I.Flags.isNormalized()
1152
             ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1153
             : 0);
1154
    if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1155
      llvm::offloading::emitOffloadingEntry(
1156
          M, I.Var, getDeviceSideName(I.D), VarSize,
1157
          (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1158
                               : llvm::offloading::OffloadGlobalEntry) |
1159
              Flags,
1160
          /*Data=*/0, Section);
1161
    } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1162
      llvm::offloading::emitOffloadingEntry(
1163
          M, I.Var, getDeviceSideName(I.D), VarSize,
1164
          llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1165
          I.Flags.getSurfTexType(), Section);
1166
    } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1167
      llvm::offloading::emitOffloadingEntry(
1168
          M, I.Var, getDeviceSideName(I.D), VarSize,
1169
          llvm::offloading::OffloadGlobalTextureEntry | Flags,
1170
          I.Flags.getSurfTexType(), Section);
1171
    }
1172
  }
1173
}
1174

1175
// Returns module constructor to be added.
1176
llvm::Function *CGNVCUDARuntime::finalizeModule() {
1177
  transformManagedVars();
1178
  if (CGM.getLangOpts().CUDAIsDevice) {
1179
    // Mark ODR-used device variables as compiler used to prevent it from being
1180
    // eliminated by optimization. This is necessary for device variables
1181
    // ODR-used by host functions. Sema correctly marks them as ODR-used no
1182
    // matter whether they are ODR-used by device or host functions.
1183
    //
1184
    // We do not need to do this if the variable has used attribute since it
1185
    // has already been added.
1186
    //
1187
    // Static device variables have been externalized at this point, therefore
1188
    // variables with LLVM private or internal linkage need not be added.
1189
    for (auto &&Info : DeviceVars) {
1190
      auto Kind = Info.Flags.getKind();
1191
      if (!Info.Var->isDeclaration() &&
1192
          !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1193
          (Kind == DeviceVarFlags::Variable ||
1194
           Kind == DeviceVarFlags::Surface ||
1195
           Kind == DeviceVarFlags::Texture) &&
1196
          Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1197
        CGM.addCompilerUsedGlobal(Info.Var);
1198
      }
1199
    }
1200
    return nullptr;
1201
  }
1202
  if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1203
    createOffloadingEntries();
1204
  else
1205
    return makeModuleCtorFunction();
1206

1207
  return nullptr;
1208
}
1209

1210
llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1211
                                                    GlobalDecl GD) {
1212
  auto Loc = KernelHandles.find(F->getName());
1213
  if (Loc != KernelHandles.end()) {
1214
    auto OldHandle = Loc->second;
1215
    if (KernelStubs[OldHandle] == F)
1216
      return OldHandle;
1217

1218
    // We've found the function name, but F itself has changed, so we need to
1219
    // update the references.
1220
    if (CGM.getLangOpts().HIP) {
1221
      // For HIP compilation the handle itself does not change, so we only need
1222
      // to update the Stub value.
1223
      KernelStubs[OldHandle] = F;
1224
      return OldHandle;
1225
    }
1226
    // For non-HIP compilation, erase the old Stub and fall-through to creating
1227
    // new entries.
1228
    KernelStubs.erase(OldHandle);
1229
  }
1230

1231
  if (!CGM.getLangOpts().HIP) {
1232
    KernelHandles[F->getName()] = F;
1233
    KernelStubs[F] = F;
1234
    return F;
1235
  }
1236

1237
  auto *Var = new llvm::GlobalVariable(
1238
      TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1239
      /*Initializer=*/nullptr,
1240
      CGM.getMangledName(
1241
          GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
1242
  Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1243
  Var->setDSOLocal(F->isDSOLocal());
1244
  Var->setVisibility(F->getVisibility());
1245
  auto *FD = cast<FunctionDecl>(GD.getDecl());
1246
  auto *FT = FD->getPrimaryTemplate();
1247
  if (!FT || FT->isThisDeclarationADefinition())
1248
    CGM.maybeSetTrivialComdat(*FD, *Var);
1249
  KernelHandles[F->getName()] = Var;
1250
  KernelStubs[Var] = F;
1251
  return Var;
1252
}
1253

1254
Product

Resources

Company