clang  8.0.0svn
CGCUDANV.cpp
Go to the documentation of this file.
1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
11 // runtime library.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "CGCUDARuntime.h"
16 #include "CodeGenFunction.h"
17 #include "CodeGenModule.h"
18 #include "clang/AST/Decl.h"
20 #include "llvm/IR/BasicBlock.h"
21 #include "llvm/IR/CallSite.h"
22 #include "llvm/IR/Constants.h"
23 #include "llvm/IR/DerivedTypes.h"
24 #include "llvm/Support/Format.h"
25 
26 using namespace clang;
27 using namespace CodeGen;
28 
29 namespace {
30 constexpr unsigned CudaFatMagic = 0x466243b1;
31 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
32 
33 class CGNVCUDARuntime : public CGCUDARuntime {
34 
35 private:
36  llvm::IntegerType *IntTy, *SizeTy;
37  llvm::Type *VoidTy;
38  llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
39 
40  /// Convenience reference to LLVM Context
41  llvm::LLVMContext &Context;
42  /// Convenience reference to the current module
43  llvm::Module &TheModule;
44  /// Keeps track of kernel launch stubs emitted in this module
47  /// Keeps track of variable containing handle of GPU binary. Populated by
48  /// ModuleCtorFunction() and used to create corresponding cleanup calls in
49  /// ModuleDtorFunction()
50  llvm::GlobalVariable *GpuBinaryHandle = nullptr;
51  /// Whether we generate relocatable device code.
52  bool RelocatableDeviceCode;
53 
54  llvm::Constant *getSetupArgumentFn() const;
55  llvm::Constant *getLaunchFn() const;
56 
57  llvm::FunctionType *getRegisterGlobalsFnTy() const;
58  llvm::FunctionType *getCallbackFnTy() const;
59  llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
60  std::string addPrefixToName(StringRef FuncName) const;
61  std::string addUnderscoredPrefixToName(StringRef FuncName) const;
62 
63  /// Creates a function to register all kernel stubs generated in this module.
64  llvm::Function *makeRegisterGlobalsFn();
65 
66  /// Helper function that generates a constant string and returns a pointer to
67  /// the start of the string. The result of this function can be used anywhere
68  /// where the C code specifies const char*.
69  llvm::Constant *makeConstantString(const std::string &Str,
70  const std::string &Name = "",
71  const std::string &SectionName = "",
72  unsigned Alignment = 0) {
73  llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
74  llvm::ConstantInt::get(SizeTy, 0)};
75  auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
76  llvm::GlobalVariable *GV =
77  cast<llvm::GlobalVariable>(ConstStr.getPointer());
78  if (!SectionName.empty()) {
79  GV->setSection(SectionName);
80  // Mark the address as used which make sure that this section isn't
81  // merged and we will really have it in the object file.
82  GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
83  }
84  if (Alignment)
85  GV->setAlignment(Alignment);
86 
87  return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
88  ConstStr.getPointer(), Zeros);
89  }
90 
91  /// Helper function that generates an empty dummy function returning void.
92  llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
93  assert(FnTy->getReturnType()->isVoidTy() &&
94  "Can only generate dummy functions returning void!");
95  llvm::Function *DummyFunc = llvm::Function::Create(
96  FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
97 
98  llvm::BasicBlock *DummyBlock =
99  llvm::BasicBlock::Create(Context, "", DummyFunc);
100  CGBuilderTy FuncBuilder(CGM, Context);
101  FuncBuilder.SetInsertPoint(DummyBlock);
102  FuncBuilder.CreateRetVoid();
103 
104  return DummyFunc;
105  }
106 
107  void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args);
108 
109 public:
110  CGNVCUDARuntime(CodeGenModule &CGM);
111 
112  void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
113  void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override {
114  DeviceVars.push_back(std::make_pair(&Var, Flags));
115  }
116 
117  /// Creates module constructor function
118  llvm::Function *makeModuleCtorFunction() override;
119  /// Creates module destructor function
120  llvm::Function *makeModuleDtorFunction() override;
121 };
122 
123 }
124 
125 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
126  if (CGM.getLangOpts().HIP)
127  return ((Twine("hip") + Twine(FuncName)).str());
128  return ((Twine("cuda") + Twine(FuncName)).str());
129 }
130 std::string
131 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
132  if (CGM.getLangOpts().HIP)
133  return ((Twine("__hip") + Twine(FuncName)).str());
134  return ((Twine("__cuda") + Twine(FuncName)).str());
135 }
136 
137 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
138  : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
139  TheModule(CGM.getModule()),
140  RelocatableDeviceCode(CGM.getLangOpts().CUDARelocatableDeviceCode) {
141  CodeGen::CodeGenTypes &Types = CGM.getTypes();
142  ASTContext &Ctx = CGM.getContext();
143 
144  IntTy = CGM.IntTy;
145  SizeTy = CGM.SizeTy;
146  VoidTy = CGM.VoidTy;
147 
148  CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
149  VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
150  VoidPtrPtrTy = VoidPtrTy->getPointerTo();
151 }
152 
153 llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const {
154  // cudaError_t cudaSetupArgument(void *, size_t, size_t)
155  llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
156  return CGM.CreateRuntimeFunction(
157  llvm::FunctionType::get(IntTy, Params, false),
158  addPrefixToName("SetupArgument"));
159 }
160 
161 llvm::Constant *CGNVCUDARuntime::getLaunchFn() const {
162  if (CGM.getLangOpts().HIP) {
163  // hipError_t hipLaunchByPtr(char *);
164  return CGM.CreateRuntimeFunction(
165  llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
166  } else {
167  // cudaError_t cudaLaunch(char *);
168  return CGM.CreateRuntimeFunction(
169  llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
170  }
171 }
172 
173 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
174  return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
175 }
176 
177 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
178  return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
179 }
180 
181 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
182  auto CallbackFnTy = getCallbackFnTy();
183  auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
184  llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
185  VoidPtrTy, CallbackFnTy->getPointerTo()};
186  return llvm::FunctionType::get(VoidTy, Params, false);
187 }
188 
189 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
190  FunctionArgList &Args) {
191  EmittedKernels.push_back(CGF.CurFn);
192  emitDeviceStubBody(CGF, Args);
193 }
194 
195 void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
196  FunctionArgList &Args) {
197  // Emit a call to cudaSetupArgument for each arg in Args.
198  llvm::Constant *cudaSetupArgFn = getSetupArgumentFn();
199  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
201  for (const VarDecl *A : Args) {
202  CharUnits TyWidth, TyAlign;
203  std::tie(TyWidth, TyAlign) =
204  CGM.getContext().getTypeInfoInChars(A->getType());
205  Offset = Offset.alignTo(TyAlign);
206  llvm::Value *Args[] = {
207  CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
208  VoidPtrTy),
209  llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()),
210  llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
211  };
212  llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
213  llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
214  llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero);
215  llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
216  CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock);
217  CGF.EmitBlock(NextBlock);
218  Offset += TyWidth;
219  }
220 
221  // Emit the call to cudaLaunch
222  llvm::Constant *cudaLaunchFn = getLaunchFn();
223  llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
224  CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
225  CGF.EmitBranch(EndBlock);
226 
227  CGF.EmitBlock(EndBlock);
228 }
229 
230 /// Creates a function that sets up state on the host side for CUDA objects that
231 /// have a presence on both the host and device sides. Specifically, registers
232 /// the host side of kernel functions and device global variables with the CUDA
233 /// runtime.
234 /// \code
235 /// void __cuda_register_globals(void** GpuBinaryHandle) {
236 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
237 /// ...
238 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
239 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
240 /// ...
241 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
242 /// }
243 /// \endcode
244 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
245  // No need to register anything
246  if (EmittedKernels.empty() && DeviceVars.empty())
247  return nullptr;
248 
249  llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
250  getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
251  addUnderscoredPrefixToName("_register_globals"), &TheModule);
252  llvm::BasicBlock *EntryBB =
253  llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
254  CGBuilderTy Builder(CGM, Context);
255  Builder.SetInsertPoint(EntryBB);
256 
257  // void __cudaRegisterFunction(void **, const char *, char *, const char *,
258  // int, uint3*, uint3*, dim3*, dim3*, int*)
259  llvm::Type *RegisterFuncParams[] = {
260  VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
261  VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
262  llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction(
263  llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
264  addUnderscoredPrefixToName("RegisterFunction"));
265 
266  // Extract GpuBinaryHandle passed as the first argument passed to
267  // __cuda_register_globals() and generate __cudaRegisterFunction() call for
268  // each emitted kernel.
269  llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
270  for (llvm::Function *Kernel : EmittedKernels) {
271  llvm::Constant *KernelName = makeConstantString(Kernel->getName());
272  llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
273  llvm::Value *Args[] = {
274  &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy),
275  KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr,
276  NullPtr, NullPtr, NullPtr,
277  llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
278  Builder.CreateCall(RegisterFunc, Args);
279  }
280 
281  // void __cudaRegisterVar(void **, char *, char *, const char *,
282  // int, int, int, int)
283  llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
284  CharPtrTy, IntTy, IntTy,
285  IntTy, IntTy};
286  llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction(
287  llvm::FunctionType::get(IntTy, RegisterVarParams, false),
288  addUnderscoredPrefixToName("RegisterVar"));
289  for (auto &Pair : DeviceVars) {
290  llvm::GlobalVariable *Var = Pair.first;
291  unsigned Flags = Pair.second;
292  llvm::Constant *VarName = makeConstantString(Var->getName());
293  uint64_t VarSize =
294  CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
295  llvm::Value *Args[] = {
296  &GpuBinaryHandlePtr,
297  Builder.CreateBitCast(Var, VoidPtrTy),
298  VarName,
299  VarName,
300  llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0),
301  llvm::ConstantInt::get(IntTy, VarSize),
302  llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0),
303  llvm::ConstantInt::get(IntTy, 0)};
304  Builder.CreateCall(RegisterVar, Args);
305  }
306 
307  Builder.CreateRetVoid();
308  return RegisterKernelsFunc;
309 }
310 
311 /// Creates a global constructor function for the module:
312 ///
313 /// For CUDA:
314 /// \code
315 /// void __cuda_module_ctor(void*) {
316 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
317 /// __cuda_register_globals(Handle);
318 /// }
319 /// \endcode
320 ///
321 /// For HIP:
322 /// \code
323 /// void __hip_module_ctor(void*) {
324 /// if (__hip_gpubin_handle == 0) {
325 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
326 /// __hip_register_globals(__hip_gpubin_handle);
327 /// }
328 /// }
329 /// \endcode
330 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
331  bool IsHIP = CGM.getLangOpts().HIP;
332  // No need to generate ctors/dtors if there is no GPU binary.
333  StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
334  if (CudaGpuBinaryFileName.empty() && !IsHIP)
335  return nullptr;
336 
337  // void __{cuda|hip}_register_globals(void* handle);
338  llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
339  // We always need a function to pass in as callback. Create a dummy
340  // implementation if we don't need to register anything.
341  if (RelocatableDeviceCode && !RegisterGlobalsFunc)
342  RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
343 
344  // void ** __{cuda|hip}RegisterFatBinary(void *);
345  llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
346  llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
347  addUnderscoredPrefixToName("RegisterFatBinary"));
348  // struct { int magic, int version, void * gpu_binary, void * dont_care };
349  llvm::StructType *FatbinWrapperTy =
350  llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
351 
352  // Register GPU binary with the CUDA runtime, store returned handle in a
353  // global variable and save a reference in GpuBinaryHandle to be cleaned up
354  // in destructor on exit. Then associate all known kernels with the GPU binary
355  // handle so CUDA runtime can figure out what to call on the GPU side.
356  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary;
357  if (!IsHIP) {
358  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
359  llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
360  if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
361  CGM.getDiags().Report(diag::err_cannot_open_file)
362  << CudaGpuBinaryFileName << EC.message();
363  return nullptr;
364  }
365  CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
366  }
367 
368  llvm::Function *ModuleCtorFunc = llvm::Function::Create(
369  llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
371  addUnderscoredPrefixToName("_module_ctor"), &TheModule);
372  llvm::BasicBlock *CtorEntryBB =
373  llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
374  CGBuilderTy CtorBuilder(CGM, Context);
375 
376  CtorBuilder.SetInsertPoint(CtorEntryBB);
377 
378  const char *FatbinConstantName;
379  const char *FatbinSectionName;
380  const char *ModuleIDSectionName;
381  StringRef ModuleIDPrefix;
382  llvm::Constant *FatBinStr;
383  unsigned FatMagic;
384  if (IsHIP) {
385  FatbinConstantName = ".hip_fatbin";
386  FatbinSectionName = ".hipFatBinSegment";
387 
388  ModuleIDSectionName = "__hip_module_id";
389  ModuleIDPrefix = "__hip_";
390 
391  // For HIP, create an external symbol __hip_fatbin in section .hip_fatbin.
392  // The external symbol is supposed to contain the fat binary but will be
393  // populated somewhere else, e.g. by lld through link script.
394  FatBinStr = new llvm::GlobalVariable(
395  CGM.getModule(), CGM.Int8Ty,
396  /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
397  "__hip_fatbin", nullptr,
398  llvm::GlobalVariable::NotThreadLocal);
399  cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
400 
401  FatMagic = HIPFatMagic;
402  } else {
403  if (RelocatableDeviceCode)
404  FatbinConstantName = CGM.getTriple().isMacOSX()
405  ? "__NV_CUDA,__nv_relfatbin"
406  : "__nv_relfatbin";
407  else
408  FatbinConstantName =
409  CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
410  // NVIDIA's cuobjdump looks for fatbins in this section.
411  FatbinSectionName =
412  CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
413 
414  ModuleIDSectionName = CGM.getTriple().isMacOSX()
415  ? "__NV_CUDA,__nv_module_id"
416  : "__nv_module_id";
417  ModuleIDPrefix = "__nv_";
418 
419  // For CUDA, create a string literal containing the fat binary loaded from
420  // the given file.
421  FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
422  FatbinConstantName, 8);
423  FatMagic = CudaFatMagic;
424  }
425 
426  // Create initialized wrapper structure that points to the loaded GPU binary
428  auto Values = Builder.beginStruct(FatbinWrapperTy);
429  // Fatbin wrapper magic.
430  Values.addInt(IntTy, FatMagic);
431  // Fatbin version.
432  Values.addInt(IntTy, 1);
433  // Data.
434  Values.add(FatBinStr);
435  // Unused in fatbin v1.
436  Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
437  llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
438  addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
439  /*constant*/ true);
440  FatbinWrapper->setSection(FatbinSectionName);
441 
442  // There is only one HIP fat binary per linked module, however there are
443  // multiple constructor functions. Make sure the fat binary is registered
444  // only once. The constructor functions are executed by the dynamic loader
445  // before the program gains control. The dynamic loader cannot execute the
446  // constructor functions concurrently since doing that would not guarantee
447  // thread safety of the loaded program. Therefore we can assume sequential
448  // execution of constructor functions here.
449  if (IsHIP) {
450  llvm::BasicBlock *IfBlock =
451  llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
452  llvm::BasicBlock *ExitBlock =
453  llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
454  // The name, size, and initialization pattern of this variable is part
455  // of HIP ABI.
456  GpuBinaryHandle = new llvm::GlobalVariable(
457  TheModule, VoidPtrPtrTy, /*isConstant=*/false,
458  llvm::GlobalValue::LinkOnceAnyLinkage,
459  /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
460  "__hip_gpubin_handle");
461  GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
462  // Prevent the weak symbol in different shared libraries being merged.
463  GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
464  Address GpuBinaryAddr(
465  GpuBinaryHandle,
466  CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
467  {
468  auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
469  llvm::Constant *Zero =
470  llvm::Constant::getNullValue(HandleValue->getType());
471  llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
472  CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
473  }
474  {
475  CtorBuilder.SetInsertPoint(IfBlock);
476  // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
477  llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
478  RegisterFatbinFunc,
479  CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
480  CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
481  CtorBuilder.CreateBr(ExitBlock);
482  }
483  {
484  CtorBuilder.SetInsertPoint(ExitBlock);
485  // Call __hip_register_globals(GpuBinaryHandle);
486  if (RegisterGlobalsFunc) {
487  auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
488  CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
489  }
490  }
491  } else if (!RelocatableDeviceCode) {
492  // Register binary with CUDA runtime. This is substantially different in
493  // default mode vs. separate compilation!
494  // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
495  llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
496  RegisterFatbinFunc,
497  CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
498  GpuBinaryHandle = new llvm::GlobalVariable(
499  TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
500  llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
501  GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
502  CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
503  CGM.getPointerAlign());
504 
505  // Call __cuda_register_globals(GpuBinaryHandle);
506  if (RegisterGlobalsFunc)
507  CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
508  } else {
509  // Generate a unique module ID.
510  SmallString<64> ModuleID;
511  llvm::raw_svector_ostream OS(ModuleID);
512  OS << ModuleIDPrefix << llvm::format("%x", FatbinWrapper->getGUID());
513  llvm::Constant *ModuleIDConstant =
514  makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32);
515 
516  // Create an alias for the FatbinWrapper that nvcc will look for.
518  Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
519 
520  // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
521  // void *, void (*)(void **))
522  SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
523  RegisterLinkedBinaryName += ModuleID;
524  llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
525  getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
526 
527  assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
528  llvm::Value *Args[] = {RegisterGlobalsFunc,
529  CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
530  ModuleIDConstant,
531  makeDummyFunction(getCallbackFnTy())};
532  CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
533  }
534 
535  // Create destructor and register it with atexit() the way NVCC does it. Doing
536  // it during regular destructor phase worked in CUDA before 9.2 but results in
537  // double-free in 9.2.
538  if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
539  // extern "C" int atexit(void (*f)(void));
540  llvm::FunctionType *AtExitTy =
541  llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
542  llvm::Constant *AtExitFunc =
543  CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
544  /*Local=*/true);
545  CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
546  }
547 
548  CtorBuilder.CreateRetVoid();
549  return ModuleCtorFunc;
550 }
551 
552 /// Creates a global destructor function that unregisters the GPU code blob
553 /// registered by constructor.
554 ///
555 /// For CUDA:
556 /// \code
557 /// void __cuda_module_dtor(void*) {
558 /// __cudaUnregisterFatBinary(Handle);
559 /// }
560 /// \endcode
561 ///
562 /// For HIP:
563 /// \code
564 /// void __hip_module_dtor(void*) {
565 /// if (__hip_gpubin_handle) {
566 /// __hipUnregisterFatBinary(__hip_gpubin_handle);
567 /// __hip_gpubin_handle = 0;
568 /// }
569 /// }
570 /// \endcode
571 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
572  // No need for destructor if we don't have a handle to unregister.
573  if (!GpuBinaryHandle)
574  return nullptr;
575 
576  // void __cudaUnregisterFatBinary(void ** handle);
577  llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
578  llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
579  addUnderscoredPrefixToName("UnregisterFatBinary"));
580 
581  llvm::Function *ModuleDtorFunc = llvm::Function::Create(
582  llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
584  addUnderscoredPrefixToName("_module_dtor"), &TheModule);
585 
586  llvm::BasicBlock *DtorEntryBB =
587  llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
588  CGBuilderTy DtorBuilder(CGM, Context);
589  DtorBuilder.SetInsertPoint(DtorEntryBB);
590 
591  Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
592  GpuBinaryHandle->getAlignment()));
593  auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
594  // There is only one HIP fat binary per linked module, however there are
595  // multiple destructor functions. Make sure the fat binary is unregistered
596  // only once.
597  if (CGM.getLangOpts().HIP) {
598  llvm::BasicBlock *IfBlock =
599  llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
600  llvm::BasicBlock *ExitBlock =
601  llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
602  llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
603  llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
604  DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
605 
606  DtorBuilder.SetInsertPoint(IfBlock);
607  DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
608  DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
609  DtorBuilder.CreateBr(ExitBlock);
610 
611  DtorBuilder.SetInsertPoint(ExitBlock);
612  } else {
613  DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
614  }
615  DtorBuilder.CreateRetVoid();
616  return ModuleDtorFunc;
617 }
618 
620  return new CGNVCUDARuntime(CGM);
621 }
const llvm::DataLayout & getDataLayout() const
CharUnits alignTo(const CharUnits &Align) const
alignTo - Returns the next integer (mod 2**64) that is greater than or equal to this quantity and is ...
Definition: CharUnits.h:184
llvm::IntegerType * IntTy
int
External linkage, which indicates that the entity can be referred to from other translation units...
Definition: Linkage.h:60
CanQualType VoidPtrTy
Definition: ASTContext.h:1053
const CodeGenOptions & getCodeGenOpts() const
The standard implementation of ConstantInitBuilder used in Clang.
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1294
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
Represents a variable declaration or definition.
Definition: Decl.h:812
Objects with "hidden" visibility are not seen by the dynamic linker.
Definition: Visibility.h:37
DiagnosticsEngine & getDiags() const
llvm::Value * getPointer() const
Definition: Address.h:38
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::Type * ConvertType(QualType T)
ConvertType - Convert type T into a llvm::Type.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:154
static CharUnits Zero()
Zero - Construct a CharUnits quantity of zero.
Definition: CharUnits.h:53
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
uint32_t Offset
Definition: CacheTokens.cpp:43
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
llvm::Constant * CreateRuntimeFunction(llvm::FunctionType *Ty, StringRef Name, llvm::AttributeList ExtraAttrs=llvm::AttributeList(), bool Local=false)
Create a new runtime function with the specified type and name.
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:179
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
const LangOptions & getLangOpts() const
ASTContext & getContext() const
The l-value was considered opaque, so the alignment was determined from a type.
Address CreateBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Definition: CGBuilder.h:142
llvm::CallSite EmitRuntimeCallOrInvoke(llvm::Value *callee, ArrayRef< llvm::Value *> args, const Twine &name="")
Emits a call or invoke instruction to the given runtime function.
Definition: CGCall.cpp:3744
An aligned address.
Definition: Address.h:25
std::pair< CharUnits, CharUnits > getTypeInfoInChars(const Type *T) const
FunctionArgList - Type for representing both the decl and type of parameters to a function...
Definition: CGCall.h:356
CanQualType CharTy
Definition: ASTContext.h:1027
This class organizes the cross-function state that is used while generating LLVM code.
std::string CudaGpuBinaryFileName
Name of file passed with -fcuda-include-gpubinary option to forward to CUDA runtime back-end for inco...
Dataflow Directional Tag Classes.
std::unique_ptr< DiagnosticConsumer > create(StringRef OutputFile, DiagnosticOptions *Diags, bool MergeChildRecords=false)
Returns a DiagnosticConsumer that serializes diagnostics to a bitcode file.
llvm::Module & getModule() const
This class organizes the cross-module state that is used while lowering AST types to LLVM types...
Definition: CodeGenTypes.h:120
Internal linkage, which indicates that the entity can be referred to from within the translation unit...
Definition: Linkage.h:32
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
Definition: CGStmt.cpp:445
CGCUDARuntime * CreateNVCUDARuntime(CodeGenModule &CGM)
Creates an instance of a CUDA runtime class.
Definition: CGCUDANV.cpp:619
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block, taking care to avoid creation of branches from dummy blocks.
Definition: CGStmt.cpp:465
static OMPLinearClause * Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, OpenMPLinearClauseKind Modifier, SourceLocation ModifierLoc, SourceLocation ColonLoc, SourceLocation EndLoc, ArrayRef< Expr *> VL, ArrayRef< Expr *> PL, ArrayRef< Expr *> IL, Expr *Step, Expr *CalcStep, Stmt *PreInit, Expr *PostUpdate)
Creates clause with a list of variables VL and a linear step Step.
const llvm::Triple & getTriple() const