clang 19.0.0git
CGCUDANV.cpp
Go to the documentation of this file.
1//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10// runtime library.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGCUDARuntime.h"
15#include "CGCXXABI.h"
16#include "CodeGenFunction.h"
17#include "CodeGenModule.h"
18#include "clang/AST/Decl.h"
19#include "clang/Basic/Cuda.h"
22#include "llvm/Frontend/Offloading/Utility.h"
23#include "llvm/IR/BasicBlock.h"
24#include "llvm/IR/Constants.h"
25#include "llvm/IR/DerivedTypes.h"
26#include "llvm/IR/ReplaceConstant.h"
27#include "llvm/Support/Format.h"
28#include "llvm/Support/VirtualFileSystem.h"
29
30using namespace clang;
31using namespace CodeGen;
32
33namespace {
34constexpr unsigned CudaFatMagic = 0x466243b1;
35constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
36
37class CGNVCUDARuntime : public CGCUDARuntime {
38
39private:
40 llvm::IntegerType *IntTy, *SizeTy;
41 llvm::Type *VoidTy;
42 llvm::PointerType *PtrTy;
43
44 /// Convenience reference to LLVM Context
45 llvm::LLVMContext &Context;
46 /// Convenience reference to the current module
47 llvm::Module &TheModule;
48 /// Keeps track of kernel launch stubs and handles emitted in this module
49 struct KernelInfo {
50 llvm::Function *Kernel; // stub function to help launch kernel
51 const Decl *D;
52 };
54 // Map a kernel mangled name to a symbol for identifying kernel in host code
55 // For CUDA, the symbol for identifying the kernel is the same as the device
56 // stub function. For HIP, they are different.
57 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
58 // Map a kernel handle to the kernel stub.
59 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
60 struct VarInfo {
61 llvm::GlobalVariable *Var;
62 const VarDecl *D;
63 DeviceVarFlags Flags;
64 };
66 /// Keeps track of variable containing handle of GPU binary. Populated by
67 /// ModuleCtorFunction() and used to create corresponding cleanup calls in
68 /// ModuleDtorFunction()
69 llvm::GlobalVariable *GpuBinaryHandle = nullptr;
70 /// Whether we generate relocatable device code.
71 bool RelocatableDeviceCode;
72 /// Mangle context for device.
73 std::unique_ptr<MangleContext> DeviceMC;
74 /// Some zeros used for GEPs.
75 llvm::Constant *Zeros[2];
76
77 llvm::FunctionCallee getSetupArgumentFn() const;
78 llvm::FunctionCallee getLaunchFn() const;
79
80 llvm::FunctionType *getRegisterGlobalsFnTy() const;
81 llvm::FunctionType *getCallbackFnTy() const;
82 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
83 std::string addPrefixToName(StringRef FuncName) const;
84 std::string addUnderscoredPrefixToName(StringRef FuncName) const;
85
86 /// Creates a function to register all kernel stubs generated in this module.
87 llvm::Function *makeRegisterGlobalsFn();
88
89 /// Helper function that generates a constant string and returns a pointer to
90 /// the start of the string. The result of this function can be used anywhere
91 /// where the C code specifies const char*.
92 llvm::Constant *makeConstantString(const std::string &Str,
93 const std::string &Name = "") {
94 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
95 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
96 ConstStr.getPointer(), Zeros);
97 }
98
99 /// Helper function which generates an initialized constant array from Str,
100 /// and optionally sets section name and alignment. AddNull specifies whether
101 /// the array should nave NUL termination.
102 llvm::Constant *makeConstantArray(StringRef Str,
103 StringRef Name = "",
104 StringRef SectionName = "",
105 unsigned Alignment = 0,
106 bool AddNull = false) {
107 llvm::Constant *Value =
108 llvm::ConstantDataArray::getString(Context, Str, AddNull);
109 auto *GV = new llvm::GlobalVariable(
110 TheModule, Value->getType(), /*isConstant=*/true,
111 llvm::GlobalValue::PrivateLinkage, Value, Name);
112 if (!SectionName.empty()) {
113 GV->setSection(SectionName);
114 // Mark the address as used which make sure that this section isn't
115 // merged and we will really have it in the object file.
116 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
117 }
118 if (Alignment)
119 GV->setAlignment(llvm::Align(Alignment));
120 return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros);
121 }
122
123 /// Helper function that generates an empty dummy function returning void.
124 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
125 assert(FnTy->getReturnType()->isVoidTy() &&
126 "Can only generate dummy functions returning void!");
127 llvm::Function *DummyFunc = llvm::Function::Create(
128 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
129
130 llvm::BasicBlock *DummyBlock =
131 llvm::BasicBlock::Create(Context, "", DummyFunc);
132 CGBuilderTy FuncBuilder(CGM, Context);
133 FuncBuilder.SetInsertPoint(DummyBlock);
134 FuncBuilder.CreateRetVoid();
135
136 return DummyFunc;
137 }
138
139 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
140 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
141 std::string getDeviceSideName(const NamedDecl *ND) override;
142
143 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
144 bool Extern, bool Constant) {
145 DeviceVars.push_back({&Var,
146 VD,
147 {DeviceVarFlags::Variable, Extern, Constant,
148 VD->hasAttr<HIPManagedAttr>(),
149 /*Normalized*/ false, 0}});
150 }
151 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
152 bool Extern, int Type) {
153 DeviceVars.push_back({&Var,
154 VD,
155 {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
156 /*Managed*/ false,
157 /*Normalized*/ false, Type}});
158 }
159 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
160 bool Extern, int Type, bool Normalized) {
161 DeviceVars.push_back({&Var,
162 VD,
163 {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
164 /*Managed*/ false, Normalized, Type}});
165 }
166
167 /// Creates module constructor function
168 llvm::Function *makeModuleCtorFunction();
169 /// Creates module destructor function
170 llvm::Function *makeModuleDtorFunction();
171 /// Transform managed variables for device compilation.
172 void transformManagedVars();
173 /// Create offloading entries to register globals in RDC mode.
174 void createOffloadingEntries();
175
176public:
177 CGNVCUDARuntime(CodeGenModule &CGM);
178
179 llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
180 llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
181 auto Loc = KernelStubs.find(Handle);
182 assert(Loc != KernelStubs.end());
183 return Loc->second;
184 }
185 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
186 void handleVarRegistration(const VarDecl *VD,
187 llvm::GlobalVariable &Var) override;
188 void
190 llvm::GlobalValue::LinkageTypes &Linkage) override;
191
192 llvm::Function *finalizeModule() override;
193};
194
195} // end anonymous namespace
196
197std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
198 if (CGM.getLangOpts().HIP)
199 return ((Twine("hip") + Twine(FuncName)).str());
200 return ((Twine("cuda") + Twine(FuncName)).str());
201}
202std::string
203CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
204 if (CGM.getLangOpts().HIP)
205 return ((Twine("__hip") + Twine(FuncName)).str());
206 return ((Twine("__cuda") + Twine(FuncName)).str());
207}
208
209static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
210 // If the host and device have different C++ ABIs, mark it as the device
211 // mangle context so that the mangling needs to retrieve the additional
212 // device lambda mangling number instead of the regular host one.
213 if (CGM.getContext().getAuxTargetInfo() &&
216 return std::unique_ptr<MangleContext>(
218 *CGM.getContext().getAuxTargetInfo()));
219 }
220
221 return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
223}
224
225CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
226 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
227 TheModule(CGM.getModule()),
228 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
229 DeviceMC(InitDeviceMC(CGM)) {
230 IntTy = CGM.IntTy;
231 SizeTy = CGM.SizeTy;
232 VoidTy = CGM.VoidTy;
233 Zeros[0] = llvm::ConstantInt::get(SizeTy, 0);
234 Zeros[1] = Zeros[0];
235 PtrTy = CGM.UnqualPtrTy;
236}
237
238llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
239 // cudaError_t cudaSetupArgument(void *, size_t, size_t)
240 llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
241 return CGM.CreateRuntimeFunction(
242 llvm::FunctionType::get(IntTy, Params, false),
243 addPrefixToName("SetupArgument"));
244}
245
246llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
247 if (CGM.getLangOpts().HIP) {
248 // hipError_t hipLaunchByPtr(char *);
249 return CGM.CreateRuntimeFunction(
250 llvm::FunctionType::get(IntTy, PtrTy, false), "hipLaunchByPtr");
251 }
252 // cudaError_t cudaLaunch(char *);
253 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy, false),
254 "cudaLaunch");
255}
256
257llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
258 return llvm::FunctionType::get(VoidTy, PtrTy, false);
259}
260
261llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
262 return llvm::FunctionType::get(VoidTy, PtrTy, false);
263}
264
265llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
266 llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
267 llvm::PointerType::getUnqual(Context)};
268 return llvm::FunctionType::get(VoidTy, Params, false);
269}
270
271std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
272 GlobalDecl GD;
273 // D could be either a kernel or a variable.
274 if (auto *FD = dyn_cast<FunctionDecl>(ND))
275 GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
276 else
277 GD = GlobalDecl(ND);
278 std::string DeviceSideName;
279 MangleContext *MC;
280 if (CGM.getLangOpts().CUDAIsDevice)
281 MC = &CGM.getCXXABI().getMangleContext();
282 else
283 MC = DeviceMC.get();
284 if (MC->shouldMangleDeclName(ND)) {
285 SmallString<256> Buffer;
286 llvm::raw_svector_ostream Out(Buffer);
287 MC->mangleName(GD, Out);
288 DeviceSideName = std::string(Out.str());
289 } else
290 DeviceSideName = std::string(ND->getIdentifier()->getName());
291
292 // Make unique name for device side static file-scope variable for HIP.
293 if (CGM.getContext().shouldExternalize(ND) &&
294 CGM.getLangOpts().GPURelocatableDeviceCode) {
295 SmallString<256> Buffer;
296 llvm::raw_svector_ostream Out(Buffer);
297 Out << DeviceSideName;
298 CGM.printPostfixForExternalizedDecl(Out, ND);
299 DeviceSideName = std::string(Out.str());
300 }
301 return DeviceSideName;
302}
303
304void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
305 FunctionArgList &Args) {
306 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
307 if (auto *GV =
308 dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) {
309 GV->setLinkage(CGF.CurFn->getLinkage());
310 GV->setInitializer(CGF.CurFn);
311 }
312 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
313 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
314 (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
315 emitDeviceStubBodyNew(CGF, Args);
316 else
317 emitDeviceStubBodyLegacy(CGF, Args);
318}
319
320// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
321// array and kernels are launched using cudaLaunchKernel().
322void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
323 FunctionArgList &Args) {
324 // Build the shadow stack entry at the very start of the function.
325
326 // Calculate amount of space we will need for all arguments. If we have no
327 // args, allocate a single pointer so we still have a valid pointer to the
328 // argument array that we can pass to runtime, even if it will be unused.
329 Address KernelArgs = CGF.CreateTempAlloca(
330 PtrTy, CharUnits::fromQuantity(16), "kernel_args",
331 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
332 // Store pointers to the arguments in a locally allocated launch_args.
333 for (unsigned i = 0; i < Args.size(); ++i) {
334 llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(Args[i]).emitRawPointer(CGF);
335 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, PtrTy);
337 VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
338 PtrTy, KernelArgs.emitRawPointer(CGF), i));
339 }
340
341 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
342
343 // Lookup cudaLaunchKernel/hipLaunchKernel function.
344 // HIP kernel launching API name depends on -fgpu-default-stream option. For
345 // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
346 // it is hipLaunchKernel_spt.
347 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
348 // void **args, size_t sharedMem,
349 // cudaStream_t stream);
350 // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
351 // dim3 blockDim, void **args,
352 // size_t sharedMem, hipStream_t stream);
353 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
355 std::string KernelLaunchAPI = "LaunchKernel";
356 if (CGF.getLangOpts().GPUDefaultStream ==
357 LangOptions::GPUDefaultStreamKind::PerThread) {
358 if (CGF.getLangOpts().HIP)
359 KernelLaunchAPI = KernelLaunchAPI + "_spt";
360 else if (CGF.getLangOpts().CUDA)
361 KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
362 }
363 auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
364 const IdentifierInfo &cudaLaunchKernelII =
365 CGM.getContext().Idents.get(LaunchKernelName);
366 FunctionDecl *cudaLaunchKernelFD = nullptr;
367 for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
368 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
369 cudaLaunchKernelFD = FD;
370 }
371
372 if (cudaLaunchKernelFD == nullptr) {
373 CGM.Error(CGF.CurFuncDecl->getLocation(),
374 "Can't find declaration for " + LaunchKernelName);
375 return;
376 }
377 // Create temporary dim3 grid_dim, block_dim.
378 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
379 QualType Dim3Ty = GridDimParam->getType();
380 Address GridDim =
381 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
382 Address BlockDim =
383 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
384 Address ShmemSize =
385 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
386 Address Stream = CGF.CreateTempAlloca(PtrTy, CGM.getPointerAlign(), "stream");
387 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
388 llvm::FunctionType::get(IntTy,
389 {/*gridDim=*/GridDim.getType(),
390 /*blockDim=*/BlockDim.getType(),
391 /*ShmemSize=*/ShmemSize.getType(),
392 /*Stream=*/Stream.getType()},
393 /*isVarArg=*/false),
394 addUnderscoredPrefixToName("PopCallConfiguration"));
395
396 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.emitRawPointer(CGF),
397 BlockDim.emitRawPointer(CGF),
398 ShmemSize.emitRawPointer(CGF),
399 Stream.emitRawPointer(CGF)});
400
401 // Emit the call to cudaLaunch
402 llvm::Value *Kernel =
403 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
404 CallArgList LaunchKernelArgs;
405 LaunchKernelArgs.add(RValue::get(Kernel),
406 cudaLaunchKernelFD->getParamDecl(0)->getType());
407 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
408 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
409 LaunchKernelArgs.add(RValue::get(KernelArgs, CGF),
410 cudaLaunchKernelFD->getParamDecl(3)->getType());
411 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
412 cudaLaunchKernelFD->getParamDecl(4)->getType());
413 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
414 cudaLaunchKernelFD->getParamDecl(5)->getType());
415
416 QualType QT = cudaLaunchKernelFD->getType();
417 QualType CQT = QT.getCanonicalType();
418 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
419 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
420
421 const CGFunctionInfo &FI =
422 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
423 llvm::FunctionCallee cudaLaunchKernelFn =
424 CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
425 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
426 LaunchKernelArgs);
427 CGF.EmitBranch(EndBlock);
428
429 CGF.EmitBlock(EndBlock);
430}
431
432void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
433 FunctionArgList &Args) {
434 // Emit a call to cudaSetupArgument for each arg in Args.
435 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
436 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
437 CharUnits Offset = CharUnits::Zero();
438 for (const VarDecl *A : Args) {
439 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
440 Offset = Offset.alignTo(TInfo.Align);
441 llvm::Value *Args[] = {
442 CGF.Builder.CreatePointerCast(
443 CGF.GetAddrOfLocalVar(A).emitRawPointer(CGF), PtrTy),
444 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
445 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
446 };
447 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
448 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
449 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
450 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
451 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
452 CGF.EmitBlock(NextBlock);
453 Offset += TInfo.Width;
454 }
455
456 // Emit the call to cudaLaunch
457 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
458 llvm::Value *Arg =
459 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
460 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
461 CGF.EmitBranch(EndBlock);
462
463 CGF.EmitBlock(EndBlock);
464}
465
466// Replace the original variable Var with the address loaded from variable
467// ManagedVar populated by HIP runtime.
468static void replaceManagedVar(llvm::GlobalVariable *Var,
469 llvm::GlobalVariable *ManagedVar) {
471 for (auto &&VarUse : Var->uses()) {
472 WorkList.push_back({VarUse.getUser()});
473 }
474 while (!WorkList.empty()) {
475 auto &&WorkItem = WorkList.pop_back_val();
476 auto *U = WorkItem.back();
477 if (isa<llvm::ConstantExpr>(U)) {
478 for (auto &&UU : U->uses()) {
479 WorkItem.push_back(UU.getUser());
480 WorkList.push_back(WorkItem);
481 WorkItem.pop_back();
482 }
483 continue;
484 }
485 if (auto *I = dyn_cast<llvm::Instruction>(U)) {
486 llvm::Value *OldV = Var;
487 llvm::Instruction *NewV =
488 new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
489 llvm::Align(Var->getAlignment()), I);
490 WorkItem.pop_back();
491 // Replace constant expressions directly or indirectly using the managed
492 // variable with instructions.
493 for (auto &&Op : WorkItem) {
494 auto *CE = cast<llvm::ConstantExpr>(Op);
495 auto *NewInst = CE->getAsInstruction();
496 NewInst->insertBefore(*I->getParent(), I->getIterator());
497 NewInst->replaceUsesOfWith(OldV, NewV);
498 OldV = CE;
499 NewV = NewInst;
500 }
501 I->replaceUsesOfWith(OldV, NewV);
502 } else {
503 llvm_unreachable("Invalid use of managed variable");
504 }
505 }
506}
507
508/// Creates a function that sets up state on the host side for CUDA objects that
509/// have a presence on both the host and device sides. Specifically, registers
510/// the host side of kernel functions and device global variables with the CUDA
511/// runtime.
512/// \code
513/// void __cuda_register_globals(void** GpuBinaryHandle) {
514/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
515/// ...
516/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
517/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
518/// ...
519/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
520/// }
521/// \endcode
522llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
523 // No need to register anything
524 if (EmittedKernels.empty() && DeviceVars.empty())
525 return nullptr;
526
527 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
528 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
529 addUnderscoredPrefixToName("_register_globals"), &TheModule);
530 llvm::BasicBlock *EntryBB =
531 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
532 CGBuilderTy Builder(CGM, Context);
533 Builder.SetInsertPoint(EntryBB);
534
535 // void __cudaRegisterFunction(void **, const char *, char *, const char *,
536 // int, uint3*, uint3*, dim3*, dim3*, int*)
537 llvm::Type *RegisterFuncParams[] = {
538 PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
539 PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
540 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
541 llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
542 addUnderscoredPrefixToName("RegisterFunction"));
543
544 // Extract GpuBinaryHandle passed as the first argument passed to
545 // __cuda_register_globals() and generate __cudaRegisterFunction() call for
546 // each emitted kernel.
547 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
548 for (auto &&I : EmittedKernels) {
549 llvm::Constant *KernelName =
550 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
551 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
552 llvm::Value *Args[] = {
553 &GpuBinaryHandlePtr,
554 KernelHandles[I.Kernel->getName()],
555 KernelName,
556 KernelName,
557 llvm::ConstantInt::get(IntTy, -1),
558 NullPtr,
559 NullPtr,
560 NullPtr,
561 NullPtr,
562 llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
563 Builder.CreateCall(RegisterFunc, Args);
564 }
565
566 llvm::Type *VarSizeTy = IntTy;
567 // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
568 if (CGM.getLangOpts().HIP ||
569 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
570 VarSizeTy = SizeTy;
571
572 // void __cudaRegisterVar(void **, char *, char *, const char *,
573 // int, int, int, int)
574 llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
575 IntTy, VarSizeTy, IntTy, IntTy};
576 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
577 llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
578 addUnderscoredPrefixToName("RegisterVar"));
579 // void __hipRegisterManagedVar(void **, char *, char *, const char *,
580 // size_t, unsigned)
581 llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
582 PtrTy, VarSizeTy, IntTy};
583 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
584 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
585 addUnderscoredPrefixToName("RegisterManagedVar"));
586 // void __cudaRegisterSurface(void **, const struct surfaceReference *,
587 // const void **, const char *, int, int);
588 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
589 llvm::FunctionType::get(
590 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, false),
591 addUnderscoredPrefixToName("RegisterSurface"));
592 // void __cudaRegisterTexture(void **, const struct textureReference *,
593 // const void **, const char *, int, int, int)
594 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
595 llvm::FunctionType::get(
596 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, false),
597 addUnderscoredPrefixToName("RegisterTexture"));
598 for (auto &&Info : DeviceVars) {
599 llvm::GlobalVariable *Var = Info.Var;
600 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
601 "External variables should not show up here, except HIP managed "
602 "variables");
603 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
604 switch (Info.Flags.getKind()) {
605 case DeviceVarFlags::Variable: {
606 uint64_t VarSize =
607 CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
608 if (Info.Flags.isManaged()) {
609 assert(Var->getName().ends_with(".managed") &&
610 "HIP managed variables not transformed");
611 auto *ManagedVar = CGM.getModule().getNamedGlobal(
612 Var->getName().drop_back(StringRef(".managed").size()));
613 llvm::Value *Args[] = {
614 &GpuBinaryHandlePtr,
615 ManagedVar,
616 Var,
617 VarName,
618 llvm::ConstantInt::get(VarSizeTy, VarSize),
619 llvm::ConstantInt::get(IntTy, Var->getAlignment())};
620 if (!Var->isDeclaration())
621 Builder.CreateCall(RegisterManagedVar, Args);
622 } else {
623 llvm::Value *Args[] = {
624 &GpuBinaryHandlePtr,
625 Var,
626 VarName,
627 VarName,
628 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
629 llvm::ConstantInt::get(VarSizeTy, VarSize),
630 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
631 llvm::ConstantInt::get(IntTy, 0)};
632 Builder.CreateCall(RegisterVar, Args);
633 }
634 break;
635 }
636 case DeviceVarFlags::Surface:
637 Builder.CreateCall(
638 RegisterSurf,
639 {&GpuBinaryHandlePtr, Var, VarName, VarName,
640 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
641 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
642 break;
643 case DeviceVarFlags::Texture:
644 Builder.CreateCall(
645 RegisterTex,
646 {&GpuBinaryHandlePtr, Var, VarName, VarName,
647 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
648 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
649 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
650 break;
651 }
652 }
653
654 Builder.CreateRetVoid();
655 return RegisterKernelsFunc;
656}
657
658/// Creates a global constructor function for the module:
659///
660/// For CUDA:
661/// \code
662/// void __cuda_module_ctor() {
663/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
664/// __cuda_register_globals(Handle);
665/// }
666/// \endcode
667///
668/// For HIP:
669/// \code
670/// void __hip_module_ctor() {
671/// if (__hip_gpubin_handle == 0) {
672/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
673/// __hip_register_globals(__hip_gpubin_handle);
674/// }
675/// }
676/// \endcode
677llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
678 bool IsHIP = CGM.getLangOpts().HIP;
679 bool IsCUDA = CGM.getLangOpts().CUDA;
680 // No need to generate ctors/dtors if there is no GPU binary.
681 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
682 if (CudaGpuBinaryFileName.empty() && !IsHIP)
683 return nullptr;
684 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
685 DeviceVars.empty())
686 return nullptr;
687
688 // void __{cuda|hip}_register_globals(void* handle);
689 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
690 // We always need a function to pass in as callback. Create a dummy
691 // implementation if we don't need to register anything.
692 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
693 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
694
695 // void ** __{cuda|hip}RegisterFatBinary(void *);
696 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
697 llvm::FunctionType::get(PtrTy, PtrTy, false),
698 addUnderscoredPrefixToName("RegisterFatBinary"));
699 // struct { int magic, int version, void * gpu_binary, void * dont_care };
700 llvm::StructType *FatbinWrapperTy =
701 llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
702
703 // Register GPU binary with the CUDA runtime, store returned handle in a
704 // global variable and save a reference in GpuBinaryHandle to be cleaned up
705 // in destructor on exit. Then associate all known kernels with the GPU binary
706 // handle so CUDA runtime can figure out what to call on the GPU side.
707 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
708 if (!CudaGpuBinaryFileName.empty()) {
709 auto VFS = CGM.getFileSystem();
710 auto CudaGpuBinaryOrErr =
711 VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false);
712 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
713 CGM.getDiags().Report(diag::err_cannot_open_file)
714 << CudaGpuBinaryFileName << EC.message();
715 return nullptr;
716 }
717 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
718 }
719
720 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
721 llvm::FunctionType::get(VoidTy, false),
722 llvm::GlobalValue::InternalLinkage,
723 addUnderscoredPrefixToName("_module_ctor"), &TheModule);
724 llvm::BasicBlock *CtorEntryBB =
725 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
726 CGBuilderTy CtorBuilder(CGM, Context);
727
728 CtorBuilder.SetInsertPoint(CtorEntryBB);
729
730 const char *FatbinConstantName;
731 const char *FatbinSectionName;
732 const char *ModuleIDSectionName;
733 StringRef ModuleIDPrefix;
734 llvm::Constant *FatBinStr;
735 unsigned FatMagic;
736 if (IsHIP) {
737 FatbinConstantName = ".hip_fatbin";
738 FatbinSectionName = ".hipFatBinSegment";
739
740 ModuleIDSectionName = "__hip_module_id";
741 ModuleIDPrefix = "__hip_";
742
743 if (CudaGpuBinary) {
744 // If fatbin is available from early finalization, create a string
745 // literal containing the fat binary loaded from the given file.
746 const unsigned HIPCodeObjectAlign = 4096;
747 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
748 FatbinConstantName, HIPCodeObjectAlign);
749 } else {
750 // If fatbin is not available, create an external symbol
751 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
752 // to contain the fat binary but will be populated somewhere else,
753 // e.g. by lld through link script.
754 FatBinStr = new llvm::GlobalVariable(
755 CGM.getModule(), CGM.Int8Ty,
756 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
757 "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
758 llvm::GlobalVariable::NotThreadLocal);
759 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
760 }
761
762 FatMagic = HIPFatMagic;
763 } else {
764 if (RelocatableDeviceCode)
765 FatbinConstantName = CGM.getTriple().isMacOSX()
766 ? "__NV_CUDA,__nv_relfatbin"
767 : "__nv_relfatbin";
768 else
769 FatbinConstantName =
770 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
771 // NVIDIA's cuobjdump looks for fatbins in this section.
772 FatbinSectionName =
773 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
774
775 ModuleIDSectionName = CGM.getTriple().isMacOSX()
776 ? "__NV_CUDA,__nv_module_id"
777 : "__nv_module_id";
778 ModuleIDPrefix = "__nv_";
779
780 // For CUDA, create a string literal containing the fat binary loaded from
781 // the given file.
782 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
783 FatbinConstantName, 8);
784 FatMagic = CudaFatMagic;
785 }
786
787 // Create initialized wrapper structure that points to the loaded GPU binary
788 ConstantInitBuilder Builder(CGM);
789 auto Values = Builder.beginStruct(FatbinWrapperTy);
790 // Fatbin wrapper magic.
791 Values.addInt(IntTy, FatMagic);
792 // Fatbin version.
793 Values.addInt(IntTy, 1);
794 // Data.
795 Values.add(FatBinStr);
796 // Unused in fatbin v1.
797 Values.add(llvm::ConstantPointerNull::get(PtrTy));
798 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
799 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
800 /*constant*/ true);
801 FatbinWrapper->setSection(FatbinSectionName);
802
803 // There is only one HIP fat binary per linked module, however there are
804 // multiple constructor functions. Make sure the fat binary is registered
805 // only once. The constructor functions are executed by the dynamic loader
806 // before the program gains control. The dynamic loader cannot execute the
807 // constructor functions concurrently since doing that would not guarantee
808 // thread safety of the loaded program. Therefore we can assume sequential
809 // execution of constructor functions here.
810 if (IsHIP) {
811 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
812 : llvm::GlobalValue::ExternalLinkage;
813 llvm::BasicBlock *IfBlock =
814 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
815 llvm::BasicBlock *ExitBlock =
816 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
817 // The name, size, and initialization pattern of this variable is part
818 // of HIP ABI.
819 GpuBinaryHandle = new llvm::GlobalVariable(
820 TheModule, PtrTy, /*isConstant=*/false, Linkage,
821 /*Initializer=*/
822 CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
823 CudaGpuBinary
824 ? "__hip_gpubin_handle"
825 : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
826 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
827 // Prevent the weak symbol in different shared libraries being merged.
828 if (Linkage != llvm::GlobalValue::InternalLinkage)
829 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
830 Address GpuBinaryAddr(
831 GpuBinaryHandle, PtrTy,
832 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
833 {
834 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
835 llvm::Constant *Zero =
836 llvm::Constant::getNullValue(HandleValue->getType());
837 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
838 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
839 }
840 {
841 CtorBuilder.SetInsertPoint(IfBlock);
842 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
843 llvm::CallInst *RegisterFatbinCall =
844 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
845 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
846 CtorBuilder.CreateBr(ExitBlock);
847 }
848 {
849 CtorBuilder.SetInsertPoint(ExitBlock);
850 // Call __hip_register_globals(GpuBinaryHandle);
851 if (RegisterGlobalsFunc) {
852 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
853 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
854 }
855 }
856 } else if (!RelocatableDeviceCode) {
857 // Register binary with CUDA runtime. This is substantially different in
858 // default mode vs. separate compilation!
859 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
860 llvm::CallInst *RegisterFatbinCall =
861 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
862 GpuBinaryHandle = new llvm::GlobalVariable(
863 TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage,
864 llvm::ConstantPointerNull::get(PtrTy), "__cuda_gpubin_handle");
865 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
866 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
867 CGM.getPointerAlign());
868
869 // Call __cuda_register_globals(GpuBinaryHandle);
870 if (RegisterGlobalsFunc)
871 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
872
873 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
874 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
875 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
876 // void __cudaRegisterFatBinaryEnd(void **);
877 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
878 llvm::FunctionType::get(VoidTy, PtrTy, false),
879 "__cudaRegisterFatBinaryEnd");
880 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
881 }
882 } else {
883 // Generate a unique module ID.
884 SmallString<64> ModuleID;
885 llvm::raw_svector_ostream OS(ModuleID);
886 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
887 llvm::Constant *ModuleIDConstant = makeConstantArray(
888 std::string(ModuleID), "", ModuleIDSectionName, 32, /*AddNull=*/true);
889
890 // Create an alias for the FatbinWrapper that nvcc will look for.
891 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
892 Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
893
894 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
895 // void *, void (*)(void **))
896 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
897 RegisterLinkedBinaryName += ModuleID;
898 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
899 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
900
901 assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
902 llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
903 makeDummyFunction(getCallbackFnTy())};
904 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
905 }
906
907 // Create destructor and register it with atexit() the way NVCC does it. Doing
908 // it during regular destructor phase worked in CUDA before 9.2 but results in
909 // double-free in 9.2.
910 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
911 // extern "C" int atexit(void (*f)(void));
912 llvm::FunctionType *AtExitTy =
913 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
914 llvm::FunctionCallee AtExitFunc =
915 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
916 /*Local=*/true);
917 CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
918 }
919
920 CtorBuilder.CreateRetVoid();
921 return ModuleCtorFunc;
922}
923
924/// Creates a global destructor function that unregisters the GPU code blob
925/// registered by constructor.
926///
927/// For CUDA:
928/// \code
929/// void __cuda_module_dtor() {
930/// __cudaUnregisterFatBinary(Handle);
931/// }
932/// \endcode
933///
934/// For HIP:
935/// \code
936/// void __hip_module_dtor() {
937/// if (__hip_gpubin_handle) {
938/// __hipUnregisterFatBinary(__hip_gpubin_handle);
939/// __hip_gpubin_handle = 0;
940/// }
941/// }
942/// \endcode
943llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
944 // No need for destructor if we don't have a handle to unregister.
945 if (!GpuBinaryHandle)
946 return nullptr;
947
948 // void __cudaUnregisterFatBinary(void ** handle);
949 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
950 llvm::FunctionType::get(VoidTy, PtrTy, false),
951 addUnderscoredPrefixToName("UnregisterFatBinary"));
952
953 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
954 llvm::FunctionType::get(VoidTy, false),
955 llvm::GlobalValue::InternalLinkage,
956 addUnderscoredPrefixToName("_module_dtor"), &TheModule);
957
958 llvm::BasicBlock *DtorEntryBB =
959 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
960 CGBuilderTy DtorBuilder(CGM, Context);
961 DtorBuilder.SetInsertPoint(DtorEntryBB);
962
963 Address GpuBinaryAddr(
964 GpuBinaryHandle, GpuBinaryHandle->getValueType(),
965 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
966 auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
967 // There is only one HIP fat binary per linked module, however there are
968 // multiple destructor functions. Make sure the fat binary is unregistered
969 // only once.
970 if (CGM.getLangOpts().HIP) {
971 llvm::BasicBlock *IfBlock =
972 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
973 llvm::BasicBlock *ExitBlock =
974 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
975 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
976 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
977 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
978
979 DtorBuilder.SetInsertPoint(IfBlock);
980 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
981 DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
982 DtorBuilder.CreateBr(ExitBlock);
983
984 DtorBuilder.SetInsertPoint(ExitBlock);
985 } else {
986 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
987 }
988 DtorBuilder.CreateRetVoid();
989 return ModuleDtorFunc;
990}
991
993 return new CGNVCUDARuntime(CGM);
994}
995
996void CGNVCUDARuntime::internalizeDeviceSideVar(
997 const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
998 // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
999 // global variables become internal definitions. These have to be internal in
1000 // order to prevent name conflicts with global host variables with the same
1001 // name in a different TUs.
1002 //
1003 // For -fgpu-rdc, the shadow variables should not be internalized because
1004 // they may be accessed by different TU.
1005 if (CGM.getLangOpts().GPURelocatableDeviceCode)
1006 return;
1007
1008 // __shared__ variables are odd. Shadows do get created, but
1009 // they are not registered with the CUDA runtime, so they
1010 // can't really be used to access their device-side
1011 // counterparts. It's not clear yet whether it's nvcc's bug or
1012 // a feature, but we've got to do the same for compatibility.
1013 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
1014 D->hasAttr<CUDASharedAttr>() ||
1017 Linkage = llvm::GlobalValue::InternalLinkage;
1018 }
1019}
1020
1021void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1022 llvm::GlobalVariable &GV) {
1023 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
1024 // Shadow variables and their properties must be registered with CUDA
1025 // runtime. Skip Extern global variables, which will be registered in
1026 // the TU where they are defined.
1027 //
1028 // Don't register a C++17 inline variable. The local symbol can be
1029 // discarded and referencing a discarded local symbol from outside the
1030 // comdat (__cuda_register_globals) is disallowed by the ELF spec.
1031 //
1032 // HIP managed variables need to be always recorded in device and host
1033 // compilations for transformation.
1034 //
1035 // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1036 // added to llvm.compiler-used, therefore they are safe to be registered.
1037 if ((!D->hasExternalStorage() && !D->isInline()) ||
1038 CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
1039 D->hasAttr<HIPManagedAttr>()) {
1040 registerDeviceVar(D, GV, !D->hasDefinition(),
1041 D->hasAttr<CUDAConstantAttr>());
1042 }
1043 } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1045 // Builtin surfaces and textures and their template arguments are
1046 // also registered with CUDA runtime.
1047 const auto *TD = cast<ClassTemplateSpecializationDecl>(
1048 D->getType()->castAs<RecordType>()->getDecl());
1049 const TemplateArgumentList &Args = TD->getTemplateArgs();
1050 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1051 assert(Args.size() == 2 &&
1052 "Unexpected number of template arguments of CUDA device "
1053 "builtin surface type.");
1054 auto SurfType = Args[1].getAsIntegral();
1055 if (!D->hasExternalStorage())
1056 registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1057 } else {
1058 assert(Args.size() == 3 &&
1059 "Unexpected number of template arguments of CUDA device "
1060 "builtin texture type.");
1061 auto TexType = Args[1].getAsIntegral();
1062 auto Normalized = Args[2].getAsIntegral();
1063 if (!D->hasExternalStorage())
1064 registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1065 Normalized.getZExtValue());
1066 }
1067 }
1068}
1069
1070// Transform managed variables to pointers to managed variables in device code.
1071// Each use of the original managed variable is replaced by a load from the
1072// transformed managed variable. The transformed managed variable contains
1073// the address of managed memory which will be allocated by the runtime.
1074void CGNVCUDARuntime::transformManagedVars() {
1075 for (auto &&Info : DeviceVars) {
1076 llvm::GlobalVariable *Var = Info.Var;
1077 if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1078 Info.Flags.isManaged()) {
1079 auto *ManagedVar = new llvm::GlobalVariable(
1080 CGM.getModule(), Var->getType(),
1081 /*isConstant=*/false, Var->getLinkage(),
1082 /*Init=*/Var->isDeclaration()
1083 ? nullptr
1084 : llvm::ConstantPointerNull::get(Var->getType()),
1085 /*Name=*/"", /*InsertBefore=*/nullptr,
1086 llvm::GlobalVariable::NotThreadLocal,
1087 CGM.getContext().getTargetAddressSpace(CGM.getLangOpts().CUDAIsDevice
1088 ? LangAS::cuda_device
1089 : LangAS::Default));
1090 ManagedVar->setDSOLocal(Var->isDSOLocal());
1091 ManagedVar->setVisibility(Var->getVisibility());
1092 ManagedVar->setExternallyInitialized(true);
1093 replaceManagedVar(Var, ManagedVar);
1094 ManagedVar->takeName(Var);
1095 Var->setName(Twine(ManagedVar->getName()) + ".managed");
1096 // Keep managed variables even if they are not used in device code since
1097 // they need to be allocated by the runtime.
1098 if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1099 assert(!ManagedVar->isDeclaration());
1100 CGM.addCompilerUsedGlobal(Var);
1101 CGM.addCompilerUsedGlobal(ManagedVar);
1102 }
1103 }
1104 }
1105}
1106
1107// Creates offloading entries for all the kernels and globals that must be
1108// registered. The linker will provide a pointer to this section so we can
1109// register the symbols with the linked device image.
1110void CGNVCUDARuntime::createOffloadingEntries() {
1111 StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
1112 : "cuda_offloading_entries";
1113 llvm::Module &M = CGM.getModule();
1114 for (KernelInfo &I : EmittedKernels)
1115 llvm::offloading::emitOffloadingEntry(
1116 M, KernelHandles[I.Kernel->getName()],
1117 getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0,
1118 llvm::offloading::OffloadGlobalEntry, Section);
1119
1120 for (VarInfo &I : DeviceVars) {
1121 uint64_t VarSize =
1122 CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1123 int32_t Flags =
1124 (I.Flags.isExtern()
1125 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1126 : 0) |
1127 (I.Flags.isConstant()
1128 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1129 : 0) |
1130 (I.Flags.isNormalized()
1131 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1132 : 0);
1133 if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1134 llvm::offloading::emitOffloadingEntry(
1135 M, I.Var, getDeviceSideName(I.D), VarSize,
1136 (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1137 : llvm::offloading::OffloadGlobalEntry) |
1138 Flags,
1139 /*Data=*/0, Section);
1140 } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1141 llvm::offloading::emitOffloadingEntry(
1142 M, I.Var, getDeviceSideName(I.D), VarSize,
1143 llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1144 I.Flags.getSurfTexType(), Section);
1145 } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1146 llvm::offloading::emitOffloadingEntry(
1147 M, I.Var, getDeviceSideName(I.D), VarSize,
1148 llvm::offloading::OffloadGlobalTextureEntry | Flags,
1149 I.Flags.getSurfTexType(), Section);
1150 }
1151 }
1152}
1153
1154// Returns module constructor to be added.
1155llvm::Function *CGNVCUDARuntime::finalizeModule() {
1156 transformManagedVars();
1157 if (CGM.getLangOpts().CUDAIsDevice) {
1158 // Mark ODR-used device variables as compiler used to prevent it from being
1159 // eliminated by optimization. This is necessary for device variables
1160 // ODR-used by host functions. Sema correctly marks them as ODR-used no
1161 // matter whether they are ODR-used by device or host functions.
1162 //
1163 // We do not need to do this if the variable has used attribute since it
1164 // has already been added.
1165 //
1166 // Static device variables have been externalized at this point, therefore
1167 // variables with LLVM private or internal linkage need not be added.
1168 for (auto &&Info : DeviceVars) {
1169 auto Kind = Info.Flags.getKind();
1170 if (!Info.Var->isDeclaration() &&
1171 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1172 (Kind == DeviceVarFlags::Variable ||
1173 Kind == DeviceVarFlags::Surface ||
1174 Kind == DeviceVarFlags::Texture) &&
1175 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1176 CGM.addCompilerUsedGlobal(Info.Var);
1177 }
1178 }
1179 return nullptr;
1180 }
1181 if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1182 createOffloadingEntries();
1183 else
1184 return makeModuleCtorFunction();
1185
1186 return nullptr;
1187}
1188
1189llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1190 GlobalDecl GD) {
1191 auto Loc = KernelHandles.find(F->getName());
1192 if (Loc != KernelHandles.end()) {
1193 auto OldHandle = Loc->second;
1194 if (KernelStubs[OldHandle] == F)
1195 return OldHandle;
1196
1197 // We've found the function name, but F itself has changed, so we need to
1198 // update the references.
1199 if (CGM.getLangOpts().HIP) {
1200 // For HIP compilation the handle itself does not change, so we only need
1201 // to update the Stub value.
1202 KernelStubs[OldHandle] = F;
1203 return OldHandle;
1204 }
1205 // For non-HIP compilation, erase the old Stub and fall-through to creating
1206 // new entries.
1207 KernelStubs.erase(OldHandle);
1208 }
1209
1210 if (!CGM.getLangOpts().HIP) {
1211 KernelHandles[F->getName()] = F;
1212 KernelStubs[F] = F;
1213 return F;
1214 }
1215
1216 auto *Var = new llvm::GlobalVariable(
1217 TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1218 /*Initializer=*/nullptr,
1219 CGM.getMangledName(
1220 GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
1221 Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1222 Var->setDSOLocal(F->isDSOLocal());
1223 Var->setVisibility(F->getVisibility());
1224 auto *FD = cast<FunctionDecl>(GD.getDecl());
1225 auto *FT = FD->getPrimaryTemplate();
1226 if (!FT || FT->isThisDeclarationADefinition())
1227 CGM.maybeSetTrivialComdat(*FD, *Var);
1228 KernelHandles[F->getName()] = Var;
1229 KernelStubs[Var] = F;
1230 return Var;
1231}
static std::unique_ptr< MangleContext > InitDeviceMC(CodeGenModule &CGM)
Definition: CGCUDANV.cpp:209
static void replaceManagedVar(llvm::GlobalVariable *Var, llvm::GlobalVariable *ManagedVar)
Definition: CGCUDANV.cpp:468
MangleContext * createMangleContext(const TargetInfo *T=nullptr)
If T is null pointer, assume the target in ASTContext.
const TargetInfo * getAuxTargetInfo() const
Definition: ASTContext.h:758
MangleContext * createDeviceMangleContext(const TargetInfo &T)
Creates a device mangle context to correctly mangle lambdas in a mixed architecture compile by settin...
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:757
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
static CharUnits Zero()
Zero - Construct a CharUnits quantity of zero.
Definition: CharUnits.h:53
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition: Address.h:111
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
Definition: Address.h:220
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:176
llvm::StoreInst * CreateDefaultAlignedStore(llvm::Value *Val, llvm::Value *Addr, bool IsVolatile=false)
Definition: CGBuilder.h:151
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
Definition: CGBuilder.h:108
virtual std::string getDeviceSideName(const NamedDecl *ND)=0
Returns function or variable name on device side even if the current compilation is for host.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args)=0
Emits a kernel launch stub.
virtual llvm::Function * getKernelStub(llvm::GlobalValue *Handle)=0
Get kernel stub by kernel handle.
virtual void handleVarRegistration(const VarDecl *VD, llvm::GlobalVariable &Var)=0
Check whether a variable is a device variable and register it if true.
virtual llvm::Function * finalizeModule()=0
Finalize generated LLVM module.
virtual llvm::GlobalValue * getKernelHandle(llvm::Function *Stub, GlobalDecl GD)=0
Get kernel handle by stub function.
virtual void internalizeDeviceSideVar(const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage)=0
Adjust linkage of shadow variables in host compilation.
static CGCallee forDirect(llvm::Constant *functionPtr, const CGCalleeInfo &abstractInfo=CGCalleeInfo())
Definition: CGCall.h:129
CGFunctionInfo - Class to encapsulate the information about a function definition.
CallArgList - Type for representing both the value and type of arguments in a call.
Definition: CGCall.h:258
void add(RValue rvalue, QualType type)
Definition: CGCall.h:282
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
const LangOptions & getLangOpts() const
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee, ReturnValueSlot ReturnValue, const CallArgList &Args, llvm::CallBase **callOrInvoke, bool IsMustTail, SourceLocation Loc)
EmitCall - Generate a call of the given function, expecting the given result type,...
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
const Decl * CurFuncDecl
CurFuncDecl - Holds the Decl for the current outermost non-closure context.
llvm::CallBase * EmitRuntimeCallOrInvoke(llvm::FunctionCallee callee, ArrayRef< llvm::Value * > args, const Twine &name="")
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
This class organizes the cross-function state that is used while generating LLVM code.
ASTContext & getContext() const
The standard implementation of ConstantInitBuilder used in Clang.
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition: CGCall.h:352
static RValue get(llvm::Value *V)
Definition: CGValue.h:97
static RValue getAggregate(Address addr, bool isVolatile=false)
Convert an Address to an RValue.
Definition: CGValue.h:124
ReturnValueSlot - Contains the address where the return value of a function can be stored,...
Definition: CGCall.h:356
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1435
lookup_result lookup(DeclarationName Name) const
lookup - Find the declarations (if any) with the given Name in this context.
Definition: DeclBase.cpp:1784
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:85
SourceLocation getLocation() const
Definition: DeclBase.h:444
TranslationUnitDecl * getTranslationUnitDecl()
Definition: DeclBase.cpp:486
bool hasAttr() const
Definition: DeclBase.h:582
Represents a function declaration or definition.
Definition: Decl.h:1971
const ParmVarDecl * getParamDecl(unsigned i) const
Definition: Decl.h:2707
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:56
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind)
Definition: GlobalDecl.h:194
const Decl * getDecl() const
Definition: GlobalDecl.h:103
One of these records is kept for each identifier that is lexed.
StringRef getName() const
Return the actual identifier string.
GPUDefaultStreamKind GPUDefaultStream
The default stream kind used for HIP kernel launching.
Definition: LangOptions.h:544
MangleContext - Context for tracking state which persists across multiple calls to the C++ name mangl...
Definition: Mangle.h:45
bool shouldMangleDeclName(const NamedDecl *D)
Definition: Mangle.cpp:105
void mangleName(GlobalDecl GD, raw_ostream &)
Definition: Mangle.cpp:139
This represents a decl that may have a name.
Definition: Decl.h:249
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:270
Represents a parameter to a function.
Definition: Decl.h:1761
A (possibly-)qualified type.
Definition: Type.h:738
QualType getCanonicalType() const
Definition: Type.h:7201
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
Definition: Type.h:5339
RecordDecl * getDecl() const
Definition: Type.h:5349
bool isMicrosoft() const
Is this ABI an MSVC-compatible ABI?
Definition: TargetCXXABI.h:136
bool isItaniumFamily() const
Does this ABI generally fall into the Itanium family of ABIs?
Definition: TargetCXXABI.h:122
TargetCXXABI getCXXABI() const
Get the C++ ABI currently in use.
Definition: TargetInfo.h:1306
A template argument list.
Definition: DeclTemplate.h:244
unsigned size() const
Retrieve the number of template arguments in this template argument list.
Definition: DeclTemplate.h:280
The top declaration context.
Definition: Decl.h:84
static DeclContext * castToDeclContext(const TranslationUnitDecl *D)
Definition: Decl.h:130
The base class of the type hierarchy.
Definition: Type.h:1607
const T * castAs() const
Member-template castAs<specific type>.
Definition: Type.h:7980
bool isCUDADeviceBuiltinSurfaceType() const
Check if the type is the CUDA device builtin surface type.
Definition: Type.cpp:4906
bool isCUDADeviceBuiltinTextureType() const
Check if the type is the CUDA device builtin texture type.
Definition: Type.cpp:4913
QualType getType() const
Definition: Decl.h:717
QualType getType() const
Definition: Value.cpp:234
Represents a variable declaration or definition.
Definition: Decl.h:918
bool isInline() const
Whether this variable is (C++1z) inline.
Definition: Decl.h:1531
bool hasExternalStorage() const
Returns true if a variable has extern or private_extern storage.
Definition: Decl.h:1204
DefinitionKind hasDefinition(ASTContext &) const
Check whether this variable is defined in this translation unit.
Definition: Decl.cpp:2372
CGCUDARuntime * CreateNVCUDARuntime(CodeGenModule &CGM)
Creates an instance of a CUDA runtime class.
Definition: CGCUDANV.cpp:992
bool Zero(InterpState &S, CodePtr OpPC)
Definition: Interp.h:1840
@ VFS
Remove unused -ivfsoverlay arguments.
The JSON file list parser is used to communicate input to InstallAPI.
CudaVersion ToCudaVersion(llvm::VersionTuple)
Definition: Cuda.cpp:65
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature)
Definition: Cuda.cpp:244
Linkage
Describes the different kinds of linkage (C++ [basic.link], C99 6.2.2) that an entity may have.
Definition: Linkage.h:24
unsigned long uint64_t
llvm::IntegerType * IntTy
int