clang 19.0.0git
CGCUDANV.cpp
Go to the documentation of this file.
1//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10// runtime library.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGCUDARuntime.h"
15#include "CGCXXABI.h"
16#include "CodeGenFunction.h"
17#include "CodeGenModule.h"
18#include "clang/AST/Decl.h"
19#include "clang/Basic/Cuda.h"
22#include "llvm/Frontend/Offloading/Utility.h"
23#include "llvm/IR/BasicBlock.h"
24#include "llvm/IR/Constants.h"
25#include "llvm/IR/DerivedTypes.h"
26#include "llvm/IR/ReplaceConstant.h"
27#include "llvm/Support/Format.h"
28#include "llvm/Support/VirtualFileSystem.h"
29
30using namespace clang;
31using namespace CodeGen;
32
33namespace {
34constexpr unsigned CudaFatMagic = 0x466243b1;
35constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
36
37class CGNVCUDARuntime : public CGCUDARuntime {
38
39private:
40 llvm::IntegerType *IntTy, *SizeTy;
41 llvm::Type *VoidTy;
42 llvm::PointerType *PtrTy;
43
44 /// Convenience reference to LLVM Context
45 llvm::LLVMContext &Context;
46 /// Convenience reference to the current module
47 llvm::Module &TheModule;
48 /// Keeps track of kernel launch stubs and handles emitted in this module
49 struct KernelInfo {
50 llvm::Function *Kernel; // stub function to help launch kernel
51 const Decl *D;
52 };
54 // Map a kernel mangled name to a symbol for identifying kernel in host code
55 // For CUDA, the symbol for identifying the kernel is the same as the device
56 // stub function. For HIP, they are different.
57 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
58 // Map a kernel handle to the kernel stub.
59 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
60 struct VarInfo {
61 llvm::GlobalVariable *Var;
62 const VarDecl *D;
63 DeviceVarFlags Flags;
64 };
66 /// Keeps track of variable containing handle of GPU binary. Populated by
67 /// ModuleCtorFunction() and used to create corresponding cleanup calls in
68 /// ModuleDtorFunction()
69 llvm::GlobalVariable *GpuBinaryHandle = nullptr;
70 /// Whether we generate relocatable device code.
71 bool RelocatableDeviceCode;
72 /// Mangle context for device.
73 std::unique_ptr<MangleContext> DeviceMC;
74 /// Some zeros used for GEPs.
75 llvm::Constant *Zeros[2];
76
77 llvm::FunctionCallee getSetupArgumentFn() const;
78 llvm::FunctionCallee getLaunchFn() const;
79
80 llvm::FunctionType *getRegisterGlobalsFnTy() const;
81 llvm::FunctionType *getCallbackFnTy() const;
82 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
83 std::string addPrefixToName(StringRef FuncName) const;
84 std::string addUnderscoredPrefixToName(StringRef FuncName) const;
85
86 /// Creates a function to register all kernel stubs generated in this module.
87 llvm::Function *makeRegisterGlobalsFn();
88
89 /// Helper function that generates a constant string and returns a pointer to
90 /// the start of the string. The result of this function can be used anywhere
91 /// where the C code specifies const char*.
92 llvm::Constant *makeConstantString(const std::string &Str,
93 const std::string &Name = "") {
94 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
95 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
96 ConstStr.getPointer(), Zeros);
97 }
98
99 /// Helper function which generates an initialized constant array from Str,
100 /// and optionally sets section name and alignment. AddNull specifies whether
101 /// the array should nave NUL termination.
102 llvm::Constant *makeConstantArray(StringRef Str,
103 StringRef Name = "",
104 StringRef SectionName = "",
105 unsigned Alignment = 0,
106 bool AddNull = false) {
107 llvm::Constant *Value =
108 llvm::ConstantDataArray::getString(Context, Str, AddNull);
109 auto *GV = new llvm::GlobalVariable(
110 TheModule, Value->getType(), /*isConstant=*/true,
111 llvm::GlobalValue::PrivateLinkage, Value, Name);
112 if (!SectionName.empty()) {
113 GV->setSection(SectionName);
114 // Mark the address as used which make sure that this section isn't
115 // merged and we will really have it in the object file.
116 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
117 }
118 if (Alignment)
119 GV->setAlignment(llvm::Align(Alignment));
120 return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros);
121 }
122
123 /// Helper function that generates an empty dummy function returning void.
124 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
125 assert(FnTy->getReturnType()->isVoidTy() &&
126 "Can only generate dummy functions returning void!");
127 llvm::Function *DummyFunc = llvm::Function::Create(
128 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
129
130 llvm::BasicBlock *DummyBlock =
131 llvm::BasicBlock::Create(Context, "", DummyFunc);
132 CGBuilderTy FuncBuilder(CGM, Context);
133 FuncBuilder.SetInsertPoint(DummyBlock);
134 FuncBuilder.CreateRetVoid();
135
136 return DummyFunc;
137 }
138
139 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
140 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
141 std::string getDeviceSideName(const NamedDecl *ND) override;
142
143 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
144 bool Extern, bool Constant) {
145 DeviceVars.push_back({&Var,
146 VD,
147 {DeviceVarFlags::Variable, Extern, Constant,
148 VD->hasAttr<HIPManagedAttr>(),
149 /*Normalized*/ false, 0}});
150 }
151 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
152 bool Extern, int Type) {
153 DeviceVars.push_back({&Var,
154 VD,
155 {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
156 /*Managed*/ false,
157 /*Normalized*/ false, Type}});
158 }
159 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
160 bool Extern, int Type, bool Normalized) {
161 DeviceVars.push_back({&Var,
162 VD,
163 {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
164 /*Managed*/ false, Normalized, Type}});
165 }
166
167 /// Creates module constructor function
168 llvm::Function *makeModuleCtorFunction();
169 /// Creates module destructor function
170 llvm::Function *makeModuleDtorFunction();
171 /// Transform managed variables for device compilation.
172 void transformManagedVars();
173 /// Create offloading entries to register globals in RDC mode.
174 void createOffloadingEntries();
175
176public:
177 CGNVCUDARuntime(CodeGenModule &CGM);
178
179 llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
180 llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
181 auto Loc = KernelStubs.find(Handle);
182 assert(Loc != KernelStubs.end());
183 return Loc->second;
184 }
185 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
186 void handleVarRegistration(const VarDecl *VD,
187 llvm::GlobalVariable &Var) override;
188 void
190 llvm::GlobalValue::LinkageTypes &Linkage) override;
191
192 llvm::Function *finalizeModule() override;
193};
194
195} // end anonymous namespace
196
197std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
198 if (CGM.getLangOpts().HIP)
199 return ((Twine("hip") + Twine(FuncName)).str());
200 return ((Twine("cuda") + Twine(FuncName)).str());
201}
202std::string
203CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
204 if (CGM.getLangOpts().HIP)
205 return ((Twine("__hip") + Twine(FuncName)).str());
206 return ((Twine("__cuda") + Twine(FuncName)).str());
207}
208
209static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
210 // If the host and device have different C++ ABIs, mark it as the device
211 // mangle context so that the mangling needs to retrieve the additional
212 // device lambda mangling number instead of the regular host one.
213 if (CGM.getContext().getAuxTargetInfo() &&
216 return std::unique_ptr<MangleContext>(
218 *CGM.getContext().getAuxTargetInfo()));
219 }
220
221 return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
223}
224
225CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
226 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
227 TheModule(CGM.getModule()),
228 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
229 DeviceMC(InitDeviceMC(CGM)) {
230 IntTy = CGM.IntTy;
231 SizeTy = CGM.SizeTy;
232 VoidTy = CGM.VoidTy;
233 Zeros[0] = llvm::ConstantInt::get(SizeTy, 0);
234 Zeros[1] = Zeros[0];
235 PtrTy = CGM.UnqualPtrTy;
236}
237
238llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
239 // cudaError_t cudaSetupArgument(void *, size_t, size_t)
240 llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
241 return CGM.CreateRuntimeFunction(
242 llvm::FunctionType::get(IntTy, Params, false),
243 addPrefixToName("SetupArgument"));
244}
245
246llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
247 if (CGM.getLangOpts().HIP) {
248 // hipError_t hipLaunchByPtr(char *);
249 return CGM.CreateRuntimeFunction(
250 llvm::FunctionType::get(IntTy, PtrTy, false), "hipLaunchByPtr");
251 }
252 // cudaError_t cudaLaunch(char *);
253 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy, false),
254 "cudaLaunch");
255}
256
257llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
258 return llvm::FunctionType::get(VoidTy, PtrTy, false);
259}
260
261llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
262 return llvm::FunctionType::get(VoidTy, PtrTy, false);
263}
264
265llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
266 llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
267 llvm::PointerType::getUnqual(Context)};
268 return llvm::FunctionType::get(VoidTy, Params, false);
269}
270
271std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
272 GlobalDecl GD;
273 // D could be either a kernel or a variable.
274 if (auto *FD = dyn_cast<FunctionDecl>(ND))
275 GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
276 else
277 GD = GlobalDecl(ND);
278 std::string DeviceSideName;
279 MangleContext *MC;
280 if (CGM.getLangOpts().CUDAIsDevice)
281 MC = &CGM.getCXXABI().getMangleContext();
282 else
283 MC = DeviceMC.get();
284 if (MC->shouldMangleDeclName(ND)) {
285 SmallString<256> Buffer;
286 llvm::raw_svector_ostream Out(Buffer);
287 MC->mangleName(GD, Out);
288 DeviceSideName = std::string(Out.str());
289 } else
290 DeviceSideName = std::string(ND->getIdentifier()->getName());
291
292 // Make unique name for device side static file-scope variable for HIP.
293 if (CGM.getContext().shouldExternalize(ND) &&
294 CGM.getLangOpts().GPURelocatableDeviceCode) {
295 SmallString<256> Buffer;
296 llvm::raw_svector_ostream Out(Buffer);
297 Out << DeviceSideName;
298 CGM.printPostfixForExternalizedDecl(Out, ND);
299 DeviceSideName = std::string(Out.str());
300 }
301 return DeviceSideName;
302}
303
304void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
305 FunctionArgList &Args) {
306 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
307 if (auto *GV =
308 dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) {
309 GV->setLinkage(CGF.CurFn->getLinkage());
310 GV->setInitializer(CGF.CurFn);
311 }
312 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
313 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
314 (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
315 emitDeviceStubBodyNew(CGF, Args);
316 else
317 emitDeviceStubBodyLegacy(CGF, Args);
318}
319
320// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
321// array and kernels are launched using cudaLaunchKernel().
322void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
323 FunctionArgList &Args) {
324 // Build the shadow stack entry at the very start of the function.
325
326 // Calculate amount of space we will need for all arguments. If we have no
327 // args, allocate a single pointer so we still have a valid pointer to the
328 // argument array that we can pass to runtime, even if it will be unused.
329 Address KernelArgs = CGF.CreateTempAlloca(
330 PtrTy, CharUnits::fromQuantity(16), "kernel_args",
331 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
332 // Store pointers to the arguments in a locally allocated launch_args.
333 for (unsigned i = 0; i < Args.size(); ++i) {
334 llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
335 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, PtrTy);
337 VoidVarPtr,
338 CGF.Builder.CreateConstGEP1_32(PtrTy, KernelArgs.getPointer(), i));
339 }
340
341 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
342
343 // Lookup cudaLaunchKernel/hipLaunchKernel function.
344 // HIP kernel launching API name depends on -fgpu-default-stream option. For
345 // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
346 // it is hipLaunchKernel_spt.
347 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
348 // void **args, size_t sharedMem,
349 // cudaStream_t stream);
350 // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
351 // dim3 blockDim, void **args,
352 // size_t sharedMem, hipStream_t stream);
353 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
355 std::string KernelLaunchAPI = "LaunchKernel";
356 if (CGF.getLangOpts().GPUDefaultStream ==
357 LangOptions::GPUDefaultStreamKind::PerThread) {
358 if (CGF.getLangOpts().HIP)
359 KernelLaunchAPI = KernelLaunchAPI + "_spt";
360 else if (CGF.getLangOpts().CUDA)
361 KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
362 }
363 auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
364 IdentifierInfo &cudaLaunchKernelII =
365 CGM.getContext().Idents.get(LaunchKernelName);
366 FunctionDecl *cudaLaunchKernelFD = nullptr;
367 for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
368 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
369 cudaLaunchKernelFD = FD;
370 }
371
372 if (cudaLaunchKernelFD == nullptr) {
373 CGM.Error(CGF.CurFuncDecl->getLocation(),
374 "Can't find declaration for " + LaunchKernelName);
375 return;
376 }
377 // Create temporary dim3 grid_dim, block_dim.
378 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
379 QualType Dim3Ty = GridDimParam->getType();
380 Address GridDim =
381 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
382 Address BlockDim =
383 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
384 Address ShmemSize =
385 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
386 Address Stream = CGF.CreateTempAlloca(PtrTy, CGM.getPointerAlign(), "stream");
387 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
388 llvm::FunctionType::get(IntTy,
389 {/*gridDim=*/GridDim.getType(),
390 /*blockDim=*/BlockDim.getType(),
391 /*ShmemSize=*/ShmemSize.getType(),
392 /*Stream=*/Stream.getType()},
393 /*isVarArg=*/false),
394 addUnderscoredPrefixToName("PopCallConfiguration"));
395
396 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
397 {GridDim.getPointer(), BlockDim.getPointer(),
398 ShmemSize.getPointer(), Stream.getPointer()});
399
400 // Emit the call to cudaLaunch
401 llvm::Value *Kernel =
402 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
403 CallArgList LaunchKernelArgs;
404 LaunchKernelArgs.add(RValue::get(Kernel),
405 cudaLaunchKernelFD->getParamDecl(0)->getType());
406 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
407 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
408 LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
409 cudaLaunchKernelFD->getParamDecl(3)->getType());
410 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
411 cudaLaunchKernelFD->getParamDecl(4)->getType());
412 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
413 cudaLaunchKernelFD->getParamDecl(5)->getType());
414
415 QualType QT = cudaLaunchKernelFD->getType();
416 QualType CQT = QT.getCanonicalType();
417 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
418 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
419
420 const CGFunctionInfo &FI =
421 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
422 llvm::FunctionCallee cudaLaunchKernelFn =
423 CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
424 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
425 LaunchKernelArgs);
426 CGF.EmitBranch(EndBlock);
427
428 CGF.EmitBlock(EndBlock);
429}
430
431void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
432 FunctionArgList &Args) {
433 // Emit a call to cudaSetupArgument for each arg in Args.
434 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
435 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
436 CharUnits Offset = CharUnits::Zero();
437 for (const VarDecl *A : Args) {
438 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
439 Offset = Offset.alignTo(TInfo.Align);
440 llvm::Value *Args[] = {
441 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
442 PtrTy),
443 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
444 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
445 };
446 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
447 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
448 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
449 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
450 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
451 CGF.EmitBlock(NextBlock);
452 Offset += TInfo.Width;
453 }
454
455 // Emit the call to cudaLaunch
456 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
457 llvm::Value *Arg =
458 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
459 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
460 CGF.EmitBranch(EndBlock);
461
462 CGF.EmitBlock(EndBlock);
463}
464
465// Replace the original variable Var with the address loaded from variable
466// ManagedVar populated by HIP runtime.
467static void replaceManagedVar(llvm::GlobalVariable *Var,
468 llvm::GlobalVariable *ManagedVar) {
470 for (auto &&VarUse : Var->uses()) {
471 WorkList.push_back({VarUse.getUser()});
472 }
473 while (!WorkList.empty()) {
474 auto &&WorkItem = WorkList.pop_back_val();
475 auto *U = WorkItem.back();
476 if (isa<llvm::ConstantExpr>(U)) {
477 for (auto &&UU : U->uses()) {
478 WorkItem.push_back(UU.getUser());
479 WorkList.push_back(WorkItem);
480 WorkItem.pop_back();
481 }
482 continue;
483 }
484 if (auto *I = dyn_cast<llvm::Instruction>(U)) {
485 llvm::Value *OldV = Var;
486 llvm::Instruction *NewV =
487 new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
488 llvm::Align(Var->getAlignment()), I);
489 WorkItem.pop_back();
490 // Replace constant expressions directly or indirectly using the managed
491 // variable with instructions.
492 for (auto &&Op : WorkItem) {
493 auto *CE = cast<llvm::ConstantExpr>(Op);
494 auto *NewInst = CE->getAsInstruction(I);
495 NewInst->replaceUsesOfWith(OldV, NewV);
496 OldV = CE;
497 NewV = NewInst;
498 }
499 I->replaceUsesOfWith(OldV, NewV);
500 } else {
501 llvm_unreachable("Invalid use of managed variable");
502 }
503 }
504}
505
506/// Creates a function that sets up state on the host side for CUDA objects that
507/// have a presence on both the host and device sides. Specifically, registers
508/// the host side of kernel functions and device global variables with the CUDA
509/// runtime.
510/// \code
511/// void __cuda_register_globals(void** GpuBinaryHandle) {
512/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
513/// ...
514/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
515/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
516/// ...
517/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
518/// }
519/// \endcode
520llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
521 // No need to register anything
522 if (EmittedKernels.empty() && DeviceVars.empty())
523 return nullptr;
524
525 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
526 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
527 addUnderscoredPrefixToName("_register_globals"), &TheModule);
528 llvm::BasicBlock *EntryBB =
529 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
530 CGBuilderTy Builder(CGM, Context);
531 Builder.SetInsertPoint(EntryBB);
532
533 // void __cudaRegisterFunction(void **, const char *, char *, const char *,
534 // int, uint3*, uint3*, dim3*, dim3*, int*)
535 llvm::Type *RegisterFuncParams[] = {
536 PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
537 PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
538 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
539 llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
540 addUnderscoredPrefixToName("RegisterFunction"));
541
542 // Extract GpuBinaryHandle passed as the first argument passed to
543 // __cuda_register_globals() and generate __cudaRegisterFunction() call for
544 // each emitted kernel.
545 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
546 for (auto &&I : EmittedKernels) {
547 llvm::Constant *KernelName =
548 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
549 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
550 llvm::Value *Args[] = {
551 &GpuBinaryHandlePtr,
552 KernelHandles[I.Kernel->getName()],
553 KernelName,
554 KernelName,
555 llvm::ConstantInt::get(IntTy, -1),
556 NullPtr,
557 NullPtr,
558 NullPtr,
559 NullPtr,
560 llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
561 Builder.CreateCall(RegisterFunc, Args);
562 }
563
564 llvm::Type *VarSizeTy = IntTy;
565 // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
566 if (CGM.getLangOpts().HIP ||
567 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
568 VarSizeTy = SizeTy;
569
570 // void __cudaRegisterVar(void **, char *, char *, const char *,
571 // int, int, int, int)
572 llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
573 IntTy, VarSizeTy, IntTy, IntTy};
574 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
575 llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
576 addUnderscoredPrefixToName("RegisterVar"));
577 // void __hipRegisterManagedVar(void **, char *, char *, const char *,
578 // size_t, unsigned)
579 llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
580 PtrTy, VarSizeTy, IntTy};
581 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
582 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
583 addUnderscoredPrefixToName("RegisterManagedVar"));
584 // void __cudaRegisterSurface(void **, const struct surfaceReference *,
585 // const void **, const char *, int, int);
586 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
587 llvm::FunctionType::get(
588 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, false),
589 addUnderscoredPrefixToName("RegisterSurface"));
590 // void __cudaRegisterTexture(void **, const struct textureReference *,
591 // const void **, const char *, int, int, int)
592 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
593 llvm::FunctionType::get(
594 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, false),
595 addUnderscoredPrefixToName("RegisterTexture"));
596 for (auto &&Info : DeviceVars) {
597 llvm::GlobalVariable *Var = Info.Var;
598 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
599 "External variables should not show up here, except HIP managed "
600 "variables");
601 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
602 switch (Info.Flags.getKind()) {
603 case DeviceVarFlags::Variable: {
604 uint64_t VarSize =
605 CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
606 if (Info.Flags.isManaged()) {
607 auto *ManagedVar = new llvm::GlobalVariable(
608 CGM.getModule(), Var->getType(),
609 /*isConstant=*/false, Var->getLinkage(),
610 /*Init=*/Var->isDeclaration()
611 ? nullptr
612 : llvm::ConstantPointerNull::get(Var->getType()),
613 /*Name=*/"", /*InsertBefore=*/nullptr,
614 llvm::GlobalVariable::NotThreadLocal);
615 ManagedVar->setDSOLocal(Var->isDSOLocal());
616 ManagedVar->setVisibility(Var->getVisibility());
617 ManagedVar->setExternallyInitialized(true);
618 ManagedVar->takeName(Var);
619 Var->setName(Twine(ManagedVar->getName() + ".managed"));
620 replaceManagedVar(Var, ManagedVar);
621 llvm::Value *Args[] = {
622 &GpuBinaryHandlePtr,
623 ManagedVar,
624 Var,
625 VarName,
626 llvm::ConstantInt::get(VarSizeTy, VarSize),
627 llvm::ConstantInt::get(IntTy, Var->getAlignment())};
628 if (!Var->isDeclaration())
629 Builder.CreateCall(RegisterManagedVar, Args);
630 } else {
631 llvm::Value *Args[] = {
632 &GpuBinaryHandlePtr,
633 Var,
634 VarName,
635 VarName,
636 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
637 llvm::ConstantInt::get(VarSizeTy, VarSize),
638 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
639 llvm::ConstantInt::get(IntTy, 0)};
640 Builder.CreateCall(RegisterVar, Args);
641 }
642 break;
643 }
644 case DeviceVarFlags::Surface:
645 Builder.CreateCall(
646 RegisterSurf,
647 {&GpuBinaryHandlePtr, Var, VarName, VarName,
648 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
649 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
650 break;
651 case DeviceVarFlags::Texture:
652 Builder.CreateCall(
653 RegisterTex,
654 {&GpuBinaryHandlePtr, Var, VarName, VarName,
655 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
656 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
657 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
658 break;
659 }
660 }
661
662 Builder.CreateRetVoid();
663 return RegisterKernelsFunc;
664}
665
666/// Creates a global constructor function for the module:
667///
668/// For CUDA:
669/// \code
670/// void __cuda_module_ctor() {
671/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
672/// __cuda_register_globals(Handle);
673/// }
674/// \endcode
675///
676/// For HIP:
677/// \code
678/// void __hip_module_ctor() {
679/// if (__hip_gpubin_handle == 0) {
680/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
681/// __hip_register_globals(__hip_gpubin_handle);
682/// }
683/// }
684/// \endcode
685llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
686 bool IsHIP = CGM.getLangOpts().HIP;
687 bool IsCUDA = CGM.getLangOpts().CUDA;
688 // No need to generate ctors/dtors if there is no GPU binary.
689 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
690 if (CudaGpuBinaryFileName.empty() && !IsHIP)
691 return nullptr;
692 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
693 DeviceVars.empty())
694 return nullptr;
695
696 // void __{cuda|hip}_register_globals(void* handle);
697 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
698 // We always need a function to pass in as callback. Create a dummy
699 // implementation if we don't need to register anything.
700 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
701 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
702
703 // void ** __{cuda|hip}RegisterFatBinary(void *);
704 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
705 llvm::FunctionType::get(PtrTy, PtrTy, false),
706 addUnderscoredPrefixToName("RegisterFatBinary"));
707 // struct { int magic, int version, void * gpu_binary, void * dont_care };
708 llvm::StructType *FatbinWrapperTy =
709 llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
710
711 // Register GPU binary with the CUDA runtime, store returned handle in a
712 // global variable and save a reference in GpuBinaryHandle to be cleaned up
713 // in destructor on exit. Then associate all known kernels with the GPU binary
714 // handle so CUDA runtime can figure out what to call on the GPU side.
715 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
716 if (!CudaGpuBinaryFileName.empty()) {
717 auto VFS = CGM.getFileSystem();
718 auto CudaGpuBinaryOrErr =
719 VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false);
720 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
721 CGM.getDiags().Report(diag::err_cannot_open_file)
722 << CudaGpuBinaryFileName << EC.message();
723 return nullptr;
724 }
725 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
726 }
727
728 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
729 llvm::FunctionType::get(VoidTy, false),
730 llvm::GlobalValue::InternalLinkage,
731 addUnderscoredPrefixToName("_module_ctor"), &TheModule);
732 llvm::BasicBlock *CtorEntryBB =
733 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
734 CGBuilderTy CtorBuilder(CGM, Context);
735
736 CtorBuilder.SetInsertPoint(CtorEntryBB);
737
738 const char *FatbinConstantName;
739 const char *FatbinSectionName;
740 const char *ModuleIDSectionName;
741 StringRef ModuleIDPrefix;
742 llvm::Constant *FatBinStr;
743 unsigned FatMagic;
744 if (IsHIP) {
745 FatbinConstantName = ".hip_fatbin";
746 FatbinSectionName = ".hipFatBinSegment";
747
748 ModuleIDSectionName = "__hip_module_id";
749 ModuleIDPrefix = "__hip_";
750
751 if (CudaGpuBinary) {
752 // If fatbin is available from early finalization, create a string
753 // literal containing the fat binary loaded from the given file.
754 const unsigned HIPCodeObjectAlign = 4096;
755 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
756 FatbinConstantName, HIPCodeObjectAlign);
757 } else {
758 // If fatbin is not available, create an external symbol
759 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
760 // to contain the fat binary but will be populated somewhere else,
761 // e.g. by lld through link script.
762 FatBinStr = new llvm::GlobalVariable(
763 CGM.getModule(), CGM.Int8Ty,
764 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
765 "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
766 llvm::GlobalVariable::NotThreadLocal);
767 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
768 }
769
770 FatMagic = HIPFatMagic;
771 } else {
772 if (RelocatableDeviceCode)
773 FatbinConstantName = CGM.getTriple().isMacOSX()
774 ? "__NV_CUDA,__nv_relfatbin"
775 : "__nv_relfatbin";
776 else
777 FatbinConstantName =
778 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
779 // NVIDIA's cuobjdump looks for fatbins in this section.
780 FatbinSectionName =
781 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
782
783 ModuleIDSectionName = CGM.getTriple().isMacOSX()
784 ? "__NV_CUDA,__nv_module_id"
785 : "__nv_module_id";
786 ModuleIDPrefix = "__nv_";
787
788 // For CUDA, create a string literal containing the fat binary loaded from
789 // the given file.
790 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
791 FatbinConstantName, 8);
792 FatMagic = CudaFatMagic;
793 }
794
795 // Create initialized wrapper structure that points to the loaded GPU binary
796 ConstantInitBuilder Builder(CGM);
797 auto Values = Builder.beginStruct(FatbinWrapperTy);
798 // Fatbin wrapper magic.
799 Values.addInt(IntTy, FatMagic);
800 // Fatbin version.
801 Values.addInt(IntTy, 1);
802 // Data.
803 Values.add(FatBinStr);
804 // Unused in fatbin v1.
805 Values.add(llvm::ConstantPointerNull::get(PtrTy));
806 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
807 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
808 /*constant*/ true);
809 FatbinWrapper->setSection(FatbinSectionName);
810
811 // There is only one HIP fat binary per linked module, however there are
812 // multiple constructor functions. Make sure the fat binary is registered
813 // only once. The constructor functions are executed by the dynamic loader
814 // before the program gains control. The dynamic loader cannot execute the
815 // constructor functions concurrently since doing that would not guarantee
816 // thread safety of the loaded program. Therefore we can assume sequential
817 // execution of constructor functions here.
818 if (IsHIP) {
819 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
820 : llvm::GlobalValue::ExternalLinkage;
821 llvm::BasicBlock *IfBlock =
822 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
823 llvm::BasicBlock *ExitBlock =
824 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
825 // The name, size, and initialization pattern of this variable is part
826 // of HIP ABI.
827 GpuBinaryHandle = new llvm::GlobalVariable(
828 TheModule, PtrTy, /*isConstant=*/false, Linkage,
829 /*Initializer=*/
830 CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
831 CudaGpuBinary
832 ? "__hip_gpubin_handle"
833 : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
834 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
835 // Prevent the weak symbol in different shared libraries being merged.
836 if (Linkage != llvm::GlobalValue::InternalLinkage)
837 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
838 Address GpuBinaryAddr(
839 GpuBinaryHandle, PtrTy,
840 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
841 {
842 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
843 llvm::Constant *Zero =
844 llvm::Constant::getNullValue(HandleValue->getType());
845 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
846 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
847 }
848 {
849 CtorBuilder.SetInsertPoint(IfBlock);
850 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
851 llvm::CallInst *RegisterFatbinCall =
852 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
853 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
854 CtorBuilder.CreateBr(ExitBlock);
855 }
856 {
857 CtorBuilder.SetInsertPoint(ExitBlock);
858 // Call __hip_register_globals(GpuBinaryHandle);
859 if (RegisterGlobalsFunc) {
860 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
861 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
862 }
863 }
864 } else if (!RelocatableDeviceCode) {
865 // Register binary with CUDA runtime. This is substantially different in
866 // default mode vs. separate compilation!
867 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
868 llvm::CallInst *RegisterFatbinCall =
869 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
870 GpuBinaryHandle = new llvm::GlobalVariable(
871 TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage,
872 llvm::ConstantPointerNull::get(PtrTy), "__cuda_gpubin_handle");
873 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
874 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
875 CGM.getPointerAlign());
876
877 // Call __cuda_register_globals(GpuBinaryHandle);
878 if (RegisterGlobalsFunc)
879 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
880
881 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
882 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
883 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
884 // void __cudaRegisterFatBinaryEnd(void **);
885 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
886 llvm::FunctionType::get(VoidTy, PtrTy, false),
887 "__cudaRegisterFatBinaryEnd");
888 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
889 }
890 } else {
891 // Generate a unique module ID.
892 SmallString<64> ModuleID;
893 llvm::raw_svector_ostream OS(ModuleID);
894 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
895 llvm::Constant *ModuleIDConstant = makeConstantArray(
896 std::string(ModuleID), "", ModuleIDSectionName, 32, /*AddNull=*/true);
897
898 // Create an alias for the FatbinWrapper that nvcc will look for.
899 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
900 Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
901
902 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
903 // void *, void (*)(void **))
904 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
905 RegisterLinkedBinaryName += ModuleID;
906 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
907 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
908
909 assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
910 llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
911 makeDummyFunction(getCallbackFnTy())};
912 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
913 }
914
915 // Create destructor and register it with atexit() the way NVCC does it. Doing
916 // it during regular destructor phase worked in CUDA before 9.2 but results in
917 // double-free in 9.2.
918 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
919 // extern "C" int atexit(void (*f)(void));
920 llvm::FunctionType *AtExitTy =
921 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
922 llvm::FunctionCallee AtExitFunc =
923 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
924 /*Local=*/true);
925 CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
926 }
927
928 CtorBuilder.CreateRetVoid();
929 return ModuleCtorFunc;
930}
931
932/// Creates a global destructor function that unregisters the GPU code blob
933/// registered by constructor.
934///
935/// For CUDA:
936/// \code
937/// void __cuda_module_dtor() {
938/// __cudaUnregisterFatBinary(Handle);
939/// }
940/// \endcode
941///
942/// For HIP:
943/// \code
944/// void __hip_module_dtor() {
945/// if (__hip_gpubin_handle) {
946/// __hipUnregisterFatBinary(__hip_gpubin_handle);
947/// __hip_gpubin_handle = 0;
948/// }
949/// }
950/// \endcode
951llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
952 // No need for destructor if we don't have a handle to unregister.
953 if (!GpuBinaryHandle)
954 return nullptr;
955
956 // void __cudaUnregisterFatBinary(void ** handle);
957 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
958 llvm::FunctionType::get(VoidTy, PtrTy, false),
959 addUnderscoredPrefixToName("UnregisterFatBinary"));
960
961 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
962 llvm::FunctionType::get(VoidTy, false),
963 llvm::GlobalValue::InternalLinkage,
964 addUnderscoredPrefixToName("_module_dtor"), &TheModule);
965
966 llvm::BasicBlock *DtorEntryBB =
967 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
968 CGBuilderTy DtorBuilder(CGM, Context);
969 DtorBuilder.SetInsertPoint(DtorEntryBB);
970
971 Address GpuBinaryAddr(
972 GpuBinaryHandle, GpuBinaryHandle->getValueType(),
973 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
974 auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
975 // There is only one HIP fat binary per linked module, however there are
976 // multiple destructor functions. Make sure the fat binary is unregistered
977 // only once.
978 if (CGM.getLangOpts().HIP) {
979 llvm::BasicBlock *IfBlock =
980 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
981 llvm::BasicBlock *ExitBlock =
982 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
983 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
984 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
985 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
986
987 DtorBuilder.SetInsertPoint(IfBlock);
988 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
989 DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
990 DtorBuilder.CreateBr(ExitBlock);
991
992 DtorBuilder.SetInsertPoint(ExitBlock);
993 } else {
994 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
995 }
996 DtorBuilder.CreateRetVoid();
997 return ModuleDtorFunc;
998}
999
1001 return new CGNVCUDARuntime(CGM);
1002}
1003
1004void CGNVCUDARuntime::internalizeDeviceSideVar(
1005 const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
1006 // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
1007 // global variables become internal definitions. These have to be internal in
1008 // order to prevent name conflicts with global host variables with the same
1009 // name in a different TUs.
1010 //
1011 // For -fgpu-rdc, the shadow variables should not be internalized because
1012 // they may be accessed by different TU.
1013 if (CGM.getLangOpts().GPURelocatableDeviceCode)
1014 return;
1015
1016 // __shared__ variables are odd. Shadows do get created, but
1017 // they are not registered with the CUDA runtime, so they
1018 // can't really be used to access their device-side
1019 // counterparts. It's not clear yet whether it's nvcc's bug or
1020 // a feature, but we've got to do the same for compatibility.
1021 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
1022 D->hasAttr<CUDASharedAttr>() ||
1025 Linkage = llvm::GlobalValue::InternalLinkage;
1026 }
1027}
1028
1029void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1030 llvm::GlobalVariable &GV) {
1031 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
1032 // Shadow variables and their properties must be registered with CUDA
1033 // runtime. Skip Extern global variables, which will be registered in
1034 // the TU where they are defined.
1035 //
1036 // Don't register a C++17 inline variable. The local symbol can be
1037 // discarded and referencing a discarded local symbol from outside the
1038 // comdat (__cuda_register_globals) is disallowed by the ELF spec.
1039 //
1040 // HIP managed variables need to be always recorded in device and host
1041 // compilations for transformation.
1042 //
1043 // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1044 // added to llvm.compiler-used, therefore they are safe to be registered.
1045 if ((!D->hasExternalStorage() && !D->isInline()) ||
1046 CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
1047 D->hasAttr<HIPManagedAttr>()) {
1048 registerDeviceVar(D, GV, !D->hasDefinition(),
1049 D->hasAttr<CUDAConstantAttr>());
1050 }
1051 } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1053 // Builtin surfaces and textures and their template arguments are
1054 // also registered with CUDA runtime.
1055 const auto *TD = cast<ClassTemplateSpecializationDecl>(
1056 D->getType()->castAs<RecordType>()->getDecl());
1057 const TemplateArgumentList &Args = TD->getTemplateArgs();
1058 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1059 assert(Args.size() == 2 &&
1060 "Unexpected number of template arguments of CUDA device "
1061 "builtin surface type.");
1062 auto SurfType = Args[1].getAsIntegral();
1063 if (!D->hasExternalStorage())
1064 registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1065 } else {
1066 assert(Args.size() == 3 &&
1067 "Unexpected number of template arguments of CUDA device "
1068 "builtin texture type.");
1069 auto TexType = Args[1].getAsIntegral();
1070 auto Normalized = Args[2].getAsIntegral();
1071 if (!D->hasExternalStorage())
1072 registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1073 Normalized.getZExtValue());
1074 }
1075 }
1076}
1077
1078// Transform managed variables to pointers to managed variables in device code.
1079// Each use of the original managed variable is replaced by a load from the
1080// transformed managed variable. The transformed managed variable contains
1081// the address of managed memory which will be allocated by the runtime.
1082void CGNVCUDARuntime::transformManagedVars() {
1083 for (auto &&Info : DeviceVars) {
1084 llvm::GlobalVariable *Var = Info.Var;
1085 if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1086 Info.Flags.isManaged()) {
1087 auto *ManagedVar = new llvm::GlobalVariable(
1088 CGM.getModule(), Var->getType(),
1089 /*isConstant=*/false, Var->getLinkage(),
1090 /*Init=*/Var->isDeclaration()
1091 ? nullptr
1092 : llvm::ConstantPointerNull::get(Var->getType()),
1093 /*Name=*/"", /*InsertBefore=*/nullptr,
1094 llvm::GlobalVariable::NotThreadLocal,
1095 CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
1096 ManagedVar->setDSOLocal(Var->isDSOLocal());
1097 ManagedVar->setVisibility(Var->getVisibility());
1098 ManagedVar->setExternallyInitialized(true);
1099 replaceManagedVar(Var, ManagedVar);
1100 ManagedVar->takeName(Var);
1101 Var->setName(Twine(ManagedVar->getName()) + ".managed");
1102 // Keep managed variables even if they are not used in device code since
1103 // they need to be allocated by the runtime.
1104 if (!Var->isDeclaration()) {
1105 assert(!ManagedVar->isDeclaration());
1106 CGM.addCompilerUsedGlobal(Var);
1107 CGM.addCompilerUsedGlobal(ManagedVar);
1108 }
1109 }
1110 }
1111}
1112
1113// Creates offloading entries for all the kernels and globals that must be
1114// registered. The linker will provide a pointer to this section so we can
1115// register the symbols with the linked device image.
1116void CGNVCUDARuntime::createOffloadingEntries() {
1117 StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
1118 : "cuda_offloading_entries";
1119 llvm::Module &M = CGM.getModule();
1120 for (KernelInfo &I : EmittedKernels)
1121 llvm::offloading::emitOffloadingEntry(
1122 M, KernelHandles[I.Kernel->getName()],
1123 getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0,
1124 llvm::offloading::OffloadGlobalEntry, Section);
1125
1126 for (VarInfo &I : DeviceVars) {
1127 uint64_t VarSize =
1128 CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1129 int32_t Flags =
1130 (I.Flags.isExtern()
1131 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1132 : 0) |
1133 (I.Flags.isConstant()
1134 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1135 : 0) |
1136 (I.Flags.isNormalized()
1137 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1138 : 0);
1139 if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1140 llvm::offloading::emitOffloadingEntry(
1141 M, I.Var, getDeviceSideName(I.D), VarSize,
1142 (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1143 : llvm::offloading::OffloadGlobalEntry) |
1144 Flags,
1145 /*Data=*/0, Section);
1146 } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1147 llvm::offloading::emitOffloadingEntry(
1148 M, I.Var, getDeviceSideName(I.D), VarSize,
1149 llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1150 I.Flags.getSurfTexType(), Section);
1151 } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1152 llvm::offloading::emitOffloadingEntry(
1153 M, I.Var, getDeviceSideName(I.D), VarSize,
1154 llvm::offloading::OffloadGlobalTextureEntry | Flags,
1155 I.Flags.getSurfTexType(), Section);
1156 }
1157 }
1158}
1159
1160// Returns module constructor to be added.
1161llvm::Function *CGNVCUDARuntime::finalizeModule() {
1162 if (CGM.getLangOpts().CUDAIsDevice) {
1163 transformManagedVars();
1164
1165 // Mark ODR-used device variables as compiler used to prevent it from being
1166 // eliminated by optimization. This is necessary for device variables
1167 // ODR-used by host functions. Sema correctly marks them as ODR-used no
1168 // matter whether they are ODR-used by device or host functions.
1169 //
1170 // We do not need to do this if the variable has used attribute since it
1171 // has already been added.
1172 //
1173 // Static device variables have been externalized at this point, therefore
1174 // variables with LLVM private or internal linkage need not be added.
1175 for (auto &&Info : DeviceVars) {
1176 auto Kind = Info.Flags.getKind();
1177 if (!Info.Var->isDeclaration() &&
1178 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1179 (Kind == DeviceVarFlags::Variable ||
1180 Kind == DeviceVarFlags::Surface ||
1181 Kind == DeviceVarFlags::Texture) &&
1182 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1183 CGM.addCompilerUsedGlobal(Info.Var);
1184 }
1185 }
1186 return nullptr;
1187 }
1188 if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1189 createOffloadingEntries();
1190 else
1191 return makeModuleCtorFunction();
1192
1193 return nullptr;
1194}
1195
1196llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1197 GlobalDecl GD) {
1198 auto Loc = KernelHandles.find(F->getName());
1199 if (Loc != KernelHandles.end()) {
1200 auto OldHandle = Loc->second;
1201 if (KernelStubs[OldHandle] == F)
1202 return OldHandle;
1203
1204 // We've found the function name, but F itself has changed, so we need to
1205 // update the references.
1206 if (CGM.getLangOpts().HIP) {
1207 // For HIP compilation the handle itself does not change, so we only need
1208 // to update the Stub value.
1209 KernelStubs[OldHandle] = F;
1210 return OldHandle;
1211 }
1212 // For non-HIP compilation, erase the old Stub and fall-through to creating
1213 // new entries.
1214 KernelStubs.erase(OldHandle);
1215 }
1216
1217 if (!CGM.getLangOpts().HIP) {
1218 KernelHandles[F->getName()] = F;
1219 KernelStubs[F] = F;
1220 return F;
1221 }
1222
1223 auto *Var = new llvm::GlobalVariable(
1224 TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1225 /*Initializer=*/nullptr,
1226 CGM.getMangledName(
1227 GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
1228 Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1229 Var->setDSOLocal(F->isDSOLocal());
1230 Var->setVisibility(F->getVisibility());
1231 auto *FD = cast<FunctionDecl>(GD.getDecl());
1232 auto *FT = FD->getPrimaryTemplate();
1233 if (!FT || FT->isThisDeclarationADefinition())
1234 CGM.maybeSetTrivialComdat(*FD, *Var);
1235 KernelHandles[F->getName()] = Var;
1236 KernelStubs[Var] = F;
1237 return Var;
1238}
static std::unique_ptr< MangleContext > InitDeviceMC(CodeGenModule &CGM)
Definition: CGCUDANV.cpp:209
static void replaceManagedVar(llvm::GlobalVariable *Var, llvm::GlobalVariable *ManagedVar)
Definition: CGCUDANV.cpp:467
MangleContext * createMangleContext(const TargetInfo *T=nullptr)
If T is null pointer, assume the target in ASTContext.
const TargetInfo * getAuxTargetInfo() const
Definition: ASTContext.h:753
MangleContext * createDeviceMangleContext(const TargetInfo &T)
Creates a device mangle context to correctly mangle lambdas in a mixed architecture compile by settin...
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:752
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
static CharUnits Zero()
Zero - Construct a CharUnits quantity of zero.
Definition: CharUnits.h:53
An aligned address.
Definition: Address.h:29
llvm::Value * getPointer() const
Definition: Address.h:51
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:57
llvm::StoreInst * CreateDefaultAlignedStore(llvm::Value *Val, llvm::Value *Addr, bool IsVolatile=false)
Definition: CGBuilder.h:112
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
Definition: CGBuilder.h:71
virtual std::string getDeviceSideName(const NamedDecl *ND)=0
Returns function or variable name on device side even if the current compilation is for host.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args)=0
Emits a kernel launch stub.
virtual llvm::Function * getKernelStub(llvm::GlobalValue *Handle)=0
Get kernel stub by kernel handle.
virtual void handleVarRegistration(const VarDecl *VD, llvm::GlobalVariable &Var)=0
Check whether a variable is a device variable and register it if true.
virtual llvm::Function * finalizeModule()=0
Finalize generated LLVM module.
virtual llvm::GlobalValue * getKernelHandle(llvm::Function *Stub, GlobalDecl GD)=0
Get kernel handle by stub function.
virtual void internalizeDeviceSideVar(const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage)=0
Adjust linkage of shadow variables in host compilation.
static CGCallee forDirect(llvm::Constant *functionPtr, const CGCalleeInfo &abstractInfo=CGCalleeInfo())
Definition: CGCall.h:129
CGFunctionInfo - Class to encapsulate the information about a function definition.
CallArgList - Type for representing both the value and type of arguments in a call.
Definition: CGCall.h:258
void add(RValue rvalue, QualType type)
Definition: CGCall.h:282
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
const LangOptions & getLangOpts() const
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee, ReturnValueSlot ReturnValue, const CallArgList &Args, llvm::CallBase **callOrInvoke, bool IsMustTail, SourceLocation Loc)
EmitCall - Generate a call of the given function, expecting the given result type,...
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
const Decl * CurFuncDecl
CurFuncDecl - Holds the Decl for the current outermost non-closure context.
llvm::CallBase * EmitRuntimeCallOrInvoke(llvm::FunctionCallee callee, ArrayRef< llvm::Value * > args, const Twine &name="")
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
This class organizes the cross-function state that is used while generating LLVM code.
ASTContext & getContext() const
The standard implementation of ConstantInitBuilder used in Clang.
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition: CGCall.h:352
static RValue get(llvm::Value *V)
Definition: CGValue.h:89
static RValue getAggregate(Address addr, bool isVolatile=false)
Definition: CGValue.h:110
ReturnValueSlot - Contains the address where the return value of a function can be stored,...
Definition: CGCall.h:356
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1446
lookup_result lookup(DeclarationName Name) const
lookup - Find the declarations (if any) with the given Name in this context.
Definition: DeclBase.cpp:1785
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:85
SourceLocation getLocation() const
Definition: DeclBase.h:444
TranslationUnitDecl * getTranslationUnitDecl()
Definition: DeclBase.cpp:486
bool hasAttr() const
Definition: DeclBase.h:582
Represents a function declaration or definition.
Definition: Decl.h:1959
const ParmVarDecl * getParamDecl(unsigned i) const
Definition: Decl.h:2674
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:56
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind)
Definition: GlobalDecl.h:194
const Decl * getDecl() const
Definition: GlobalDecl.h:103
One of these records is kept for each identifier that is lexed.
StringRef getName() const
Return the actual identifier string.
GPUDefaultStreamKind GPUDefaultStream
The default stream kind used for HIP kernel launching.
Definition: LangOptions.h:513
MangleContext - Context for tracking state which persists across multiple calls to the C++ name mangl...
Definition: Mangle.h:45
bool shouldMangleDeclName(const NamedDecl *D)
Definition: Mangle.cpp:105
void mangleName(GlobalDecl GD, raw_ostream &)
Definition: Mangle.cpp:139
This represents a decl that may have a name.
Definition: Decl.h:249
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:270
Represents a parameter to a function.
Definition: Decl.h:1749
A (possibly-)qualified type.
Definition: Type.h:737
QualType getCanonicalType() const
Definition: Type.h:6954
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
Definition: Type.h:5092
RecordDecl * getDecl() const
Definition: Type.h:5102
bool isMicrosoft() const
Is this ABI an MSVC-compatible ABI?
Definition: TargetCXXABI.h:136
bool isItaniumFamily() const
Does this ABI generally fall into the Itanium family of ABIs?
Definition: TargetCXXABI.h:122
TargetCXXABI getCXXABI() const
Get the C++ ABI currently in use.
Definition: TargetInfo.h:1291
A template argument list.
Definition: DeclTemplate.h:244
unsigned size() const
Retrieve the number of template arguments in this template argument list.
Definition: DeclTemplate.h:280
The top declaration context.
Definition: Decl.h:84
static DeclContext * castToDeclContext(const TranslationUnitDecl *D)
Definition: Decl.h:130
The base class of the type hierarchy.
Definition: Type.h:1606
const T * castAs() const
Member-template castAs<specific type>.
Definition: Type.h:7724
bool isCUDADeviceBuiltinSurfaceType() const
Check if the type is the CUDA device builtin surface type.
Definition: Type.cpp:4802
bool isCUDADeviceBuiltinTextureType() const
Check if the type is the CUDA device builtin texture type.
Definition: Type.cpp:4809
QualType getType() const
Definition: Decl.h:717
QualType getType() const
Definition: Value.cpp:234
Represents a variable declaration or definition.
Definition: Decl.h:918
bool isInline() const
Whether this variable is (C++1z) inline.
Definition: Decl.h:1528
bool hasExternalStorage() const
Returns true if a variable has extern or private_extern storage.
Definition: Decl.h:1201
DefinitionKind hasDefinition(ASTContext &) const
Check whether this variable is defined in this translation unit.
Definition: Decl.cpp:2372
CGCUDARuntime * CreateNVCUDARuntime(CodeGenModule &CGM)
Creates an instance of a CUDA runtime class.
Definition: CGCUDANV.cpp:1000
bool Zero(InterpState &S, CodePtr OpPC)
Definition: Interp.h:1809
@ VFS
Remove unused -ivfsoverlay arguments.
The JSON file list parser is used to communicate input to InstallAPI.
CudaVersion ToCudaVersion(llvm::VersionTuple)
Definition: Cuda.cpp:65
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature)
Definition: Cuda.cpp:244
Linkage
Describes the different kinds of linkage (C++ [basic.link], C99 6.2.2) that an entity may have.
Definition: Linkage.h:24
unsigned long uint64_t
llvm::IntegerType * IntTy
int