clang 20.0.0git
CGCUDANV.cpp
Go to the documentation of this file.
1//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10// runtime library.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGCUDARuntime.h"
15#include "CGCXXABI.h"
16#include "CodeGenFunction.h"
17#include "CodeGenModule.h"
18#include "clang/AST/Decl.h"
19#include "clang/Basic/Cuda.h"
22#include "llvm/Frontend/Offloading/Utility.h"
23#include "llvm/IR/BasicBlock.h"
24#include "llvm/IR/Constants.h"
25#include "llvm/IR/DerivedTypes.h"
26#include "llvm/IR/ReplaceConstant.h"
27#include "llvm/Support/Format.h"
28#include "llvm/Support/VirtualFileSystem.h"
29
30using namespace clang;
31using namespace CodeGen;
32
33namespace {
34constexpr unsigned CudaFatMagic = 0x466243b1;
35constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
36
37class CGNVCUDARuntime : public CGCUDARuntime {
38
39private:
40 llvm::IntegerType *IntTy, *SizeTy;
41 llvm::Type *VoidTy;
42 llvm::PointerType *PtrTy;
43
44 /// Convenience reference to LLVM Context
45 llvm::LLVMContext &Context;
46 /// Convenience reference to the current module
47 llvm::Module &TheModule;
48 /// Keeps track of kernel launch stubs and handles emitted in this module
49 struct KernelInfo {
50 llvm::Function *Kernel; // stub function to help launch kernel
51 const Decl *D;
52 };
54 // Map a kernel mangled name to a symbol for identifying kernel in host code
55 // For CUDA, the symbol for identifying the kernel is the same as the device
56 // stub function. For HIP, they are different.
57 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
58 // Map a kernel handle to the kernel stub.
59 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
60 struct VarInfo {
61 llvm::GlobalVariable *Var;
62 const VarDecl *D;
63 DeviceVarFlags Flags;
64 };
66 /// Keeps track of variable containing handle of GPU binary. Populated by
67 /// ModuleCtorFunction() and used to create corresponding cleanup calls in
68 /// ModuleDtorFunction()
69 llvm::GlobalVariable *GpuBinaryHandle = nullptr;
70 /// Whether we generate relocatable device code.
71 bool RelocatableDeviceCode;
72 /// Mangle context for device.
73 std::unique_ptr<MangleContext> DeviceMC;
74
75 llvm::FunctionCallee getSetupArgumentFn() const;
76 llvm::FunctionCallee getLaunchFn() const;
77
78 llvm::FunctionType *getRegisterGlobalsFnTy() const;
79 llvm::FunctionType *getCallbackFnTy() const;
80 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
81 std::string addPrefixToName(StringRef FuncName) const;
82 std::string addUnderscoredPrefixToName(StringRef FuncName) const;
83
84 /// Creates a function to register all kernel stubs generated in this module.
85 llvm::Function *makeRegisterGlobalsFn();
86
87 /// Helper function that generates a constant string and returns a pointer to
88 /// the start of the string. The result of this function can be used anywhere
89 /// where the C code specifies const char*.
90 llvm::Constant *makeConstantString(const std::string &Str,
91 const std::string &Name = "") {
92 return CGM.GetAddrOfConstantCString(Str, Name.c_str()).getPointer();
93 }
94
95 /// Helper function which generates an initialized constant array from Str,
96 /// and optionally sets section name and alignment. AddNull specifies whether
97 /// the array should nave NUL termination.
98 llvm::Constant *makeConstantArray(StringRef Str,
99 StringRef Name = "",
100 StringRef SectionName = "",
101 unsigned Alignment = 0,
102 bool AddNull = false) {
103 llvm::Constant *Value =
104 llvm::ConstantDataArray::getString(Context, Str, AddNull);
105 auto *GV = new llvm::GlobalVariable(
106 TheModule, Value->getType(), /*isConstant=*/true,
107 llvm::GlobalValue::PrivateLinkage, Value, Name);
108 if (!SectionName.empty()) {
109 GV->setSection(SectionName);
110 // Mark the address as used which make sure that this section isn't
111 // merged and we will really have it in the object file.
112 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
113 }
114 if (Alignment)
115 GV->setAlignment(llvm::Align(Alignment));
116 return GV;
117 }
118
119 /// Helper function that generates an empty dummy function returning void.
120 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
121 assert(FnTy->getReturnType()->isVoidTy() &&
122 "Can only generate dummy functions returning void!");
123 llvm::Function *DummyFunc = llvm::Function::Create(
124 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
125
126 llvm::BasicBlock *DummyBlock =
127 llvm::BasicBlock::Create(Context, "", DummyFunc);
128 CGBuilderTy FuncBuilder(CGM, Context);
129 FuncBuilder.SetInsertPoint(DummyBlock);
130 FuncBuilder.CreateRetVoid();
131
132 return DummyFunc;
133 }
134
135 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
136 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
137 std::string getDeviceSideName(const NamedDecl *ND) override;
138
139 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
140 bool Extern, bool Constant) {
141 DeviceVars.push_back({&Var,
142 VD,
143 {DeviceVarFlags::Variable, Extern, Constant,
144 VD->hasAttr<HIPManagedAttr>(),
145 /*Normalized*/ false, 0}});
146 }
147 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
148 bool Extern, int Type) {
149 DeviceVars.push_back({&Var,
150 VD,
151 {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
152 /*Managed*/ false,
153 /*Normalized*/ false, Type}});
154 }
155 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
156 bool Extern, int Type, bool Normalized) {
157 DeviceVars.push_back({&Var,
158 VD,
159 {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
160 /*Managed*/ false, Normalized, Type}});
161 }
162
163 /// Creates module constructor function
164 llvm::Function *makeModuleCtorFunction();
165 /// Creates module destructor function
166 llvm::Function *makeModuleDtorFunction();
167 /// Transform managed variables for device compilation.
168 void transformManagedVars();
169 /// Create offloading entries to register globals in RDC mode.
170 void createOffloadingEntries();
171
172public:
173 CGNVCUDARuntime(CodeGenModule &CGM);
174
175 llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
176 llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
177 auto Loc = KernelStubs.find(Handle);
178 assert(Loc != KernelStubs.end());
179 return Loc->second;
180 }
181 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
182 void handleVarRegistration(const VarDecl *VD,
183 llvm::GlobalVariable &Var) override;
184 void
186 llvm::GlobalValue::LinkageTypes &Linkage) override;
187
188 llvm::Function *finalizeModule() override;
189};
190
191} // end anonymous namespace
192
193std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
194 if (CGM.getLangOpts().HIP)
195 return ((Twine("hip") + Twine(FuncName)).str());
196 return ((Twine("cuda") + Twine(FuncName)).str());
197}
198std::string
199CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
200 if (CGM.getLangOpts().HIP)
201 return ((Twine("__hip") + Twine(FuncName)).str());
202 return ((Twine("__cuda") + Twine(FuncName)).str());
203}
204
205static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
206 // If the host and device have different C++ ABIs, mark it as the device
207 // mangle context so that the mangling needs to retrieve the additional
208 // device lambda mangling number instead of the regular host one.
209 if (CGM.getContext().getAuxTargetInfo() &&
212 return std::unique_ptr<MangleContext>(
214 *CGM.getContext().getAuxTargetInfo()));
215 }
216
217 return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
219}
220
221CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
222 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
223 TheModule(CGM.getModule()),
224 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
225 DeviceMC(InitDeviceMC(CGM)) {
226 IntTy = CGM.IntTy;
227 SizeTy = CGM.SizeTy;
228 VoidTy = CGM.VoidTy;
229 PtrTy = CGM.UnqualPtrTy;
230}
231
232llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
233 // cudaError_t cudaSetupArgument(void *, size_t, size_t)
234 llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
235 return CGM.CreateRuntimeFunction(
236 llvm::FunctionType::get(IntTy, Params, false),
237 addPrefixToName("SetupArgument"));
238}
239
240llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
241 if (CGM.getLangOpts().HIP) {
242 // hipError_t hipLaunchByPtr(char *);
243 return CGM.CreateRuntimeFunction(
244 llvm::FunctionType::get(IntTy, PtrTy, false), "hipLaunchByPtr");
245 }
246 // cudaError_t cudaLaunch(char *);
247 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy, false),
248 "cudaLaunch");
249}
250
251llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
252 return llvm::FunctionType::get(VoidTy, PtrTy, false);
253}
254
255llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
256 return llvm::FunctionType::get(VoidTy, PtrTy, false);
257}
258
259llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
260 llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
261 llvm::PointerType::getUnqual(Context)};
262 return llvm::FunctionType::get(VoidTy, Params, false);
263}
264
265std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
266 GlobalDecl GD;
267 // D could be either a kernel or a variable.
268 if (auto *FD = dyn_cast<FunctionDecl>(ND))
269 GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
270 else
271 GD = GlobalDecl(ND);
272 std::string DeviceSideName;
273 MangleContext *MC;
274 if (CGM.getLangOpts().CUDAIsDevice)
275 MC = &CGM.getCXXABI().getMangleContext();
276 else
277 MC = DeviceMC.get();
278 if (MC->shouldMangleDeclName(ND)) {
279 SmallString<256> Buffer;
280 llvm::raw_svector_ostream Out(Buffer);
281 MC->mangleName(GD, Out);
282 DeviceSideName = std::string(Out.str());
283 } else
284 DeviceSideName = std::string(ND->getIdentifier()->getName());
285
286 // Make unique name for device side static file-scope variable for HIP.
287 if (CGM.getContext().shouldExternalize(ND) &&
288 CGM.getLangOpts().GPURelocatableDeviceCode) {
289 SmallString<256> Buffer;
290 llvm::raw_svector_ostream Out(Buffer);
291 Out << DeviceSideName;
292 CGM.printPostfixForExternalizedDecl(Out, ND);
293 DeviceSideName = std::string(Out.str());
294 }
295 return DeviceSideName;
296}
297
298void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
299 FunctionArgList &Args) {
300 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
301 if (auto *GV =
302 dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) {
303 GV->setLinkage(CGF.CurFn->getLinkage());
304 GV->setInitializer(CGF.CurFn);
305 }
306 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
307 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
308 (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
309 emitDeviceStubBodyNew(CGF, Args);
310 else
311 emitDeviceStubBodyLegacy(CGF, Args);
312}
313
314// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
315// array and kernels are launched using cudaLaunchKernel().
316void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
317 FunctionArgList &Args) {
318 // Build the shadow stack entry at the very start of the function.
319
320 // Calculate amount of space we will need for all arguments. If we have no
321 // args, allocate a single pointer so we still have a valid pointer to the
322 // argument array that we can pass to runtime, even if it will be unused.
323 Address KernelArgs = CGF.CreateTempAlloca(
324 PtrTy, CharUnits::fromQuantity(16), "kernel_args",
325 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
326 // Store pointers to the arguments in a locally allocated launch_args.
327 for (unsigned i = 0; i < Args.size(); ++i) {
328 llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(Args[i]).emitRawPointer(CGF);
329 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, PtrTy);
331 VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
332 PtrTy, KernelArgs.emitRawPointer(CGF), i));
333 }
334
335 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
336
337 // Lookup cudaLaunchKernel/hipLaunchKernel function.
338 // HIP kernel launching API name depends on -fgpu-default-stream option. For
339 // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
340 // it is hipLaunchKernel_spt.
341 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
342 // void **args, size_t sharedMem,
343 // cudaStream_t stream);
344 // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
345 // dim3 blockDim, void **args,
346 // size_t sharedMem, hipStream_t stream);
347 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
349 std::string KernelLaunchAPI = "LaunchKernel";
350 if (CGF.getLangOpts().GPUDefaultStream ==
351 LangOptions::GPUDefaultStreamKind::PerThread) {
352 if (CGF.getLangOpts().HIP)
353 KernelLaunchAPI = KernelLaunchAPI + "_spt";
354 else if (CGF.getLangOpts().CUDA)
355 KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
356 }
357 auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
358 const IdentifierInfo &cudaLaunchKernelII =
359 CGM.getContext().Idents.get(LaunchKernelName);
360 FunctionDecl *cudaLaunchKernelFD = nullptr;
361 for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
362 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
363 cudaLaunchKernelFD = FD;
364 }
365
366 if (cudaLaunchKernelFD == nullptr) {
367 CGM.Error(CGF.CurFuncDecl->getLocation(),
368 "Can't find declaration for " + LaunchKernelName);
369 return;
370 }
371 // Create temporary dim3 grid_dim, block_dim.
372 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
373 QualType Dim3Ty = GridDimParam->getType();
374 Address GridDim =
375 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
376 Address BlockDim =
377 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
378 Address ShmemSize =
379 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
380 Address Stream = CGF.CreateTempAlloca(PtrTy, CGM.getPointerAlign(), "stream");
381 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
382 llvm::FunctionType::get(IntTy,
383 {/*gridDim=*/GridDim.getType(),
384 /*blockDim=*/BlockDim.getType(),
385 /*ShmemSize=*/ShmemSize.getType(),
386 /*Stream=*/Stream.getType()},
387 /*isVarArg=*/false),
388 addUnderscoredPrefixToName("PopCallConfiguration"));
389
390 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.emitRawPointer(CGF),
391 BlockDim.emitRawPointer(CGF),
392 ShmemSize.emitRawPointer(CGF),
393 Stream.emitRawPointer(CGF)});
394
395 // Emit the call to cudaLaunch
396 llvm::Value *Kernel =
397 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
398 CallArgList LaunchKernelArgs;
399 LaunchKernelArgs.add(RValue::get(Kernel),
400 cudaLaunchKernelFD->getParamDecl(0)->getType());
401 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
402 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
403 LaunchKernelArgs.add(RValue::get(KernelArgs, CGF),
404 cudaLaunchKernelFD->getParamDecl(3)->getType());
405 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
406 cudaLaunchKernelFD->getParamDecl(4)->getType());
407 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
408 cudaLaunchKernelFD->getParamDecl(5)->getType());
409
410 QualType QT = cudaLaunchKernelFD->getType();
411 QualType CQT = QT.getCanonicalType();
412 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
413 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
414
415 const CGFunctionInfo &FI =
416 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
417 llvm::FunctionCallee cudaLaunchKernelFn =
418 CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
419 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
420 LaunchKernelArgs);
421
422 // To prevent CUDA device stub functions from being merged by ICF in MSVC
423 // environment, create an unique global variable for each kernel and write to
424 // the variable in the device stub.
425 if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
426 !CGF.getLangOpts().HIP) {
427 llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Kernel);
428 std::string GlobalVarName = (KernelFunction->getName() + ".id").str();
429
430 llvm::GlobalVariable *HandleVar =
431 CGM.getModule().getNamedGlobal(GlobalVarName);
432 if (!HandleVar) {
433 HandleVar = new llvm::GlobalVariable(
434 CGM.getModule(), CGM.Int8Ty,
435 /*Constant=*/false, KernelFunction->getLinkage(),
436 llvm::ConstantInt::get(CGM.Int8Ty, 0), GlobalVarName);
437 HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
438 HandleVar->setVisibility(KernelFunction->getVisibility());
439 if (KernelFunction->hasComdat())
440 HandleVar->setComdat(CGM.getModule().getOrInsertComdat(GlobalVarName));
441 }
442
443 CGF.Builder.CreateAlignedStore(llvm::ConstantInt::get(CGM.Int8Ty, 1),
444 HandleVar, CharUnits::One(),
445 /*IsVolatile=*/true);
446 }
447
448 CGF.EmitBranch(EndBlock);
449
450 CGF.EmitBlock(EndBlock);
451}
452
453void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
454 FunctionArgList &Args) {
455 // Emit a call to cudaSetupArgument for each arg in Args.
456 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
457 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
458 CharUnits Offset = CharUnits::Zero();
459 for (const VarDecl *A : Args) {
460 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
461 Offset = Offset.alignTo(TInfo.Align);
462 llvm::Value *Args[] = {
463 CGF.Builder.CreatePointerCast(
464 CGF.GetAddrOfLocalVar(A).emitRawPointer(CGF), PtrTy),
465 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
466 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
467 };
468 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
469 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
470 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
471 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
472 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
473 CGF.EmitBlock(NextBlock);
474 Offset += TInfo.Width;
475 }
476
477 // Emit the call to cudaLaunch
478 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
479 llvm::Value *Arg =
480 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
481 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
482 CGF.EmitBranch(EndBlock);
483
484 CGF.EmitBlock(EndBlock);
485}
486
487// Replace the original variable Var with the address loaded from variable
488// ManagedVar populated by HIP runtime.
489static void replaceManagedVar(llvm::GlobalVariable *Var,
490 llvm::GlobalVariable *ManagedVar) {
492 for (auto &&VarUse : Var->uses()) {
493 WorkList.push_back({VarUse.getUser()});
494 }
495 while (!WorkList.empty()) {
496 auto &&WorkItem = WorkList.pop_back_val();
497 auto *U = WorkItem.back();
498 if (isa<llvm::ConstantExpr>(U)) {
499 for (auto &&UU : U->uses()) {
500 WorkItem.push_back(UU.getUser());
501 WorkList.push_back(WorkItem);
502 WorkItem.pop_back();
503 }
504 continue;
505 }
506 if (auto *I = dyn_cast<llvm::Instruction>(U)) {
507 llvm::Value *OldV = Var;
508 llvm::Instruction *NewV =
509 new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
510 llvm::Align(Var->getAlignment()), I);
511 WorkItem.pop_back();
512 // Replace constant expressions directly or indirectly using the managed
513 // variable with instructions.
514 for (auto &&Op : WorkItem) {
515 auto *CE = cast<llvm::ConstantExpr>(Op);
516 auto *NewInst = CE->getAsInstruction();
517 NewInst->insertBefore(*I->getParent(), I->getIterator());
518 NewInst->replaceUsesOfWith(OldV, NewV);
519 OldV = CE;
520 NewV = NewInst;
521 }
522 I->replaceUsesOfWith(OldV, NewV);
523 } else {
524 llvm_unreachable("Invalid use of managed variable");
525 }
526 }
527}
528
529/// Creates a function that sets up state on the host side for CUDA objects that
530/// have a presence on both the host and device sides. Specifically, registers
531/// the host side of kernel functions and device global variables with the CUDA
532/// runtime.
533/// \code
534/// void __cuda_register_globals(void** GpuBinaryHandle) {
535/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
536/// ...
537/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
538/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
539/// ...
540/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
541/// }
542/// \endcode
543llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
544 // No need to register anything
545 if (EmittedKernels.empty() && DeviceVars.empty())
546 return nullptr;
547
548 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
549 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
550 addUnderscoredPrefixToName("_register_globals"), &TheModule);
551 llvm::BasicBlock *EntryBB =
552 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
553 CGBuilderTy Builder(CGM, Context);
554 Builder.SetInsertPoint(EntryBB);
555
556 // void __cudaRegisterFunction(void **, const char *, char *, const char *,
557 // int, uint3*, uint3*, dim3*, dim3*, int*)
558 llvm::Type *RegisterFuncParams[] = {
559 PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
560 PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
561 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
562 llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
563 addUnderscoredPrefixToName("RegisterFunction"));
564
565 // Extract GpuBinaryHandle passed as the first argument passed to
566 // __cuda_register_globals() and generate __cudaRegisterFunction() call for
567 // each emitted kernel.
568 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
569 for (auto &&I : EmittedKernels) {
570 llvm::Constant *KernelName =
571 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
572 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
573 llvm::Value *Args[] = {
574 &GpuBinaryHandlePtr,
575 KernelHandles[I.Kernel->getName()],
576 KernelName,
577 KernelName,
578 llvm::ConstantInt::get(IntTy, -1),
579 NullPtr,
580 NullPtr,
581 NullPtr,
582 NullPtr,
583 llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
584 Builder.CreateCall(RegisterFunc, Args);
585 }
586
587 llvm::Type *VarSizeTy = IntTy;
588 // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
589 if (CGM.getLangOpts().HIP ||
590 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
591 VarSizeTy = SizeTy;
592
593 // void __cudaRegisterVar(void **, char *, char *, const char *,
594 // int, int, int, int)
595 llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
596 IntTy, VarSizeTy, IntTy, IntTy};
597 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
598 llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
599 addUnderscoredPrefixToName("RegisterVar"));
600 // void __hipRegisterManagedVar(void **, char *, char *, const char *,
601 // size_t, unsigned)
602 llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
603 PtrTy, VarSizeTy, IntTy};
604 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
605 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
606 addUnderscoredPrefixToName("RegisterManagedVar"));
607 // void __cudaRegisterSurface(void **, const struct surfaceReference *,
608 // const void **, const char *, int, int);
609 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
610 llvm::FunctionType::get(
611 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, false),
612 addUnderscoredPrefixToName("RegisterSurface"));
613 // void __cudaRegisterTexture(void **, const struct textureReference *,
614 // const void **, const char *, int, int, int)
615 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
616 llvm::FunctionType::get(
617 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, false),
618 addUnderscoredPrefixToName("RegisterTexture"));
619 for (auto &&Info : DeviceVars) {
620 llvm::GlobalVariable *Var = Info.Var;
621 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
622 "External variables should not show up here, except HIP managed "
623 "variables");
624 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
625 switch (Info.Flags.getKind()) {
626 case DeviceVarFlags::Variable: {
627 uint64_t VarSize =
628 CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
629 if (Info.Flags.isManaged()) {
630 assert(Var->getName().ends_with(".managed") &&
631 "HIP managed variables not transformed");
632 auto *ManagedVar = CGM.getModule().getNamedGlobal(
633 Var->getName().drop_back(StringRef(".managed").size()));
634 llvm::Value *Args[] = {
635 &GpuBinaryHandlePtr,
636 ManagedVar,
637 Var,
638 VarName,
639 llvm::ConstantInt::get(VarSizeTy, VarSize),
640 llvm::ConstantInt::get(IntTy, Var->getAlignment())};
641 if (!Var->isDeclaration())
642 Builder.CreateCall(RegisterManagedVar, Args);
643 } else {
644 llvm::Value *Args[] = {
645 &GpuBinaryHandlePtr,
646 Var,
647 VarName,
648 VarName,
649 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
650 llvm::ConstantInt::get(VarSizeTy, VarSize),
651 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
652 llvm::ConstantInt::get(IntTy, 0)};
653 Builder.CreateCall(RegisterVar, Args);
654 }
655 break;
656 }
657 case DeviceVarFlags::Surface:
658 Builder.CreateCall(
659 RegisterSurf,
660 {&GpuBinaryHandlePtr, Var, VarName, VarName,
661 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
662 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
663 break;
664 case DeviceVarFlags::Texture:
665 Builder.CreateCall(
666 RegisterTex,
667 {&GpuBinaryHandlePtr, Var, VarName, VarName,
668 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
669 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
670 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
671 break;
672 }
673 }
674
675 Builder.CreateRetVoid();
676 return RegisterKernelsFunc;
677}
678
679/// Creates a global constructor function for the module:
680///
681/// For CUDA:
682/// \code
683/// void __cuda_module_ctor() {
684/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
685/// __cuda_register_globals(Handle);
686/// }
687/// \endcode
688///
689/// For HIP:
690/// \code
691/// void __hip_module_ctor() {
692/// if (__hip_gpubin_handle == 0) {
693/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
694/// __hip_register_globals(__hip_gpubin_handle);
695/// }
696/// }
697/// \endcode
698llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
699 bool IsHIP = CGM.getLangOpts().HIP;
700 bool IsCUDA = CGM.getLangOpts().CUDA;
701 // No need to generate ctors/dtors if there is no GPU binary.
702 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
703 if (CudaGpuBinaryFileName.empty() && !IsHIP)
704 return nullptr;
705 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
706 DeviceVars.empty())
707 return nullptr;
708
709 // void __{cuda|hip}_register_globals(void* handle);
710 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
711 // We always need a function to pass in as callback. Create a dummy
712 // implementation if we don't need to register anything.
713 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
714 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
715
716 // void ** __{cuda|hip}RegisterFatBinary(void *);
717 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
718 llvm::FunctionType::get(PtrTy, PtrTy, false),
719 addUnderscoredPrefixToName("RegisterFatBinary"));
720 // struct { int magic, int version, void * gpu_binary, void * dont_care };
721 llvm::StructType *FatbinWrapperTy =
722 llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
723
724 // Register GPU binary with the CUDA runtime, store returned handle in a
725 // global variable and save a reference in GpuBinaryHandle to be cleaned up
726 // in destructor on exit. Then associate all known kernels with the GPU binary
727 // handle so CUDA runtime can figure out what to call on the GPU side.
728 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
729 if (!CudaGpuBinaryFileName.empty()) {
730 auto VFS = CGM.getFileSystem();
731 auto CudaGpuBinaryOrErr =
732 VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false);
733 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
734 CGM.getDiags().Report(diag::err_cannot_open_file)
735 << CudaGpuBinaryFileName << EC.message();
736 return nullptr;
737 }
738 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
739 }
740
741 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
742 llvm::FunctionType::get(VoidTy, false),
743 llvm::GlobalValue::InternalLinkage,
744 addUnderscoredPrefixToName("_module_ctor"), &TheModule);
745 llvm::BasicBlock *CtorEntryBB =
746 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
747 CGBuilderTy CtorBuilder(CGM, Context);
748
749 CtorBuilder.SetInsertPoint(CtorEntryBB);
750
751 const char *FatbinConstantName;
752 const char *FatbinSectionName;
753 const char *ModuleIDSectionName;
754 StringRef ModuleIDPrefix;
755 llvm::Constant *FatBinStr;
756 unsigned FatMagic;
757 if (IsHIP) {
758 FatbinConstantName = ".hip_fatbin";
759 FatbinSectionName = ".hipFatBinSegment";
760
761 ModuleIDSectionName = "__hip_module_id";
762 ModuleIDPrefix = "__hip_";
763
764 if (CudaGpuBinary) {
765 // If fatbin is available from early finalization, create a string
766 // literal containing the fat binary loaded from the given file.
767 const unsigned HIPCodeObjectAlign = 4096;
768 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
769 FatbinConstantName, HIPCodeObjectAlign);
770 } else {
771 // If fatbin is not available, create an external symbol
772 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
773 // to contain the fat binary but will be populated somewhere else,
774 // e.g. by lld through link script.
775 FatBinStr = new llvm::GlobalVariable(
776 CGM.getModule(), CGM.Int8Ty,
777 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
778 "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
779 llvm::GlobalVariable::NotThreadLocal);
780 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
781 }
782
783 FatMagic = HIPFatMagic;
784 } else {
785 if (RelocatableDeviceCode)
786 FatbinConstantName = CGM.getTriple().isMacOSX()
787 ? "__NV_CUDA,__nv_relfatbin"
788 : "__nv_relfatbin";
789 else
790 FatbinConstantName =
791 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
792 // NVIDIA's cuobjdump looks for fatbins in this section.
793 FatbinSectionName =
794 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
795
796 ModuleIDSectionName = CGM.getTriple().isMacOSX()
797 ? "__NV_CUDA,__nv_module_id"
798 : "__nv_module_id";
799 ModuleIDPrefix = "__nv_";
800
801 // For CUDA, create a string literal containing the fat binary loaded from
802 // the given file.
803 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
804 FatbinConstantName, 8);
805 FatMagic = CudaFatMagic;
806 }
807
808 // Create initialized wrapper structure that points to the loaded GPU binary
809 ConstantInitBuilder Builder(CGM);
810 auto Values = Builder.beginStruct(FatbinWrapperTy);
811 // Fatbin wrapper magic.
812 Values.addInt(IntTy, FatMagic);
813 // Fatbin version.
814 Values.addInt(IntTy, 1);
815 // Data.
816 Values.add(FatBinStr);
817 // Unused in fatbin v1.
818 Values.add(llvm::ConstantPointerNull::get(PtrTy));
819 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
820 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
821 /*constant*/ true);
822 FatbinWrapper->setSection(FatbinSectionName);
823
824 // There is only one HIP fat binary per linked module, however there are
825 // multiple constructor functions. Make sure the fat binary is registered
826 // only once. The constructor functions are executed by the dynamic loader
827 // before the program gains control. The dynamic loader cannot execute the
828 // constructor functions concurrently since doing that would not guarantee
829 // thread safety of the loaded program. Therefore we can assume sequential
830 // execution of constructor functions here.
831 if (IsHIP) {
832 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
833 : llvm::GlobalValue::ExternalLinkage;
834 llvm::BasicBlock *IfBlock =
835 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
836 llvm::BasicBlock *ExitBlock =
837 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
838 // The name, size, and initialization pattern of this variable is part
839 // of HIP ABI.
840 GpuBinaryHandle = new llvm::GlobalVariable(
841 TheModule, PtrTy, /*isConstant=*/false, Linkage,
842 /*Initializer=*/
843 CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
844 CudaGpuBinary
845 ? "__hip_gpubin_handle"
846 : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
847 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
848 // Prevent the weak symbol in different shared libraries being merged.
849 if (Linkage != llvm::GlobalValue::InternalLinkage)
850 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
851 Address GpuBinaryAddr(
852 GpuBinaryHandle, PtrTy,
853 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
854 {
855 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
856 llvm::Constant *Zero =
857 llvm::Constant::getNullValue(HandleValue->getType());
858 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
859 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
860 }
861 {
862 CtorBuilder.SetInsertPoint(IfBlock);
863 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
864 llvm::CallInst *RegisterFatbinCall =
865 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
866 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
867 CtorBuilder.CreateBr(ExitBlock);
868 }
869 {
870 CtorBuilder.SetInsertPoint(ExitBlock);
871 // Call __hip_register_globals(GpuBinaryHandle);
872 if (RegisterGlobalsFunc) {
873 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
874 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
875 }
876 }
877 } else if (!RelocatableDeviceCode) {
878 // Register binary with CUDA runtime. This is substantially different in
879 // default mode vs. separate compilation!
880 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
881 llvm::CallInst *RegisterFatbinCall =
882 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
883 GpuBinaryHandle = new llvm::GlobalVariable(
884 TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage,
885 llvm::ConstantPointerNull::get(PtrTy), "__cuda_gpubin_handle");
886 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
887 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
888 CGM.getPointerAlign());
889
890 // Call __cuda_register_globals(GpuBinaryHandle);
891 if (RegisterGlobalsFunc)
892 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
893
894 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
895 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
896 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
897 // void __cudaRegisterFatBinaryEnd(void **);
898 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
899 llvm::FunctionType::get(VoidTy, PtrTy, false),
900 "__cudaRegisterFatBinaryEnd");
901 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
902 }
903 } else {
904 // Generate a unique module ID.
905 SmallString<64> ModuleID;
906 llvm::raw_svector_ostream OS(ModuleID);
907 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
908 llvm::Constant *ModuleIDConstant = makeConstantArray(
909 std::string(ModuleID), "", ModuleIDSectionName, 32, /*AddNull=*/true);
910
911 // Create an alias for the FatbinWrapper that nvcc will look for.
912 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
913 Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
914
915 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
916 // void *, void (*)(void **))
917 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
918 RegisterLinkedBinaryName += ModuleID;
919 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
920 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
921
922 assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
923 llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
924 makeDummyFunction(getCallbackFnTy())};
925 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
926 }
927
928 // Create destructor and register it with atexit() the way NVCC does it. Doing
929 // it during regular destructor phase worked in CUDA before 9.2 but results in
930 // double-free in 9.2.
931 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
932 // extern "C" int atexit(void (*f)(void));
933 llvm::FunctionType *AtExitTy =
934 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
935 llvm::FunctionCallee AtExitFunc =
936 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
937 /*Local=*/true);
938 CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
939 }
940
941 CtorBuilder.CreateRetVoid();
942 return ModuleCtorFunc;
943}
944
945/// Creates a global destructor function that unregisters the GPU code blob
946/// registered by constructor.
947///
948/// For CUDA:
949/// \code
950/// void __cuda_module_dtor() {
951/// __cudaUnregisterFatBinary(Handle);
952/// }
953/// \endcode
954///
955/// For HIP:
956/// \code
957/// void __hip_module_dtor() {
958/// if (__hip_gpubin_handle) {
959/// __hipUnregisterFatBinary(__hip_gpubin_handle);
960/// __hip_gpubin_handle = 0;
961/// }
962/// }
963/// \endcode
964llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
965 // No need for destructor if we don't have a handle to unregister.
966 if (!GpuBinaryHandle)
967 return nullptr;
968
969 // void __cudaUnregisterFatBinary(void ** handle);
970 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
971 llvm::FunctionType::get(VoidTy, PtrTy, false),
972 addUnderscoredPrefixToName("UnregisterFatBinary"));
973
974 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
975 llvm::FunctionType::get(VoidTy, false),
976 llvm::GlobalValue::InternalLinkage,
977 addUnderscoredPrefixToName("_module_dtor"), &TheModule);
978
979 llvm::BasicBlock *DtorEntryBB =
980 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
981 CGBuilderTy DtorBuilder(CGM, Context);
982 DtorBuilder.SetInsertPoint(DtorEntryBB);
983
984 Address GpuBinaryAddr(
985 GpuBinaryHandle, GpuBinaryHandle->getValueType(),
986 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
987 auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
988 // There is only one HIP fat binary per linked module, however there are
989 // multiple destructor functions. Make sure the fat binary is unregistered
990 // only once.
991 if (CGM.getLangOpts().HIP) {
992 llvm::BasicBlock *IfBlock =
993 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
994 llvm::BasicBlock *ExitBlock =
995 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
996 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
997 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
998 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
999
1000 DtorBuilder.SetInsertPoint(IfBlock);
1001 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1002 DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
1003 DtorBuilder.CreateBr(ExitBlock);
1004
1005 DtorBuilder.SetInsertPoint(ExitBlock);
1006 } else {
1007 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1008 }
1009 DtorBuilder.CreateRetVoid();
1010 return ModuleDtorFunc;
1011}
1012
1014 return new CGNVCUDARuntime(CGM);
1015}
1016
1017void CGNVCUDARuntime::internalizeDeviceSideVar(
1018 const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
1019 // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
1020 // global variables become internal definitions. These have to be internal in
1021 // order to prevent name conflicts with global host variables with the same
1022 // name in a different TUs.
1023 //
1024 // For -fgpu-rdc, the shadow variables should not be internalized because
1025 // they may be accessed by different TU.
1026 if (CGM.getLangOpts().GPURelocatableDeviceCode)
1027 return;
1028
1029 // __shared__ variables are odd. Shadows do get created, but
1030 // they are not registered with the CUDA runtime, so they
1031 // can't really be used to access their device-side
1032 // counterparts. It's not clear yet whether it's nvcc's bug or
1033 // a feature, but we've got to do the same for compatibility.
1034 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
1035 D->hasAttr<CUDASharedAttr>() ||
1036 D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1037 D->getType()->isCUDADeviceBuiltinTextureType()) {
1038 Linkage = llvm::GlobalValue::InternalLinkage;
1039 }
1040}
1041
1042void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1043 llvm::GlobalVariable &GV) {
1044 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
1045 // Shadow variables and their properties must be registered with CUDA
1046 // runtime. Skip Extern global variables, which will be registered in
1047 // the TU where they are defined.
1048 //
1049 // Don't register a C++17 inline variable. The local symbol can be
1050 // discarded and referencing a discarded local symbol from outside the
1051 // comdat (__cuda_register_globals) is disallowed by the ELF spec.
1052 //
1053 // HIP managed variables need to be always recorded in device and host
1054 // compilations for transformation.
1055 //
1056 // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1057 // added to llvm.compiler-used, therefore they are safe to be registered.
1058 if ((!D->hasExternalStorage() && !D->isInline()) ||
1059 CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
1060 D->hasAttr<HIPManagedAttr>()) {
1061 registerDeviceVar(D, GV, !D->hasDefinition(),
1062 D->hasAttr<CUDAConstantAttr>());
1063 }
1064 } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1065 D->getType()->isCUDADeviceBuiltinTextureType()) {
1066 // Builtin surfaces and textures and their template arguments are
1067 // also registered with CUDA runtime.
1068 const auto *TD = cast<ClassTemplateSpecializationDecl>(
1069 D->getType()->castAs<RecordType>()->getDecl());
1070 const TemplateArgumentList &Args = TD->getTemplateArgs();
1071 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1072 assert(Args.size() == 2 &&
1073 "Unexpected number of template arguments of CUDA device "
1074 "builtin surface type.");
1075 auto SurfType = Args[1].getAsIntegral();
1076 if (!D->hasExternalStorage())
1077 registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1078 } else {
1079 assert(Args.size() == 3 &&
1080 "Unexpected number of template arguments of CUDA device "
1081 "builtin texture type.");
1082 auto TexType = Args[1].getAsIntegral();
1083 auto Normalized = Args[2].getAsIntegral();
1084 if (!D->hasExternalStorage())
1085 registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1086 Normalized.getZExtValue());
1087 }
1088 }
1089}
1090
1091// Transform managed variables to pointers to managed variables in device code.
1092// Each use of the original managed variable is replaced by a load from the
1093// transformed managed variable. The transformed managed variable contains
1094// the address of managed memory which will be allocated by the runtime.
1095void CGNVCUDARuntime::transformManagedVars() {
1096 for (auto &&Info : DeviceVars) {
1097 llvm::GlobalVariable *Var = Info.Var;
1098 if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1099 Info.Flags.isManaged()) {
1100 auto *ManagedVar = new llvm::GlobalVariable(
1101 CGM.getModule(), Var->getType(),
1102 /*isConstant=*/false, Var->getLinkage(),
1103 /*Init=*/Var->isDeclaration()
1104 ? nullptr
1105 : llvm::ConstantPointerNull::get(Var->getType()),
1106 /*Name=*/"", /*InsertBefore=*/nullptr,
1107 llvm::GlobalVariable::NotThreadLocal,
1108 CGM.getContext().getTargetAddressSpace(CGM.getLangOpts().CUDAIsDevice
1109 ? LangAS::cuda_device
1110 : LangAS::Default));
1111 ManagedVar->setDSOLocal(Var->isDSOLocal());
1112 ManagedVar->setVisibility(Var->getVisibility());
1113 ManagedVar->setExternallyInitialized(true);
1114 replaceManagedVar(Var, ManagedVar);
1115 ManagedVar->takeName(Var);
1116 Var->setName(Twine(ManagedVar->getName()) + ".managed");
1117 // Keep managed variables even if they are not used in device code since
1118 // they need to be allocated by the runtime.
1119 if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1120 assert(!ManagedVar->isDeclaration());
1121 CGM.addCompilerUsedGlobal(Var);
1122 CGM.addCompilerUsedGlobal(ManagedVar);
1123 }
1124 }
1125 }
1126}
1127
1128// Creates offloading entries for all the kernels and globals that must be
1129// registered. The linker will provide a pointer to this section so we can
1130// register the symbols with the linked device image.
1131void CGNVCUDARuntime::createOffloadingEntries() {
1132 StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
1133 : "cuda_offloading_entries";
1134 llvm::Module &M = CGM.getModule();
1135 for (KernelInfo &I : EmittedKernels)
1136 llvm::offloading::emitOffloadingEntry(
1137 M, KernelHandles[I.Kernel->getName()],
1138 getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0,
1139 llvm::offloading::OffloadGlobalEntry, Section);
1140
1141 for (VarInfo &I : DeviceVars) {
1142 uint64_t VarSize =
1143 CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1144 int32_t Flags =
1145 (I.Flags.isExtern()
1146 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1147 : 0) |
1148 (I.Flags.isConstant()
1149 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1150 : 0) |
1151 (I.Flags.isNormalized()
1152 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1153 : 0);
1154 if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1155 llvm::offloading::emitOffloadingEntry(
1156 M, I.Var, getDeviceSideName(I.D), VarSize,
1157 (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1158 : llvm::offloading::OffloadGlobalEntry) |
1159 Flags,
1160 /*Data=*/0, Section);
1161 } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1162 llvm::offloading::emitOffloadingEntry(
1163 M, I.Var, getDeviceSideName(I.D), VarSize,
1164 llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1165 I.Flags.getSurfTexType(), Section);
1166 } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1167 llvm::offloading::emitOffloadingEntry(
1168 M, I.Var, getDeviceSideName(I.D), VarSize,
1169 llvm::offloading::OffloadGlobalTextureEntry | Flags,
1170 I.Flags.getSurfTexType(), Section);
1171 }
1172 }
1173}
1174
1175// Returns module constructor to be added.
1176llvm::Function *CGNVCUDARuntime::finalizeModule() {
1177 transformManagedVars();
1178 if (CGM.getLangOpts().CUDAIsDevice) {
1179 // Mark ODR-used device variables as compiler used to prevent it from being
1180 // eliminated by optimization. This is necessary for device variables
1181 // ODR-used by host functions. Sema correctly marks them as ODR-used no
1182 // matter whether they are ODR-used by device or host functions.
1183 //
1184 // We do not need to do this if the variable has used attribute since it
1185 // has already been added.
1186 //
1187 // Static device variables have been externalized at this point, therefore
1188 // variables with LLVM private or internal linkage need not be added.
1189 for (auto &&Info : DeviceVars) {
1190 auto Kind = Info.Flags.getKind();
1191 if (!Info.Var->isDeclaration() &&
1192 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1193 (Kind == DeviceVarFlags::Variable ||
1194 Kind == DeviceVarFlags::Surface ||
1195 Kind == DeviceVarFlags::Texture) &&
1196 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1197 CGM.addCompilerUsedGlobal(Info.Var);
1198 }
1199 }
1200 return nullptr;
1201 }
1202 if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1203 createOffloadingEntries();
1204 else
1205 return makeModuleCtorFunction();
1206
1207 return nullptr;
1208}
1209
1210llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1211 GlobalDecl GD) {
1212 auto Loc = KernelHandles.find(F->getName());
1213 if (Loc != KernelHandles.end()) {
1214 auto OldHandle = Loc->second;
1215 if (KernelStubs[OldHandle] == F)
1216 return OldHandle;
1217
1218 // We've found the function name, but F itself has changed, so we need to
1219 // update the references.
1220 if (CGM.getLangOpts().HIP) {
1221 // For HIP compilation the handle itself does not change, so we only need
1222 // to update the Stub value.
1223 KernelStubs[OldHandle] = F;
1224 return OldHandle;
1225 }
1226 // For non-HIP compilation, erase the old Stub and fall-through to creating
1227 // new entries.
1228 KernelStubs.erase(OldHandle);
1229 }
1230
1231 if (!CGM.getLangOpts().HIP) {
1232 KernelHandles[F->getName()] = F;
1233 KernelStubs[F] = F;
1234 return F;
1235 }
1236
1237 auto *Var = new llvm::GlobalVariable(
1238 TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1239 /*Initializer=*/nullptr,
1240 CGM.getMangledName(
1241 GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
1242 Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1243 Var->setDSOLocal(F->isDSOLocal());
1244 Var->setVisibility(F->getVisibility());
1245 auto *FD = cast<FunctionDecl>(GD.getDecl());
1246 auto *FT = FD->getPrimaryTemplate();
1247 if (!FT || FT->isThisDeclarationADefinition())
1248 CGM.maybeSetTrivialComdat(*FD, *Var);
1249 KernelHandles[F->getName()] = Var;
1250 KernelStubs[Var] = F;
1251 return Var;
1252}
static std::unique_ptr< MangleContext > InitDeviceMC(CodeGenModule &CGM)
Definition: CGCUDANV.cpp:205
static void replaceManagedVar(llvm::GlobalVariable *Var, llvm::GlobalVariable *ManagedVar)
Definition: CGCUDANV.cpp:489
const Decl * D
SourceLocation Loc
Definition: SemaObjC.cpp:758
MangleContext * createMangleContext(const TargetInfo *T=nullptr)
If T is null pointer, assume the target in ASTContext.
const TargetInfo * getAuxTargetInfo() const
Definition: ASTContext.h:779
MangleContext * createDeviceMangleContext(const TargetInfo &T)
Creates a device mangle context to correctly mangle lambdas in a mixed architecture compile by settin...
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:778
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
static CharUnits One()
One - Construct a CharUnits quantity of one.
Definition: CharUnits.h:58
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
static CharUnits Zero()
Zero - Construct a CharUnits quantity of zero.
Definition: CharUnits.h:53
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition: Address.h:128
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
Definition: Address.h:251
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:199
llvm::StoreInst * CreateAlignedStore(llvm::Value *Val, llvm::Value *Addr, CharUnits Align, bool IsVolatile=false)
Definition: CGBuilder.h:142
llvm::StoreInst * CreateDefaultAlignedStore(llvm::Value *Val, llvm::Value *Addr, bool IsVolatile=false)
Definition: CGBuilder.h:150
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
Definition: CGBuilder.h:107
virtual std::string getDeviceSideName(const NamedDecl *ND)=0
Returns function or variable name on device side even if the current compilation is for host.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args)=0
Emits a kernel launch stub.
virtual llvm::Function * getKernelStub(llvm::GlobalValue *Handle)=0
Get kernel stub by kernel handle.
virtual void handleVarRegistration(const VarDecl *VD, llvm::GlobalVariable &Var)=0
Check whether a variable is a device variable and register it if true.
virtual llvm::Function * finalizeModule()=0
Finalize generated LLVM module.
virtual llvm::GlobalValue * getKernelHandle(llvm::Function *Stub, GlobalDecl GD)=0
Get kernel handle by stub function.
virtual void internalizeDeviceSideVar(const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage)=0
Adjust linkage of shadow variables in host compilation.
static CGCallee forDirect(llvm::Constant *functionPtr, const CGCalleeInfo &abstractInfo=CGCalleeInfo())
Definition: CGCall.h:137
CGFunctionInfo - Class to encapsulate the information about a function definition.
CallArgList - Type for representing both the value and type of arguments in a call.
Definition: CGCall.h:274
void add(RValue rvalue, QualType type)
Definition: CGCall.h:298
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
const LangOptions & getLangOpts() const
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee, ReturnValueSlot ReturnValue, const CallArgList &Args, llvm::CallBase **callOrInvoke, bool IsMustTail, SourceLocation Loc, bool IsVirtualFunctionPointerThunk=false)
EmitCall - Generate a call of the given function, expecting the given result type,...
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
const Decl * CurFuncDecl
CurFuncDecl - Holds the Decl for the current outermost non-closure context.
llvm::CallBase * EmitRuntimeCallOrInvoke(llvm::FunctionCallee callee, ArrayRef< llvm::Value * > args, const Twine &name="")
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
This class organizes the cross-function state that is used while generating LLVM code.
ASTContext & getContext() const
The standard implementation of ConstantInitBuilder used in Clang.
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition: CGCall.h:368
static RValue get(llvm::Value *V)
Definition: CGValue.h:98
static RValue getAggregate(Address addr, bool isVolatile=false)
Convert an Address to an RValue.
Definition: CGValue.h:125
ReturnValueSlot - Contains the address where the return value of a function can be stored,...
Definition: CGCall.h:372
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1425
lookup_result lookup(DeclarationName Name) const
lookup - Find the declarations (if any) with the given Name in this context.
Definition: DeclBase.cpp:1828
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86
SourceLocation getLocation() const
Definition: DeclBase.h:445
TranslationUnitDecl * getTranslationUnitDecl()
Definition: DeclBase.cpp:508
bool hasAttr() const
Definition: DeclBase.h:583
Represents a function declaration or definition.
Definition: Decl.h:1932
const ParmVarDecl * getParamDecl(unsigned i) const
Definition: Decl.h:2669
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:56
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind)
Definition: GlobalDecl.h:198
const Decl * getDecl() const
Definition: GlobalDecl.h:103
One of these records is kept for each identifier that is lexed.
StringRef getName() const
Return the actual identifier string.
GPUDefaultStreamKind GPUDefaultStream
The default stream kind used for HIP kernel launching.
Definition: LangOptions.h:556
MangleContext - Context for tracking state which persists across multiple calls to the C++ name mangl...
Definition: Mangle.h:45
bool shouldMangleDeclName(const NamedDecl *D)
Definition: Mangle.cpp:105
void mangleName(GlobalDecl GD, raw_ostream &)
Definition: Mangle.cpp:139
This represents a decl that may have a name.
Definition: Decl.h:249
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:270
Represents a parameter to a function.
Definition: Decl.h:1722
A (possibly-)qualified type.
Definition: Type.h:941
QualType getCanonicalType() const
Definition: Type.h:7795
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
Definition: Type.h:5936
RecordDecl * getDecl() const
Definition: Type.h:5946
bool isMicrosoft() const
Is this ABI an MSVC-compatible ABI?
Definition: TargetCXXABI.h:136
bool isItaniumFamily() const
Does this ABI generally fall into the Itanium family of ABIs?
Definition: TargetCXXABI.h:122
TargetCXXABI getCXXABI() const
Get the C++ ABI currently in use.
Definition: TargetInfo.h:1327
A template argument list.
Definition: DeclTemplate.h:244
unsigned size() const
Retrieve the number of template arguments in this template argument list.
Definition: DeclTemplate.h:280
The top declaration context.
Definition: Decl.h:84
static DeclContext * castToDeclContext(const TranslationUnitDecl *D)
Definition: Decl.h:130
The base class of the type hierarchy.
Definition: Type.h:1829
QualType getType() const
Definition: Decl.h:678
QualType getType() const
Definition: Value.cpp:234
Represents a variable declaration or definition.
Definition: Decl.h:879
CGCUDARuntime * CreateNVCUDARuntime(CodeGenModule &CGM)
Creates an instance of a CUDA runtime class.
Definition: CGCUDANV.cpp:1013
bool Zero(InterpState &S, CodePtr OpPC)
Definition: Interp.h:2204
@ VFS
Remove unused -ivfsoverlay arguments.
The JSON file list parser is used to communicate input to InstallAPI.
CudaVersion ToCudaVersion(llvm::VersionTuple)
Definition: Cuda.cpp:67
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature)
Definition: Cuda.cpp:251
Linkage
Describes the different kinds of linkage (C++ [basic.link], C99 6.2.2) that an entity may have.
Definition: Linkage.h:24
unsigned long uint64_t
llvm::IntegerType * IntTy
int