clang 19.0.0git
CGCUDANV.cpp
Go to the documentation of this file.
1//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10// runtime library.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGCUDARuntime.h"
15#include "CGCXXABI.h"
16#include "CodeGenFunction.h"
17#include "CodeGenModule.h"
18#include "clang/AST/Decl.h"
19#include "clang/Basic/Cuda.h"
22#include "llvm/Frontend/Offloading/Utility.h"
23#include "llvm/IR/BasicBlock.h"
24#include "llvm/IR/Constants.h"
25#include "llvm/IR/DerivedTypes.h"
26#include "llvm/IR/ReplaceConstant.h"
27#include "llvm/Support/Format.h"
28#include "llvm/Support/VirtualFileSystem.h"
29
30using namespace clang;
31using namespace CodeGen;
32
33namespace {
34constexpr unsigned CudaFatMagic = 0x466243b1;
35constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
36
37class CGNVCUDARuntime : public CGCUDARuntime {
38
39private:
40 llvm::IntegerType *IntTy, *SizeTy;
41 llvm::Type *VoidTy;
42 llvm::PointerType *PtrTy;
43
44 /// Convenience reference to LLVM Context
45 llvm::LLVMContext &Context;
46 /// Convenience reference to the current module
47 llvm::Module &TheModule;
48 /// Keeps track of kernel launch stubs and handles emitted in this module
49 struct KernelInfo {
50 llvm::Function *Kernel; // stub function to help launch kernel
51 const Decl *D;
52 };
54 // Map a kernel mangled name to a symbol for identifying kernel in host code
55 // For CUDA, the symbol for identifying the kernel is the same as the device
56 // stub function. For HIP, they are different.
57 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
58 // Map a kernel handle to the kernel stub.
59 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
60 struct VarInfo {
61 llvm::GlobalVariable *Var;
62 const VarDecl *D;
63 DeviceVarFlags Flags;
64 };
66 /// Keeps track of variable containing handle of GPU binary. Populated by
67 /// ModuleCtorFunction() and used to create corresponding cleanup calls in
68 /// ModuleDtorFunction()
69 llvm::GlobalVariable *GpuBinaryHandle = nullptr;
70 /// Whether we generate relocatable device code.
71 bool RelocatableDeviceCode;
72 /// Mangle context for device.
73 std::unique_ptr<MangleContext> DeviceMC;
74 /// Some zeros used for GEPs.
75 llvm::Constant *Zeros[2];
76
77 llvm::FunctionCallee getSetupArgumentFn() const;
78 llvm::FunctionCallee getLaunchFn() const;
79
80 llvm::FunctionType *getRegisterGlobalsFnTy() const;
81 llvm::FunctionType *getCallbackFnTy() const;
82 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
83 std::string addPrefixToName(StringRef FuncName) const;
84 std::string addUnderscoredPrefixToName(StringRef FuncName) const;
85
86 /// Creates a function to register all kernel stubs generated in this module.
87 llvm::Function *makeRegisterGlobalsFn();
88
89 /// Helper function that generates a constant string and returns a pointer to
90 /// the start of the string. The result of this function can be used anywhere
91 /// where the C code specifies const char*.
92 llvm::Constant *makeConstantString(const std::string &Str,
93 const std::string &Name = "") {
94 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
95 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
96 ConstStr.getPointer(), Zeros);
97 }
98
99 /// Helper function which generates an initialized constant array from Str,
100 /// and optionally sets section name and alignment. AddNull specifies whether
101 /// the array should nave NUL termination.
102 llvm::Constant *makeConstantArray(StringRef Str,
103 StringRef Name = "",
104 StringRef SectionName = "",
105 unsigned Alignment = 0,
106 bool AddNull = false) {
107 llvm::Constant *Value =
108 llvm::ConstantDataArray::getString(Context, Str, AddNull);
109 auto *GV = new llvm::GlobalVariable(
110 TheModule, Value->getType(), /*isConstant=*/true,
111 llvm::GlobalValue::PrivateLinkage, Value, Name);
112 if (!SectionName.empty()) {
113 GV->setSection(SectionName);
114 // Mark the address as used which make sure that this section isn't
115 // merged and we will really have it in the object file.
116 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
117 }
118 if (Alignment)
119 GV->setAlignment(llvm::Align(Alignment));
120 return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros);
121 }
122
123 /// Helper function that generates an empty dummy function returning void.
124 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
125 assert(FnTy->getReturnType()->isVoidTy() &&
126 "Can only generate dummy functions returning void!");
127 llvm::Function *DummyFunc = llvm::Function::Create(
128 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
129
130 llvm::BasicBlock *DummyBlock =
131 llvm::BasicBlock::Create(Context, "", DummyFunc);
132 CGBuilderTy FuncBuilder(CGM, Context);
133 FuncBuilder.SetInsertPoint(DummyBlock);
134 FuncBuilder.CreateRetVoid();
135
136 return DummyFunc;
137 }
138
139 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
140 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
141 std::string getDeviceSideName(const NamedDecl *ND) override;
142
143 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
144 bool Extern, bool Constant) {
145 DeviceVars.push_back({&Var,
146 VD,
147 {DeviceVarFlags::Variable, Extern, Constant,
148 VD->hasAttr<HIPManagedAttr>(),
149 /*Normalized*/ false, 0}});
150 }
151 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
152 bool Extern, int Type) {
153 DeviceVars.push_back({&Var,
154 VD,
155 {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
156 /*Managed*/ false,
157 /*Normalized*/ false, Type}});
158 }
159 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
160 bool Extern, int Type, bool Normalized) {
161 DeviceVars.push_back({&Var,
162 VD,
163 {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
164 /*Managed*/ false, Normalized, Type}});
165 }
166
167 /// Creates module constructor function
168 llvm::Function *makeModuleCtorFunction();
169 /// Creates module destructor function
170 llvm::Function *makeModuleDtorFunction();
171 /// Transform managed variables for device compilation.
172 void transformManagedVars();
173 /// Create offloading entries to register globals in RDC mode.
174 void createOffloadingEntries();
175
176public:
177 CGNVCUDARuntime(CodeGenModule &CGM);
178
179 llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
180 llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
181 auto Loc = KernelStubs.find(Handle);
182 assert(Loc != KernelStubs.end());
183 return Loc->second;
184 }
185 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
186 void handleVarRegistration(const VarDecl *VD,
187 llvm::GlobalVariable &Var) override;
188 void
190 llvm::GlobalValue::LinkageTypes &Linkage) override;
191
192 llvm::Function *finalizeModule() override;
193};
194
195} // end anonymous namespace
196
197std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
198 if (CGM.getLangOpts().HIP)
199 return ((Twine("hip") + Twine(FuncName)).str());
200 return ((Twine("cuda") + Twine(FuncName)).str());
201}
202std::string
203CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
204 if (CGM.getLangOpts().HIP)
205 return ((Twine("__hip") + Twine(FuncName)).str());
206 return ((Twine("__cuda") + Twine(FuncName)).str());
207}
208
209static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
210 // If the host and device have different C++ ABIs, mark it as the device
211 // mangle context so that the mangling needs to retrieve the additional
212 // device lambda mangling number instead of the regular host one.
213 if (CGM.getContext().getAuxTargetInfo() &&
216 return std::unique_ptr<MangleContext>(
218 *CGM.getContext().getAuxTargetInfo()));
219 }
220
221 return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
223}
224
225CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
226 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
227 TheModule(CGM.getModule()),
228 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
229 DeviceMC(InitDeviceMC(CGM)) {
230 IntTy = CGM.IntTy;
231 SizeTy = CGM.SizeTy;
232 VoidTy = CGM.VoidTy;
233 Zeros[0] = llvm::ConstantInt::get(SizeTy, 0);
234 Zeros[1] = Zeros[0];
235 PtrTy = CGM.UnqualPtrTy;
236}
237
238llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
239 // cudaError_t cudaSetupArgument(void *, size_t, size_t)
240 llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
241 return CGM.CreateRuntimeFunction(
242 llvm::FunctionType::get(IntTy, Params, false),
243 addPrefixToName("SetupArgument"));
244}
245
246llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
247 if (CGM.getLangOpts().HIP) {
248 // hipError_t hipLaunchByPtr(char *);
249 return CGM.CreateRuntimeFunction(
250 llvm::FunctionType::get(IntTy, PtrTy, false), "hipLaunchByPtr");
251 }
252 // cudaError_t cudaLaunch(char *);
253 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy, false),
254 "cudaLaunch");
255}
256
257llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
258 return llvm::FunctionType::get(VoidTy, PtrTy, false);
259}
260
261llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
262 return llvm::FunctionType::get(VoidTy, PtrTy, false);
263}
264
265llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
266 llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
267 llvm::PointerType::getUnqual(Context)};
268 return llvm::FunctionType::get(VoidTy, Params, false);
269}
270
271std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
272 GlobalDecl GD;
273 // D could be either a kernel or a variable.
274 if (auto *FD = dyn_cast<FunctionDecl>(ND))
275 GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
276 else
277 GD = GlobalDecl(ND);
278 std::string DeviceSideName;
279 MangleContext *MC;
280 if (CGM.getLangOpts().CUDAIsDevice)
281 MC = &CGM.getCXXABI().getMangleContext();
282 else
283 MC = DeviceMC.get();
284 if (MC->shouldMangleDeclName(ND)) {
285 SmallString<256> Buffer;
286 llvm::raw_svector_ostream Out(Buffer);
287 MC->mangleName(GD, Out);
288 DeviceSideName = std::string(Out.str());
289 } else
290 DeviceSideName = std::string(ND->getIdentifier()->getName());
291
292 // Make unique name for device side static file-scope variable for HIP.
293 if (CGM.getContext().shouldExternalize(ND) &&
294 CGM.getLangOpts().GPURelocatableDeviceCode) {
295 SmallString<256> Buffer;
296 llvm::raw_svector_ostream Out(Buffer);
297 Out << DeviceSideName;
298 CGM.printPostfixForExternalizedDecl(Out, ND);
299 DeviceSideName = std::string(Out.str());
300 }
301 return DeviceSideName;
302}
303
304void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
305 FunctionArgList &Args) {
306 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
307 if (auto *GV =
308 dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) {
309 GV->setLinkage(CGF.CurFn->getLinkage());
310 GV->setInitializer(CGF.CurFn);
311 }
312 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
313 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
314 (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
315 emitDeviceStubBodyNew(CGF, Args);
316 else
317 emitDeviceStubBodyLegacy(CGF, Args);
318}
319
320// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
321// array and kernels are launched using cudaLaunchKernel().
322void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
323 FunctionArgList &Args) {
324 // Build the shadow stack entry at the very start of the function.
325
326 // Calculate amount of space we will need for all arguments. If we have no
327 // args, allocate a single pointer so we still have a valid pointer to the
328 // argument array that we can pass to runtime, even if it will be unused.
329 Address KernelArgs = CGF.CreateTempAlloca(
330 PtrTy, CharUnits::fromQuantity(16), "kernel_args",
331 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
332 // Store pointers to the arguments in a locally allocated launch_args.
333 for (unsigned i = 0; i < Args.size(); ++i) {
334 llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(Args[i]).emitRawPointer(CGF);
335 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, PtrTy);
337 VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
338 PtrTy, KernelArgs.emitRawPointer(CGF), i));
339 }
340
341 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
342
343 // Lookup cudaLaunchKernel/hipLaunchKernel function.
344 // HIP kernel launching API name depends on -fgpu-default-stream option. For
345 // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
346 // it is hipLaunchKernel_spt.
347 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
348 // void **args, size_t sharedMem,
349 // cudaStream_t stream);
350 // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
351 // dim3 blockDim, void **args,
352 // size_t sharedMem, hipStream_t stream);
353 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
355 std::string KernelLaunchAPI = "LaunchKernel";
356 if (CGF.getLangOpts().GPUDefaultStream ==
357 LangOptions::GPUDefaultStreamKind::PerThread) {
358 if (CGF.getLangOpts().HIP)
359 KernelLaunchAPI = KernelLaunchAPI + "_spt";
360 else if (CGF.getLangOpts().CUDA)
361 KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
362 }
363 auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
364 const IdentifierInfo &cudaLaunchKernelII =
365 CGM.getContext().Idents.get(LaunchKernelName);
366 FunctionDecl *cudaLaunchKernelFD = nullptr;
367 for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
368 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
369 cudaLaunchKernelFD = FD;
370 }
371
372 if (cudaLaunchKernelFD == nullptr) {
373 CGM.Error(CGF.CurFuncDecl->getLocation(),
374 "Can't find declaration for " + LaunchKernelName);
375 return;
376 }
377 // Create temporary dim3 grid_dim, block_dim.
378 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
379 QualType Dim3Ty = GridDimParam->getType();
380 Address GridDim =
381 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
382 Address BlockDim =
383 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
384 Address ShmemSize =
385 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
386 Address Stream = CGF.CreateTempAlloca(PtrTy, CGM.getPointerAlign(), "stream");
387 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
388 llvm::FunctionType::get(IntTy,
389 {/*gridDim=*/GridDim.getType(),
390 /*blockDim=*/BlockDim.getType(),
391 /*ShmemSize=*/ShmemSize.getType(),
392 /*Stream=*/Stream.getType()},
393 /*isVarArg=*/false),
394 addUnderscoredPrefixToName("PopCallConfiguration"));
395
396 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.emitRawPointer(CGF),
397 BlockDim.emitRawPointer(CGF),
398 ShmemSize.emitRawPointer(CGF),
399 Stream.emitRawPointer(CGF)});
400
401 // Emit the call to cudaLaunch
402 llvm::Value *Kernel =
403 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
404 CallArgList LaunchKernelArgs;
405 LaunchKernelArgs.add(RValue::get(Kernel),
406 cudaLaunchKernelFD->getParamDecl(0)->getType());
407 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
408 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
409 LaunchKernelArgs.add(RValue::get(KernelArgs, CGF),
410 cudaLaunchKernelFD->getParamDecl(3)->getType());
411 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
412 cudaLaunchKernelFD->getParamDecl(4)->getType());
413 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
414 cudaLaunchKernelFD->getParamDecl(5)->getType());
415
416 QualType QT = cudaLaunchKernelFD->getType();
417 QualType CQT = QT.getCanonicalType();
418 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
419 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
420
421 const CGFunctionInfo &FI =
422 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
423 llvm::FunctionCallee cudaLaunchKernelFn =
424 CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
425 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
426 LaunchKernelArgs);
427
428 // To prevent CUDA device stub functions from being merged by ICF in MSVC
429 // environment, create an unique global variable for each kernel and write to
430 // the variable in the device stub.
431 if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
432 !CGF.getLangOpts().HIP) {
433 llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Kernel);
434 std::string GlobalVarName = (KernelFunction->getName() + ".id").str();
435
436 llvm::GlobalVariable *HandleVar =
437 CGM.getModule().getNamedGlobal(GlobalVarName);
438 if (!HandleVar) {
439 HandleVar = new llvm::GlobalVariable(
440 CGM.getModule(), CGM.Int8Ty,
441 /*Constant=*/false, KernelFunction->getLinkage(),
442 llvm::ConstantInt::get(CGM.Int8Ty, 0), GlobalVarName);
443 HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
444 HandleVar->setVisibility(KernelFunction->getVisibility());
445 if (KernelFunction->hasComdat())
446 HandleVar->setComdat(CGM.getModule().getOrInsertComdat(GlobalVarName));
447 }
448
449 CGF.Builder.CreateAlignedStore(llvm::ConstantInt::get(CGM.Int8Ty, 1),
450 HandleVar, CharUnits::One(),
451 /*IsVolatile=*/true);
452 }
453
454 CGF.EmitBranch(EndBlock);
455
456 CGF.EmitBlock(EndBlock);
457}
458
459void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
460 FunctionArgList &Args) {
461 // Emit a call to cudaSetupArgument for each arg in Args.
462 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
463 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
464 CharUnits Offset = CharUnits::Zero();
465 for (const VarDecl *A : Args) {
466 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
467 Offset = Offset.alignTo(TInfo.Align);
468 llvm::Value *Args[] = {
469 CGF.Builder.CreatePointerCast(
470 CGF.GetAddrOfLocalVar(A).emitRawPointer(CGF), PtrTy),
471 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
472 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
473 };
474 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
475 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
476 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
477 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
478 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
479 CGF.EmitBlock(NextBlock);
480 Offset += TInfo.Width;
481 }
482
483 // Emit the call to cudaLaunch
484 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
485 llvm::Value *Arg =
486 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
487 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
488 CGF.EmitBranch(EndBlock);
489
490 CGF.EmitBlock(EndBlock);
491}
492
493// Replace the original variable Var with the address loaded from variable
494// ManagedVar populated by HIP runtime.
495static void replaceManagedVar(llvm::GlobalVariable *Var,
496 llvm::GlobalVariable *ManagedVar) {
498 for (auto &&VarUse : Var->uses()) {
499 WorkList.push_back({VarUse.getUser()});
500 }
501 while (!WorkList.empty()) {
502 auto &&WorkItem = WorkList.pop_back_val();
503 auto *U = WorkItem.back();
504 if (isa<llvm::ConstantExpr>(U)) {
505 for (auto &&UU : U->uses()) {
506 WorkItem.push_back(UU.getUser());
507 WorkList.push_back(WorkItem);
508 WorkItem.pop_back();
509 }
510 continue;
511 }
512 if (auto *I = dyn_cast<llvm::Instruction>(U)) {
513 llvm::Value *OldV = Var;
514 llvm::Instruction *NewV =
515 new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
516 llvm::Align(Var->getAlignment()), I);
517 WorkItem.pop_back();
518 // Replace constant expressions directly or indirectly using the managed
519 // variable with instructions.
520 for (auto &&Op : WorkItem) {
521 auto *CE = cast<llvm::ConstantExpr>(Op);
522 auto *NewInst = CE->getAsInstruction();
523 NewInst->insertBefore(*I->getParent(), I->getIterator());
524 NewInst->replaceUsesOfWith(OldV, NewV);
525 OldV = CE;
526 NewV = NewInst;
527 }
528 I->replaceUsesOfWith(OldV, NewV);
529 } else {
530 llvm_unreachable("Invalid use of managed variable");
531 }
532 }
533}
534
535/// Creates a function that sets up state on the host side for CUDA objects that
536/// have a presence on both the host and device sides. Specifically, registers
537/// the host side of kernel functions and device global variables with the CUDA
538/// runtime.
539/// \code
540/// void __cuda_register_globals(void** GpuBinaryHandle) {
541/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
542/// ...
543/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
544/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
545/// ...
546/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
547/// }
548/// \endcode
549llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
550 // No need to register anything
551 if (EmittedKernels.empty() && DeviceVars.empty())
552 return nullptr;
553
554 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
555 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
556 addUnderscoredPrefixToName("_register_globals"), &TheModule);
557 llvm::BasicBlock *EntryBB =
558 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
559 CGBuilderTy Builder(CGM, Context);
560 Builder.SetInsertPoint(EntryBB);
561
562 // void __cudaRegisterFunction(void **, const char *, char *, const char *,
563 // int, uint3*, uint3*, dim3*, dim3*, int*)
564 llvm::Type *RegisterFuncParams[] = {
565 PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
566 PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
567 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
568 llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
569 addUnderscoredPrefixToName("RegisterFunction"));
570
571 // Extract GpuBinaryHandle passed as the first argument passed to
572 // __cuda_register_globals() and generate __cudaRegisterFunction() call for
573 // each emitted kernel.
574 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
575 for (auto &&I : EmittedKernels) {
576 llvm::Constant *KernelName =
577 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
578 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
579 llvm::Value *Args[] = {
580 &GpuBinaryHandlePtr,
581 KernelHandles[I.Kernel->getName()],
582 KernelName,
583 KernelName,
584 llvm::ConstantInt::get(IntTy, -1),
585 NullPtr,
586 NullPtr,
587 NullPtr,
588 NullPtr,
589 llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
590 Builder.CreateCall(RegisterFunc, Args);
591 }
592
593 llvm::Type *VarSizeTy = IntTy;
594 // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
595 if (CGM.getLangOpts().HIP ||
596 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
597 VarSizeTy = SizeTy;
598
599 // void __cudaRegisterVar(void **, char *, char *, const char *,
600 // int, int, int, int)
601 llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
602 IntTy, VarSizeTy, IntTy, IntTy};
603 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
604 llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
605 addUnderscoredPrefixToName("RegisterVar"));
606 // void __hipRegisterManagedVar(void **, char *, char *, const char *,
607 // size_t, unsigned)
608 llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
609 PtrTy, VarSizeTy, IntTy};
610 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
611 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
612 addUnderscoredPrefixToName("RegisterManagedVar"));
613 // void __cudaRegisterSurface(void **, const struct surfaceReference *,
614 // const void **, const char *, int, int);
615 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
616 llvm::FunctionType::get(
617 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, false),
618 addUnderscoredPrefixToName("RegisterSurface"));
619 // void __cudaRegisterTexture(void **, const struct textureReference *,
620 // const void **, const char *, int, int, int)
621 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
622 llvm::FunctionType::get(
623 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, false),
624 addUnderscoredPrefixToName("RegisterTexture"));
625 for (auto &&Info : DeviceVars) {
626 llvm::GlobalVariable *Var = Info.Var;
627 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
628 "External variables should not show up here, except HIP managed "
629 "variables");
630 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
631 switch (Info.Flags.getKind()) {
632 case DeviceVarFlags::Variable: {
633 uint64_t VarSize =
634 CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
635 if (Info.Flags.isManaged()) {
636 assert(Var->getName().ends_with(".managed") &&
637 "HIP managed variables not transformed");
638 auto *ManagedVar = CGM.getModule().getNamedGlobal(
639 Var->getName().drop_back(StringRef(".managed").size()));
640 llvm::Value *Args[] = {
641 &GpuBinaryHandlePtr,
642 ManagedVar,
643 Var,
644 VarName,
645 llvm::ConstantInt::get(VarSizeTy, VarSize),
646 llvm::ConstantInt::get(IntTy, Var->getAlignment())};
647 if (!Var->isDeclaration())
648 Builder.CreateCall(RegisterManagedVar, Args);
649 } else {
650 llvm::Value *Args[] = {
651 &GpuBinaryHandlePtr,
652 Var,
653 VarName,
654 VarName,
655 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
656 llvm::ConstantInt::get(VarSizeTy, VarSize),
657 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
658 llvm::ConstantInt::get(IntTy, 0)};
659 Builder.CreateCall(RegisterVar, Args);
660 }
661 break;
662 }
663 case DeviceVarFlags::Surface:
664 Builder.CreateCall(
665 RegisterSurf,
666 {&GpuBinaryHandlePtr, Var, VarName, VarName,
667 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
668 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
669 break;
670 case DeviceVarFlags::Texture:
671 Builder.CreateCall(
672 RegisterTex,
673 {&GpuBinaryHandlePtr, Var, VarName, VarName,
674 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
675 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
676 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
677 break;
678 }
679 }
680
681 Builder.CreateRetVoid();
682 return RegisterKernelsFunc;
683}
684
685/// Creates a global constructor function for the module:
686///
687/// For CUDA:
688/// \code
689/// void __cuda_module_ctor() {
690/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
691/// __cuda_register_globals(Handle);
692/// }
693/// \endcode
694///
695/// For HIP:
696/// \code
697/// void __hip_module_ctor() {
698/// if (__hip_gpubin_handle == 0) {
699/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
700/// __hip_register_globals(__hip_gpubin_handle);
701/// }
702/// }
703/// \endcode
704llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
705 bool IsHIP = CGM.getLangOpts().HIP;
706 bool IsCUDA = CGM.getLangOpts().CUDA;
707 // No need to generate ctors/dtors if there is no GPU binary.
708 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
709 if (CudaGpuBinaryFileName.empty() && !IsHIP)
710 return nullptr;
711 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
712 DeviceVars.empty())
713 return nullptr;
714
715 // void __{cuda|hip}_register_globals(void* handle);
716 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
717 // We always need a function to pass in as callback. Create a dummy
718 // implementation if we don't need to register anything.
719 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
720 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
721
722 // void ** __{cuda|hip}RegisterFatBinary(void *);
723 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
724 llvm::FunctionType::get(PtrTy, PtrTy, false),
725 addUnderscoredPrefixToName("RegisterFatBinary"));
726 // struct { int magic, int version, void * gpu_binary, void * dont_care };
727 llvm::StructType *FatbinWrapperTy =
728 llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
729
730 // Register GPU binary with the CUDA runtime, store returned handle in a
731 // global variable and save a reference in GpuBinaryHandle to be cleaned up
732 // in destructor on exit. Then associate all known kernels with the GPU binary
733 // handle so CUDA runtime can figure out what to call on the GPU side.
734 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
735 if (!CudaGpuBinaryFileName.empty()) {
736 auto VFS = CGM.getFileSystem();
737 auto CudaGpuBinaryOrErr =
738 VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false);
739 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
740 CGM.getDiags().Report(diag::err_cannot_open_file)
741 << CudaGpuBinaryFileName << EC.message();
742 return nullptr;
743 }
744 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
745 }
746
747 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
748 llvm::FunctionType::get(VoidTy, false),
749 llvm::GlobalValue::InternalLinkage,
750 addUnderscoredPrefixToName("_module_ctor"), &TheModule);
751 llvm::BasicBlock *CtorEntryBB =
752 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
753 CGBuilderTy CtorBuilder(CGM, Context);
754
755 CtorBuilder.SetInsertPoint(CtorEntryBB);
756
757 const char *FatbinConstantName;
758 const char *FatbinSectionName;
759 const char *ModuleIDSectionName;
760 StringRef ModuleIDPrefix;
761 llvm::Constant *FatBinStr;
762 unsigned FatMagic;
763 if (IsHIP) {
764 FatbinConstantName = ".hip_fatbin";
765 FatbinSectionName = ".hipFatBinSegment";
766
767 ModuleIDSectionName = "__hip_module_id";
768 ModuleIDPrefix = "__hip_";
769
770 if (CudaGpuBinary) {
771 // If fatbin is available from early finalization, create a string
772 // literal containing the fat binary loaded from the given file.
773 const unsigned HIPCodeObjectAlign = 4096;
774 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
775 FatbinConstantName, HIPCodeObjectAlign);
776 } else {
777 // If fatbin is not available, create an external symbol
778 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
779 // to contain the fat binary but will be populated somewhere else,
780 // e.g. by lld through link script.
781 FatBinStr = new llvm::GlobalVariable(
782 CGM.getModule(), CGM.Int8Ty,
783 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
784 "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
785 llvm::GlobalVariable::NotThreadLocal);
786 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
787 }
788
789 FatMagic = HIPFatMagic;
790 } else {
791 if (RelocatableDeviceCode)
792 FatbinConstantName = CGM.getTriple().isMacOSX()
793 ? "__NV_CUDA,__nv_relfatbin"
794 : "__nv_relfatbin";
795 else
796 FatbinConstantName =
797 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
798 // NVIDIA's cuobjdump looks for fatbins in this section.
799 FatbinSectionName =
800 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
801
802 ModuleIDSectionName = CGM.getTriple().isMacOSX()
803 ? "__NV_CUDA,__nv_module_id"
804 : "__nv_module_id";
805 ModuleIDPrefix = "__nv_";
806
807 // For CUDA, create a string literal containing the fat binary loaded from
808 // the given file.
809 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
810 FatbinConstantName, 8);
811 FatMagic = CudaFatMagic;
812 }
813
814 // Create initialized wrapper structure that points to the loaded GPU binary
815 ConstantInitBuilder Builder(CGM);
816 auto Values = Builder.beginStruct(FatbinWrapperTy);
817 // Fatbin wrapper magic.
818 Values.addInt(IntTy, FatMagic);
819 // Fatbin version.
820 Values.addInt(IntTy, 1);
821 // Data.
822 Values.add(FatBinStr);
823 // Unused in fatbin v1.
824 Values.add(llvm::ConstantPointerNull::get(PtrTy));
825 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
826 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
827 /*constant*/ true);
828 FatbinWrapper->setSection(FatbinSectionName);
829
830 // There is only one HIP fat binary per linked module, however there are
831 // multiple constructor functions. Make sure the fat binary is registered
832 // only once. The constructor functions are executed by the dynamic loader
833 // before the program gains control. The dynamic loader cannot execute the
834 // constructor functions concurrently since doing that would not guarantee
835 // thread safety of the loaded program. Therefore we can assume sequential
836 // execution of constructor functions here.
837 if (IsHIP) {
838 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
839 : llvm::GlobalValue::ExternalLinkage;
840 llvm::BasicBlock *IfBlock =
841 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
842 llvm::BasicBlock *ExitBlock =
843 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
844 // The name, size, and initialization pattern of this variable is part
845 // of HIP ABI.
846 GpuBinaryHandle = new llvm::GlobalVariable(
847 TheModule, PtrTy, /*isConstant=*/false, Linkage,
848 /*Initializer=*/
849 CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
850 CudaGpuBinary
851 ? "__hip_gpubin_handle"
852 : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
853 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
854 // Prevent the weak symbol in different shared libraries being merged.
855 if (Linkage != llvm::GlobalValue::InternalLinkage)
856 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
857 Address GpuBinaryAddr(
858 GpuBinaryHandle, PtrTy,
859 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
860 {
861 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
862 llvm::Constant *Zero =
863 llvm::Constant::getNullValue(HandleValue->getType());
864 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
865 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
866 }
867 {
868 CtorBuilder.SetInsertPoint(IfBlock);
869 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
870 llvm::CallInst *RegisterFatbinCall =
871 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
872 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
873 CtorBuilder.CreateBr(ExitBlock);
874 }
875 {
876 CtorBuilder.SetInsertPoint(ExitBlock);
877 // Call __hip_register_globals(GpuBinaryHandle);
878 if (RegisterGlobalsFunc) {
879 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
880 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
881 }
882 }
883 } else if (!RelocatableDeviceCode) {
884 // Register binary with CUDA runtime. This is substantially different in
885 // default mode vs. separate compilation!
886 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
887 llvm::CallInst *RegisterFatbinCall =
888 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
889 GpuBinaryHandle = new llvm::GlobalVariable(
890 TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage,
891 llvm::ConstantPointerNull::get(PtrTy), "__cuda_gpubin_handle");
892 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
893 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
894 CGM.getPointerAlign());
895
896 // Call __cuda_register_globals(GpuBinaryHandle);
897 if (RegisterGlobalsFunc)
898 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
899
900 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
901 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
902 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
903 // void __cudaRegisterFatBinaryEnd(void **);
904 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
905 llvm::FunctionType::get(VoidTy, PtrTy, false),
906 "__cudaRegisterFatBinaryEnd");
907 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
908 }
909 } else {
910 // Generate a unique module ID.
911 SmallString<64> ModuleID;
912 llvm::raw_svector_ostream OS(ModuleID);
913 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
914 llvm::Constant *ModuleIDConstant = makeConstantArray(
915 std::string(ModuleID), "", ModuleIDSectionName, 32, /*AddNull=*/true);
916
917 // Create an alias for the FatbinWrapper that nvcc will look for.
918 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
919 Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
920
921 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
922 // void *, void (*)(void **))
923 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
924 RegisterLinkedBinaryName += ModuleID;
925 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
926 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
927
928 assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
929 llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
930 makeDummyFunction(getCallbackFnTy())};
931 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
932 }
933
934 // Create destructor and register it with atexit() the way NVCC does it. Doing
935 // it during regular destructor phase worked in CUDA before 9.2 but results in
936 // double-free in 9.2.
937 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
938 // extern "C" int atexit(void (*f)(void));
939 llvm::FunctionType *AtExitTy =
940 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
941 llvm::FunctionCallee AtExitFunc =
942 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
943 /*Local=*/true);
944 CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
945 }
946
947 CtorBuilder.CreateRetVoid();
948 return ModuleCtorFunc;
949}
950
951/// Creates a global destructor function that unregisters the GPU code blob
952/// registered by constructor.
953///
954/// For CUDA:
955/// \code
956/// void __cuda_module_dtor() {
957/// __cudaUnregisterFatBinary(Handle);
958/// }
959/// \endcode
960///
961/// For HIP:
962/// \code
963/// void __hip_module_dtor() {
964/// if (__hip_gpubin_handle) {
965/// __hipUnregisterFatBinary(__hip_gpubin_handle);
966/// __hip_gpubin_handle = 0;
967/// }
968/// }
969/// \endcode
970llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
971 // No need for destructor if we don't have a handle to unregister.
972 if (!GpuBinaryHandle)
973 return nullptr;
974
975 // void __cudaUnregisterFatBinary(void ** handle);
976 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
977 llvm::FunctionType::get(VoidTy, PtrTy, false),
978 addUnderscoredPrefixToName("UnregisterFatBinary"));
979
980 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
981 llvm::FunctionType::get(VoidTy, false),
982 llvm::GlobalValue::InternalLinkage,
983 addUnderscoredPrefixToName("_module_dtor"), &TheModule);
984
985 llvm::BasicBlock *DtorEntryBB =
986 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
987 CGBuilderTy DtorBuilder(CGM, Context);
988 DtorBuilder.SetInsertPoint(DtorEntryBB);
989
990 Address GpuBinaryAddr(
991 GpuBinaryHandle, GpuBinaryHandle->getValueType(),
992 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
993 auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
994 // There is only one HIP fat binary per linked module, however there are
995 // multiple destructor functions. Make sure the fat binary is unregistered
996 // only once.
997 if (CGM.getLangOpts().HIP) {
998 llvm::BasicBlock *IfBlock =
999 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
1000 llvm::BasicBlock *ExitBlock =
1001 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
1002 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
1003 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
1004 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
1005
1006 DtorBuilder.SetInsertPoint(IfBlock);
1007 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1008 DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
1009 DtorBuilder.CreateBr(ExitBlock);
1010
1011 DtorBuilder.SetInsertPoint(ExitBlock);
1012 } else {
1013 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1014 }
1015 DtorBuilder.CreateRetVoid();
1016 return ModuleDtorFunc;
1017}
1018
1020 return new CGNVCUDARuntime(CGM);
1021}
1022
1023void CGNVCUDARuntime::internalizeDeviceSideVar(
1024 const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
1025 // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
1026 // global variables become internal definitions. These have to be internal in
1027 // order to prevent name conflicts with global host variables with the same
1028 // name in a different TUs.
1029 //
1030 // For -fgpu-rdc, the shadow variables should not be internalized because
1031 // they may be accessed by different TU.
1032 if (CGM.getLangOpts().GPURelocatableDeviceCode)
1033 return;
1034
1035 // __shared__ variables are odd. Shadows do get created, but
1036 // they are not registered with the CUDA runtime, so they
1037 // can't really be used to access their device-side
1038 // counterparts. It's not clear yet whether it's nvcc's bug or
1039 // a feature, but we've got to do the same for compatibility.
1040 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
1041 D->hasAttr<CUDASharedAttr>() ||
1044 Linkage = llvm::GlobalValue::InternalLinkage;
1045 }
1046}
1047
1048void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1049 llvm::GlobalVariable &GV) {
1050 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
1051 // Shadow variables and their properties must be registered with CUDA
1052 // runtime. Skip Extern global variables, which will be registered in
1053 // the TU where they are defined.
1054 //
1055 // Don't register a C++17 inline variable. The local symbol can be
1056 // discarded and referencing a discarded local symbol from outside the
1057 // comdat (__cuda_register_globals) is disallowed by the ELF spec.
1058 //
1059 // HIP managed variables need to be always recorded in device and host
1060 // compilations for transformation.
1061 //
1062 // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1063 // added to llvm.compiler-used, therefore they are safe to be registered.
1064 if ((!D->hasExternalStorage() && !D->isInline()) ||
1065 CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
1066 D->hasAttr<HIPManagedAttr>()) {
1067 registerDeviceVar(D, GV, !D->hasDefinition(),
1068 D->hasAttr<CUDAConstantAttr>());
1069 }
1070 } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1072 // Builtin surfaces and textures and their template arguments are
1073 // also registered with CUDA runtime.
1074 const auto *TD = cast<ClassTemplateSpecializationDecl>(
1075 D->getType()->castAs<RecordType>()->getDecl());
1076 const TemplateArgumentList &Args = TD->getTemplateArgs();
1077 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1078 assert(Args.size() == 2 &&
1079 "Unexpected number of template arguments of CUDA device "
1080 "builtin surface type.");
1081 auto SurfType = Args[1].getAsIntegral();
1082 if (!D->hasExternalStorage())
1083 registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1084 } else {
1085 assert(Args.size() == 3 &&
1086 "Unexpected number of template arguments of CUDA device "
1087 "builtin texture type.");
1088 auto TexType = Args[1].getAsIntegral();
1089 auto Normalized = Args[2].getAsIntegral();
1090 if (!D->hasExternalStorage())
1091 registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1092 Normalized.getZExtValue());
1093 }
1094 }
1095}
1096
1097// Transform managed variables to pointers to managed variables in device code.
1098// Each use of the original managed variable is replaced by a load from the
1099// transformed managed variable. The transformed managed variable contains
1100// the address of managed memory which will be allocated by the runtime.
1101void CGNVCUDARuntime::transformManagedVars() {
1102 for (auto &&Info : DeviceVars) {
1103 llvm::GlobalVariable *Var = Info.Var;
1104 if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1105 Info.Flags.isManaged()) {
1106 auto *ManagedVar = new llvm::GlobalVariable(
1107 CGM.getModule(), Var->getType(),
1108 /*isConstant=*/false, Var->getLinkage(),
1109 /*Init=*/Var->isDeclaration()
1110 ? nullptr
1111 : llvm::ConstantPointerNull::get(Var->getType()),
1112 /*Name=*/"", /*InsertBefore=*/nullptr,
1113 llvm::GlobalVariable::NotThreadLocal,
1114 CGM.getContext().getTargetAddressSpace(CGM.getLangOpts().CUDAIsDevice
1115 ? LangAS::cuda_device
1116 : LangAS::Default));
1117 ManagedVar->setDSOLocal(Var->isDSOLocal());
1118 ManagedVar->setVisibility(Var->getVisibility());
1119 ManagedVar->setExternallyInitialized(true);
1120 replaceManagedVar(Var, ManagedVar);
1121 ManagedVar->takeName(Var);
1122 Var->setName(Twine(ManagedVar->getName()) + ".managed");
1123 // Keep managed variables even if they are not used in device code since
1124 // they need to be allocated by the runtime.
1125 if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1126 assert(!ManagedVar->isDeclaration());
1127 CGM.addCompilerUsedGlobal(Var);
1128 CGM.addCompilerUsedGlobal(ManagedVar);
1129 }
1130 }
1131 }
1132}
1133
1134// Creates offloading entries for all the kernels and globals that must be
1135// registered. The linker will provide a pointer to this section so we can
1136// register the symbols with the linked device image.
1137void CGNVCUDARuntime::createOffloadingEntries() {
1138 StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
1139 : "cuda_offloading_entries";
1140 llvm::Module &M = CGM.getModule();
1141 for (KernelInfo &I : EmittedKernels)
1142 llvm::offloading::emitOffloadingEntry(
1143 M, KernelHandles[I.Kernel->getName()],
1144 getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0,
1145 llvm::offloading::OffloadGlobalEntry, Section);
1146
1147 for (VarInfo &I : DeviceVars) {
1148 uint64_t VarSize =
1149 CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1150 int32_t Flags =
1151 (I.Flags.isExtern()
1152 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1153 : 0) |
1154 (I.Flags.isConstant()
1155 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1156 : 0) |
1157 (I.Flags.isNormalized()
1158 ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1159 : 0);
1160 if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1161 llvm::offloading::emitOffloadingEntry(
1162 M, I.Var, getDeviceSideName(I.D), VarSize,
1163 (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1164 : llvm::offloading::OffloadGlobalEntry) |
1165 Flags,
1166 /*Data=*/0, Section);
1167 } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1168 llvm::offloading::emitOffloadingEntry(
1169 M, I.Var, getDeviceSideName(I.D), VarSize,
1170 llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1171 I.Flags.getSurfTexType(), Section);
1172 } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1173 llvm::offloading::emitOffloadingEntry(
1174 M, I.Var, getDeviceSideName(I.D), VarSize,
1175 llvm::offloading::OffloadGlobalTextureEntry | Flags,
1176 I.Flags.getSurfTexType(), Section);
1177 }
1178 }
1179}
1180
1181// Returns module constructor to be added.
1182llvm::Function *CGNVCUDARuntime::finalizeModule() {
1183 transformManagedVars();
1184 if (CGM.getLangOpts().CUDAIsDevice) {
1185 // Mark ODR-used device variables as compiler used to prevent it from being
1186 // eliminated by optimization. This is necessary for device variables
1187 // ODR-used by host functions. Sema correctly marks them as ODR-used no
1188 // matter whether they are ODR-used by device or host functions.
1189 //
1190 // We do not need to do this if the variable has used attribute since it
1191 // has already been added.
1192 //
1193 // Static device variables have been externalized at this point, therefore
1194 // variables with LLVM private or internal linkage need not be added.
1195 for (auto &&Info : DeviceVars) {
1196 auto Kind = Info.Flags.getKind();
1197 if (!Info.Var->isDeclaration() &&
1198 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1199 (Kind == DeviceVarFlags::Variable ||
1200 Kind == DeviceVarFlags::Surface ||
1201 Kind == DeviceVarFlags::Texture) &&
1202 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1203 CGM.addCompilerUsedGlobal(Info.Var);
1204 }
1205 }
1206 return nullptr;
1207 }
1208 if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1209 createOffloadingEntries();
1210 else
1211 return makeModuleCtorFunction();
1212
1213 return nullptr;
1214}
1215
1216llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1217 GlobalDecl GD) {
1218 auto Loc = KernelHandles.find(F->getName());
1219 if (Loc != KernelHandles.end()) {
1220 auto OldHandle = Loc->second;
1221 if (KernelStubs[OldHandle] == F)
1222 return OldHandle;
1223
1224 // We've found the function name, but F itself has changed, so we need to
1225 // update the references.
1226 if (CGM.getLangOpts().HIP) {
1227 // For HIP compilation the handle itself does not change, so we only need
1228 // to update the Stub value.
1229 KernelStubs[OldHandle] = F;
1230 return OldHandle;
1231 }
1232 // For non-HIP compilation, erase the old Stub and fall-through to creating
1233 // new entries.
1234 KernelStubs.erase(OldHandle);
1235 }
1236
1237 if (!CGM.getLangOpts().HIP) {
1238 KernelHandles[F->getName()] = F;
1239 KernelStubs[F] = F;
1240 return F;
1241 }
1242
1243 auto *Var = new llvm::GlobalVariable(
1244 TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1245 /*Initializer=*/nullptr,
1246 CGM.getMangledName(
1247 GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
1248 Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1249 Var->setDSOLocal(F->isDSOLocal());
1250 Var->setVisibility(F->getVisibility());
1251 auto *FD = cast<FunctionDecl>(GD.getDecl());
1252 auto *FT = FD->getPrimaryTemplate();
1253 if (!FT || FT->isThisDeclarationADefinition())
1254 CGM.maybeSetTrivialComdat(*FD, *Var);
1255 KernelHandles[F->getName()] = Var;
1256 KernelStubs[Var] = F;
1257 return Var;
1258}
static std::unique_ptr< MangleContext > InitDeviceMC(CodeGenModule &CGM)
Definition: CGCUDANV.cpp:209
static void replaceManagedVar(llvm::GlobalVariable *Var, llvm::GlobalVariable *ManagedVar)
Definition: CGCUDANV.cpp:495
MangleContext * createMangleContext(const TargetInfo *T=nullptr)
If T is null pointer, assume the target in ASTContext.
const TargetInfo * getAuxTargetInfo() const
Definition: ASTContext.h:758
MangleContext * createDeviceMangleContext(const TargetInfo &T)
Creates a device mangle context to correctly mangle lambdas in a mixed architecture compile by settin...
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:757
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
static CharUnits One()
One - Construct a CharUnits quantity of one.
Definition: CharUnits.h:58
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
static CharUnits Zero()
Zero - Construct a CharUnits quantity of zero.
Definition: CharUnits.h:53
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition: Address.h:111
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
Definition: Address.h:220
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:176
llvm::StoreInst * CreateAlignedStore(llvm::Value *Val, llvm::Value *Addr, CharUnits Align, bool IsVolatile=false)
Definition: CGBuilder.h:143
llvm::StoreInst * CreateDefaultAlignedStore(llvm::Value *Val, llvm::Value *Addr, bool IsVolatile=false)
Definition: CGBuilder.h:151
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
Definition: CGBuilder.h:108
virtual std::string getDeviceSideName(const NamedDecl *ND)=0
Returns function or variable name on device side even if the current compilation is for host.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args)=0
Emits a kernel launch stub.
virtual llvm::Function * getKernelStub(llvm::GlobalValue *Handle)=0
Get kernel stub by kernel handle.
virtual void handleVarRegistration(const VarDecl *VD, llvm::GlobalVariable &Var)=0
Check whether a variable is a device variable and register it if true.
virtual llvm::Function * finalizeModule()=0
Finalize generated LLVM module.
virtual llvm::GlobalValue * getKernelHandle(llvm::Function *Stub, GlobalDecl GD)=0
Get kernel handle by stub function.
virtual void internalizeDeviceSideVar(const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage)=0
Adjust linkage of shadow variables in host compilation.
static CGCallee forDirect(llvm::Constant *functionPtr, const CGCalleeInfo &abstractInfo=CGCalleeInfo())
Definition: CGCall.h:129
CGFunctionInfo - Class to encapsulate the information about a function definition.
CallArgList - Type for representing both the value and type of arguments in a call.
Definition: CGCall.h:258
void add(RValue rvalue, QualType type)
Definition: CGCall.h:282
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
const LangOptions & getLangOpts() const
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee, ReturnValueSlot ReturnValue, const CallArgList &Args, llvm::CallBase **callOrInvoke, bool IsMustTail, SourceLocation Loc)
EmitCall - Generate a call of the given function, expecting the given result type,...
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
const Decl * CurFuncDecl
CurFuncDecl - Holds the Decl for the current outermost non-closure context.
llvm::CallBase * EmitRuntimeCallOrInvoke(llvm::FunctionCallee callee, ArrayRef< llvm::Value * > args, const Twine &name="")
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
This class organizes the cross-function state that is used while generating LLVM code.
ASTContext & getContext() const
The standard implementation of ConstantInitBuilder used in Clang.
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition: CGCall.h:352
static RValue get(llvm::Value *V)
Definition: CGValue.h:97
static RValue getAggregate(Address addr, bool isVolatile=false)
Convert an Address to an RValue.
Definition: CGValue.h:124
ReturnValueSlot - Contains the address where the return value of a function can be stored,...
Definition: CGCall.h:356
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1436
lookup_result lookup(DeclarationName Name) const
lookup - Find the declarations (if any) with the given Name in this context.
Definition: DeclBase.cpp:1786
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86
SourceLocation getLocation() const
Definition: DeclBase.h:445
TranslationUnitDecl * getTranslationUnitDecl()
Definition: DeclBase.cpp:486
bool hasAttr() const
Definition: DeclBase.h:583
Represents a function declaration or definition.
Definition: Decl.h:1971
const ParmVarDecl * getParamDecl(unsigned i) const
Definition: Decl.h:2707
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:56
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind)
Definition: GlobalDecl.h:194
const Decl * getDecl() const
Definition: GlobalDecl.h:103
One of these records is kept for each identifier that is lexed.
StringRef getName() const
Return the actual identifier string.
GPUDefaultStreamKind GPUDefaultStream
The default stream kind used for HIP kernel launching.
Definition: LangOptions.h:556
MangleContext - Context for tracking state which persists across multiple calls to the C++ name mangl...
Definition: Mangle.h:45
bool shouldMangleDeclName(const NamedDecl *D)
Definition: Mangle.cpp:105
void mangleName(GlobalDecl GD, raw_ostream &)
Definition: Mangle.cpp:139
This represents a decl that may have a name.
Definition: Decl.h:249
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:270
Represents a parameter to a function.
Definition: Decl.h:1761
A (possibly-)qualified type.
Definition: Type.h:940
QualType getCanonicalType() const
Definition: Type.h:7411
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
Definition: Type.h:5549
RecordDecl * getDecl() const
Definition: Type.h:5559
bool isMicrosoft() const
Is this ABI an MSVC-compatible ABI?
Definition: TargetCXXABI.h:136
bool isItaniumFamily() const
Does this ABI generally fall into the Itanium family of ABIs?
Definition: TargetCXXABI.h:122
TargetCXXABI getCXXABI() const
Get the C++ ABI currently in use.
Definition: TargetInfo.h:1307
A template argument list.
Definition: DeclTemplate.h:244
unsigned size() const
Retrieve the number of template arguments in this template argument list.
Definition: DeclTemplate.h:280
The top declaration context.
Definition: Decl.h:84
static DeclContext * castToDeclContext(const TranslationUnitDecl *D)
Definition: Decl.h:130
The base class of the type hierarchy.
Definition: Type.h:1813
const T * castAs() const
Member-template castAs<specific type>.
Definition: Type.h:8190
bool isCUDADeviceBuiltinSurfaceType() const
Check if the type is the CUDA device builtin surface type.
Definition: Type.cpp:4922
bool isCUDADeviceBuiltinTextureType() const
Check if the type is the CUDA device builtin texture type.
Definition: Type.cpp:4929
QualType getType() const
Definition: Decl.h:717
QualType getType() const
Definition: Value.cpp:234
Represents a variable declaration or definition.
Definition: Decl.h:918
bool isInline() const
Whether this variable is (C++1z) inline.
Definition: Decl.h:1531
bool hasExternalStorage() const
Returns true if a variable has extern or private_extern storage.
Definition: Decl.h:1204
DefinitionKind hasDefinition(ASTContext &) const
Check whether this variable is defined in this translation unit.
Definition: Decl.cpp:2372
CGCUDARuntime * CreateNVCUDARuntime(CodeGenModule &CGM)
Creates an instance of a CUDA runtime class.
Definition: CGCUDANV.cpp:1019
bool Zero(InterpState &S, CodePtr OpPC)
Definition: Interp.h:1873
@ VFS
Remove unused -ivfsoverlay arguments.
The JSON file list parser is used to communicate input to InstallAPI.
CudaVersion ToCudaVersion(llvm::VersionTuple)
Definition: Cuda.cpp:65
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature)
Definition: Cuda.cpp:244
Linkage
Describes the different kinds of linkage (C++ [basic.link], C99 6.2.2) that an entity may have.
Definition: Linkage.h:24
unsigned long uint64_t
llvm::IntegerType * IntTy
int