22#include "llvm/Frontend/Offloading/Utility.h"
23#include "llvm/IR/BasicBlock.h"
24#include "llvm/IR/Constants.h"
25#include "llvm/IR/DerivedTypes.h"
26#include "llvm/IR/ReplaceConstant.h"
27#include "llvm/Support/Format.h"
28#include "llvm/Support/VirtualFileSystem.h"
31using namespace CodeGen;
34constexpr unsigned CudaFatMagic = 0x466243b1;
35constexpr unsigned HIPFatMagic = 0x48495046;
40 llvm::IntegerType *IntTy, *SizeTy;
42 llvm::PointerType *PtrTy;
45 llvm::LLVMContext &Context;
47 llvm::Module &TheModule;
57 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
59 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
61 llvm::GlobalVariable *Var;
69 llvm::GlobalVariable *GpuBinaryHandle =
nullptr;
71 bool RelocatableDeviceCode;
73 std::unique_ptr<MangleContext> DeviceMC;
75 llvm::Constant *Zeros[2];
77 llvm::FunctionCallee getSetupArgumentFn()
const;
78 llvm::FunctionCallee getLaunchFn()
const;
80 llvm::FunctionType *getRegisterGlobalsFnTy()
const;
81 llvm::FunctionType *getCallbackFnTy()
const;
82 llvm::FunctionType *getRegisterLinkedBinaryFnTy()
const;
83 std::string addPrefixToName(StringRef FuncName)
const;
84 std::string addUnderscoredPrefixToName(StringRef FuncName)
const;
87 llvm::Function *makeRegisterGlobalsFn();
92 llvm::Constant *makeConstantString(
const std::string &Str,
93 const std::string &Name =
"") {
94 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
95 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
96 ConstStr.getPointer(), Zeros);
102 llvm::Constant *makeConstantArray(StringRef Str,
104 StringRef SectionName =
"",
105 unsigned Alignment = 0,
106 bool AddNull =
false) {
107 llvm::Constant *
Value =
108 llvm::ConstantDataArray::getString(Context, Str, AddNull);
109 auto *GV =
new llvm::GlobalVariable(
111 llvm::GlobalValue::PrivateLinkage,
Value, Name);
112 if (!SectionName.empty()) {
113 GV->setSection(SectionName);
116 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
119 GV->setAlignment(llvm::Align(Alignment));
120 return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros);
124 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
125 assert(FnTy->getReturnType()->isVoidTy() &&
126 "Can only generate dummy functions returning void!");
127 llvm::Function *DummyFunc = llvm::Function::Create(
128 FnTy, llvm::GlobalValue::InternalLinkage,
"dummy", &TheModule);
130 llvm::BasicBlock *DummyBlock =
131 llvm::BasicBlock::Create(Context,
"", DummyFunc);
133 FuncBuilder.SetInsertPoint(DummyBlock);
134 FuncBuilder.CreateRetVoid();
143 void registerDeviceVar(
const VarDecl *VD, llvm::GlobalVariable &Var,
144 bool Extern,
bool Constant) {
145 DeviceVars.push_back({&Var,
148 VD->hasAttr<HIPManagedAttr>(),
151 void registerDeviceSurf(
const VarDecl *VD, llvm::GlobalVariable &Var,
152 bool Extern,
int Type) {
153 DeviceVars.push_back({&Var,
159 void registerDeviceTex(
const VarDecl *VD, llvm::GlobalVariable &Var,
160 bool Extern,
int Type,
bool Normalized) {
161 DeviceVars.push_back({&Var,
164 false, Normalized,
Type}});
168 llvm::Function *makeModuleCtorFunction();
170 llvm::Function *makeModuleDtorFunction();
172 void transformManagedVars();
174 void createOffloadingEntries();
180 llvm::Function *
getKernelStub(llvm::GlobalValue *Handle)
override {
181 auto Loc = KernelStubs.find(Handle);
182 assert(Loc != KernelStubs.end());
187 llvm::GlobalVariable &Var)
override;
190 llvm::GlobalValue::LinkageTypes &
Linkage)
override;
197std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName)
const {
198 if (CGM.getLangOpts().HIP)
199 return ((Twine(
"hip") + Twine(FuncName)).str());
200 return ((Twine(
"cuda") + Twine(FuncName)).str());
203CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName)
const {
204 if (CGM.getLangOpts().HIP)
205 return ((Twine(
"__hip") + Twine(FuncName)).str());
206 return ((Twine(
"__cuda") + Twine(FuncName)).str());
216 return std::unique_ptr<MangleContext>(
227 TheModule(CGM.getModule()),
228 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
233 Zeros[0] = llvm::ConstantInt::get(SizeTy, 0);
238llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn()
const {
240 llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
241 return CGM.CreateRuntimeFunction(
242 llvm::FunctionType::get(IntTy, Params,
false),
243 addPrefixToName(
"SetupArgument"));
246llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn()
const {
247 if (CGM.getLangOpts().HIP) {
249 return CGM.CreateRuntimeFunction(
250 llvm::FunctionType::get(IntTy, PtrTy,
false),
"hipLaunchByPtr");
253 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy,
false),
257llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy()
const {
258 return llvm::FunctionType::get(VoidTy, PtrTy,
false);
261llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy()
const {
262 return llvm::FunctionType::get(VoidTy, PtrTy,
false);
265llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy()
const {
266 llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
267 llvm::PointerType::getUnqual(Context)};
268 return llvm::FunctionType::get(VoidTy, Params,
false);
271std::string CGNVCUDARuntime::getDeviceSideName(
const NamedDecl *ND) {
274 if (
auto *FD = dyn_cast<FunctionDecl>(ND))
275 GD =
GlobalDecl(FD, KernelReferenceKind::Kernel);
278 std::string DeviceSideName;
280 if (CGM.getLangOpts().CUDAIsDevice)
281 MC = &CGM.getCXXABI().getMangleContext();
286 llvm::raw_svector_ostream Out(Buffer);
288 DeviceSideName = std::string(Out.str());
293 if (CGM.getContext().shouldExternalize(ND) &&
294 CGM.getLangOpts().GPURelocatableDeviceCode) {
296 llvm::raw_svector_ostream Out(Buffer);
297 Out << DeviceSideName;
298 CGM.printPostfixForExternalizedDecl(Out, ND);
299 DeviceSideName = std::string(Out.str());
301 return DeviceSideName;
308 dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.
CurFn->getName()])) {
309 GV->setLinkage(CGF.
CurFn->getLinkage());
310 GV->setInitializer(CGF.
CurFn);
313 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
315 emitDeviceStubBodyNew(CGF, Args);
317 emitDeviceStubBodyLegacy(CGF, Args);
331 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
333 for (
unsigned i = 0; i < Args.size(); ++i) {
335 llvm::Value *VoidVarPtr = CGF.
Builder.CreatePointerCast(VarPtr, PtrTy);
355 std::string KernelLaunchAPI =
"LaunchKernel";
357 LangOptions::GPUDefaultStreamKind::PerThread) {
359 KernelLaunchAPI = KernelLaunchAPI +
"_spt";
361 KernelLaunchAPI = KernelLaunchAPI +
"_ptsz";
363 auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
365 CGM.getContext().Idents.get(LaunchKernelName);
367 for (
auto *Result : DC->
lookup(&cudaLaunchKernelII)) {
369 cudaLaunchKernelFD = FD;
372 if (cudaLaunchKernelFD ==
nullptr) {
374 "Can't find declaration for " + LaunchKernelName);
387 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
388 llvm::FunctionType::get(IntTy,
394 addUnderscoredPrefixToName(
"PopCallConfiguration"));
402 CGF.
Builder.CreatePointerCast(KernelHandles[CGF.
CurFn->getName()], PtrTy);
417 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
418 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
421 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
422 llvm::FunctionCallee cudaLaunchKernelFn =
423 CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
434 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
437 for (
const VarDecl *A : Args) {
438 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
439 Offset = Offset.alignTo(TInfo.Align);
440 llvm::Value *Args[] = {
443 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
444 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
447 llvm::Constant *
Zero = llvm::ConstantInt::get(IntTy, 0);
448 llvm::Value *CBZero = CGF.
Builder.CreateICmpEQ(CB, Zero);
450 CGF.
Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
452 Offset += TInfo.Width;
456 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
458 CGF.
Builder.CreatePointerCast(KernelHandles[CGF.
CurFn->getName()], PtrTy);
468 llvm::GlobalVariable *ManagedVar) {
470 for (
auto &&VarUse : Var->uses()) {
471 WorkList.push_back({VarUse.getUser()});
473 while (!WorkList.empty()) {
474 auto &&WorkItem = WorkList.pop_back_val();
475 auto *
U = WorkItem.back();
476 if (isa<llvm::ConstantExpr>(
U)) {
477 for (
auto &&UU :
U->uses()) {
478 WorkItem.push_back(UU.getUser());
479 WorkList.push_back(WorkItem);
484 if (
auto *I = dyn_cast<llvm::Instruction>(
U)) {
485 llvm::Value *OldV = Var;
486 llvm::Instruction *NewV =
487 new llvm::LoadInst(Var->getType(), ManagedVar,
"ld.managed",
false,
488 llvm::Align(Var->getAlignment()), I);
492 for (
auto &&Op : WorkItem) {
493 auto *CE = cast<llvm::ConstantExpr>(Op);
494 auto *NewInst = CE->getAsInstruction(I);
495 NewInst->replaceUsesOfWith(OldV, NewV);
499 I->replaceUsesOfWith(OldV, NewV);
501 llvm_unreachable(
"Invalid use of managed variable");
520llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
522 if (EmittedKernels.empty() && DeviceVars.empty())
525 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
526 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
527 addUnderscoredPrefixToName(
"_register_globals"), &TheModule);
528 llvm::BasicBlock *EntryBB =
529 llvm::BasicBlock::Create(Context,
"entry", RegisterKernelsFunc);
531 Builder.SetInsertPoint(EntryBB);
535 llvm::Type *RegisterFuncParams[] = {
536 PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
537 PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
538 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
539 llvm::FunctionType::get(IntTy, RegisterFuncParams,
false),
540 addUnderscoredPrefixToName(
"RegisterFunction"));
545 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
546 for (
auto &&I : EmittedKernels) {
547 llvm::Constant *KernelName =
548 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
549 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
550 llvm::Value *Args[] = {
552 KernelHandles[I.Kernel->getName()],
555 llvm::ConstantInt::get(IntTy, -1),
560 llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
561 Builder.CreateCall(RegisterFunc, Args);
564 llvm::Type *VarSizeTy = IntTy;
566 if (CGM.getLangOpts().HIP ||
567 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
572 llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
573 IntTy, VarSizeTy, IntTy, IntTy};
574 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
575 llvm::FunctionType::get(VoidTy, RegisterVarParams,
false),
576 addUnderscoredPrefixToName(
"RegisterVar"));
579 llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
580 PtrTy, VarSizeTy, IntTy};
581 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
582 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams,
false),
583 addUnderscoredPrefixToName(
"RegisterManagedVar"));
586 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
587 llvm::FunctionType::get(
588 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy},
false),
589 addUnderscoredPrefixToName(
"RegisterSurface"));
592 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
593 llvm::FunctionType::get(
594 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy},
false),
595 addUnderscoredPrefixToName(
"RegisterTexture"));
596 for (
auto &&Info : DeviceVars) {
597 llvm::GlobalVariable *Var = Info.Var;
598 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
599 "External variables should not show up here, except HIP managed "
601 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
602 switch (Info.Flags.getKind()) {
603 case DeviceVarFlags::Variable: {
605 CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
606 if (Info.Flags.isManaged()) {
607 auto *ManagedVar =
new llvm::GlobalVariable(
608 CGM.getModule(), Var->getType(),
609 false, Var->getLinkage(),
612 : llvm::ConstantPointerNull::get(Var->getType()),
614 llvm::GlobalVariable::NotThreadLocal);
615 ManagedVar->setDSOLocal(Var->isDSOLocal());
616 ManagedVar->setVisibility(Var->getVisibility());
617 ManagedVar->setExternallyInitialized(
true);
618 ManagedVar->takeName(Var);
619 Var->setName(Twine(ManagedVar->getName() +
".managed"));
621 llvm::Value *Args[] = {
626 llvm::ConstantInt::get(VarSizeTy, VarSize),
627 llvm::ConstantInt::get(IntTy, Var->getAlignment())};
628 if (!Var->isDeclaration())
629 Builder.CreateCall(RegisterManagedVar, Args);
631 llvm::Value *Args[] = {
636 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
637 llvm::ConstantInt::get(VarSizeTy, VarSize),
638 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
639 llvm::ConstantInt::get(IntTy, 0)};
640 Builder.CreateCall(RegisterVar, Args);
644 case DeviceVarFlags::Surface:
647 {&GpuBinaryHandlePtr, Var, VarName, VarName,
648 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
649 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
651 case DeviceVarFlags::Texture:
654 {&GpuBinaryHandlePtr, Var, VarName, VarName,
655 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
656 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
657 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
662 Builder.CreateRetVoid();
663 return RegisterKernelsFunc;
685llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
686 bool IsHIP = CGM.getLangOpts().HIP;
687 bool IsCUDA = CGM.getLangOpts().CUDA;
689 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
690 if (CudaGpuBinaryFileName.empty() && !IsHIP)
692 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
697 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
700 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
701 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
704 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
705 llvm::FunctionType::get(PtrTy, PtrTy,
false),
706 addUnderscoredPrefixToName(
"RegisterFatBinary"));
708 llvm::StructType *FatbinWrapperTy =
709 llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
715 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary =
nullptr;
716 if (!CudaGpuBinaryFileName.empty()) {
717 auto VFS = CGM.getFileSystem();
718 auto CudaGpuBinaryOrErr =
719 VFS->getBufferForFile(CudaGpuBinaryFileName, -1,
false);
720 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
721 CGM.getDiags().Report(diag::err_cannot_open_file)
722 << CudaGpuBinaryFileName << EC.message();
725 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
728 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
729 llvm::FunctionType::get(VoidTy,
false),
730 llvm::GlobalValue::InternalLinkage,
731 addUnderscoredPrefixToName(
"_module_ctor"), &TheModule);
732 llvm::BasicBlock *CtorEntryBB =
733 llvm::BasicBlock::Create(Context,
"entry", ModuleCtorFunc);
736 CtorBuilder.SetInsertPoint(CtorEntryBB);
738 const char *FatbinConstantName;
739 const char *FatbinSectionName;
740 const char *ModuleIDSectionName;
741 StringRef ModuleIDPrefix;
742 llvm::Constant *FatBinStr;
745 FatbinConstantName =
".hip_fatbin";
746 FatbinSectionName =
".hipFatBinSegment";
748 ModuleIDSectionName =
"__hip_module_id";
749 ModuleIDPrefix =
"__hip_";
754 const unsigned HIPCodeObjectAlign = 4096;
755 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
756 FatbinConstantName, HIPCodeObjectAlign);
762 FatBinStr =
new llvm::GlobalVariable(
763 CGM.getModule(), CGM.Int8Ty,
764 true, llvm::GlobalValue::ExternalLinkage,
nullptr,
765 "__hip_fatbin_" + CGM.getContext().getCUIDHash(),
nullptr,
766 llvm::GlobalVariable::NotThreadLocal);
767 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
770 FatMagic = HIPFatMagic;
772 if (RelocatableDeviceCode)
773 FatbinConstantName = CGM.getTriple().isMacOSX()
774 ?
"__NV_CUDA,__nv_relfatbin"
778 CGM.getTriple().isMacOSX() ?
"__NV_CUDA,__nv_fatbin" :
".nv_fatbin";
781 CGM.getTriple().isMacOSX() ?
"__NV_CUDA,__fatbin" :
".nvFatBinSegment";
783 ModuleIDSectionName = CGM.getTriple().isMacOSX()
784 ?
"__NV_CUDA,__nv_module_id"
786 ModuleIDPrefix =
"__nv_";
790 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
791 FatbinConstantName, 8);
792 FatMagic = CudaFatMagic;
797 auto Values = Builder.beginStruct(FatbinWrapperTy);
799 Values.addInt(IntTy, FatMagic);
801 Values.addInt(IntTy, 1);
803 Values.add(FatBinStr);
805 Values.add(llvm::ConstantPointerNull::get(PtrTy));
806 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
807 addUnderscoredPrefixToName(
"_fatbin_wrapper"), CGM.getPointerAlign(),
809 FatbinWrapper->setSection(FatbinSectionName);
819 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
820 : llvm::GlobalValue::ExternalLinkage;
821 llvm::BasicBlock *IfBlock =
822 llvm::BasicBlock::Create(Context,
"if", ModuleCtorFunc);
823 llvm::BasicBlock *ExitBlock =
824 llvm::BasicBlock::Create(Context,
"exit", ModuleCtorFunc);
827 GpuBinaryHandle =
new llvm::GlobalVariable(
828 TheModule, PtrTy,
false,
Linkage,
830 CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) :
nullptr,
832 ?
"__hip_gpubin_handle"
833 :
"__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
834 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
836 if (
Linkage != llvm::GlobalValue::InternalLinkage)
837 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
839 GpuBinaryHandle, PtrTy,
842 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
843 llvm::Constant *
Zero =
844 llvm::Constant::getNullValue(HandleValue->getType());
845 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
846 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
849 CtorBuilder.SetInsertPoint(IfBlock);
851 llvm::CallInst *RegisterFatbinCall =
852 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
853 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
854 CtorBuilder.CreateBr(ExitBlock);
857 CtorBuilder.SetInsertPoint(ExitBlock);
859 if (RegisterGlobalsFunc) {
860 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
861 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
864 }
else if (!RelocatableDeviceCode) {
868 llvm::CallInst *RegisterFatbinCall =
869 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
870 GpuBinaryHandle =
new llvm::GlobalVariable(
871 TheModule, PtrTy,
false, llvm::GlobalValue::InternalLinkage,
872 llvm::ConstantPointerNull::get(PtrTy),
"__cuda_gpubin_handle");
873 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
874 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
875 CGM.getPointerAlign());
878 if (RegisterGlobalsFunc)
879 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
883 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
885 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
886 llvm::FunctionType::get(VoidTy, PtrTy,
false),
887 "__cudaRegisterFatBinaryEnd");
888 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
893 llvm::raw_svector_ostream OS(ModuleID);
894 OS << ModuleIDPrefix << llvm::format(
"%" PRIx64, FatbinWrapper->getGUID());
895 llvm::Constant *ModuleIDConstant = makeConstantArray(
896 std::string(ModuleID),
"", ModuleIDSectionName, 32,
true);
899 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
900 Twine(
"__fatbinwrap") + ModuleID, FatbinWrapper);
905 RegisterLinkedBinaryName += ModuleID;
906 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
907 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
909 assert(RegisterGlobalsFunc &&
"Expecting at least dummy function!");
910 llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
911 makeDummyFunction(getCallbackFnTy())};
912 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
918 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
920 llvm::FunctionType *AtExitTy =
921 llvm::FunctionType::get(IntTy, CleanupFn->getType(),
false);
922 llvm::FunctionCallee AtExitFunc =
923 CGM.CreateRuntimeFunction(AtExitTy,
"atexit", llvm::AttributeList(),
925 CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
928 CtorBuilder.CreateRetVoid();
929 return ModuleCtorFunc;
951llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
953 if (!GpuBinaryHandle)
957 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
958 llvm::FunctionType::get(VoidTy, PtrTy,
false),
959 addUnderscoredPrefixToName(
"UnregisterFatBinary"));
961 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
962 llvm::FunctionType::get(VoidTy,
false),
963 llvm::GlobalValue::InternalLinkage,
964 addUnderscoredPrefixToName(
"_module_dtor"), &TheModule);
966 llvm::BasicBlock *DtorEntryBB =
967 llvm::BasicBlock::Create(Context,
"entry", ModuleDtorFunc);
969 DtorBuilder.SetInsertPoint(DtorEntryBB);
972 GpuBinaryHandle, GpuBinaryHandle->getValueType(),
974 auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
978 if (CGM.getLangOpts().HIP) {
979 llvm::BasicBlock *IfBlock =
980 llvm::BasicBlock::Create(Context,
"if", ModuleDtorFunc);
981 llvm::BasicBlock *ExitBlock =
982 llvm::BasicBlock::Create(Context,
"exit", ModuleDtorFunc);
983 llvm::Constant *
Zero = llvm::Constant::getNullValue(HandleValue->getType());
984 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
985 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
987 DtorBuilder.SetInsertPoint(IfBlock);
988 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
989 DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
990 DtorBuilder.CreateBr(ExitBlock);
992 DtorBuilder.SetInsertPoint(ExitBlock);
994 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
996 DtorBuilder.CreateRetVoid();
997 return ModuleDtorFunc;
1001 return new CGNVCUDARuntime(CGM);
1004void CGNVCUDARuntime::internalizeDeviceSideVar(
1013 if (CGM.getLangOpts().GPURelocatableDeviceCode)
1021 if (D->
hasAttr<CUDADeviceAttr>() || D->
hasAttr<CUDAConstantAttr>() ||
1022 D->
hasAttr<CUDASharedAttr>() ||
1025 Linkage = llvm::GlobalValue::InternalLinkage;
1029void CGNVCUDARuntime::handleVarRegistration(
const VarDecl *D,
1030 llvm::GlobalVariable &GV) {
1031 if (D->
hasAttr<CUDADeviceAttr>() || D->
hasAttr<CUDAConstantAttr>()) {
1046 CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
1047 D->
hasAttr<HIPManagedAttr>()) {
1049 D->
hasAttr<CUDAConstantAttr>());
1055 const auto *TD = cast<ClassTemplateSpecializationDecl>(
1058 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1059 assert(Args.
size() == 2 &&
1060 "Unexpected number of template arguments of CUDA device "
1061 "builtin surface type.");
1062 auto SurfType = Args[1].getAsIntegral();
1064 registerDeviceSurf(D, GV, !D->
hasDefinition(), SurfType.getSExtValue());
1066 assert(Args.
size() == 3 &&
1067 "Unexpected number of template arguments of CUDA device "
1068 "builtin texture type.");
1069 auto TexType = Args[1].getAsIntegral();
1070 auto Normalized = Args[2].getAsIntegral();
1072 registerDeviceTex(D, GV, !D->
hasDefinition(), TexType.getSExtValue(),
1073 Normalized.getZExtValue());
1082void CGNVCUDARuntime::transformManagedVars() {
1083 for (
auto &&Info : DeviceVars) {
1084 llvm::GlobalVariable *Var = Info.Var;
1085 if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1086 Info.Flags.isManaged()) {
1087 auto *ManagedVar =
new llvm::GlobalVariable(
1088 CGM.getModule(), Var->getType(),
1089 false, Var->getLinkage(),
1090 Var->isDeclaration()
1092 : llvm::ConstantPointerNull::get(Var->getType()),
1094 llvm::GlobalVariable::NotThreadLocal,
1095 CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
1096 ManagedVar->setDSOLocal(Var->isDSOLocal());
1097 ManagedVar->setVisibility(Var->getVisibility());
1098 ManagedVar->setExternallyInitialized(
true);
1100 ManagedVar->takeName(Var);
1101 Var->setName(Twine(ManagedVar->getName()) +
".managed");
1104 if (!Var->isDeclaration()) {
1105 assert(!ManagedVar->isDeclaration());
1106 CGM.addCompilerUsedGlobal(Var);
1107 CGM.addCompilerUsedGlobal(ManagedVar);
1116void CGNVCUDARuntime::createOffloadingEntries() {
1117 StringRef Section = CGM.getLangOpts().HIP ?
"hip_offloading_entries"
1118 :
"cuda_offloading_entries";
1119 llvm::Module &M = CGM.getModule();
1120 for (KernelInfo &I : EmittedKernels)
1121 llvm::offloading::emitOffloadingEntry(
1122 M, KernelHandles[I.Kernel->getName()],
1123 getDeviceSideName(cast<NamedDecl>(I.D)), 0, 0,
1124 llvm::offloading::OffloadGlobalEntry, Section);
1126 for (VarInfo &I : DeviceVars) {
1128 CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1131 ?
static_cast<int32_t
>(llvm::offloading::OffloadGlobalExtern)
1133 (I.Flags.isConstant()
1134 ?
static_cast<int32_t
>(llvm::offloading::OffloadGlobalConstant)
1136 (I.Flags.isNormalized()
1137 ?
static_cast<int32_t
>(llvm::offloading::OffloadGlobalNormalized)
1139 if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1140 llvm::offloading::emitOffloadingEntry(
1141 M, I.Var, getDeviceSideName(I.D), VarSize,
1142 (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1143 : llvm::offloading::OffloadGlobalEntry) |
1146 }
else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1147 llvm::offloading::emitOffloadingEntry(
1148 M, I.Var, getDeviceSideName(I.D), VarSize,
1149 llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1150 I.Flags.getSurfTexType(), Section);
1151 }
else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1152 llvm::offloading::emitOffloadingEntry(
1153 M, I.Var, getDeviceSideName(I.D), VarSize,
1154 llvm::offloading::OffloadGlobalTextureEntry | Flags,
1155 I.Flags.getSurfTexType(), Section);
1161llvm::Function *CGNVCUDARuntime::finalizeModule() {
1162 if (CGM.getLangOpts().CUDAIsDevice) {
1163 transformManagedVars();
1175 for (
auto &&Info : DeviceVars) {
1176 auto Kind = Info.Flags.getKind();
1177 if (!Info.Var->isDeclaration() &&
1178 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1179 (Kind == DeviceVarFlags::Variable ||
1180 Kind == DeviceVarFlags::Surface ||
1181 Kind == DeviceVarFlags::Texture) &&
1182 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1183 CGM.addCompilerUsedGlobal(Info.Var);
1188 if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1189 createOffloadingEntries();
1191 return makeModuleCtorFunction();
1196llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1198 auto Loc = KernelHandles.find(F->getName());
1199 if (Loc != KernelHandles.end()) {
1200 auto OldHandle = Loc->second;
1201 if (KernelStubs[OldHandle] == F)
1206 if (CGM.getLangOpts().HIP) {
1209 KernelStubs[OldHandle] = F;
1214 KernelStubs.erase(OldHandle);
1217 if (!CGM.getLangOpts().HIP) {
1218 KernelHandles[F->getName()] = F;
1223 auto *Var =
new llvm::GlobalVariable(
1224 TheModule, F->getType(),
true, F->getLinkage(),
1228 Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1229 Var->setDSOLocal(F->isDSOLocal());
1230 Var->setVisibility(F->getVisibility());
1231 auto *FD = cast<FunctionDecl>(GD.
getDecl());
1232 auto *FT = FD->getPrimaryTemplate();
1233 if (!FT || FT->isThisDeclarationADefinition())
1234 CGM.maybeSetTrivialComdat(*FD, *Var);
1235 KernelHandles[F->getName()] = Var;
1236 KernelStubs[Var] = F;
static std::unique_ptr< MangleContext > InitDeviceMC(CodeGenModule &CGM)
static void replaceManagedVar(llvm::GlobalVariable *Var, llvm::GlobalVariable *ManagedVar)
MangleContext * createMangleContext(const TargetInfo *T=nullptr)
If T is null pointer, assume the target in ASTContext.
const TargetInfo * getAuxTargetInfo() const
MangleContext * createDeviceMangleContext(const TargetInfo &T)
Creates a device mangle context to correctly mangle lambdas in a mixed architecture compile by settin...
const TargetInfo & getTargetInfo() const
CharUnits - This is an opaque type for sizes expressed in character units.
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
static CharUnits Zero()
Zero - Construct a CharUnits quantity of zero.
llvm::Value * getPointer() const
llvm::PointerType * getType() const
Return the type of the pointer value.
llvm::StoreInst * CreateDefaultAlignedStore(llvm::Value *Val, llvm::Value *Addr, bool IsVolatile=false)
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
virtual std::string getDeviceSideName(const NamedDecl *ND)=0
Returns function or variable name on device side even if the current compilation is for host.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args)=0
Emits a kernel launch stub.
virtual llvm::Function * getKernelStub(llvm::GlobalValue *Handle)=0
Get kernel stub by kernel handle.
virtual void handleVarRegistration(const VarDecl *VD, llvm::GlobalVariable &Var)=0
Check whether a variable is a device variable and register it if true.
virtual llvm::Function * finalizeModule()=0
Finalize generated LLVM module.
virtual llvm::GlobalValue * getKernelHandle(llvm::Function *Stub, GlobalDecl GD)=0
Get kernel handle by stub function.
virtual void internalizeDeviceSideVar(const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage)=0
Adjust linkage of shadow variables in host compilation.
static CGCallee forDirect(llvm::Constant *functionPtr, const CGCalleeInfo &abstractInfo=CGCalleeInfo())
CGFunctionInfo - Class to encapsulate the information about a function definition.
CallArgList - Type for representing both the value and type of arguments in a call.
void add(RValue rvalue, QualType type)
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
const LangOptions & getLangOpts() const
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee, ReturnValueSlot ReturnValue, const CallArgList &Args, llvm::CallBase **callOrInvoke, bool IsMustTail, SourceLocation Loc)
EmitCall - Generate a call of the given function, expecting the given result type,...
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
const Decl * CurFuncDecl
CurFuncDecl - Holds the Decl for the current outermost non-closure context.
llvm::CallBase * EmitRuntimeCallOrInvoke(llvm::FunctionCallee callee, ArrayRef< llvm::Value * > args, const Twine &name="")
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
This class organizes the cross-function state that is used while generating LLVM code.
ASTContext & getContext() const
The standard implementation of ConstantInitBuilder used in Clang.
FunctionArgList - Type for representing both the decl and type of parameters to a function.
static RValue get(llvm::Value *V)
static RValue getAggregate(Address addr, bool isVolatile=false)
ReturnValueSlot - Contains the address where the return value of a function can be stored,...
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
lookup_result lookup(DeclarationName Name) const
lookup - Find the declarations (if any) with the given Name in this context.
Decl - This represents one declaration (or definition), e.g.
SourceLocation getLocation() const
TranslationUnitDecl * getTranslationUnitDecl()
Represents a function declaration or definition.
const ParmVarDecl * getParamDecl(unsigned i) const
GlobalDecl - represents a global declaration.
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind)
const Decl * getDecl() const
One of these records is kept for each identifier that is lexed.
StringRef getName() const
Return the actual identifier string.
GPUDefaultStreamKind GPUDefaultStream
The default stream kind used for HIP kernel launching.
MangleContext - Context for tracking state which persists across multiple calls to the C++ name mangl...
bool shouldMangleDeclName(const NamedDecl *D)
void mangleName(GlobalDecl GD, raw_ostream &)
This represents a decl that may have a name.
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Represents a parameter to a function.
A (possibly-)qualified type.
QualType getCanonicalType() const
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
RecordDecl * getDecl() const
bool isMicrosoft() const
Is this ABI an MSVC-compatible ABI?
bool isItaniumFamily() const
Does this ABI generally fall into the Itanium family of ABIs?
TargetCXXABI getCXXABI() const
Get the C++ ABI currently in use.
A template argument list.
unsigned size() const
Retrieve the number of template arguments in this template argument list.
The top declaration context.
static DeclContext * castToDeclContext(const TranslationUnitDecl *D)
The base class of the type hierarchy.
const T * castAs() const
Member-template castAs<specific type>.
bool isCUDADeviceBuiltinSurfaceType() const
Check if the type is the CUDA device builtin surface type.
bool isCUDADeviceBuiltinTextureType() const
Check if the type is the CUDA device builtin texture type.
Represents a variable declaration or definition.
bool isInline() const
Whether this variable is (C++1z) inline.
bool hasExternalStorage() const
Returns true if a variable has extern or private_extern storage.
DefinitionKind hasDefinition(ASTContext &) const
Check whether this variable is defined in this translation unit.
CGCUDARuntime * CreateNVCUDARuntime(CodeGenModule &CGM)
Creates an instance of a CUDA runtime class.
bool Zero(InterpState &S, CodePtr OpPC)
The JSON file list parser is used to communicate input to InstallAPI.
CudaVersion ToCudaVersion(llvm::VersionTuple)
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature)
Linkage
Describes the different kinds of linkage (C++ [basic.link], C99 6.2.2) that an entity may have.
llvm::IntegerType * SizeTy
llvm::IntegerType * IntTy
int
llvm::PointerType * UnqualPtrTy