22#include "llvm/Frontend/Offloading/Utility.h"
23#include "llvm/IR/BasicBlock.h"
24#include "llvm/IR/Constants.h"
25#include "llvm/IR/DerivedTypes.h"
26#include "llvm/IR/ReplaceConstant.h"
27#include "llvm/Support/Format.h"
28#include "llvm/Support/VirtualFileSystem.h"
31using namespace CodeGen;
34constexpr unsigned CudaFatMagic = 0x466243b1;
35constexpr unsigned HIPFatMagic = 0x48495046;
40 llvm::IntegerType *IntTy, *SizeTy;
42 llvm::PointerType *PtrTy;
45 llvm::LLVMContext &Context;
47 llvm::Module &TheModule;
57 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
59 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
61 llvm::GlobalVariable *Var;
69 llvm::GlobalVariable *GpuBinaryHandle =
nullptr;
71 bool RelocatableDeviceCode;
73 std::unique_ptr<MangleContext> DeviceMC;
75 llvm::Constant *Zeros[2];
77 llvm::FunctionCallee getSetupArgumentFn()
const;
78 llvm::FunctionCallee getLaunchFn()
const;
80 llvm::FunctionType *getRegisterGlobalsFnTy()
const;
81 llvm::FunctionType *getCallbackFnTy()
const;
82 llvm::FunctionType *getRegisterLinkedBinaryFnTy()
const;
83 std::string addPrefixToName(StringRef FuncName)
const;
84 std::string addUnderscoredPrefixToName(StringRef FuncName)
const;
87 llvm::Function *makeRegisterGlobalsFn();
92 llvm::Constant *makeConstantString(
const std::string &Str,
93 const std::string &Name =
"") {
94 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
95 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
96 ConstStr.getPointer(), Zeros);
102 llvm::Constant *makeConstantArray(StringRef Str,
104 StringRef SectionName =
"",
105 unsigned Alignment = 0,
106 bool AddNull =
false) {
107 llvm::Constant *
Value =
108 llvm::ConstantDataArray::getString(Context, Str, AddNull);
109 auto *GV =
new llvm::GlobalVariable(
111 llvm::GlobalValue::PrivateLinkage,
Value, Name);
112 if (!SectionName.empty()) {
113 GV->setSection(SectionName);
116 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
119 GV->setAlignment(llvm::Align(Alignment));
120 return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros);
124 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
125 assert(FnTy->getReturnType()->isVoidTy() &&
126 "Can only generate dummy functions returning void!");
127 llvm::Function *DummyFunc = llvm::Function::Create(
128 FnTy, llvm::GlobalValue::InternalLinkage,
"dummy", &TheModule);
130 llvm::BasicBlock *DummyBlock =
131 llvm::BasicBlock::Create(Context,
"", DummyFunc);
133 FuncBuilder.SetInsertPoint(DummyBlock);
134 FuncBuilder.CreateRetVoid();
143 void registerDeviceVar(
const VarDecl *VD, llvm::GlobalVariable &Var,
144 bool Extern,
bool Constant) {
145 DeviceVars.push_back({&Var,
148 VD->hasAttr<HIPManagedAttr>(),
151 void registerDeviceSurf(
const VarDecl *VD, llvm::GlobalVariable &Var,
152 bool Extern,
int Type) {
153 DeviceVars.push_back({&Var,
159 void registerDeviceTex(
const VarDecl *VD, llvm::GlobalVariable &Var,
160 bool Extern,
int Type,
bool Normalized) {
161 DeviceVars.push_back({&Var,
164 false, Normalized,
Type}});
168 llvm::Function *makeModuleCtorFunction();
170 llvm::Function *makeModuleDtorFunction();
172 void transformManagedVars();
174 void createOffloadingEntries();
180 llvm::Function *
getKernelStub(llvm::GlobalValue *Handle)
override {
181 auto Loc = KernelStubs.find(Handle);
182 assert(Loc != KernelStubs.end());
187 llvm::GlobalVariable &Var)
override;
190 llvm::GlobalValue::LinkageTypes &
Linkage)
override;
197std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName)
const {
198 if (CGM.getLangOpts().HIP)
199 return ((Twine(
"hip") + Twine(FuncName)).str());
200 return ((Twine(
"cuda") + Twine(FuncName)).str());
203CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName)
const {
204 if (CGM.getLangOpts().HIP)
205 return ((Twine(
"__hip") + Twine(FuncName)).str());
206 return ((Twine(
"__cuda") + Twine(FuncName)).str());
216 return std::unique_ptr<MangleContext>(
227 TheModule(CGM.getModule()),
228 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
233 Zeros[0] = llvm::ConstantInt::get(SizeTy, 0);
238llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn()
const {
240 llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
241 return CGM.CreateRuntimeFunction(
242 llvm::FunctionType::get(IntTy, Params,
false),
243 addPrefixToName(
"SetupArgument"));
246llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn()
const {
247 if (CGM.getLangOpts().HIP) {
249 return CGM.CreateRuntimeFunction(
250 llvm::FunctionType::get(IntTy, PtrTy,
false),
"hipLaunchByPtr");
253 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy,
false),
257llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy()
const {
258 return llvm::FunctionType::get(VoidTy, PtrTy,
false);
261llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy()
const {
262 return llvm::FunctionType::get(VoidTy, PtrTy,
false);
265llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy()
const {
266 llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
267 llvm::PointerType::getUnqual(Context)};
268 return llvm::FunctionType::get(VoidTy, Params,
false);
271std::string CGNVCUDARuntime::getDeviceSideName(
const NamedDecl *ND) {
274 if (
auto *FD = dyn_cast<FunctionDecl>(ND))
275 GD =
GlobalDecl(FD, KernelReferenceKind::Kernel);
278 std::string DeviceSideName;
280 if (CGM.getLangOpts().CUDAIsDevice)
281 MC = &CGM.getCXXABI().getMangleContext();
286 llvm::raw_svector_ostream Out(Buffer);
288 DeviceSideName = std::string(Out.str());
293 if (CGM.getContext().shouldExternalize(ND) &&
294 CGM.getLangOpts().GPURelocatableDeviceCode) {
296 llvm::raw_svector_ostream Out(Buffer);
297 Out << DeviceSideName;
298 CGM.printPostfixForExternalizedDecl(Out, ND);
299 DeviceSideName = std::string(Out.str());
301 return DeviceSideName;
308 dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.
CurFn->getName()])) {
309 GV->setLinkage(CGF.
CurFn->getLinkage());
310 GV->setInitializer(CGF.
CurFn);
313 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
315 emitDeviceStubBodyNew(CGF, Args);
317 emitDeviceStubBodyLegacy(CGF, Args);
331 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
333 for (
unsigned i = 0; i < Args.size(); ++i) {
335 llvm::Value *VoidVarPtr = CGF.
Builder.CreatePointerCast(VarPtr, PtrTy);
337 VoidVarPtr, CGF.
Builder.CreateConstGEP1_32(
355 std::string KernelLaunchAPI =
"LaunchKernel";
357 LangOptions::GPUDefaultStreamKind::PerThread) {
359 KernelLaunchAPI = KernelLaunchAPI +
"_spt";
361 KernelLaunchAPI = KernelLaunchAPI +
"_ptsz";
363 auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
365 CGM.getContext().Idents.get(LaunchKernelName);
367 for (
auto *Result : DC->
lookup(&cudaLaunchKernelII)) {
369 cudaLaunchKernelFD = FD;
372 if (cudaLaunchKernelFD ==
nullptr) {
374 "Can't find declaration for " + LaunchKernelName);
387 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
388 llvm::FunctionType::get(IntTy,
394 addUnderscoredPrefixToName(
"PopCallConfiguration"));
403 CGF.
Builder.CreatePointerCast(KernelHandles[CGF.
CurFn->getName()], PtrTy);
418 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
419 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
422 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
423 llvm::FunctionCallee cudaLaunchKernelFn =
424 CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
435 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
438 for (
const VarDecl *A : Args) {
439 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
440 Offset = Offset.alignTo(TInfo.Align);
441 llvm::Value *Args[] = {
444 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
445 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
448 llvm::Constant *
Zero = llvm::ConstantInt::get(IntTy, 0);
449 llvm::Value *CBZero = CGF.
Builder.CreateICmpEQ(CB, Zero);
451 CGF.
Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
453 Offset += TInfo.Width;
457 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
459 CGF.
Builder.CreatePointerCast(KernelHandles[CGF.
CurFn->getName()], PtrTy);
469 llvm::GlobalVariable *ManagedVar) {
471 for (
auto &&VarUse : Var->uses()) {
472 WorkList.push_back({VarUse.getUser()});
474 while (!WorkList.empty()) {
475 auto &&WorkItem = WorkList.pop_back_val();
476 auto *
U = WorkItem.back();
477 if (isa<llvm::ConstantExpr>(
U)) {
478 for (
auto &&UU :
U->uses()) {
479 WorkItem.push_back(UU.getUser());
480 WorkList.push_back(WorkItem);
485 if (
auto *I = dyn_cast<llvm::Instruction>(
U)) {
486 llvm::Value *OldV = Var;
487 llvm::Instruction *NewV =
488 new llvm::LoadInst(Var->getType(), ManagedVar,
"ld.managed",
false,
489 llvm::Align(Var->getAlignment()), I);
493 for (
auto &&Op : WorkItem) {
494 auto *CE = cast<llvm::ConstantExpr>(Op);
495 auto *NewInst = CE->getAsInstruction();
496 NewInst->insertBefore(*I->getParent(), I->getIterator());
497 NewInst->replaceUsesOfWith(OldV, NewV);
501 I->replaceUsesOfWith(OldV, NewV);
503 llvm_unreachable(
"Invalid use of managed variable");
522llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
524 if (EmittedKernels.empty() && DeviceVars.empty())
527 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
528 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
529 addUnderscoredPrefixToName(
"_register_globals"), &TheModule);
530 llvm::BasicBlock *EntryBB =
531 llvm::BasicBlock::Create(Context,
"entry", RegisterKernelsFunc);
533 Builder.SetInsertPoint(EntryBB);
537 llvm::Type *RegisterFuncParams[] = {
538 PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
539 PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
540 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
541 llvm::FunctionType::get(IntTy, RegisterFuncParams,
false),
542 addUnderscoredPrefixToName(
"RegisterFunction"));
547 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
548 for (
auto &&I : EmittedKernels) {
549 llvm::Constant *KernelName =
550 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
551 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
552 llvm::Value *Args[] = {
554 KernelHandles[I.Kernel->getName()],
557 llvm::ConstantInt::get(IntTy, -1),
562 llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
563 Builder.CreateCall(RegisterFunc, Args);
566 llvm::Type *VarSizeTy = IntTy;
568 if (CGM.getLangOpts().HIP ||
569 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
574 llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
575 IntTy, VarSizeTy, IntTy, IntTy};
576 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
577 llvm::FunctionType::get(VoidTy, RegisterVarParams,
false),
578 addUnderscoredPrefixToName(
"RegisterVar"));
581 llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
582 PtrTy, VarSizeTy, IntTy};
583 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
584 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams,
false),
585 addUnderscoredPrefixToName(
"RegisterManagedVar"));
588 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
589 llvm::FunctionType::get(
590 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy},
false),
591 addUnderscoredPrefixToName(
"RegisterSurface"));
594 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
595 llvm::FunctionType::get(
596 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy},
false),
597 addUnderscoredPrefixToName(
"RegisterTexture"));
598 for (
auto &&Info : DeviceVars) {
599 llvm::GlobalVariable *Var = Info.Var;
600 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
601 "External variables should not show up here, except HIP managed "
603 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
604 switch (Info.Flags.getKind()) {
605 case DeviceVarFlags::Variable: {
607 CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
608 if (Info.Flags.isManaged()) {
609 assert(Var->getName().ends_with(
".managed") &&
610 "HIP managed variables not transformed");
611 auto *ManagedVar = CGM.getModule().getNamedGlobal(
612 Var->getName().drop_back(StringRef(
".managed").size()));
613 llvm::Value *Args[] = {
618 llvm::ConstantInt::get(VarSizeTy, VarSize),
619 llvm::ConstantInt::get(IntTy, Var->getAlignment())};
620 if (!Var->isDeclaration())
621 Builder.CreateCall(RegisterManagedVar, Args);
623 llvm::Value *Args[] = {
628 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
629 llvm::ConstantInt::get(VarSizeTy, VarSize),
630 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
631 llvm::ConstantInt::get(IntTy, 0)};
632 Builder.CreateCall(RegisterVar, Args);
636 case DeviceVarFlags::Surface:
639 {&GpuBinaryHandlePtr, Var, VarName, VarName,
640 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
641 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
643 case DeviceVarFlags::Texture:
646 {&GpuBinaryHandlePtr, Var, VarName, VarName,
647 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
648 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
649 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
654 Builder.CreateRetVoid();
655 return RegisterKernelsFunc;
677llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
678 bool IsHIP = CGM.getLangOpts().HIP;
679 bool IsCUDA = CGM.getLangOpts().CUDA;
681 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
682 if (CudaGpuBinaryFileName.empty() && !IsHIP)
684 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
689 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
692 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
693 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
696 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
697 llvm::FunctionType::get(PtrTy, PtrTy,
false),
698 addUnderscoredPrefixToName(
"RegisterFatBinary"));
700 llvm::StructType *FatbinWrapperTy =
701 llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
707 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary =
nullptr;
708 if (!CudaGpuBinaryFileName.empty()) {
709 auto VFS = CGM.getFileSystem();
710 auto CudaGpuBinaryOrErr =
711 VFS->getBufferForFile(CudaGpuBinaryFileName, -1,
false);
712 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
713 CGM.getDiags().Report(diag::err_cannot_open_file)
714 << CudaGpuBinaryFileName << EC.message();
717 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
720 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
721 llvm::FunctionType::get(VoidTy,
false),
722 llvm::GlobalValue::InternalLinkage,
723 addUnderscoredPrefixToName(
"_module_ctor"), &TheModule);
724 llvm::BasicBlock *CtorEntryBB =
725 llvm::BasicBlock::Create(Context,
"entry", ModuleCtorFunc);
728 CtorBuilder.SetInsertPoint(CtorEntryBB);
730 const char *FatbinConstantName;
731 const char *FatbinSectionName;
732 const char *ModuleIDSectionName;
733 StringRef ModuleIDPrefix;
734 llvm::Constant *FatBinStr;
737 FatbinConstantName =
".hip_fatbin";
738 FatbinSectionName =
".hipFatBinSegment";
740 ModuleIDSectionName =
"__hip_module_id";
741 ModuleIDPrefix =
"__hip_";
746 const unsigned HIPCodeObjectAlign = 4096;
747 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
748 FatbinConstantName, HIPCodeObjectAlign);
754 FatBinStr =
new llvm::GlobalVariable(
755 CGM.getModule(), CGM.Int8Ty,
756 true, llvm::GlobalValue::ExternalLinkage,
nullptr,
757 "__hip_fatbin_" + CGM.getContext().getCUIDHash(),
nullptr,
758 llvm::GlobalVariable::NotThreadLocal);
759 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
762 FatMagic = HIPFatMagic;
764 if (RelocatableDeviceCode)
765 FatbinConstantName = CGM.getTriple().isMacOSX()
766 ?
"__NV_CUDA,__nv_relfatbin"
770 CGM.getTriple().isMacOSX() ?
"__NV_CUDA,__nv_fatbin" :
".nv_fatbin";
773 CGM.getTriple().isMacOSX() ?
"__NV_CUDA,__fatbin" :
".nvFatBinSegment";
775 ModuleIDSectionName = CGM.getTriple().isMacOSX()
776 ?
"__NV_CUDA,__nv_module_id"
778 ModuleIDPrefix =
"__nv_";
782 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
783 FatbinConstantName, 8);
784 FatMagic = CudaFatMagic;
789 auto Values = Builder.beginStruct(FatbinWrapperTy);
791 Values.addInt(IntTy, FatMagic);
793 Values.addInt(IntTy, 1);
795 Values.add(FatBinStr);
797 Values.add(llvm::ConstantPointerNull::get(PtrTy));
798 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
799 addUnderscoredPrefixToName(
"_fatbin_wrapper"), CGM.getPointerAlign(),
801 FatbinWrapper->setSection(FatbinSectionName);
811 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
812 : llvm::GlobalValue::ExternalLinkage;
813 llvm::BasicBlock *IfBlock =
814 llvm::BasicBlock::Create(Context,
"if", ModuleCtorFunc);
815 llvm::BasicBlock *ExitBlock =
816 llvm::BasicBlock::Create(Context,
"exit", ModuleCtorFunc);
819 GpuBinaryHandle =
new llvm::GlobalVariable(
820 TheModule, PtrTy,
false,
Linkage,
822 CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) :
nullptr,
824 ?
"__hip_gpubin_handle"
825 :
"__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
826 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
828 if (
Linkage != llvm::GlobalValue::InternalLinkage)
829 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
831 GpuBinaryHandle, PtrTy,
834 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
835 llvm::Constant *
Zero =
836 llvm::Constant::getNullValue(HandleValue->getType());
837 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
838 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
841 CtorBuilder.SetInsertPoint(IfBlock);
843 llvm::CallInst *RegisterFatbinCall =
844 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
845 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
846 CtorBuilder.CreateBr(ExitBlock);
849 CtorBuilder.SetInsertPoint(ExitBlock);
851 if (RegisterGlobalsFunc) {
852 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
853 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
856 }
else if (!RelocatableDeviceCode) {
860 llvm::CallInst *RegisterFatbinCall =
861 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
862 GpuBinaryHandle =
new llvm::GlobalVariable(
863 TheModule, PtrTy,
false, llvm::GlobalValue::InternalLinkage,
864 llvm::ConstantPointerNull::get(PtrTy),
"__cuda_gpubin_handle");
865 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
866 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
867 CGM.getPointerAlign());
870 if (RegisterGlobalsFunc)
871 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
875 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
877 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
878 llvm::FunctionType::get(VoidTy, PtrTy,
false),
879 "__cudaRegisterFatBinaryEnd");
880 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
885 llvm::raw_svector_ostream OS(ModuleID);
886 OS << ModuleIDPrefix << llvm::format(
"%" PRIx64, FatbinWrapper->getGUID());
887 llvm::Constant *ModuleIDConstant = makeConstantArray(
888 std::string(ModuleID),
"", ModuleIDSectionName, 32,
true);
891 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
892 Twine(
"__fatbinwrap") + ModuleID, FatbinWrapper);
897 RegisterLinkedBinaryName += ModuleID;
898 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
899 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
901 assert(RegisterGlobalsFunc &&
"Expecting at least dummy function!");
902 llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
903 makeDummyFunction(getCallbackFnTy())};
904 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
910 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
912 llvm::FunctionType *AtExitTy =
913 llvm::FunctionType::get(IntTy, CleanupFn->getType(),
false);
914 llvm::FunctionCallee AtExitFunc =
915 CGM.CreateRuntimeFunction(AtExitTy,
"atexit", llvm::AttributeList(),
917 CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
920 CtorBuilder.CreateRetVoid();
921 return ModuleCtorFunc;
943llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
945 if (!GpuBinaryHandle)
949 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
950 llvm::FunctionType::get(VoidTy, PtrTy,
false),
951 addUnderscoredPrefixToName(
"UnregisterFatBinary"));
953 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
954 llvm::FunctionType::get(VoidTy,
false),
955 llvm::GlobalValue::InternalLinkage,
956 addUnderscoredPrefixToName(
"_module_dtor"), &TheModule);
958 llvm::BasicBlock *DtorEntryBB =
959 llvm::BasicBlock::Create(Context,
"entry", ModuleDtorFunc);
961 DtorBuilder.SetInsertPoint(DtorEntryBB);
964 GpuBinaryHandle, GpuBinaryHandle->getValueType(),
966 auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
970 if (CGM.getLangOpts().HIP) {
971 llvm::BasicBlock *IfBlock =
972 llvm::BasicBlock::Create(Context,
"if", ModuleDtorFunc);
973 llvm::BasicBlock *ExitBlock =
974 llvm::BasicBlock::Create(Context,
"exit", ModuleDtorFunc);
975 llvm::Constant *
Zero = llvm::Constant::getNullValue(HandleValue->getType());
976 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
977 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
979 DtorBuilder.SetInsertPoint(IfBlock);
980 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
981 DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
982 DtorBuilder.CreateBr(ExitBlock);
984 DtorBuilder.SetInsertPoint(ExitBlock);
986 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
988 DtorBuilder.CreateRetVoid();
989 return ModuleDtorFunc;
993 return new CGNVCUDARuntime(CGM);
996void CGNVCUDARuntime::internalizeDeviceSideVar(
1005 if (CGM.getLangOpts().GPURelocatableDeviceCode)
1013 if (D->
hasAttr<CUDADeviceAttr>() || D->
hasAttr<CUDAConstantAttr>() ||
1014 D->
hasAttr<CUDASharedAttr>() ||
1017 Linkage = llvm::GlobalValue::InternalLinkage;
1021void CGNVCUDARuntime::handleVarRegistration(
const VarDecl *D,
1022 llvm::GlobalVariable &GV) {
1023 if (D->
hasAttr<CUDADeviceAttr>() || D->
hasAttr<CUDAConstantAttr>()) {
1038 CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
1039 D->
hasAttr<HIPManagedAttr>()) {
1041 D->
hasAttr<CUDAConstantAttr>());
1047 const auto *TD = cast<ClassTemplateSpecializationDecl>(
1050 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1051 assert(Args.
size() == 2 &&
1052 "Unexpected number of template arguments of CUDA device "
1053 "builtin surface type.");
1054 auto SurfType = Args[1].getAsIntegral();
1056 registerDeviceSurf(D, GV, !D->
hasDefinition(), SurfType.getSExtValue());
1058 assert(Args.
size() == 3 &&
1059 "Unexpected number of template arguments of CUDA device "
1060 "builtin texture type.");
1061 auto TexType = Args[1].getAsIntegral();
1062 auto Normalized = Args[2].getAsIntegral();
1064 registerDeviceTex(D, GV, !D->
hasDefinition(), TexType.getSExtValue(),
1065 Normalized.getZExtValue());
1074void CGNVCUDARuntime::transformManagedVars() {
1075 for (
auto &&Info : DeviceVars) {
1076 llvm::GlobalVariable *Var = Info.Var;
1077 if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1078 Info.Flags.isManaged()) {
1079 auto *ManagedVar =
new llvm::GlobalVariable(
1080 CGM.getModule(), Var->getType(),
1081 false, Var->getLinkage(),
1082 Var->isDeclaration()
1084 : llvm::ConstantPointerNull::get(Var->getType()),
1086 llvm::GlobalVariable::NotThreadLocal,
1087 CGM.getContext().getTargetAddressSpace(CGM.getLangOpts().CUDAIsDevice
1088 ? LangAS::cuda_device
1089 : LangAS::Default));
1090 ManagedVar->setDSOLocal(Var->isDSOLocal());
1091 ManagedVar->setVisibility(Var->getVisibility());
1092 ManagedVar->setExternallyInitialized(
true);
1094 ManagedVar->takeName(Var);
1095 Var->setName(Twine(ManagedVar->getName()) +
".managed");
1098 if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1099 assert(!ManagedVar->isDeclaration());
1100 CGM.addCompilerUsedGlobal(Var);
1101 CGM.addCompilerUsedGlobal(ManagedVar);
1110void CGNVCUDARuntime::createOffloadingEntries() {
1111 StringRef Section = CGM.getLangOpts().HIP ?
"hip_offloading_entries"
1112 :
"cuda_offloading_entries";
1113 llvm::Module &M = CGM.getModule();
1114 for (KernelInfo &I : EmittedKernels)
1115 llvm::offloading::emitOffloadingEntry(
1116 M, KernelHandles[I.Kernel->getName()],
1117 getDeviceSideName(cast<NamedDecl>(I.D)), 0, 0,
1118 llvm::offloading::OffloadGlobalEntry, Section);
1120 for (VarInfo &I : DeviceVars) {
1122 CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1125 ?
static_cast<int32_t
>(llvm::offloading::OffloadGlobalExtern)
1127 (I.Flags.isConstant()
1128 ?
static_cast<int32_t
>(llvm::offloading::OffloadGlobalConstant)
1130 (I.Flags.isNormalized()
1131 ?
static_cast<int32_t
>(llvm::offloading::OffloadGlobalNormalized)
1133 if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1134 llvm::offloading::emitOffloadingEntry(
1135 M, I.Var, getDeviceSideName(I.D), VarSize,
1136 (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1137 : llvm::offloading::OffloadGlobalEntry) |
1140 }
else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1141 llvm::offloading::emitOffloadingEntry(
1142 M, I.Var, getDeviceSideName(I.D), VarSize,
1143 llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1144 I.Flags.getSurfTexType(), Section);
1145 }
else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1146 llvm::offloading::emitOffloadingEntry(
1147 M, I.Var, getDeviceSideName(I.D), VarSize,
1148 llvm::offloading::OffloadGlobalTextureEntry | Flags,
1149 I.Flags.getSurfTexType(), Section);
1155llvm::Function *CGNVCUDARuntime::finalizeModule() {
1156 transformManagedVars();
1157 if (CGM.getLangOpts().CUDAIsDevice) {
1168 for (
auto &&Info : DeviceVars) {
1169 auto Kind = Info.Flags.getKind();
1170 if (!Info.Var->isDeclaration() &&
1171 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1172 (Kind == DeviceVarFlags::Variable ||
1173 Kind == DeviceVarFlags::Surface ||
1174 Kind == DeviceVarFlags::Texture) &&
1175 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1176 CGM.addCompilerUsedGlobal(Info.Var);
1181 if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1182 createOffloadingEntries();
1184 return makeModuleCtorFunction();
1189llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1191 auto Loc = KernelHandles.find(F->getName());
1192 if (Loc != KernelHandles.end()) {
1193 auto OldHandle = Loc->second;
1194 if (KernelStubs[OldHandle] == F)
1199 if (CGM.getLangOpts().HIP) {
1202 KernelStubs[OldHandle] = F;
1207 KernelStubs.erase(OldHandle);
1210 if (!CGM.getLangOpts().HIP) {
1211 KernelHandles[F->getName()] = F;
1216 auto *Var =
new llvm::GlobalVariable(
1217 TheModule, F->getType(),
true, F->getLinkage(),
1221 Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1222 Var->setDSOLocal(F->isDSOLocal());
1223 Var->setVisibility(F->getVisibility());
1224 auto *FD = cast<FunctionDecl>(GD.
getDecl());
1225 auto *FT = FD->getPrimaryTemplate();
1226 if (!FT || FT->isThisDeclarationADefinition())
1227 CGM.maybeSetTrivialComdat(*FD, *Var);
1228 KernelHandles[F->getName()] = Var;
1229 KernelStubs[Var] = F;
static std::unique_ptr< MangleContext > InitDeviceMC(CodeGenModule &CGM)
static void replaceManagedVar(llvm::GlobalVariable *Var, llvm::GlobalVariable *ManagedVar)
MangleContext * createMangleContext(const TargetInfo *T=nullptr)
If T is null pointer, assume the target in ASTContext.
const TargetInfo * getAuxTargetInfo() const
MangleContext * createDeviceMangleContext(const TargetInfo &T)
Creates a device mangle context to correctly mangle lambdas in a mixed architecture compile by settin...
const TargetInfo & getTargetInfo() const
CharUnits - This is an opaque type for sizes expressed in character units.
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
static CharUnits Zero()
Zero - Construct a CharUnits quantity of zero.
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
llvm::PointerType * getType() const
Return the type of the pointer value.
llvm::StoreInst * CreateDefaultAlignedStore(llvm::Value *Val, llvm::Value *Addr, bool IsVolatile=false)
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
virtual std::string getDeviceSideName(const NamedDecl *ND)=0
Returns function or variable name on device side even if the current compilation is for host.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args)=0
Emits a kernel launch stub.
virtual llvm::Function * getKernelStub(llvm::GlobalValue *Handle)=0
Get kernel stub by kernel handle.
virtual void handleVarRegistration(const VarDecl *VD, llvm::GlobalVariable &Var)=0
Check whether a variable is a device variable and register it if true.
virtual llvm::Function * finalizeModule()=0
Finalize generated LLVM module.
virtual llvm::GlobalValue * getKernelHandle(llvm::Function *Stub, GlobalDecl GD)=0
Get kernel handle by stub function.
virtual void internalizeDeviceSideVar(const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage)=0
Adjust linkage of shadow variables in host compilation.
static CGCallee forDirect(llvm::Constant *functionPtr, const CGCalleeInfo &abstractInfo=CGCalleeInfo())
CGFunctionInfo - Class to encapsulate the information about a function definition.
CallArgList - Type for representing both the value and type of arguments in a call.
void add(RValue rvalue, QualType type)
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
const LangOptions & getLangOpts() const
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee, ReturnValueSlot ReturnValue, const CallArgList &Args, llvm::CallBase **callOrInvoke, bool IsMustTail, SourceLocation Loc)
EmitCall - Generate a call of the given function, expecting the given result type,...
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
const Decl * CurFuncDecl
CurFuncDecl - Holds the Decl for the current outermost non-closure context.
llvm::CallBase * EmitRuntimeCallOrInvoke(llvm::FunctionCallee callee, ArrayRef< llvm::Value * > args, const Twine &name="")
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
This class organizes the cross-function state that is used while generating LLVM code.
ASTContext & getContext() const
The standard implementation of ConstantInitBuilder used in Clang.
FunctionArgList - Type for representing both the decl and type of parameters to a function.
static RValue get(llvm::Value *V)
static RValue getAggregate(Address addr, bool isVolatile=false)
Convert an Address to an RValue.
ReturnValueSlot - Contains the address where the return value of a function can be stored,...
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
lookup_result lookup(DeclarationName Name) const
lookup - Find the declarations (if any) with the given Name in this context.
Decl - This represents one declaration (or definition), e.g.
SourceLocation getLocation() const
TranslationUnitDecl * getTranslationUnitDecl()
Represents a function declaration or definition.
const ParmVarDecl * getParamDecl(unsigned i) const
GlobalDecl - represents a global declaration.
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind)
const Decl * getDecl() const
One of these records is kept for each identifier that is lexed.
StringRef getName() const
Return the actual identifier string.
GPUDefaultStreamKind GPUDefaultStream
The default stream kind used for HIP kernel launching.
MangleContext - Context for tracking state which persists across multiple calls to the C++ name mangl...
bool shouldMangleDeclName(const NamedDecl *D)
void mangleName(GlobalDecl GD, raw_ostream &)
This represents a decl that may have a name.
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Represents a parameter to a function.
A (possibly-)qualified type.
QualType getCanonicalType() const
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
RecordDecl * getDecl() const
bool isMicrosoft() const
Is this ABI an MSVC-compatible ABI?
bool isItaniumFamily() const
Does this ABI generally fall into the Itanium family of ABIs?
TargetCXXABI getCXXABI() const
Get the C++ ABI currently in use.
A template argument list.
unsigned size() const
Retrieve the number of template arguments in this template argument list.
The top declaration context.
static DeclContext * castToDeclContext(const TranslationUnitDecl *D)
The base class of the type hierarchy.
const T * castAs() const
Member-template castAs<specific type>.
bool isCUDADeviceBuiltinSurfaceType() const
Check if the type is the CUDA device builtin surface type.
bool isCUDADeviceBuiltinTextureType() const
Check if the type is the CUDA device builtin texture type.
Represents a variable declaration or definition.
bool isInline() const
Whether this variable is (C++1z) inline.
bool hasExternalStorage() const
Returns true if a variable has extern or private_extern storage.
DefinitionKind hasDefinition(ASTContext &) const
Check whether this variable is defined in this translation unit.
CGCUDARuntime * CreateNVCUDARuntime(CodeGenModule &CGM)
Creates an instance of a CUDA runtime class.
bool Zero(InterpState &S, CodePtr OpPC)
The JSON file list parser is used to communicate input to InstallAPI.
CudaVersion ToCudaVersion(llvm::VersionTuple)
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature)
Linkage
Describes the different kinds of linkage (C++ [basic.link], C99 6.2.2) that an entity may have.
llvm::IntegerType * SizeTy
llvm::IntegerType * IntTy
int
llvm::PointerType * UnqualPtrTy