22#include "llvm/Frontend/Offloading/Utility.h"
23#include "llvm/IR/BasicBlock.h"
24#include "llvm/IR/Constants.h"
25#include "llvm/IR/DerivedTypes.h"
26#include "llvm/IR/ReplaceConstant.h"
27#include "llvm/Support/Format.h"
28#include "llvm/Support/VirtualFileSystem.h"
31using namespace CodeGen;
34constexpr unsigned CudaFatMagic = 0x466243b1;
35constexpr unsigned HIPFatMagic = 0x48495046;
40 llvm::IntegerType *IntTy, *SizeTy;
42 llvm::PointerType *PtrTy;
45 llvm::LLVMContext &Context;
47 llvm::Module &TheModule;
57 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
59 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
61 llvm::GlobalVariable *Var;
69 llvm::GlobalVariable *GpuBinaryHandle =
nullptr;
71 bool RelocatableDeviceCode;
73 std::unique_ptr<MangleContext> DeviceMC;
75 llvm::FunctionCallee getSetupArgumentFn()
const;
76 llvm::FunctionCallee getLaunchFn()
const;
78 llvm::FunctionType *getRegisterGlobalsFnTy()
const;
79 llvm::FunctionType *getCallbackFnTy()
const;
80 llvm::FunctionType *getRegisterLinkedBinaryFnTy()
const;
81 std::string addPrefixToName(StringRef FuncName)
const;
82 std::string addUnderscoredPrefixToName(StringRef FuncName)
const;
85 llvm::Function *makeRegisterGlobalsFn();
90 llvm::Constant *makeConstantString(
const std::string &Str,
91 const std::string &Name =
"") {
92 return CGM.GetAddrOfConstantCString(Str, Name.c_str()).getPointer();
98 llvm::Constant *makeConstantArray(StringRef Str,
100 StringRef SectionName =
"",
101 unsigned Alignment = 0,
102 bool AddNull =
false) {
103 llvm::Constant *
Value =
104 llvm::ConstantDataArray::getString(Context, Str, AddNull);
105 auto *GV =
new llvm::GlobalVariable(
107 llvm::GlobalValue::PrivateLinkage,
Value, Name);
108 if (!SectionName.empty()) {
109 GV->setSection(SectionName);
112 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
115 GV->setAlignment(llvm::Align(Alignment));
120 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
121 assert(FnTy->getReturnType()->isVoidTy() &&
122 "Can only generate dummy functions returning void!");
123 llvm::Function *DummyFunc = llvm::Function::Create(
124 FnTy, llvm::GlobalValue::InternalLinkage,
"dummy", &TheModule);
126 llvm::BasicBlock *DummyBlock =
127 llvm::BasicBlock::Create(Context,
"", DummyFunc);
129 FuncBuilder.SetInsertPoint(DummyBlock);
130 FuncBuilder.CreateRetVoid();
139 void registerDeviceVar(
const VarDecl *VD, llvm::GlobalVariable &Var,
140 bool Extern,
bool Constant) {
141 DeviceVars.push_back({&Var,
144 VD->hasAttr<HIPManagedAttr>(),
147 void registerDeviceSurf(
const VarDecl *VD, llvm::GlobalVariable &Var,
148 bool Extern,
int Type) {
149 DeviceVars.push_back({&Var,
155 void registerDeviceTex(
const VarDecl *VD, llvm::GlobalVariable &Var,
156 bool Extern,
int Type,
bool Normalized) {
157 DeviceVars.push_back({&Var,
160 false, Normalized,
Type}});
164 llvm::Function *makeModuleCtorFunction();
166 llvm::Function *makeModuleDtorFunction();
168 void transformManagedVars();
170 void createOffloadingEntries();
176 llvm::Function *
getKernelStub(llvm::GlobalValue *Handle)
override {
177 auto Loc = KernelStubs.find(Handle);
178 assert(
Loc != KernelStubs.end());
183 llvm::GlobalVariable &Var)
override;
186 llvm::GlobalValue::LinkageTypes &
Linkage)
override;
193std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName)
const {
194 if (CGM.getLangOpts().HIP)
195 return ((Twine(
"hip") + Twine(FuncName)).str());
196 return ((Twine(
"cuda") + Twine(FuncName)).str());
199CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName)
const {
200 if (CGM.getLangOpts().HIP)
201 return ((Twine(
"__hip") + Twine(FuncName)).str());
202 return ((Twine(
"__cuda") + Twine(FuncName)).str());
212 return std::unique_ptr<MangleContext>(
223 TheModule(CGM.getModule()),
224 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
232llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn()
const {
234 llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
235 return CGM.CreateRuntimeFunction(
236 llvm::FunctionType::get(IntTy, Params,
false),
237 addPrefixToName(
"SetupArgument"));
240llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn()
const {
241 if (CGM.getLangOpts().HIP) {
243 return CGM.CreateRuntimeFunction(
244 llvm::FunctionType::get(IntTy, PtrTy,
false),
"hipLaunchByPtr");
247 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy,
false),
251llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy()
const {
252 return llvm::FunctionType::get(VoidTy, PtrTy,
false);
255llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy()
const {
256 return llvm::FunctionType::get(VoidTy, PtrTy,
false);
259llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy()
const {
260 llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
261 llvm::PointerType::getUnqual(Context)};
262 return llvm::FunctionType::get(VoidTy, Params,
false);
265std::string CGNVCUDARuntime::getDeviceSideName(
const NamedDecl *ND) {
268 if (
auto *FD = dyn_cast<FunctionDecl>(ND))
269 GD =
GlobalDecl(FD, KernelReferenceKind::Kernel);
272 std::string DeviceSideName;
274 if (CGM.getLangOpts().CUDAIsDevice)
275 MC = &CGM.getCXXABI().getMangleContext();
280 llvm::raw_svector_ostream Out(Buffer);
282 DeviceSideName = std::string(Out.str());
287 if (CGM.getContext().shouldExternalize(ND) &&
288 CGM.getLangOpts().GPURelocatableDeviceCode) {
290 llvm::raw_svector_ostream Out(Buffer);
291 Out << DeviceSideName;
292 CGM.printPostfixForExternalizedDecl(Out, ND);
293 DeviceSideName = std::string(Out.str());
295 return DeviceSideName;
302 dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.
CurFn->getName()])) {
303 GV->setLinkage(CGF.
CurFn->getLinkage());
304 GV->setInitializer(CGF.
CurFn);
307 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
309 emitDeviceStubBodyNew(CGF, Args);
311 emitDeviceStubBodyLegacy(CGF, Args);
325 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
327 for (
unsigned i = 0; i < Args.size(); ++i) {
329 llvm::Value *VoidVarPtr = CGF.
Builder.CreatePointerCast(VarPtr, PtrTy);
331 VoidVarPtr, CGF.
Builder.CreateConstGEP1_32(
349 std::string KernelLaunchAPI =
"LaunchKernel";
351 LangOptions::GPUDefaultStreamKind::PerThread) {
353 KernelLaunchAPI = KernelLaunchAPI +
"_spt";
355 KernelLaunchAPI = KernelLaunchAPI +
"_ptsz";
357 auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
359 CGM.getContext().Idents.get(LaunchKernelName);
361 for (
auto *Result : DC->
lookup(&cudaLaunchKernelII)) {
363 cudaLaunchKernelFD = FD;
366 if (cudaLaunchKernelFD ==
nullptr) {
368 "Can't find declaration for " + LaunchKernelName);
381 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
382 llvm::FunctionType::get(IntTy,
388 addUnderscoredPrefixToName(
"PopCallConfiguration"));
397 CGF.
Builder.CreatePointerCast(KernelHandles[CGF.
CurFn->getName()], PtrTy);
412 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
413 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
416 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
417 llvm::FunctionCallee cudaLaunchKernelFn =
418 CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
425 if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
427 llvm::Function *KernelFunction = llvm::cast<llvm::Function>(
Kernel);
428 std::string GlobalVarName = (KernelFunction->getName() +
".id").str();
430 llvm::GlobalVariable *HandleVar =
431 CGM.getModule().getNamedGlobal(GlobalVarName);
433 HandleVar =
new llvm::GlobalVariable(
434 CGM.getModule(), CGM.Int8Ty,
435 false, KernelFunction->getLinkage(),
436 llvm::ConstantInt::get(CGM.Int8Ty, 0), GlobalVarName);
437 HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
438 HandleVar->setVisibility(KernelFunction->getVisibility());
439 if (KernelFunction->hasComdat())
440 HandleVar->setComdat(CGM.getModule().getOrInsertComdat(GlobalVarName));
456 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
459 for (
const VarDecl *A : Args) {
460 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
461 Offset = Offset.alignTo(TInfo.Align);
462 llvm::Value *Args[] = {
465 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
466 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
469 llvm::Constant *
Zero = llvm::ConstantInt::get(IntTy, 0);
470 llvm::Value *CBZero = CGF.
Builder.CreateICmpEQ(CB, Zero);
472 CGF.
Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
474 Offset += TInfo.Width;
478 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
480 CGF.
Builder.CreatePointerCast(KernelHandles[CGF.
CurFn->getName()], PtrTy);
490 llvm::GlobalVariable *ManagedVar) {
492 for (
auto &&VarUse : Var->uses()) {
493 WorkList.push_back({VarUse.getUser()});
495 while (!WorkList.empty()) {
496 auto &&WorkItem = WorkList.pop_back_val();
497 auto *
U = WorkItem.back();
498 if (isa<llvm::ConstantExpr>(
U)) {
499 for (
auto &&UU :
U->uses()) {
500 WorkItem.push_back(UU.getUser());
501 WorkList.push_back(WorkItem);
506 if (
auto *I = dyn_cast<llvm::Instruction>(
U)) {
507 llvm::Value *OldV = Var;
508 llvm::Instruction *NewV =
509 new llvm::LoadInst(Var->getType(), ManagedVar,
"ld.managed",
false,
510 llvm::Align(Var->getAlignment()), I);
514 for (
auto &&Op : WorkItem) {
515 auto *CE = cast<llvm::ConstantExpr>(Op);
516 auto *NewInst = CE->getAsInstruction();
517 NewInst->insertBefore(*I->getParent(), I->getIterator());
518 NewInst->replaceUsesOfWith(OldV, NewV);
522 I->replaceUsesOfWith(OldV, NewV);
524 llvm_unreachable(
"Invalid use of managed variable");
543llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
545 if (EmittedKernels.empty() && DeviceVars.empty())
548 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
549 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
550 addUnderscoredPrefixToName(
"_register_globals"), &TheModule);
551 llvm::BasicBlock *EntryBB =
552 llvm::BasicBlock::Create(Context,
"entry", RegisterKernelsFunc);
554 Builder.SetInsertPoint(EntryBB);
558 llvm::Type *RegisterFuncParams[] = {
559 PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
560 PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
561 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
562 llvm::FunctionType::get(IntTy, RegisterFuncParams,
false),
563 addUnderscoredPrefixToName(
"RegisterFunction"));
568 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
569 for (
auto &&I : EmittedKernels) {
570 llvm::Constant *KernelName =
571 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
572 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
573 llvm::Value *Args[] = {
575 KernelHandles[I.Kernel->getName()],
578 llvm::ConstantInt::get(IntTy, -1),
583 llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
584 Builder.CreateCall(RegisterFunc, Args);
587 llvm::Type *VarSizeTy = IntTy;
589 if (CGM.getLangOpts().HIP ||
590 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
595 llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
596 IntTy, VarSizeTy, IntTy, IntTy};
597 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
598 llvm::FunctionType::get(VoidTy, RegisterVarParams,
false),
599 addUnderscoredPrefixToName(
"RegisterVar"));
602 llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
603 PtrTy, VarSizeTy, IntTy};
604 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
605 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams,
false),
606 addUnderscoredPrefixToName(
"RegisterManagedVar"));
609 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
610 llvm::FunctionType::get(
611 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy},
false),
612 addUnderscoredPrefixToName(
"RegisterSurface"));
615 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
616 llvm::FunctionType::get(
617 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy},
false),
618 addUnderscoredPrefixToName(
"RegisterTexture"));
619 for (
auto &&Info : DeviceVars) {
620 llvm::GlobalVariable *Var = Info.Var;
621 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
622 "External variables should not show up here, except HIP managed "
624 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
625 switch (Info.Flags.getKind()) {
626 case DeviceVarFlags::Variable: {
628 CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
629 if (Info.Flags.isManaged()) {
630 assert(Var->getName().ends_with(
".managed") &&
631 "HIP managed variables not transformed");
632 auto *ManagedVar = CGM.getModule().getNamedGlobal(
633 Var->getName().drop_back(StringRef(
".managed").size()));
634 llvm::Value *Args[] = {
639 llvm::ConstantInt::get(VarSizeTy, VarSize),
640 llvm::ConstantInt::get(IntTy, Var->getAlignment())};
641 if (!Var->isDeclaration())
642 Builder.CreateCall(RegisterManagedVar, Args);
644 llvm::Value *Args[] = {
649 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
650 llvm::ConstantInt::get(VarSizeTy, VarSize),
651 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
652 llvm::ConstantInt::get(IntTy, 0)};
653 Builder.CreateCall(RegisterVar, Args);
657 case DeviceVarFlags::Surface:
660 {&GpuBinaryHandlePtr, Var, VarName, VarName,
661 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
662 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
664 case DeviceVarFlags::Texture:
667 {&GpuBinaryHandlePtr, Var, VarName, VarName,
668 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
669 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
670 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
675 Builder.CreateRetVoid();
676 return RegisterKernelsFunc;
698llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
699 bool IsHIP = CGM.getLangOpts().HIP;
700 bool IsCUDA = CGM.getLangOpts().CUDA;
702 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
703 if (CudaGpuBinaryFileName.empty() && !IsHIP)
705 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
710 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
713 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
714 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
717 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
718 llvm::FunctionType::get(PtrTy, PtrTy,
false),
719 addUnderscoredPrefixToName(
"RegisterFatBinary"));
721 llvm::StructType *FatbinWrapperTy =
722 llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
728 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary =
nullptr;
729 if (!CudaGpuBinaryFileName.empty()) {
730 auto VFS = CGM.getFileSystem();
731 auto CudaGpuBinaryOrErr =
732 VFS->getBufferForFile(CudaGpuBinaryFileName, -1,
false);
733 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
734 CGM.getDiags().Report(diag::err_cannot_open_file)
735 << CudaGpuBinaryFileName << EC.message();
738 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
741 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
742 llvm::FunctionType::get(VoidTy,
false),
743 llvm::GlobalValue::InternalLinkage,
744 addUnderscoredPrefixToName(
"_module_ctor"), &TheModule);
745 llvm::BasicBlock *CtorEntryBB =
746 llvm::BasicBlock::Create(Context,
"entry", ModuleCtorFunc);
749 CtorBuilder.SetInsertPoint(CtorEntryBB);
751 const char *FatbinConstantName;
752 const char *FatbinSectionName;
753 const char *ModuleIDSectionName;
754 StringRef ModuleIDPrefix;
755 llvm::Constant *FatBinStr;
758 FatbinConstantName =
".hip_fatbin";
759 FatbinSectionName =
".hipFatBinSegment";
761 ModuleIDSectionName =
"__hip_module_id";
762 ModuleIDPrefix =
"__hip_";
767 const unsigned HIPCodeObjectAlign = 4096;
768 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
769 FatbinConstantName, HIPCodeObjectAlign);
775 FatBinStr =
new llvm::GlobalVariable(
776 CGM.getModule(), CGM.Int8Ty,
777 true, llvm::GlobalValue::ExternalLinkage,
nullptr,
778 "__hip_fatbin_" + CGM.getContext().getCUIDHash(),
nullptr,
779 llvm::GlobalVariable::NotThreadLocal);
780 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
783 FatMagic = HIPFatMagic;
785 if (RelocatableDeviceCode)
786 FatbinConstantName = CGM.getTriple().isMacOSX()
787 ?
"__NV_CUDA,__nv_relfatbin"
791 CGM.getTriple().isMacOSX() ?
"__NV_CUDA,__nv_fatbin" :
".nv_fatbin";
794 CGM.getTriple().isMacOSX() ?
"__NV_CUDA,__fatbin" :
".nvFatBinSegment";
796 ModuleIDSectionName = CGM.getTriple().isMacOSX()
797 ?
"__NV_CUDA,__nv_module_id"
799 ModuleIDPrefix =
"__nv_";
803 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
804 FatbinConstantName, 8);
805 FatMagic = CudaFatMagic;
810 auto Values = Builder.beginStruct(FatbinWrapperTy);
812 Values.addInt(IntTy, FatMagic);
814 Values.addInt(IntTy, 1);
816 Values.add(FatBinStr);
818 Values.add(llvm::ConstantPointerNull::get(PtrTy));
819 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
820 addUnderscoredPrefixToName(
"_fatbin_wrapper"), CGM.getPointerAlign(),
822 FatbinWrapper->setSection(FatbinSectionName);
832 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
833 : llvm::GlobalValue::ExternalLinkage;
834 llvm::BasicBlock *IfBlock =
835 llvm::BasicBlock::Create(Context,
"if", ModuleCtorFunc);
836 llvm::BasicBlock *ExitBlock =
837 llvm::BasicBlock::Create(Context,
"exit", ModuleCtorFunc);
840 GpuBinaryHandle =
new llvm::GlobalVariable(
841 TheModule, PtrTy,
false,
Linkage,
843 CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) :
nullptr,
845 ?
"__hip_gpubin_handle"
846 :
"__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
847 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
849 if (
Linkage != llvm::GlobalValue::InternalLinkage)
850 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
852 GpuBinaryHandle, PtrTy,
855 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
856 llvm::Constant *
Zero =
857 llvm::Constant::getNullValue(HandleValue->getType());
858 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
859 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
862 CtorBuilder.SetInsertPoint(IfBlock);
864 llvm::CallInst *RegisterFatbinCall =
865 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
866 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
867 CtorBuilder.CreateBr(ExitBlock);
870 CtorBuilder.SetInsertPoint(ExitBlock);
872 if (RegisterGlobalsFunc) {
873 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
874 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
877 }
else if (!RelocatableDeviceCode) {
881 llvm::CallInst *RegisterFatbinCall =
882 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
883 GpuBinaryHandle =
new llvm::GlobalVariable(
884 TheModule, PtrTy,
false, llvm::GlobalValue::InternalLinkage,
885 llvm::ConstantPointerNull::get(PtrTy),
"__cuda_gpubin_handle");
886 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
887 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
888 CGM.getPointerAlign());
891 if (RegisterGlobalsFunc)
892 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
896 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
898 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
899 llvm::FunctionType::get(VoidTy, PtrTy,
false),
900 "__cudaRegisterFatBinaryEnd");
901 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
906 llvm::raw_svector_ostream OS(ModuleID);
907 OS << ModuleIDPrefix << llvm::format(
"%" PRIx64, FatbinWrapper->getGUID());
908 llvm::Constant *ModuleIDConstant = makeConstantArray(
909 std::string(ModuleID),
"", ModuleIDSectionName, 32,
true);
912 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
913 Twine(
"__fatbinwrap") + ModuleID, FatbinWrapper);
918 RegisterLinkedBinaryName += ModuleID;
919 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
920 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
922 assert(RegisterGlobalsFunc &&
"Expecting at least dummy function!");
923 llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
924 makeDummyFunction(getCallbackFnTy())};
925 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
931 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
933 llvm::FunctionType *AtExitTy =
934 llvm::FunctionType::get(IntTy, CleanupFn->getType(),
false);
935 llvm::FunctionCallee AtExitFunc =
936 CGM.CreateRuntimeFunction(AtExitTy,
"atexit", llvm::AttributeList(),
938 CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
941 CtorBuilder.CreateRetVoid();
942 return ModuleCtorFunc;
964llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
966 if (!GpuBinaryHandle)
970 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
971 llvm::FunctionType::get(VoidTy, PtrTy,
false),
972 addUnderscoredPrefixToName(
"UnregisterFatBinary"));
974 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
975 llvm::FunctionType::get(VoidTy,
false),
976 llvm::GlobalValue::InternalLinkage,
977 addUnderscoredPrefixToName(
"_module_dtor"), &TheModule);
979 llvm::BasicBlock *DtorEntryBB =
980 llvm::BasicBlock::Create(Context,
"entry", ModuleDtorFunc);
982 DtorBuilder.SetInsertPoint(DtorEntryBB);
985 GpuBinaryHandle, GpuBinaryHandle->getValueType(),
987 auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
991 if (CGM.getLangOpts().HIP) {
992 llvm::BasicBlock *IfBlock =
993 llvm::BasicBlock::Create(Context,
"if", ModuleDtorFunc);
994 llvm::BasicBlock *ExitBlock =
995 llvm::BasicBlock::Create(Context,
"exit", ModuleDtorFunc);
996 llvm::Constant *
Zero = llvm::Constant::getNullValue(HandleValue->getType());
997 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
998 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
1000 DtorBuilder.SetInsertPoint(IfBlock);
1001 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1002 DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
1003 DtorBuilder.CreateBr(ExitBlock);
1005 DtorBuilder.SetInsertPoint(ExitBlock);
1007 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1009 DtorBuilder.CreateRetVoid();
1010 return ModuleDtorFunc;
1014 return new CGNVCUDARuntime(CGM);
1017void CGNVCUDARuntime::internalizeDeviceSideVar(
1026 if (CGM.getLangOpts().GPURelocatableDeviceCode)
1036 D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1037 D->getType()->isCUDADeviceBuiltinTextureType()) {
1038 Linkage = llvm::GlobalValue::InternalLinkage;
1042void CGNVCUDARuntime::handleVarRegistration(
const VarDecl *
D,
1043 llvm::GlobalVariable &GV) {
1058 if ((!
D->hasExternalStorage() && !
D->isInline()) ||
1059 CGM.getContext().CUDADeviceVarODRUsedByHost.contains(
D) ||
1061 registerDeviceVar(
D, GV, !
D->hasDefinition(),
1064 }
else if (
D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1065 D->getType()->isCUDADeviceBuiltinTextureType()) {
1068 const auto *TD = cast<ClassTemplateSpecializationDecl>(
1071 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1072 assert(Args.
size() == 2 &&
1073 "Unexpected number of template arguments of CUDA device "
1074 "builtin surface type.");
1075 auto SurfType = Args[1].getAsIntegral();
1076 if (!
D->hasExternalStorage())
1077 registerDeviceSurf(
D, GV, !
D->hasDefinition(), SurfType.getSExtValue());
1079 assert(Args.
size() == 3 &&
1080 "Unexpected number of template arguments of CUDA device "
1081 "builtin texture type.");
1082 auto TexType = Args[1].getAsIntegral();
1083 auto Normalized = Args[2].getAsIntegral();
1084 if (!
D->hasExternalStorage())
1085 registerDeviceTex(
D, GV, !
D->hasDefinition(), TexType.getSExtValue(),
1086 Normalized.getZExtValue());
1095void CGNVCUDARuntime::transformManagedVars() {
1096 for (
auto &&Info : DeviceVars) {
1097 llvm::GlobalVariable *Var = Info.Var;
1098 if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1099 Info.Flags.isManaged()) {
1100 auto *ManagedVar =
new llvm::GlobalVariable(
1101 CGM.getModule(), Var->getType(),
1102 false, Var->getLinkage(),
1103 Var->isDeclaration()
1105 : llvm::ConstantPointerNull::get(Var->getType()),
1107 llvm::GlobalVariable::NotThreadLocal,
1108 CGM.getContext().getTargetAddressSpace(CGM.getLangOpts().CUDAIsDevice
1109 ? LangAS::cuda_device
1110 : LangAS::Default));
1111 ManagedVar->setDSOLocal(Var->isDSOLocal());
1112 ManagedVar->setVisibility(Var->getVisibility());
1113 ManagedVar->setExternallyInitialized(
true);
1115 ManagedVar->takeName(Var);
1116 Var->setName(Twine(ManagedVar->getName()) +
".managed");
1119 if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1120 assert(!ManagedVar->isDeclaration());
1121 CGM.addCompilerUsedGlobal(Var);
1122 CGM.addCompilerUsedGlobal(ManagedVar);
1131void CGNVCUDARuntime::createOffloadingEntries() {
1132 StringRef Section = CGM.getLangOpts().HIP ?
"hip_offloading_entries"
1133 :
"cuda_offloading_entries";
1134 llvm::Module &M = CGM.getModule();
1135 for (KernelInfo &I : EmittedKernels)
1136 llvm::offloading::emitOffloadingEntry(
1137 M, KernelHandles[I.Kernel->getName()],
1138 getDeviceSideName(cast<NamedDecl>(I.D)), 0, 0,
1139 llvm::offloading::OffloadGlobalEntry, Section);
1141 for (VarInfo &I : DeviceVars) {
1143 CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1146 ?
static_cast<int32_t
>(llvm::offloading::OffloadGlobalExtern)
1148 (I.Flags.isConstant()
1149 ?
static_cast<int32_t
>(llvm::offloading::OffloadGlobalConstant)
1151 (I.Flags.isNormalized()
1152 ?
static_cast<int32_t
>(llvm::offloading::OffloadGlobalNormalized)
1154 if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1155 llvm::offloading::emitOffloadingEntry(
1156 M, I.Var, getDeviceSideName(I.D), VarSize,
1157 (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1158 : llvm::offloading::OffloadGlobalEntry) |
1161 }
else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1162 llvm::offloading::emitOffloadingEntry(
1163 M, I.Var, getDeviceSideName(I.D), VarSize,
1164 llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1165 I.Flags.getSurfTexType(), Section);
1166 }
else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1167 llvm::offloading::emitOffloadingEntry(
1168 M, I.Var, getDeviceSideName(I.D), VarSize,
1169 llvm::offloading::OffloadGlobalTextureEntry | Flags,
1170 I.Flags.getSurfTexType(), Section);
1176llvm::Function *CGNVCUDARuntime::finalizeModule() {
1177 transformManagedVars();
1178 if (CGM.getLangOpts().CUDAIsDevice) {
1189 for (
auto &&Info : DeviceVars) {
1190 auto Kind = Info.Flags.getKind();
1191 if (!Info.Var->isDeclaration() &&
1192 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1193 (Kind == DeviceVarFlags::Variable ||
1194 Kind == DeviceVarFlags::Surface ||
1195 Kind == DeviceVarFlags::Texture) &&
1196 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1197 CGM.addCompilerUsedGlobal(Info.Var);
1202 if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1203 createOffloadingEntries();
1205 return makeModuleCtorFunction();
1210llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1212 auto Loc = KernelHandles.find(F->getName());
1213 if (
Loc != KernelHandles.end()) {
1214 auto OldHandle =
Loc->second;
1215 if (KernelStubs[OldHandle] == F)
1220 if (CGM.getLangOpts().HIP) {
1223 KernelStubs[OldHandle] = F;
1228 KernelStubs.erase(OldHandle);
1231 if (!CGM.getLangOpts().HIP) {
1232 KernelHandles[F->getName()] = F;
1237 auto *Var =
new llvm::GlobalVariable(
1238 TheModule, F->getType(),
true, F->getLinkage(),
1242 Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1243 Var->setDSOLocal(F->isDSOLocal());
1244 Var->setVisibility(F->getVisibility());
1245 auto *FD = cast<FunctionDecl>(GD.
getDecl());
1246 auto *FT = FD->getPrimaryTemplate();
1247 if (!FT || FT->isThisDeclarationADefinition())
1248 CGM.maybeSetTrivialComdat(*FD, *Var);
1249 KernelHandles[F->getName()] = Var;
1250 KernelStubs[Var] = F;
static std::unique_ptr< MangleContext > InitDeviceMC(CodeGenModule &CGM)
static void replaceManagedVar(llvm::GlobalVariable *Var, llvm::GlobalVariable *ManagedVar)
MangleContext * createMangleContext(const TargetInfo *T=nullptr)
If T is null pointer, assume the target in ASTContext.
const TargetInfo * getAuxTargetInfo() const
MangleContext * createDeviceMangleContext(const TargetInfo &T)
Creates a device mangle context to correctly mangle lambdas in a mixed architecture compile by settin...
const TargetInfo & getTargetInfo() const
CharUnits - This is an opaque type for sizes expressed in character units.
static CharUnits One()
One - Construct a CharUnits quantity of one.
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
static CharUnits Zero()
Zero - Construct a CharUnits quantity of zero.
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
llvm::PointerType * getType() const
Return the type of the pointer value.
llvm::StoreInst * CreateAlignedStore(llvm::Value *Val, llvm::Value *Addr, CharUnits Align, bool IsVolatile=false)
llvm::StoreInst * CreateDefaultAlignedStore(llvm::Value *Val, llvm::Value *Addr, bool IsVolatile=false)
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
virtual std::string getDeviceSideName(const NamedDecl *ND)=0
Returns function or variable name on device side even if the current compilation is for host.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args)=0
Emits a kernel launch stub.
virtual llvm::Function * getKernelStub(llvm::GlobalValue *Handle)=0
Get kernel stub by kernel handle.
virtual void handleVarRegistration(const VarDecl *VD, llvm::GlobalVariable &Var)=0
Check whether a variable is a device variable and register it if true.
virtual llvm::Function * finalizeModule()=0
Finalize generated LLVM module.
virtual llvm::GlobalValue * getKernelHandle(llvm::Function *Stub, GlobalDecl GD)=0
Get kernel handle by stub function.
virtual void internalizeDeviceSideVar(const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage)=0
Adjust linkage of shadow variables in host compilation.
static CGCallee forDirect(llvm::Constant *functionPtr, const CGCalleeInfo &abstractInfo=CGCalleeInfo())
CGFunctionInfo - Class to encapsulate the information about a function definition.
CallArgList - Type for representing both the value and type of arguments in a call.
void add(RValue rvalue, QualType type)
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
const LangOptions & getLangOpts() const
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee, ReturnValueSlot ReturnValue, const CallArgList &Args, llvm::CallBase **callOrInvoke, bool IsMustTail, SourceLocation Loc, bool IsVirtualFunctionPointerThunk=false)
EmitCall - Generate a call of the given function, expecting the given result type,...
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
const Decl * CurFuncDecl
CurFuncDecl - Holds the Decl for the current outermost non-closure context.
llvm::CallBase * EmitRuntimeCallOrInvoke(llvm::FunctionCallee callee, ArrayRef< llvm::Value * > args, const Twine &name="")
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
This class organizes the cross-function state that is used while generating LLVM code.
ASTContext & getContext() const
The standard implementation of ConstantInitBuilder used in Clang.
FunctionArgList - Type for representing both the decl and type of parameters to a function.
static RValue get(llvm::Value *V)
static RValue getAggregate(Address addr, bool isVolatile=false)
Convert an Address to an RValue.
ReturnValueSlot - Contains the address where the return value of a function can be stored,...
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
lookup_result lookup(DeclarationName Name) const
lookup - Find the declarations (if any) with the given Name in this context.
Decl - This represents one declaration (or definition), e.g.
SourceLocation getLocation() const
TranslationUnitDecl * getTranslationUnitDecl()
Represents a function declaration or definition.
const ParmVarDecl * getParamDecl(unsigned i) const
GlobalDecl - represents a global declaration.
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind)
const Decl * getDecl() const
One of these records is kept for each identifier that is lexed.
StringRef getName() const
Return the actual identifier string.
GPUDefaultStreamKind GPUDefaultStream
The default stream kind used for HIP kernel launching.
MangleContext - Context for tracking state which persists across multiple calls to the C++ name mangl...
bool shouldMangleDeclName(const NamedDecl *D)
void mangleName(GlobalDecl GD, raw_ostream &)
This represents a decl that may have a name.
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Represents a parameter to a function.
A (possibly-)qualified type.
QualType getCanonicalType() const
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
RecordDecl * getDecl() const
bool isMicrosoft() const
Is this ABI an MSVC-compatible ABI?
bool isItaniumFamily() const
Does this ABI generally fall into the Itanium family of ABIs?
TargetCXXABI getCXXABI() const
Get the C++ ABI currently in use.
A template argument list.
unsigned size() const
Retrieve the number of template arguments in this template argument list.
The top declaration context.
static DeclContext * castToDeclContext(const TranslationUnitDecl *D)
The base class of the type hierarchy.
Represents a variable declaration or definition.
CGCUDARuntime * CreateNVCUDARuntime(CodeGenModule &CGM)
Creates an instance of a CUDA runtime class.
bool Zero(InterpState &S, CodePtr OpPC)
The JSON file list parser is used to communicate input to InstallAPI.
CudaVersion ToCudaVersion(llvm::VersionTuple)
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature)
Linkage
Describes the different kinds of linkage (C++ [basic.link], C99 6.2.2) that an entity may have.
llvm::IntegerType * SizeTy
llvm::IntegerType * IntTy
int
llvm::PointerType * UnqualPtrTy