22#include "llvm/Frontend/Offloading/Utility.h"
23#include "llvm/IR/BasicBlock.h"
24#include "llvm/IR/Constants.h"
25#include "llvm/IR/DerivedTypes.h"
26#include "llvm/IR/ReplaceConstant.h"
27#include "llvm/Support/Format.h"
28#include "llvm/Support/VirtualFileSystem.h"
31using namespace CodeGen;
34constexpr unsigned CudaFatMagic = 0x466243b1;
35constexpr unsigned HIPFatMagic = 0x48495046;
40 llvm::IntegerType *IntTy, *SizeTy;
42 llvm::PointerType *PtrTy;
45 llvm::LLVMContext &Context;
47 llvm::Module &TheModule;
57 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
59 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
61 llvm::GlobalVariable *Var;
69 llvm::GlobalVariable *GpuBinaryHandle =
nullptr;
71 bool RelocatableDeviceCode;
73 std::unique_ptr<MangleContext> DeviceMC;
75 llvm::Constant *Zeros[2];
77 llvm::FunctionCallee getSetupArgumentFn()
const;
78 llvm::FunctionCallee getLaunchFn()
const;
80 llvm::FunctionType *getRegisterGlobalsFnTy()
const;
81 llvm::FunctionType *getCallbackFnTy()
const;
82 llvm::FunctionType *getRegisterLinkedBinaryFnTy()
const;
83 std::string addPrefixToName(StringRef FuncName)
const;
84 std::string addUnderscoredPrefixToName(StringRef FuncName)
const;
87 llvm::Function *makeRegisterGlobalsFn();
92 llvm::Constant *makeConstantString(
const std::string &Str,
93 const std::string &Name =
"") {
94 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
95 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
96 ConstStr.getPointer(), Zeros);
102 llvm::Constant *makeConstantArray(StringRef Str,
104 StringRef SectionName =
"",
105 unsigned Alignment = 0,
106 bool AddNull =
false) {
107 llvm::Constant *
Value =
108 llvm::ConstantDataArray::getString(Context, Str, AddNull);
109 auto *GV =
new llvm::GlobalVariable(
111 llvm::GlobalValue::PrivateLinkage,
Value, Name);
112 if (!SectionName.empty()) {
113 GV->setSection(SectionName);
116 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
119 GV->setAlignment(llvm::Align(Alignment));
120 return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros);
124 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
125 assert(FnTy->getReturnType()->isVoidTy() &&
126 "Can only generate dummy functions returning void!");
127 llvm::Function *DummyFunc = llvm::Function::Create(
128 FnTy, llvm::GlobalValue::InternalLinkage,
"dummy", &TheModule);
130 llvm::BasicBlock *DummyBlock =
131 llvm::BasicBlock::Create(Context,
"", DummyFunc);
133 FuncBuilder.SetInsertPoint(DummyBlock);
134 FuncBuilder.CreateRetVoid();
143 void registerDeviceVar(
const VarDecl *VD, llvm::GlobalVariable &Var,
144 bool Extern,
bool Constant) {
145 DeviceVars.push_back({&Var,
148 VD->hasAttr<HIPManagedAttr>(),
151 void registerDeviceSurf(
const VarDecl *VD, llvm::GlobalVariable &Var,
152 bool Extern,
int Type) {
153 DeviceVars.push_back({&Var,
159 void registerDeviceTex(
const VarDecl *VD, llvm::GlobalVariable &Var,
160 bool Extern,
int Type,
bool Normalized) {
161 DeviceVars.push_back({&Var,
164 false, Normalized,
Type}});
168 llvm::Function *makeModuleCtorFunction();
170 llvm::Function *makeModuleDtorFunction();
172 void transformManagedVars();
174 void createOffloadingEntries();
180 llvm::Function *
getKernelStub(llvm::GlobalValue *Handle)
override {
181 auto Loc = KernelStubs.find(Handle);
182 assert(Loc != KernelStubs.end());
187 llvm::GlobalVariable &Var)
override;
190 llvm::GlobalValue::LinkageTypes &
Linkage)
override;
197std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName)
const {
198 if (CGM.getLangOpts().HIP)
199 return ((Twine(
"hip") + Twine(FuncName)).str());
200 return ((Twine(
"cuda") + Twine(FuncName)).str());
203CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName)
const {
204 if (CGM.getLangOpts().HIP)
205 return ((Twine(
"__hip") + Twine(FuncName)).str());
206 return ((Twine(
"__cuda") + Twine(FuncName)).str());
216 return std::unique_ptr<MangleContext>(
227 TheModule(CGM.getModule()),
228 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
233 Zeros[0] = llvm::ConstantInt::get(SizeTy, 0);
238llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn()
const {
240 llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
241 return CGM.CreateRuntimeFunction(
242 llvm::FunctionType::get(IntTy, Params,
false),
243 addPrefixToName(
"SetupArgument"));
246llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn()
const {
247 if (CGM.getLangOpts().HIP) {
249 return CGM.CreateRuntimeFunction(
250 llvm::FunctionType::get(IntTy, PtrTy,
false),
"hipLaunchByPtr");
253 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy,
false),
257llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy()
const {
258 return llvm::FunctionType::get(VoidTy, PtrTy,
false);
261llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy()
const {
262 return llvm::FunctionType::get(VoidTy, PtrTy,
false);
265llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy()
const {
266 llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), PtrTy, PtrTy,
267 llvm::PointerType::getUnqual(Context)};
268 return llvm::FunctionType::get(VoidTy, Params,
false);
271std::string CGNVCUDARuntime::getDeviceSideName(
const NamedDecl *ND) {
274 if (
auto *FD = dyn_cast<FunctionDecl>(ND))
275 GD =
GlobalDecl(FD, KernelReferenceKind::Kernel);
278 std::string DeviceSideName;
280 if (CGM.getLangOpts().CUDAIsDevice)
281 MC = &CGM.getCXXABI().getMangleContext();
286 llvm::raw_svector_ostream Out(Buffer);
288 DeviceSideName = std::string(Out.str());
293 if (CGM.getContext().shouldExternalize(ND) &&
294 CGM.getLangOpts().GPURelocatableDeviceCode) {
296 llvm::raw_svector_ostream Out(Buffer);
297 Out << DeviceSideName;
298 CGM.printPostfixForExternalizedDecl(Out, ND);
299 DeviceSideName = std::string(Out.str());
301 return DeviceSideName;
308 dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.
CurFn->getName()])) {
309 GV->setLinkage(CGF.
CurFn->getLinkage());
310 GV->setInitializer(CGF.
CurFn);
313 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
315 emitDeviceStubBodyNew(CGF, Args);
317 emitDeviceStubBodyLegacy(CGF, Args);
331 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
333 for (
unsigned i = 0; i < Args.size(); ++i) {
335 llvm::Value *VoidVarPtr = CGF.
Builder.CreatePointerCast(VarPtr, PtrTy);
337 VoidVarPtr, CGF.
Builder.CreateConstGEP1_32(
355 std::string KernelLaunchAPI =
"LaunchKernel";
357 LangOptions::GPUDefaultStreamKind::PerThread) {
359 KernelLaunchAPI = KernelLaunchAPI +
"_spt";
361 KernelLaunchAPI = KernelLaunchAPI +
"_ptsz";
363 auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
365 CGM.getContext().Idents.get(LaunchKernelName);
367 for (
auto *Result : DC->
lookup(&cudaLaunchKernelII)) {
369 cudaLaunchKernelFD = FD;
372 if (cudaLaunchKernelFD ==
nullptr) {
374 "Can't find declaration for " + LaunchKernelName);
387 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
388 llvm::FunctionType::get(IntTy,
394 addUnderscoredPrefixToName(
"PopCallConfiguration"));
403 CGF.
Builder.CreatePointerCast(KernelHandles[CGF.
CurFn->getName()], PtrTy);
418 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
419 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
422 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
423 llvm::FunctionCallee cudaLaunchKernelFn =
424 CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
431 if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
433 llvm::Function *KernelFunction = llvm::cast<llvm::Function>(
Kernel);
434 std::string GlobalVarName = (KernelFunction->getName() +
".id").str();
436 llvm::GlobalVariable *HandleVar =
437 CGM.getModule().getNamedGlobal(GlobalVarName);
439 HandleVar =
new llvm::GlobalVariable(
440 CGM.getModule(), CGM.Int8Ty,
441 false, KernelFunction->getLinkage(),
442 llvm::ConstantInt::get(CGM.Int8Ty, 0), GlobalVarName);
443 HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
444 HandleVar->setVisibility(KernelFunction->getVisibility());
445 if (KernelFunction->hasComdat())
446 HandleVar->setComdat(CGM.getModule().getOrInsertComdat(GlobalVarName));
462 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
465 for (
const VarDecl *A : Args) {
466 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
467 Offset = Offset.alignTo(TInfo.Align);
468 llvm::Value *Args[] = {
471 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
472 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
475 llvm::Constant *
Zero = llvm::ConstantInt::get(IntTy, 0);
476 llvm::Value *CBZero = CGF.
Builder.CreateICmpEQ(CB, Zero);
478 CGF.
Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
480 Offset += TInfo.Width;
484 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
486 CGF.
Builder.CreatePointerCast(KernelHandles[CGF.
CurFn->getName()], PtrTy);
496 llvm::GlobalVariable *ManagedVar) {
498 for (
auto &&VarUse : Var->uses()) {
499 WorkList.push_back({VarUse.getUser()});
501 while (!WorkList.empty()) {
502 auto &&WorkItem = WorkList.pop_back_val();
503 auto *
U = WorkItem.back();
504 if (isa<llvm::ConstantExpr>(
U)) {
505 for (
auto &&UU :
U->uses()) {
506 WorkItem.push_back(UU.getUser());
507 WorkList.push_back(WorkItem);
512 if (
auto *I = dyn_cast<llvm::Instruction>(
U)) {
513 llvm::Value *OldV = Var;
514 llvm::Instruction *NewV =
515 new llvm::LoadInst(Var->getType(), ManagedVar,
"ld.managed",
false,
516 llvm::Align(Var->getAlignment()), I);
520 for (
auto &&Op : WorkItem) {
521 auto *CE = cast<llvm::ConstantExpr>(Op);
522 auto *NewInst = CE->getAsInstruction();
523 NewInst->insertBefore(*I->getParent(), I->getIterator());
524 NewInst->replaceUsesOfWith(OldV, NewV);
528 I->replaceUsesOfWith(OldV, NewV);
530 llvm_unreachable(
"Invalid use of managed variable");
549llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
551 if (EmittedKernels.empty() && DeviceVars.empty())
554 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
555 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
556 addUnderscoredPrefixToName(
"_register_globals"), &TheModule);
557 llvm::BasicBlock *EntryBB =
558 llvm::BasicBlock::Create(Context,
"entry", RegisterKernelsFunc);
560 Builder.SetInsertPoint(EntryBB);
564 llvm::Type *RegisterFuncParams[] = {
565 PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
566 PtrTy, PtrTy, PtrTy, PtrTy, llvm::PointerType::getUnqual(Context)};
567 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
568 llvm::FunctionType::get(IntTy, RegisterFuncParams,
false),
569 addUnderscoredPrefixToName(
"RegisterFunction"));
574 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
575 for (
auto &&I : EmittedKernels) {
576 llvm::Constant *KernelName =
577 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
578 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
579 llvm::Value *Args[] = {
581 KernelHandles[I.Kernel->getName()],
584 llvm::ConstantInt::get(IntTy, -1),
589 llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
590 Builder.CreateCall(RegisterFunc, Args);
593 llvm::Type *VarSizeTy = IntTy;
595 if (CGM.getLangOpts().HIP ||
596 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
601 llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
602 IntTy, VarSizeTy, IntTy, IntTy};
603 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
604 llvm::FunctionType::get(VoidTy, RegisterVarParams,
false),
605 addUnderscoredPrefixToName(
"RegisterVar"));
608 llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
609 PtrTy, VarSizeTy, IntTy};
610 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
611 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams,
false),
612 addUnderscoredPrefixToName(
"RegisterManagedVar"));
615 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
616 llvm::FunctionType::get(
617 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy},
false),
618 addUnderscoredPrefixToName(
"RegisterSurface"));
621 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
622 llvm::FunctionType::get(
623 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy},
false),
624 addUnderscoredPrefixToName(
"RegisterTexture"));
625 for (
auto &&Info : DeviceVars) {
626 llvm::GlobalVariable *Var = Info.Var;
627 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
628 "External variables should not show up here, except HIP managed "
630 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
631 switch (Info.Flags.getKind()) {
632 case DeviceVarFlags::Variable: {
634 CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
635 if (Info.Flags.isManaged()) {
636 assert(Var->getName().ends_with(
".managed") &&
637 "HIP managed variables not transformed");
638 auto *ManagedVar = CGM.getModule().getNamedGlobal(
639 Var->getName().drop_back(StringRef(
".managed").size()));
640 llvm::Value *Args[] = {
645 llvm::ConstantInt::get(VarSizeTy, VarSize),
646 llvm::ConstantInt::get(IntTy, Var->getAlignment())};
647 if (!Var->isDeclaration())
648 Builder.CreateCall(RegisterManagedVar, Args);
650 llvm::Value *Args[] = {
655 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
656 llvm::ConstantInt::get(VarSizeTy, VarSize),
657 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
658 llvm::ConstantInt::get(IntTy, 0)};
659 Builder.CreateCall(RegisterVar, Args);
663 case DeviceVarFlags::Surface:
666 {&GpuBinaryHandlePtr, Var, VarName, VarName,
667 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
668 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
670 case DeviceVarFlags::Texture:
673 {&GpuBinaryHandlePtr, Var, VarName, VarName,
674 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
675 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
676 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
681 Builder.CreateRetVoid();
682 return RegisterKernelsFunc;
704llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
705 bool IsHIP = CGM.getLangOpts().HIP;
706 bool IsCUDA = CGM.getLangOpts().CUDA;
708 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
709 if (CudaGpuBinaryFileName.empty() && !IsHIP)
711 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
716 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
719 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
720 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
723 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
724 llvm::FunctionType::get(PtrTy, PtrTy,
false),
725 addUnderscoredPrefixToName(
"RegisterFatBinary"));
727 llvm::StructType *FatbinWrapperTy =
728 llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
734 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary =
nullptr;
735 if (!CudaGpuBinaryFileName.empty()) {
736 auto VFS = CGM.getFileSystem();
737 auto CudaGpuBinaryOrErr =
738 VFS->getBufferForFile(CudaGpuBinaryFileName, -1,
false);
739 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
740 CGM.getDiags().Report(diag::err_cannot_open_file)
741 << CudaGpuBinaryFileName << EC.message();
744 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
747 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
748 llvm::FunctionType::get(VoidTy,
false),
749 llvm::GlobalValue::InternalLinkage,
750 addUnderscoredPrefixToName(
"_module_ctor"), &TheModule);
751 llvm::BasicBlock *CtorEntryBB =
752 llvm::BasicBlock::Create(Context,
"entry", ModuleCtorFunc);
755 CtorBuilder.SetInsertPoint(CtorEntryBB);
757 const char *FatbinConstantName;
758 const char *FatbinSectionName;
759 const char *ModuleIDSectionName;
760 StringRef ModuleIDPrefix;
761 llvm::Constant *FatBinStr;
764 FatbinConstantName =
".hip_fatbin";
765 FatbinSectionName =
".hipFatBinSegment";
767 ModuleIDSectionName =
"__hip_module_id";
768 ModuleIDPrefix =
"__hip_";
773 const unsigned HIPCodeObjectAlign = 4096;
774 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
775 FatbinConstantName, HIPCodeObjectAlign);
781 FatBinStr =
new llvm::GlobalVariable(
782 CGM.getModule(), CGM.Int8Ty,
783 true, llvm::GlobalValue::ExternalLinkage,
nullptr,
784 "__hip_fatbin_" + CGM.getContext().getCUIDHash(),
nullptr,
785 llvm::GlobalVariable::NotThreadLocal);
786 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
789 FatMagic = HIPFatMagic;
791 if (RelocatableDeviceCode)
792 FatbinConstantName = CGM.getTriple().isMacOSX()
793 ?
"__NV_CUDA,__nv_relfatbin"
797 CGM.getTriple().isMacOSX() ?
"__NV_CUDA,__nv_fatbin" :
".nv_fatbin";
800 CGM.getTriple().isMacOSX() ?
"__NV_CUDA,__fatbin" :
".nvFatBinSegment";
802 ModuleIDSectionName = CGM.getTriple().isMacOSX()
803 ?
"__NV_CUDA,__nv_module_id"
805 ModuleIDPrefix =
"__nv_";
809 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
810 FatbinConstantName, 8);
811 FatMagic = CudaFatMagic;
816 auto Values = Builder.beginStruct(FatbinWrapperTy);
818 Values.addInt(IntTy, FatMagic);
820 Values.addInt(IntTy, 1);
822 Values.add(FatBinStr);
824 Values.add(llvm::ConstantPointerNull::get(PtrTy));
825 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
826 addUnderscoredPrefixToName(
"_fatbin_wrapper"), CGM.getPointerAlign(),
828 FatbinWrapper->setSection(FatbinSectionName);
838 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
839 : llvm::GlobalValue::ExternalLinkage;
840 llvm::BasicBlock *IfBlock =
841 llvm::BasicBlock::Create(Context,
"if", ModuleCtorFunc);
842 llvm::BasicBlock *ExitBlock =
843 llvm::BasicBlock::Create(Context,
"exit", ModuleCtorFunc);
846 GpuBinaryHandle =
new llvm::GlobalVariable(
847 TheModule, PtrTy,
false,
Linkage,
849 CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) :
nullptr,
851 ?
"__hip_gpubin_handle"
852 :
"__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
853 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
855 if (
Linkage != llvm::GlobalValue::InternalLinkage)
856 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
858 GpuBinaryHandle, PtrTy,
861 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
862 llvm::Constant *
Zero =
863 llvm::Constant::getNullValue(HandleValue->getType());
864 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
865 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
868 CtorBuilder.SetInsertPoint(IfBlock);
870 llvm::CallInst *RegisterFatbinCall =
871 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
872 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
873 CtorBuilder.CreateBr(ExitBlock);
876 CtorBuilder.SetInsertPoint(ExitBlock);
878 if (RegisterGlobalsFunc) {
879 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
880 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
883 }
else if (!RelocatableDeviceCode) {
887 llvm::CallInst *RegisterFatbinCall =
888 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
889 GpuBinaryHandle =
new llvm::GlobalVariable(
890 TheModule, PtrTy,
false, llvm::GlobalValue::InternalLinkage,
891 llvm::ConstantPointerNull::get(PtrTy),
"__cuda_gpubin_handle");
892 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
893 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
894 CGM.getPointerAlign());
897 if (RegisterGlobalsFunc)
898 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
902 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
904 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
905 llvm::FunctionType::get(VoidTy, PtrTy,
false),
906 "__cudaRegisterFatBinaryEnd");
907 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
912 llvm::raw_svector_ostream OS(ModuleID);
913 OS << ModuleIDPrefix << llvm::format(
"%" PRIx64, FatbinWrapper->getGUID());
914 llvm::Constant *ModuleIDConstant = makeConstantArray(
915 std::string(ModuleID),
"", ModuleIDSectionName, 32,
true);
918 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
919 Twine(
"__fatbinwrap") + ModuleID, FatbinWrapper);
924 RegisterLinkedBinaryName += ModuleID;
925 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
926 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
928 assert(RegisterGlobalsFunc &&
"Expecting at least dummy function!");
929 llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
930 makeDummyFunction(getCallbackFnTy())};
931 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
937 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
939 llvm::FunctionType *AtExitTy =
940 llvm::FunctionType::get(IntTy, CleanupFn->getType(),
false);
941 llvm::FunctionCallee AtExitFunc =
942 CGM.CreateRuntimeFunction(AtExitTy,
"atexit", llvm::AttributeList(),
944 CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
947 CtorBuilder.CreateRetVoid();
948 return ModuleCtorFunc;
970llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
972 if (!GpuBinaryHandle)
976 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
977 llvm::FunctionType::get(VoidTy, PtrTy,
false),
978 addUnderscoredPrefixToName(
"UnregisterFatBinary"));
980 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
981 llvm::FunctionType::get(VoidTy,
false),
982 llvm::GlobalValue::InternalLinkage,
983 addUnderscoredPrefixToName(
"_module_dtor"), &TheModule);
985 llvm::BasicBlock *DtorEntryBB =
986 llvm::BasicBlock::Create(Context,
"entry", ModuleDtorFunc);
988 DtorBuilder.SetInsertPoint(DtorEntryBB);
991 GpuBinaryHandle, GpuBinaryHandle->getValueType(),
993 auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
997 if (CGM.getLangOpts().HIP) {
998 llvm::BasicBlock *IfBlock =
999 llvm::BasicBlock::Create(Context,
"if", ModuleDtorFunc);
1000 llvm::BasicBlock *ExitBlock =
1001 llvm::BasicBlock::Create(Context,
"exit", ModuleDtorFunc);
1002 llvm::Constant *
Zero = llvm::Constant::getNullValue(HandleValue->getType());
1003 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
1004 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
1006 DtorBuilder.SetInsertPoint(IfBlock);
1007 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1008 DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
1009 DtorBuilder.CreateBr(ExitBlock);
1011 DtorBuilder.SetInsertPoint(ExitBlock);
1013 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1015 DtorBuilder.CreateRetVoid();
1016 return ModuleDtorFunc;
1020 return new CGNVCUDARuntime(CGM);
1023void CGNVCUDARuntime::internalizeDeviceSideVar(
1032 if (CGM.getLangOpts().GPURelocatableDeviceCode)
1040 if (D->
hasAttr<CUDADeviceAttr>() || D->
hasAttr<CUDAConstantAttr>() ||
1041 D->
hasAttr<CUDASharedAttr>() ||
1044 Linkage = llvm::GlobalValue::InternalLinkage;
1048void CGNVCUDARuntime::handleVarRegistration(
const VarDecl *D,
1049 llvm::GlobalVariable &GV) {
1050 if (D->
hasAttr<CUDADeviceAttr>() || D->
hasAttr<CUDAConstantAttr>()) {
1065 CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
1066 D->
hasAttr<HIPManagedAttr>()) {
1068 D->
hasAttr<CUDAConstantAttr>());
1074 const auto *TD = cast<ClassTemplateSpecializationDecl>(
1077 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1078 assert(Args.
size() == 2 &&
1079 "Unexpected number of template arguments of CUDA device "
1080 "builtin surface type.");
1081 auto SurfType = Args[1].getAsIntegral();
1083 registerDeviceSurf(D, GV, !D->
hasDefinition(), SurfType.getSExtValue());
1085 assert(Args.
size() == 3 &&
1086 "Unexpected number of template arguments of CUDA device "
1087 "builtin texture type.");
1088 auto TexType = Args[1].getAsIntegral();
1089 auto Normalized = Args[2].getAsIntegral();
1091 registerDeviceTex(D, GV, !D->
hasDefinition(), TexType.getSExtValue(),
1092 Normalized.getZExtValue());
1101void CGNVCUDARuntime::transformManagedVars() {
1102 for (
auto &&Info : DeviceVars) {
1103 llvm::GlobalVariable *Var = Info.Var;
1104 if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1105 Info.Flags.isManaged()) {
1106 auto *ManagedVar =
new llvm::GlobalVariable(
1107 CGM.getModule(), Var->getType(),
1108 false, Var->getLinkage(),
1109 Var->isDeclaration()
1111 : llvm::ConstantPointerNull::get(Var->getType()),
1113 llvm::GlobalVariable::NotThreadLocal,
1114 CGM.getContext().getTargetAddressSpace(CGM.getLangOpts().CUDAIsDevice
1115 ? LangAS::cuda_device
1116 : LangAS::Default));
1117 ManagedVar->setDSOLocal(Var->isDSOLocal());
1118 ManagedVar->setVisibility(Var->getVisibility());
1119 ManagedVar->setExternallyInitialized(
true);
1121 ManagedVar->takeName(Var);
1122 Var->setName(Twine(ManagedVar->getName()) +
".managed");
1125 if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1126 assert(!ManagedVar->isDeclaration());
1127 CGM.addCompilerUsedGlobal(Var);
1128 CGM.addCompilerUsedGlobal(ManagedVar);
1137void CGNVCUDARuntime::createOffloadingEntries() {
1138 StringRef Section = CGM.getLangOpts().HIP ?
"hip_offloading_entries"
1139 :
"cuda_offloading_entries";
1140 llvm::Module &M = CGM.getModule();
1141 for (KernelInfo &I : EmittedKernels)
1142 llvm::offloading::emitOffloadingEntry(
1143 M, KernelHandles[I.Kernel->getName()],
1144 getDeviceSideName(cast<NamedDecl>(I.D)), 0, 0,
1145 llvm::offloading::OffloadGlobalEntry, Section);
1147 for (VarInfo &I : DeviceVars) {
1149 CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1152 ?
static_cast<int32_t
>(llvm::offloading::OffloadGlobalExtern)
1154 (I.Flags.isConstant()
1155 ?
static_cast<int32_t
>(llvm::offloading::OffloadGlobalConstant)
1157 (I.Flags.isNormalized()
1158 ?
static_cast<int32_t
>(llvm::offloading::OffloadGlobalNormalized)
1160 if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1161 llvm::offloading::emitOffloadingEntry(
1162 M, I.Var, getDeviceSideName(I.D), VarSize,
1163 (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1164 : llvm::offloading::OffloadGlobalEntry) |
1167 }
else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1168 llvm::offloading::emitOffloadingEntry(
1169 M, I.Var, getDeviceSideName(I.D), VarSize,
1170 llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1171 I.Flags.getSurfTexType(), Section);
1172 }
else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1173 llvm::offloading::emitOffloadingEntry(
1174 M, I.Var, getDeviceSideName(I.D), VarSize,
1175 llvm::offloading::OffloadGlobalTextureEntry | Flags,
1176 I.Flags.getSurfTexType(), Section);
1182llvm::Function *CGNVCUDARuntime::finalizeModule() {
1183 transformManagedVars();
1184 if (CGM.getLangOpts().CUDAIsDevice) {
1195 for (
auto &&Info : DeviceVars) {
1196 auto Kind = Info.Flags.getKind();
1197 if (!Info.Var->isDeclaration() &&
1198 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1199 (Kind == DeviceVarFlags::Variable ||
1200 Kind == DeviceVarFlags::Surface ||
1201 Kind == DeviceVarFlags::Texture) &&
1202 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1203 CGM.addCompilerUsedGlobal(Info.Var);
1208 if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1209 createOffloadingEntries();
1211 return makeModuleCtorFunction();
1216llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1218 auto Loc = KernelHandles.find(F->getName());
1219 if (Loc != KernelHandles.end()) {
1220 auto OldHandle = Loc->second;
1221 if (KernelStubs[OldHandle] == F)
1226 if (CGM.getLangOpts().HIP) {
1229 KernelStubs[OldHandle] = F;
1234 KernelStubs.erase(OldHandle);
1237 if (!CGM.getLangOpts().HIP) {
1238 KernelHandles[F->getName()] = F;
1243 auto *Var =
new llvm::GlobalVariable(
1244 TheModule, F->getType(),
true, F->getLinkage(),
1248 Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1249 Var->setDSOLocal(F->isDSOLocal());
1250 Var->setVisibility(F->getVisibility());
1251 auto *FD = cast<FunctionDecl>(GD.
getDecl());
1252 auto *FT = FD->getPrimaryTemplate();
1253 if (!FT || FT->isThisDeclarationADefinition())
1254 CGM.maybeSetTrivialComdat(*FD, *Var);
1255 KernelHandles[F->getName()] = Var;
1256 KernelStubs[Var] = F;
static std::unique_ptr< MangleContext > InitDeviceMC(CodeGenModule &CGM)
static void replaceManagedVar(llvm::GlobalVariable *Var, llvm::GlobalVariable *ManagedVar)
MangleContext * createMangleContext(const TargetInfo *T=nullptr)
If T is null pointer, assume the target in ASTContext.
const TargetInfo * getAuxTargetInfo() const
MangleContext * createDeviceMangleContext(const TargetInfo &T)
Creates a device mangle context to correctly mangle lambdas in a mixed architecture compile by settin...
const TargetInfo & getTargetInfo() const
CharUnits - This is an opaque type for sizes expressed in character units.
static CharUnits One()
One - Construct a CharUnits quantity of one.
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
static CharUnits Zero()
Zero - Construct a CharUnits quantity of zero.
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
llvm::PointerType * getType() const
Return the type of the pointer value.
llvm::StoreInst * CreateAlignedStore(llvm::Value *Val, llvm::Value *Addr, CharUnits Align, bool IsVolatile=false)
llvm::StoreInst * CreateDefaultAlignedStore(llvm::Value *Val, llvm::Value *Addr, bool IsVolatile=false)
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
virtual std::string getDeviceSideName(const NamedDecl *ND)=0
Returns function or variable name on device side even if the current compilation is for host.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args)=0
Emits a kernel launch stub.
virtual llvm::Function * getKernelStub(llvm::GlobalValue *Handle)=0
Get kernel stub by kernel handle.
virtual void handleVarRegistration(const VarDecl *VD, llvm::GlobalVariable &Var)=0
Check whether a variable is a device variable and register it if true.
virtual llvm::Function * finalizeModule()=0
Finalize generated LLVM module.
virtual llvm::GlobalValue * getKernelHandle(llvm::Function *Stub, GlobalDecl GD)=0
Get kernel handle by stub function.
virtual void internalizeDeviceSideVar(const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage)=0
Adjust linkage of shadow variables in host compilation.
static CGCallee forDirect(llvm::Constant *functionPtr, const CGCalleeInfo &abstractInfo=CGCalleeInfo())
CGFunctionInfo - Class to encapsulate the information about a function definition.
CallArgList - Type for representing both the value and type of arguments in a call.
void add(RValue rvalue, QualType type)
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
const LangOptions & getLangOpts() const
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee, ReturnValueSlot ReturnValue, const CallArgList &Args, llvm::CallBase **callOrInvoke, bool IsMustTail, SourceLocation Loc)
EmitCall - Generate a call of the given function, expecting the given result type,...
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
const Decl * CurFuncDecl
CurFuncDecl - Holds the Decl for the current outermost non-closure context.
llvm::CallBase * EmitRuntimeCallOrInvoke(llvm::FunctionCallee callee, ArrayRef< llvm::Value * > args, const Twine &name="")
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
This class organizes the cross-function state that is used while generating LLVM code.
ASTContext & getContext() const
The standard implementation of ConstantInitBuilder used in Clang.
FunctionArgList - Type for representing both the decl and type of parameters to a function.
static RValue get(llvm::Value *V)
static RValue getAggregate(Address addr, bool isVolatile=false)
Convert an Address to an RValue.
ReturnValueSlot - Contains the address where the return value of a function can be stored,...
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
lookup_result lookup(DeclarationName Name) const
lookup - Find the declarations (if any) with the given Name in this context.
Decl - This represents one declaration (or definition), e.g.
SourceLocation getLocation() const
TranslationUnitDecl * getTranslationUnitDecl()
Represents a function declaration or definition.
const ParmVarDecl * getParamDecl(unsigned i) const
GlobalDecl - represents a global declaration.
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind)
const Decl * getDecl() const
One of these records is kept for each identifier that is lexed.
StringRef getName() const
Return the actual identifier string.
GPUDefaultStreamKind GPUDefaultStream
The default stream kind used for HIP kernel launching.
MangleContext - Context for tracking state which persists across multiple calls to the C++ name mangl...
bool shouldMangleDeclName(const NamedDecl *D)
void mangleName(GlobalDecl GD, raw_ostream &)
This represents a decl that may have a name.
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Represents a parameter to a function.
A (possibly-)qualified type.
QualType getCanonicalType() const
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
RecordDecl * getDecl() const
bool isMicrosoft() const
Is this ABI an MSVC-compatible ABI?
bool isItaniumFamily() const
Does this ABI generally fall into the Itanium family of ABIs?
TargetCXXABI getCXXABI() const
Get the C++ ABI currently in use.
A template argument list.
unsigned size() const
Retrieve the number of template arguments in this template argument list.
The top declaration context.
static DeclContext * castToDeclContext(const TranslationUnitDecl *D)
The base class of the type hierarchy.
const T * castAs() const
Member-template castAs<specific type>.
bool isCUDADeviceBuiltinSurfaceType() const
Check if the type is the CUDA device builtin surface type.
bool isCUDADeviceBuiltinTextureType() const
Check if the type is the CUDA device builtin texture type.
Represents a variable declaration or definition.
bool isInline() const
Whether this variable is (C++1z) inline.
bool hasExternalStorage() const
Returns true if a variable has extern or private_extern storage.
DefinitionKind hasDefinition(ASTContext &) const
Check whether this variable is defined in this translation unit.
CGCUDARuntime * CreateNVCUDARuntime(CodeGenModule &CGM)
Creates an instance of a CUDA runtime class.
bool Zero(InterpState &S, CodePtr OpPC)
The JSON file list parser is used to communicate input to InstallAPI.
CudaVersion ToCudaVersion(llvm::VersionTuple)
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature)
Linkage
Describes the different kinds of linkage (C++ [basic.link], C99 6.2.2) that an entity may have.
llvm::IntegerType * SizeTy
llvm::IntegerType * IntTy
int
llvm::PointerType * UnqualPtrTy